diff --git a/docs/doctor/articles/_TEMPLATE.md b/docs/doctor/articles/_TEMPLATE.md new file mode 100644 index 000000000..30a4397d0 --- /dev/null +++ b/docs/doctor/articles/_TEMPLATE.md @@ -0,0 +1,47 @@ +--- +checkId: check.. +plugin: stellaops.doctor. +severity: fail | warn | info +tags: [tag1, tag2] +--- +# + +## What It Checks + + +## Why It Matters + + +## Common Causes +- Cause 1 (specific: exact misconfiguration, missing file, wrong env var) +- Cause 2 +- Cause 3 + +## How to Fix + +### Docker Compose +```bash +# Step-by-step commands for docker-compose deployments +# Use exact env var names with __ separator +# Reference exact file paths relative to devops/compose/ +``` + +### Bare Metal / systemd +```bash +# Step-by-step commands for bare-metal / systemd deployments +# Reference exact config file paths (e.g., /etc/stellaops/appsettings.json) +``` + +### Kubernetes / Helm +```bash +# Step-by-step commands for Kubernetes/Helm deployments +# Reference exact Helm values, ConfigMap keys, Secret names +``` + +## Verification +```bash +stella doctor run --check check.. +``` + +## Related Checks +- `check.related.id` - brief explanation of relationship diff --git a/docs/doctor/articles/agent/capacity.md b/docs/doctor/articles/agent/capacity.md new file mode 100644 index 000000000..c2649c7a3 --- /dev/null +++ b/docs/doctor/articles/agent/capacity.md @@ -0,0 +1,86 @@ +--- +checkId: check.agent.capacity +plugin: stellaops.doctor.agent +severity: warn +tags: [agent, capacity, performance] +--- +# Agent Capacity + +## What It Checks + +Verifies that agents have sufficient capacity to handle incoming tasks. The check queries the agent store for the current tenant and categorizes agents by status: + +1. **Fail** if zero agents have `AgentStatus.Active` -- no agents are available to run tasks. +2. **Pass** if at least one active agent exists, reporting the active-vs-total count. + +Evidence collected: `ActiveAgents`, `TotalAgents`. + +Thresholds defined in source (not yet wired to the simplified implementation): +- High utilization: >= 90% +- Warning utilization: >= 75% + +The check skips with a warning if the tenant ID is missing or unparseable. + +## Why It Matters + +When no active agents are available, the platform cannot execute deployment tasks, scans, or any agent-dispatched work. Releases stall, scan queues grow, and SLA timers expire silently. Detecting zero-capacity before a promotion attempt prevents failed deployments and on-call pages. + +## Common Causes + +- All agents are offline (host crash, network partition, maintenance window) +- No agents have been registered for this tenant +- Agents exist but are in `Revoked` or `Inactive` status and none remain `Active` +- Agent bootstrap was started but never completed + +## How to Fix + +### Docker Compose + +```bash +# Check agent container health +docker compose -f devops/compose/docker-compose.stella-ops.yml ps | grep agent + +# View agent container logs +docker compose -f devops/compose/docker-compose.stella-ops.yml logs agent --tail 100 + +# Restart agent container +docker compose -f devops/compose/docker-compose.stella-ops.yml restart agent +``` + +### Bare Metal / systemd + +```bash +# Check agent service status +systemctl status stella-agent + +# Restart agent service +sudo systemctl restart stella-agent + +# Bootstrap a new agent if none registered +stella agent bootstrap --name agent-01 --env production --platform linux +``` + +### Kubernetes / Helm + +```bash +# Check agent pods +kubectl get pods -l app.kubernetes.io/component=agent -n stellaops + +# Describe agent deployment +kubectl describe deployment stellaops-agent -n stellaops + +# Scale agent replicas +kubectl scale deployment stellaops-agent --replicas=2 -n stellaops +``` + +## Verification + +``` +stella doctor run --check check.agent.capacity +``` + +## Related Checks + +- `check.agent.heartbeat.freshness` -- agents may be registered but not sending heartbeats +- `check.agent.stale` -- agents offline for extended periods may need decommissioning +- `check.agent.resource.utilization` -- active agents may be resource-constrained diff --git a/docs/doctor/articles/agent/certificate-expiry.md b/docs/doctor/articles/agent/certificate-expiry.md new file mode 100644 index 000000000..4436bfde0 --- /dev/null +++ b/docs/doctor/articles/agent/certificate-expiry.md @@ -0,0 +1,95 @@ +--- +checkId: check.agent.certificate.expiry +plugin: stellaops.doctor.agent +severity: fail +tags: [agent, certificate, security, quick] +--- +# Agent Certificate Expiry + +## What It Checks + +Inspects the `CertificateExpiresAt` field on every non-revoked, non-inactive agent and classifies each into one of four buckets: + +1. **Expired** -- `CertificateExpiresAt` is in the past. Result: **Fail**. +2. **Critical** -- certificate expires within **1 day** (24 hours). Result: **Fail**. +3. **Warning** -- certificate expires within **7 days**. Result: **Warn**. +4. **Healthy** -- certificate has more than 7 days remaining. Result: **Pass**. + +The check short-circuits to the most severe bucket found. Evidence includes per-agent names with time-since-expiry or time-until-expiry, plus counts of `TotalActive`, `Expired`, `Critical`, and `Warning` agents. + +Agents whose `CertificateExpiresAt` is null or default are silently skipped (certificate info not available). If no active agents exist the check is skipped entirely. + +## Why It Matters + +Agent mTLS certificates authenticate the agent to the orchestrator. An expired certificate causes the agent to fail heartbeats, reject task assignments, and drop out of the fleet. In production this means deployments and scans silently stop being dispatched to that agent, potentially leaving environments unserviced. + +## Common Causes + +- Certificate auto-renewal is disabled on the agent +- Agent was offline when renewal was due (missed the renewal window) +- Certificate authority is unreachable from the agent host +- Agent bootstrap was incomplete (certificate provisioned but auto-renewal not configured) +- Certificate renewal threshold not yet reached (warning-level) +- Certificate authority rate limiting prevented renewal (critical-level) + +## How to Fix + +### Docker Compose + +```bash +# Check certificate expiry for agent containers +docker compose -f devops/compose/docker-compose.stella-ops.yml exec agent \ + stella agent health --show-cert + +# Force certificate renewal +docker compose -f devops/compose/docker-compose.stella-ops.yml exec agent \ + stella agent renew-cert --force + +# Verify auto-renewal configuration +docker compose -f devops/compose/docker-compose.stella-ops.yml exec agent \ + stella agent config show | grep auto_renew +``` + +### Bare Metal / systemd + +```bash +# Force certificate renewal on an affected agent +stella agent renew-cert --agent-id --force + +# If agent is unreachable, re-bootstrap +stella agent bootstrap --name --env + +# Verify auto-renewal is enabled +stella agent config --agent-id | grep auto_renew + +# Check agent logs for renewal failures +stella agent logs --agent-id --level warn +``` + +### Kubernetes / Helm + +```bash +# Check cert expiry across agent pods +kubectl exec -it deploy/stellaops-agent -n stellaops -- \ + stella agent health --show-cert + +# Force renewal via pod exec +kubectl exec -it deploy/stellaops-agent -n stellaops -- \ + stella agent renew-cert --force + +# If using cert-manager, check Certificate resource +kubectl get certificate -n stellaops +kubectl describe certificate stellaops-agent-tls -n stellaops +``` + +## Verification + +``` +stella doctor run --check check.agent.certificate.expiry +``` + +## Related Checks + +- `check.agent.certificate.validity` -- verifies certificate chain of trust (not just expiry) +- `check.agent.heartbeat.freshness` -- expired certs cause heartbeat failures +- `check.agent.stale` -- agents with expired certs often show as stale diff --git a/docs/doctor/articles/agent/certificate-validity.md b/docs/doctor/articles/agent/certificate-validity.md new file mode 100644 index 000000000..9fd2f656a --- /dev/null +++ b/docs/doctor/articles/agent/certificate-validity.md @@ -0,0 +1,84 @@ +--- +checkId: check.agent.certificate.validity +plugin: stellaops.doctor.agent +severity: fail +tags: [agent, certificate, security] +--- +# Agent Certificate Validity + +## What It Checks + +Validates the full certificate chain of trust for agent mTLS certificates. The check is designed to verify: + +1. Certificate is signed by a trusted CA +2. Certificate chain is complete (no missing intermediates) +3. No revoked certificates in the chain (CRL/OCSP check) +4. Certificate subject matches the agent's registered identity + +**Current status:** implementation pending -- the check always returns Pass with a placeholder message. The framework and metadata are wired; the chain-validation logic is not yet connected. + +Evidence collected: none yet (pending implementation). + +The check requires `IAgentStore` to be registered in DI; otherwise it will not run. + +## Why It Matters + +A valid certificate expiry date (checked by `check.agent.certificate.expiry`) is necessary but not sufficient. An agent could present a non-expired certificate that was signed by an untrusted CA, has a broken chain, or has been revoked. Any of these conditions would allow an impersonating agent to receive task dispatches or exfiltrate deployment secrets. + +## Common Causes + +- CA certificate rotated but agent still presents cert signed by old CA +- Intermediate certificate missing from agent's cert bundle +- Certificate revoked via CRL but agent not yet re-provisioned +- Agent identity mismatch after hostname change or migration + +## How to Fix + +### Docker Compose + +```bash +# Inspect agent certificate chain +docker compose -f devops/compose/docker-compose.stella-ops.yml exec agent \ + openssl x509 -in /etc/stellaops/agent/tls.crt -text -noout + +# Verify chain against CA bundle +docker compose -f devops/compose/docker-compose.stella-ops.yml exec agent \ + openssl verify -CAfile /etc/stellaops/ca/ca.crt /etc/stellaops/agent/tls.crt +``` + +### Bare Metal / systemd + +```bash +# Inspect agent certificate +openssl x509 -in /etc/stellaops/agent/tls.crt -text -noout + +# Verify certificate chain +openssl verify -CAfile /etc/stellaops/ca/ca.crt -untrusted /etc/stellaops/ca/intermediate.crt \ + /etc/stellaops/agent/tls.crt + +# Re-bootstrap if chain is broken +stella agent bootstrap --name --env +``` + +### Kubernetes / Helm + +```bash +# Check certificate in agent pod +kubectl exec -it deploy/stellaops-agent -n stellaops -- \ + openssl x509 -in /etc/stellaops/agent/tls.crt -text -noout + +# If using cert-manager, check CertificateRequest status +kubectl get certificaterequest -n stellaops +kubectl describe certificaterequest -n stellaops +``` + +## Verification + +``` +stella doctor run --check check.agent.certificate.validity +``` + +## Related Checks + +- `check.agent.certificate.expiry` -- checks expiry dates (complementary to chain validation) +- `check.agent.heartbeat.freshness` -- invalid certs prevent heartbeat communication diff --git a/docs/doctor/articles/agent/cluster-health.md b/docs/doctor/articles/agent/cluster-health.md new file mode 100644 index 000000000..dfe14c1d3 --- /dev/null +++ b/docs/doctor/articles/agent/cluster-health.md @@ -0,0 +1,97 @@ +--- +checkId: check.agent.cluster.health +plugin: stellaops.doctor.agent +severity: fail +tags: [agent, cluster, ha, resilience] +--- +# Agent Cluster Health + +## What It Checks + +Monitors the health of the agent cluster when clustering is enabled. The check only runs when the configuration key `Agent:Cluster:Enabled` is set to `true`. It is designed to verify: + +1. All cluster members are reachable +2. A leader is elected and healthy +3. State synchronization is working across members +4. Failover is possible if the current leader goes down + +**Current status:** implementation pending -- the check returns Skip with a placeholder message. The `CanRun` gate is functional (reads cluster config), but `RunAsync` does not yet perform cluster health probes. + +## Why It Matters + +In high-availability deployments, agents form a cluster to provide redundancy and automatic failover. If cluster health degrades -- members become unreachable, leader election fails, or state sync stalls -- task dispatch can stop entirely or produce split-brain scenarios where two agents execute the same task concurrently, leading to deployment conflicts. + +## Common Causes + +- Network partition between cluster members +- Leader node crashed without triggering failover +- State sync backlog due to high task volume +- Clock skew between cluster members causing consensus protocol failures +- Insufficient cluster members for quorum (see `check.agent.cluster.quorum`) + +## How to Fix + +### Docker Compose + +```bash +# Check cluster member containers +docker compose -f devops/compose/docker-compose.stella-ops.yml ps | grep agent + +# View cluster-specific logs +docker compose -f devops/compose/docker-compose.stella-ops.yml logs agent --tail 200 | grep -i cluster + +# Restart all agent containers to force re-election +docker compose -f devops/compose/docker-compose.stella-ops.yml restart agent +``` + +Set clustering configuration in your `.env` or compose override: + +``` +AGENT__CLUSTER__ENABLED=true +AGENT__CLUSTER__MEMBERS=agent-1:8500,agent-2:8500,agent-3:8500 +``` + +### Bare Metal / systemd + +```bash +# Check cluster status +stella agent cluster status + +# View cluster member health +stella agent cluster members + +# Force leader re-election if leader is unhealthy +stella agent cluster elect --force + +# Restart agent to rejoin cluster +sudo systemctl restart stella-agent +``` + +### Kubernetes / Helm + +```bash +# Check agent StatefulSet pods +kubectl get pods -l app.kubernetes.io/component=agent -n stellaops + +# View cluster gossip logs +kubectl logs -l app.kubernetes.io/component=agent -n stellaops --tail=100 | grep -i cluster + +# Helm values for clustering +# agent: +# cluster: +# enabled: true +# replicas: 3 +helm upgrade stellaops stellaops/stellaops --set agent.cluster.enabled=true --set agent.cluster.replicas=3 +``` + +## Verification + +``` +stella doctor run --check check.agent.cluster.health +``` + +## Related Checks + +- `check.agent.cluster.quorum` -- verifies minimum members for consensus +- `check.agent.heartbeat.freshness` -- individual agent connectivity +- `check.agent.capacity` -- fleet-level task capacity diff --git a/docs/doctor/articles/agent/cluster-quorum.md b/docs/doctor/articles/agent/cluster-quorum.md new file mode 100644 index 000000000..598b6e36a --- /dev/null +++ b/docs/doctor/articles/agent/cluster-quorum.md @@ -0,0 +1,97 @@ +--- +checkId: check.agent.cluster.quorum +plugin: stellaops.doctor.agent +severity: fail +tags: [agent, cluster, quorum, ha] +--- +# Agent Cluster Quorum + +## What It Checks + +Verifies that the agent cluster has sufficient members online to maintain quorum for leader election and consensus operations. The check only runs when `Agent:Cluster:Enabled` is `true`. It is designed to verify: + +1. Minimum members are online (n/2 + 1 for odd-numbered clusters, or the configured minimum) +2. Leader election is possible with current membership +3. Split-brain prevention mechanisms are active + +**Current status:** implementation pending -- the check returns Skip with a placeholder message. The `CanRun` gate is functional (reads cluster config), but `RunAsync` does not yet query cluster membership. + +## Why It Matters + +Without quorum, the agent cluster cannot elect a leader, which means no task dispatch, no failover, and potentially a complete halt of agent-driven operations. Losing quorum is often the step before a full cluster outage. Monitoring quorum proactively allows operators to add members or fix partitions before the cluster becomes non-functional. + +## Common Causes + +- Too many cluster members went offline simultaneously (maintenance, host failure) +- Network partition isolating a minority of members from the majority +- Cluster scaled down below quorum threshold +- New deployment removed members without draining them first + +## How to Fix + +### Docker Compose + +```bash +# Verify all agent containers are running +docker compose -f devops/compose/docker-compose.stella-ops.yml ps | grep agent + +# Scale agents to restore quorum (minimum 3 for quorum of 2) +docker compose -f devops/compose/docker-compose.stella-ops.yml up -d --scale agent=3 +``` + +Ensure cluster member list is correct in `.env`: + +``` +AGENT__CLUSTER__ENABLED=true +AGENT__CLUSTER__MINMEMBERS=2 +``` + +### Bare Metal / systemd + +```bash +# Check how many cluster members are online +stella agent cluster members --status online + +# If a member is down, restart it +ssh 'sudo systemctl restart stella-agent' + +# Verify quorum status +stella agent cluster quorum +``` + +### Kubernetes / Helm + +```bash +# Check agent pod count vs desired +kubectl get statefulset stellaops-agent -n stellaops + +# Scale up if below quorum +kubectl scale statefulset stellaops-agent --replicas=3 -n stellaops + +# Check pod disruption budget +kubectl get pdb -n stellaops +``` + +Set a PodDisruptionBudget to prevent quorum loss during rollouts: + +```yaml +# values.yaml +agent: + cluster: + enabled: true + replicas: 3 + podDisruptionBudget: + minAvailable: 2 +``` + +## Verification + +``` +stella doctor run --check check.agent.cluster.quorum +``` + +## Related Checks + +- `check.agent.cluster.health` -- overall cluster health including leader and sync status +- `check.agent.capacity` -- even with quorum, capacity may be insufficient +- `check.agent.heartbeat.freshness` -- individual member connectivity diff --git a/docs/doctor/articles/agent/heartbeat-freshness.md b/docs/doctor/articles/agent/heartbeat-freshness.md new file mode 100644 index 000000000..eae05958d --- /dev/null +++ b/docs/doctor/articles/agent/heartbeat-freshness.md @@ -0,0 +1,104 @@ +--- +checkId: check.agent.heartbeat.freshness +plugin: stellaops.doctor.agent +severity: fail +tags: [agent, heartbeat, connectivity, quick] +--- +# Agent Heartbeat Freshness + +## What It Checks + +Queries all non-revoked, non-inactive agents for the current tenant and classifies each by the age of its last heartbeat: + +1. **Stale** (> 5 minutes since last heartbeat): Result is **Fail**. Evidence lists each stale agent with the time since its last heartbeat in minutes. +2. **Warning** (> 2 minutes but <= 5 minutes): Result is **Warn**. Evidence lists each delayed agent with time since heartbeat in seconds. +3. **Healthy** (<= 2 minutes): Result is **Pass**. + +If no active agents are registered, the check returns **Warn** with a prompt to bootstrap agents. If the tenant ID is missing, it warns about being unable to check. + +Evidence collected: `TotalActive`, `Stale` count, `Warning` count, `Healthy` count, per-agent names and heartbeat ages. + +## Why It Matters + +Heartbeats are the primary signal that an agent is alive and accepting work. A stale heartbeat means the agent has stopped communicating with the orchestrator -- it may have crashed, lost network connectivity, or had its mTLS certificate expire. Tasks dispatched to a stale agent will time out, and the lack of timely detection causes deployment delays and alert fatigue. + +## Common Causes + +- Agent process has crashed or stopped +- Network connectivity issue between agent and orchestrator +- Firewall blocking agent heartbeat traffic (typically HTTPS on port 8443) +- Agent host is unreachable or powered off +- mTLS certificate has expired (see `check.agent.certificate.expiry`) +- Agent is under heavy load (warning-level) +- Network latency between agent and orchestrator (warning-level) +- Agent is processing long-running tasks that block the heartbeat loop (warning-level) + +## How to Fix + +### Docker Compose + +```bash +# Check agent container status +docker compose -f devops/compose/docker-compose.stella-ops.yml ps agent + +# View agent logs for crash or error messages +docker compose -f devops/compose/docker-compose.stella-ops.yml logs agent --tail 200 + +# Restart agent container +docker compose -f devops/compose/docker-compose.stella-ops.yml restart agent + +# Verify network connectivity from agent to orchestrator +docker compose -f devops/compose/docker-compose.stella-ops.yml exec agent \ + curl -k https://orchestrator:8443/health +``` + +### Bare Metal / systemd + +```bash +# Check agent service status +systemctl status stella-agent + +# View recent agent logs +journalctl -u stella-agent --since '10 minutes ago' + +# Run agent diagnostics +stella agent doctor + +# Check network connectivity to orchestrator +curl -k https://orchestrator:8443/health + +# If certificate expired, renew it +stella agent renew-cert --force + +# Restart the service +sudo systemctl restart stella-agent +``` + +### Kubernetes / Helm + +```bash +# Check agent pod status and restarts +kubectl get pods -l app.kubernetes.io/component=agent -n stellaops + +# View agent pod logs +kubectl logs -l app.kubernetes.io/component=agent -n stellaops --tail=200 + +# Check network policy allowing agent -> orchestrator traffic +kubectl get networkpolicy -n stellaops + +# Restart agent pods via rollout +kubectl rollout restart deployment/stellaops-agent -n stellaops +``` + +## Verification + +``` +stella doctor run --check check.agent.heartbeat.freshness +``` + +## Related Checks + +- `check.agent.stale` -- detects agents offline for hours/days (longer threshold than heartbeat freshness) +- `check.agent.certificate.expiry` -- expired certificates cause heartbeat authentication failures +- `check.agent.capacity` -- heartbeat failures reduce effective fleet capacity +- `check.agent.resource.utilization` -- overloaded agents may delay heartbeats diff --git a/docs/doctor/articles/agent/resource-utilization.md b/docs/doctor/articles/agent/resource-utilization.md new file mode 100644 index 000000000..56710b4fb --- /dev/null +++ b/docs/doctor/articles/agent/resource-utilization.md @@ -0,0 +1,103 @@ +--- +checkId: check.agent.resource.utilization +plugin: stellaops.doctor.agent +severity: warn +tags: [agent, resource, performance, capacity] +--- +# Agent Resource Utilization + +## What It Checks + +Monitors CPU, memory, and disk utilization across the agent fleet. The check is designed to verify: + +1. CPU utilization per agent +2. Memory utilization per agent +3. Disk space per agent (for task workspace, logs, and cached artifacts) +4. Resource usage trends (increasing/stable/decreasing) + +**Current status:** implementation pending -- the check always returns Pass with a placeholder message. The `CanRun` method always returns true, so the check will always appear in results. + +## Why It Matters + +Agents that exhaust CPU, memory, or disk become unable to execute tasks reliably. CPU saturation causes task timeouts; memory exhaustion triggers OOM kills that look like intermittent crashes; disk exhaustion prevents artifact downloads and log writes. Proactive monitoring prevents these cascading failures before they impact deployment SLAs. + +## Common Causes + +- Agent running too many concurrent tasks for its resource allocation +- Disk filled by accumulated scan artifacts, logs, or cached images +- Memory leak in long-running agent process +- Noisy neighbor on shared infrastructure consuming resources +- Resource limits not configured (no cgroup/container memory cap) + +## How to Fix + +### Docker Compose + +```bash +# Check agent container resource usage +docker stats --no-stream $(docker compose -f devops/compose/docker-compose.stella-ops.yml ps -q agent) + +# Set resource limits in compose override +# docker-compose.override.yml: +# services: +# agent: +# deploy: +# resources: +# limits: +# cpus: '2.0' +# memory: 4G + +# Clean up old task artifacts +docker compose -f devops/compose/docker-compose.stella-ops.yml exec agent \ + stella agent cleanup --older-than 7d +``` + +### Bare Metal / systemd + +```bash +# Check resource usage +stella agent health + +# View system resources on agent host +top -bn1 | head -20 +df -h /var/lib/stellaops + +# Clean up old task artifacts +stella agent cleanup --older-than 7d + +# Adjust concurrent task limit +stella agent config --agent-id --set max_concurrent_tasks=4 +``` + +### Kubernetes / Helm + +```bash +# Check agent pod resource usage +kubectl top pods -l app.kubernetes.io/component=agent -n stellaops + +# Set resource requests and limits in Helm values +# agent: +# resources: +# requests: +# cpu: "500m" +# memory: "1Gi" +# limits: +# cpu: "2000m" +# memory: "4Gi" +helm upgrade stellaops stellaops/stellaops -f values.yaml + +# Check if pods are being OOM-killed +kubectl get events -n stellaops --field-selector reason=OOMKilling +``` + +## Verification + +``` +stella doctor run --check check.agent.resource.utilization +``` + +## Related Checks + +- `check.agent.capacity` -- resource exhaustion reduces effective capacity +- `check.agent.heartbeat.freshness` -- resource saturation can delay heartbeats +- `check.agent.task.backlog` -- high utilization combined with backlog indicates need to scale diff --git a/docs/doctor/articles/agent/stale.md b/docs/doctor/articles/agent/stale.md new file mode 100644 index 000000000..cfb906e26 --- /dev/null +++ b/docs/doctor/articles/agent/stale.md @@ -0,0 +1,91 @@ +--- +checkId: check.agent.stale +plugin: stellaops.doctor.agent +severity: warn +tags: [agent, maintenance, cleanup] +--- +# Stale Agent Detection + +## What It Checks + +Identifies agents that have been offline (no heartbeat) for extended periods and may need investigation or decommissioning. The check inspects all non-revoked, non-inactive agents and categorizes them: + +1. **Decommission candidates** -- offline for more than **7 days**. Result: **Warn** listing each agent with days offline. +2. **Stale** -- offline for more than **1 hour** but less than 7 days. Result: **Warn** listing each agent with hours offline. +3. **All healthy** -- no agents exceed the 1-hour stale threshold. Result: **Pass**. + +The check uses `LastHeartbeatAt` from the agent store. Agents with no recorded heartbeat (`null`) are treated as having `TimeSpan.MaxValue` offline duration. + +Evidence collected: `DecommissionCandidates` count, `StaleAgents` count, per-agent names with offline durations. + +## Why It Matters + +Stale agents consume fleet management overhead, confuse capacity planning, and may hold allocated resources (IP addresses, certificates, license seats) that could be reclaimed. An agent that has been offline for 7+ days is unlikely to return without intervention and should be explicitly deactivated or investigated. Ignoring stale agents leads to a growing inventory of ghost entries that obscure the true fleet state. + +## Common Causes + +- Agent host has been permanently removed (decommissioned hardware, terminated cloud instance) +- Agent was replaced by a new instance but the old registration was not deactivated +- Infrastructure change (network re-architecture, datacenter migration) without cleanup +- Agent host is undergoing extended maintenance +- Network partition isolating the agent +- Agent process crash without auto-restart configured (systemd restart policy missing) + +## How to Fix + +### Docker Compose + +```bash +# List all agent registrations with status +docker compose -f devops/compose/docker-compose.stella-ops.yml exec agent \ + stella agent list --all + +# Deactivate a stale agent +docker compose -f devops/compose/docker-compose.stella-ops.yml exec agent \ + stella agent deactivate --agent-id +``` + +### Bare Metal / systemd + +```bash +# Review stale agents +stella agent list --status stale + +# Deactivate agents that are no longer needed +stella agent deactivate --agent-id + +# If the agent should still be active, investigate the host +ssh 'systemctl status stella-agent' + +# Check network connectivity from the agent host +ssh 'curl -k https://orchestrator:8443/health' + +# Restart agent on the host +ssh 'sudo systemctl restart stella-agent' +``` + +### Kubernetes / Helm + +```bash +# Check for terminated or evicted agent pods +kubectl get pods -l app.kubernetes.io/component=agent -n stellaops --field-selector=status.phase!=Running + +# Remove stale agent registrations via API +stella agent deactivate --agent-id + +# If pod was evicted, check node status +kubectl get nodes +kubectl describe node | grep -A5 Conditions +``` + +## Verification + +``` +stella doctor run --check check.agent.stale +``` + +## Related Checks + +- `check.agent.heartbeat.freshness` -- short-term heartbeat staleness (minutes vs. hours/days) +- `check.agent.capacity` -- stale agents do not contribute to capacity +- `check.agent.certificate.expiry` -- long-offline agents likely have expired certificates diff --git a/docs/doctor/articles/agent/task-backlog.md b/docs/doctor/articles/agent/task-backlog.md new file mode 100644 index 000000000..e68ec04fa --- /dev/null +++ b/docs/doctor/articles/agent/task-backlog.md @@ -0,0 +1,92 @@ +--- +checkId: check.agent.task.backlog +plugin: stellaops.doctor.agent +severity: warn +tags: [agent, task, queue, capacity] +--- +# Task Queue Backlog + +## What It Checks + +Monitors the pending task queue depth across the agent fleet to detect capacity issues. The check is designed to evaluate: + +1. Total queued tasks across the entire fleet +2. Age of the oldest queued task (how long tasks wait before dispatch) +3. Queue growth rate trend (growing, stable, or draining) + +**Current status:** implementation pending -- the check always returns Pass with a placeholder message. The `CanRun` method always returns true. + +## Why It Matters + +A growing task backlog means agents cannot keep up with incoming work. Tasks age in the queue, SLA timers expire, and users experience delayed deployments and scan results. If the backlog grows unchecked, it can cascade: delayed scans block policy gates, which block promotions, which block release trains. Detecting backlog growth early allows operators to scale the fleet or prioritize the queue. + +## Common Causes + +- Insufficient agent count for current workload +- One or more agents offline, reducing effective fleet capacity +- Task burst from bulk operations (mass rescans, environment-wide deployments) +- Slow tasks monopolizing agent slots (large image scans, complex builds) +- Task dispatch paused due to configuration or freeze window + +## How to Fix + +### Docker Compose + +```bash +# Check current queue depth +docker compose -f devops/compose/docker-compose.stella-ops.yml exec agent \ + stella agent tasks --status queued --count + +# Scale agents to reduce backlog +docker compose -f devops/compose/docker-compose.stella-ops.yml up -d --scale agent=3 + +# Increase concurrent task limit per agent +# Set environment variable in compose override: +# AGENT__MAXCONCURRENTTASKS=8 +``` + +### Bare Metal / systemd + +```bash +# Check queue depth and oldest task +stella agent tasks --status queued + +# Increase concurrent task limit +stella agent config --agent-id --set max_concurrent_tasks=8 + +# Add more agents to the fleet +stella agent bootstrap --name agent-03 --env production --platform linux +``` + +### Kubernetes / Helm + +```bash +# Check queue depth +kubectl exec -it deploy/stellaops-agent -n stellaops -- \ + stella agent tasks --status queued --count + +# Scale agent deployment +kubectl scale deployment stellaops-agent --replicas=5 -n stellaops + +# Or use HPA for auto-scaling +# agent: +# autoscaling: +# enabled: true +# minReplicas: 2 +# maxReplicas: 10 +# targetCPUUtilizationPercentage: 70 +helm upgrade stellaops stellaops/stellaops -f values.yaml +``` + +## Verification + +``` +stella doctor run --check check.agent.task.backlog +``` + +## Related Checks + +- `check.agent.capacity` -- backlog grows when capacity is insufficient +- `check.agent.task.failure.rate` -- failed tasks may be re-queued, inflating the backlog +- `check.agent.resource.utilization` -- saturated agents process tasks slowly +- `check.agent.heartbeat.freshness` -- offline agents reduce dispatch targets diff --git a/docs/doctor/articles/agent/task-failure-rate.md b/docs/doctor/articles/agent/task-failure-rate.md new file mode 100644 index 000000000..2dda7427a --- /dev/null +++ b/docs/doctor/articles/agent/task-failure-rate.md @@ -0,0 +1,82 @@ +--- +checkId: check.agent.task.failure.rate +plugin: stellaops.doctor.agent +severity: warn +tags: [agent, task, failure, reliability] +--- +# Task Failure Rate + +## What It Checks + +Monitors the task failure rate across the agent fleet to detect systemic issues. The check is designed to evaluate: + +1. Overall task failure rate over the last hour +2. Per-agent failure rate to isolate problematic agents +3. Failure rate trend (increasing, decreasing, or stable) +4. Common failure reasons to guide remediation + +**Current status:** implementation pending -- the check always returns Pass with a placeholder message. The `CanRun` method always returns true. + +## Why It Matters + +A rising task failure rate is an early indicator of systemic problems: infrastructure issues, misconfigured environments, expired credentials, or agent software bugs. Catching a spike before it reaches 100% failure allows operators to intervene, roll back, or redirect tasks to healthy agents before an outage fully materializes. + +## Common Causes + +- Registry or artifact store unreachable (tasks cannot pull images) +- Expired credentials used by tasks (registry tokens, cloud provider keys) +- Agent software bug introduced by recent update +- Target environment misconfigured (wrong endpoints, firewall rules) +- Disk exhaustion on agent hosts preventing artifact staging +- OOM kills during resource-intensive tasks (scans, builds) + +## How to Fix + +### Docker Compose + +```bash +# Check agent logs for task failures +docker compose -f devops/compose/docker-compose.stella-ops.yml logs agent --tail 500 | \ + grep -i "task.*fail\|error\|exception" + +# Review recent task history +docker compose -f devops/compose/docker-compose.stella-ops.yml exec agent \ + stella agent tasks --status failed --last 1h +``` + +### Bare Metal / systemd + +```bash +# View failed tasks +stella agent tasks --status failed --last 1h + +# Check per-agent failure rates +stella agent health --show-tasks + +# Review agent logs for failure patterns +journalctl -u stella-agent --since '1 hour ago' | grep -i 'fail\|error' +``` + +### Kubernetes / Helm + +```bash +# Check agent pod logs for task errors +kubectl logs -l app.kubernetes.io/component=agent -n stellaops --tail=500 | \ + grep -i "task.*fail\|error" + +# Check pod events for OOM or crash signals +kubectl get events -n stellaops --sort-by='.lastTimestamp' | grep -i agent +``` + +## Verification + +``` +stella doctor run --check check.agent.task.failure.rate +``` + +## Related Checks + +- `check.agent.resource.utilization` -- resource exhaustion causes task failures +- `check.agent.task.backlog` -- high failure rate combined with backlog indicates systemic issue +- `check.agent.heartbeat.freshness` -- crashing agents fail tasks and go stale +- `check.agent.version.consistency` -- version skew can cause task compatibility failures diff --git a/docs/doctor/articles/agent/version-consistency.md b/docs/doctor/articles/agent/version-consistency.md new file mode 100644 index 000000000..b5f55979e --- /dev/null +++ b/docs/doctor/articles/agent/version-consistency.md @@ -0,0 +1,81 @@ +--- +checkId: check.agent.version.consistency +plugin: stellaops.doctor.agent +severity: warn +tags: [agent, version, maintenance] +--- +# Agent Version Consistency + +## What It Checks + +Groups all non-revoked, non-inactive agents by their reported `Version` field and evaluates version skew: + +1. **Single version** across all agents: **Pass** -- all agents are consistent. +2. **Two versions** with skew affecting less than half the fleet: **Pass** (minor skew acceptable). +3. **Significant skew** (more than 2 distinct versions, or outdated agents exceed half the fleet): **Warn** with evidence listing the version distribution and up to 10 outdated agent names. +4. **No active agents**: **Skip**. + +The "majority version" is the version running on the most agents. All other versions are considered outdated. Evidence collected: `MajorityVersion`, `VersionDistribution` (e.g., "1.5.0: 8, 1.4.2: 2"), `OutdatedAgents` (list of names with their versions). + +## Why It Matters + +Version skew across the agent fleet can cause subtle compatibility issues: newer agents may support task types that older agents reject, protocol changes may cause heartbeat or dispatch failures, and mixed versions make incident triage harder because behavior differs across agents. Keeping the fleet consistent reduces operational surprises. + +## Common Causes + +- Auto-update is disabled on some agents +- Some agents failed to update (download failure, permission issue, disk full) +- Phased rollout in progress (expected, temporary skew) +- Agents on isolated networks that cannot reach the update server + +## How to Fix + +### Docker Compose + +```bash +# Check agent image versions +docker compose -f devops/compose/docker-compose.stella-ops.yml ps agent --format json | \ + jq '.[] | {name: .Name, image: .Image}' + +# Pull latest image and recreate +docker compose -f devops/compose/docker-compose.stella-ops.yml pull agent +docker compose -f devops/compose/docker-compose.stella-ops.yml up -d agent +``` + +### Bare Metal / systemd + +```bash +# Update outdated agents to target version +stella agent update --version --agent-id + +# Enable auto-update +stella agent config --agent-id --set auto_update.enabled=true + +# Batch update all agents +stella agent update --version --all +``` + +### Kubernetes / Helm + +```bash +# Check running image versions across pods +kubectl get pods -l app.kubernetes.io/component=agent -n stellaops \ + -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.containers[0].image}{"\n"}{end}' + +# Update image tag in Helm values and rollout +helm upgrade stellaops stellaops/stellaops --set agent.image.tag= + +# Monitor rollout +kubectl rollout status deployment/stellaops-agent -n stellaops +``` + +## Verification + +``` +stella doctor run --check check.agent.version.consistency +``` + +## Related Checks + +- `check.agent.heartbeat.freshness` -- version mismatch can cause heartbeat protocol failures +- `check.agent.capacity` -- outdated agents may be unable to accept newer task types diff --git a/docs/doctor/articles/attestor/clock-skew.md b/docs/doctor/articles/attestor/clock-skew.md new file mode 100644 index 000000000..74cf1d39c --- /dev/null +++ b/docs/doctor/articles/attestor/clock-skew.md @@ -0,0 +1,142 @@ +--- +checkId: check.attestation.clock.skew +plugin: stellaops.doctor.attestor +severity: fail +tags: [attestation, time, ntp, quick, setup] +--- +# Clock Skew + +## What It Checks + +Verifies that the system clock is synchronized accurately enough for attestation validity by comparing local time against the Rekor transparency log server's `Date` response header. Additionally collects NTP daemon status and virtual machine detection as discriminating evidence for root-cause analysis. + +**Threshold:** maximum allowed skew is **5 seconds** (`MaxSkewSeconds`). + +The check performs these steps: + +1. Collects NTP status: daemon type (chronyd, ntpd, systemd-timesyncd, w32time), running state, configured servers, last sync time, and sync age. +2. Detects virtual machine environment: VMware, Hyper-V, KVM, Xen, or container. Checks whether VM clock synchronization is enabled. +3. Sends HTTP GET to `{rekorUrl}/api/v1/log` (configured via `Attestor:Rekor:Url` or `Transparency:Rekor:Url`, defaults to `https://rekor.sigstore.dev`) with 5-second timeout. +4. Extracts server time from the HTTP `Date` header. +5. Computes skew as `localTime - serverTime`. + +Results: + +- **Skew <= 5s**: **Pass** with exact skew value. +- **Skew > 5s**: **Fail** with skew, NTP status, and VM detection evidence. Remediation steps are platform-specific (Linux: chronyd/ntpd/timesyncd; Windows: w32time; VM: clock sync integration). +- **Server unreachable or non-2xx**: **Warn** (cannot verify, includes NTP evidence). +- **No Date header**: **Skip**. +- **HTTP exception**: **Warn** with classified error type (ssl_error, dns_failure, refused, timeout, connection_failed). +- **Timeout**: **Warn** with 5-second timeout note. + +Evidence collected: `local_time_utc`, `server_time_utc`, `skew_seconds`, `max_allowed_skew`, `ntp_daemon_running`, `ntp_daemon_type`, `ntp_servers_configured`, `last_sync_time_utc`, `sync_age_seconds`, `is_virtual_machine`, `vm_type`, `vm_clock_sync_enabled`, `connection_error_type`. + +## Why It Matters + +Attestation timestamps must be accurate for signature validity. Rekor transparency log entries include timestamps that are verified against the signing time. If the system clock is skewed beyond the tolerance, attestations may be rejected as invalid, signatures may fail verification, and OIDC tokens used in keyless signing will be rejected for having future or expired timestamps. Even a few seconds of skew can cause intermittent failures that are difficult to diagnose. + +## Common Causes + +- NTP service not running (stopped, disabled, or not installed) +- NTP server unreachable (firewall, DNS, or network issue) +- System clock manually set incorrectly +- Virtual machine clock drift (common when VM clock sync is disabled) +- Container relying on host clock which is itself drifted +- Hibernation/resume causing sudden clock jump + +## How to Fix + +### Docker Compose + +Docker containers inherit the host clock. Fix the host time: + +```bash +# Check host time +date -u + +# Linux host: ensure NTP is running +sudo timedatectl set-ntp true +sudo systemctl start systemd-timesyncd + +# Windows host: resync time +w32tm /resync /nowait +``` + +### Bare Metal / systemd + +**Linux with chronyd:** +```bash +# Start NTP service +sudo systemctl start chronyd + +# Enable NTP synchronization +sudo timedatectl set-ntp true + +# Force immediate sync +sudo chronyc -a makestep + +# Check status +timedatectl status +chronyc tracking +``` + +**Linux with ntpd:** +```bash +# Start NTP service +sudo systemctl start ntpd + +# Enable NTP synchronization +sudo timedatectl set-ntp true + +# Force immediate sync +sudo ntpdate -u pool.ntp.org +``` + +**Linux with systemd-timesyncd:** +```bash +# Start and enable +sudo systemctl start systemd-timesyncd +sudo timedatectl set-ntp true +``` + +**Windows:** +```bash +# Start Windows Time service +net start w32time + +# Force time resync +w32tm /resync /nowait + +# Check status +w32tm /query /status +``` + +**Virtual machine with clock sync disabled:** +``` +Enable time synchronization in Hyper-V Integration Services or VMware Tools settings. +``` + +### Kubernetes / Helm + +Kubernetes pods inherit node clock. Fix the node: + +```bash +# Check node time +kubectl debug node/ -it --image=busybox -- date -u + +# Ensure NTP is configured on all nodes (varies by OS) +# For systemd-based nodes: +ssh 'sudo timedatectl set-ntp true' +``` + +## Verification + +``` +stella doctor run --check check.attestation.clock.skew +``` + +## Related Checks + +- `check.attestation.rekor.connectivity` -- clock skew check requires Rekor connectivity +- `check.attestation.rekor.verification.job` -- verification job can fail due to clock skew +- `check.attestation.transparency.consistency` -- timestamp accuracy affects consistency proofs diff --git a/docs/doctor/articles/attestor/cosign-keymaterial.md b/docs/doctor/articles/attestor/cosign-keymaterial.md new file mode 100644 index 000000000..27fe47d8a --- /dev/null +++ b/docs/doctor/articles/attestor/cosign-keymaterial.md @@ -0,0 +1,128 @@ +--- +checkId: check.attestation.cosign.keymaterial +plugin: stellaops.doctor.attestor +severity: fail +tags: [attestation, cosign, signing, setup] +--- +# Cosign Key Material + +## What It Checks + +Verifies that signing key material is available for container image attestation. The check reads the signing mode from configuration (`Attestor:Signing:Mode` or `Signing:Mode`, defaulting to `keyless`) and validates the appropriate key material for that mode: + +### Keyless mode +Checks that Fulcio URL is configured (defaults to `https://fulcio.sigstore.dev`). Uses OIDC identity for signing -- no persistent key material required. Result: **Pass** if configured. + +### File mode +1. If `KeyPath` is not configured: **Fail** with "KeyPath not configured". +2. If the key file does not exist at the configured path: **Fail** with "Signing key file not found". +3. If the key file cannot be read (permission error): **Fail** with the error message. +4. If the key file exists and is readable: **Pass** with file size and last modification time. + +### KMS mode +1. If `KmsKeyRef` is not configured: **Fail** with "KmsKeyRef not configured". +2. If configured, the check parses the KMS provider from the key reference URI prefix (`awskms://`, `gcpkms://`, `azurekms://`, `hashivault://`) and reports it. Result: **Pass** with provider name and key reference. + +### Unknown mode +**Fail** with "Unknown signing mode" and the list of supported modes. + +Evidence collected varies by mode: `SigningMode`, `FulcioUrl`, `KeyPath`, `FileExists`, `FileSize`, `LastModified`, `KmsKeyRef`, `Provider`. + +## Why It Matters + +Without valid signing key material, the Attestor cannot sign container images, SBOMs, or provenance attestations. Unsigned artifacts cannot pass policy gates that require signature verification, blocking the entire release pipeline. This check ensures the signing infrastructure is correctly configured before any signing operations are attempted. + +## Common Causes + +- KeyPath not set in configuration (file mode, incomplete setup) +- Configuration file not loaded (missing appsettings, environment variable not set) +- Key file was moved or deleted from the configured path +- Wrong path configured (typo, path changed during migration) +- Key file not yet generated (first-run setup incomplete) +- KmsKeyRef not configured (KMS mode, missing configuration) +- Unknown or misspelled signing mode in configuration + +## How to Fix + +### Docker Compose + +For **file** mode: +```bash +# Generate a new Cosign key pair +docker compose -f devops/compose/docker-compose.stella-ops.yml exec attestor \ + cosign generate-key-pair --output-key-prefix stellaops + +# Set key path in environment +# ATTESTOR__SIGNING__MODE=file +# ATTESTOR__SIGNING__KEYPATH=/etc/stellaops/cosign.key + +# Restart attestor +docker compose -f devops/compose/docker-compose.stella-ops.yml restart attestor +``` + +For **keyless** mode: +```bash +# Set signing mode to keyless (default) +# ATTESTOR__SIGNING__MODE=keyless +# ATTESTOR__FULCIO__URL=https://fulcio.sigstore.dev +``` + +For **KMS** mode: +```bash +# Set KMS key reference +# ATTESTOR__SIGNING__MODE=kms +# ATTESTOR__SIGNING__KMSKEYREF=awskms:///arn:aws:kms:us-east-1:123456789:key/abcd-1234 +``` + +### Bare Metal / systemd + +```bash +# Configure signing mode +stella attestor signing configure --mode keyless + +# For file mode, generate keys +cosign generate-key-pair --output-key-prefix stellaops +stella attestor signing configure --mode file --key-path /etc/stellaops/cosign.key + +# For KMS mode (AWS example) +stella attestor signing configure --mode kms \ + --kms-key-ref 'awskms:///arn:aws:kms:us-east-1:123456789:key/abcd-1234' + +# For KMS mode (GCP example) +stella attestor signing configure --mode kms \ + --kms-key-ref 'gcpkms://projects/my-project/locations/global/keyRings/my-ring/cryptoKeys/my-key' + +# Check if key exists at another location +find /etc/stellaops -name '*.key' -o -name 'cosign*' +``` + +### Kubernetes / Helm + +```bash +# For file mode, create secret with key material +kubectl create secret generic stellaops-cosign-key -n stellaops \ + --from-file=cosign.key=/path/to/cosign.key \ + --from-file=cosign.pub=/path/to/cosign.pub + +# Set signing configuration in Helm values +# attestor: +# signing: +# mode: "kms" # or "file" or "keyless" +# kmsKeyRef: "awskms:///arn:aws:kms:..." +# # For file mode: +# # keyPath: "/etc/stellaops/cosign.key" +# # keySecret: "stellaops-cosign-key" +helm upgrade stellaops stellaops/stellaops -f values.yaml +``` + +## Verification + +``` +stella doctor run --check check.attestation.cosign.keymaterial +``` + +## Related Checks + +- `check.attestation.keymaterial` -- signing key expiration monitoring +- `check.attestation.rekor.connectivity` -- Rekor required for keyless signing verification +- `check.attestation.clock.skew` -- clock accuracy required for keyless OIDC tokens diff --git a/docs/doctor/articles/attestor/keymaterial.md b/docs/doctor/articles/attestor/keymaterial.md new file mode 100644 index 000000000..a614257ee --- /dev/null +++ b/docs/doctor/articles/attestor/keymaterial.md @@ -0,0 +1,110 @@ +--- +checkId: check.attestation.keymaterial +plugin: stellaops.doctor.attestor +severity: warn +tags: [attestation, signing, security, expiration] +--- +# Signing Key Expiration + +## What It Checks + +Monitors the expiration timeline of attestation signing keys. The check reads the signing mode from configuration and, for modes that use expiring keys (file, kms, certificate), retrieves key information and classifies each key: + +1. **Expired** -- key has already expired (`daysUntilExpiry < 0`). Result: **Fail** with list of expired key IDs. +2. **Critical** -- key expires within **7 days**. Result: **Fail** with key IDs and days remaining. +3. **Warning** -- key expires within **30 days**. Result: **Warn** with key IDs and days remaining. +4. **Healthy** -- all keys have more than 30 days until expiration. Result: **Pass** with key count and per-key expiry dates (up to 5 keys shown). + +For **keyless** signing mode, the check returns **Skip** because keyless signing does not use expiring key material. + +If no signing keys are found, the check returns **Skip** with a note that no file-based or certificate-based keys were found. + +Evidence collected: `ExpiredKeys` (list of IDs), `CriticalKeys` (ID + days), `WarningKeys` (ID + days), `TotalKeys`, `HealthyKeys`, per-key entries showing `Key:` with expiry date and days remaining. + +Thresholds: +- Warning: 30 days (`WarningDays`) +- Critical: 7 days (`CriticalDays`) + +## Why It Matters + +Expired signing keys make it impossible to create new attestations, blocking the release pipeline at policy gates that require signed artifacts. Keys approaching expiration should be rotated proactively to ensure overlap between old and new keys, allowing verifiers to accept signatures from both during the transition period. Without monitoring, key expiration causes a sudden, hard outage. + +## Common Causes + +- Keys were not rotated before expiration (manual process forgotten) +- Scheduled rotation job failed (permissions, connectivity) +- Key expiration not monitored (no alerting configured) +- Normal lifecycle -- keys approaching the warning threshold (plan rotation) +- Rotation reminders not configured + +## How to Fix + +### Docker Compose + +```bash +# Check key status +docker compose -f devops/compose/docker-compose.stella-ops.yml exec attestor \ + stella keys status + +# Rotate expired or critical keys +docker compose -f devops/compose/docker-compose.stella-ops.yml exec attestor \ + stella keys rotate + +# Set up expiration monitoring +docker compose -f devops/compose/docker-compose.stella-ops.yml exec attestor \ + stella notify channels add --type email --event key.expiring --threshold-days 30 +``` + +### Bare Metal / systemd + +```bash +# Rotate expired keys immediately +stella keys rotate + +# Set up key expiration monitoring +stella notify channels add --type email --event key.expiring --threshold-days 30 + +# Schedule immediate key rotation for critical keys (with overlap) +stella keys rotate --overlap-days 7 + +# Plan rotation for warning-level keys (dry run first) +stella keys rotate --dry-run + +# Execute rotation with overlap period +stella keys rotate --overlap-days 14 + +# Review all key status +stella keys status +``` + +### Kubernetes / Helm + +```bash +# Check key status +kubectl exec -it deploy/stellaops-attestor -n stellaops -- \ + stella keys status + +# Rotate keys +kubectl exec -it deploy/stellaops-attestor -n stellaops -- \ + stella keys rotate --overlap-days 14 + +# Configure automatic key rotation in Helm values +# attestor: +# signing: +# autoRotate: true +# rotationBeforeDays: 30 +# overlapDays: 14 +helm upgrade stellaops stellaops/stellaops -f values.yaml +``` + +## Verification + +``` +stella doctor run --check check.attestation.keymaterial +``` + +## Related Checks + +- `check.attestation.cosign.keymaterial` -- verifies key material availability (existence, not expiration) +- `check.auth.signing-key` -- auth signing key health (separate from attestation keys) +- `check.attestation.rekor.verification.job` -- expired keys cause verification failures diff --git a/docs/doctor/articles/attestor/rekor-connectivity.md b/docs/doctor/articles/attestor/rekor-connectivity.md new file mode 100644 index 000000000..676506961 --- /dev/null +++ b/docs/doctor/articles/attestor/rekor-connectivity.md @@ -0,0 +1,115 @@ +--- +checkId: check.attestation.rekor.connectivity +plugin: stellaops.doctor.attestor +severity: fail +tags: [attestation, rekor, transparency, quick, setup] +--- +# Rekor Connectivity + +## What It Checks + +Tests connectivity to the Rekor transparency log by sending an HTTP GET request to the log info endpoint (`{rekorUrl}/api/v1/log`). The Rekor URL is read from configuration (`Attestor:Rekor:Url` or `Transparency:Rekor:Url`, defaulting to `https://rekor.sigstore.dev`). Request timeout is 10 seconds. + +Results: + +1. **HTTP 2xx success**: **Pass**. Parses the response JSON for `treeSize` and reports the endpoint, response latency (ms), and current tree size. +2. **HTTP non-2xx**: **Fail** with status code and latency. +3. **Connection timeout** (`TaskCanceledException`): **Fail** with "Connection timeout (10s)". +4. **HTTP request exception** (DNS failure, SSL error, connection refused): **Fail** with the exception message. + +Evidence collected: `Endpoint`, `Latency` (ms), `TreeSize`, `StatusCode`, `Error`. + +The check always runs (`CanRun` returns true) because Rekor connectivity is essential for attestation. + +## Why It Matters + +Rekor is the transparency log that records attestation entries, providing tamper-evident proof that signatures were created at a specific time. Without Rekor connectivity, the Attestor cannot submit new log entries, and verifiers cannot confirm that attestations were properly logged. In non-air-gapped deployments, Rekor connectivity is a hard requirement for the signing and verification pipeline. + +## Common Causes + +- Rekor service is down or undergoing maintenance +- Network connectivity issue (proxy not configured, routing problem) +- Firewall blocking outbound HTTPS (port 443) +- DNS resolution failure for `rekor.sigstore.dev` +- Wrong Rekor endpoint configured +- SSL/TLS handshake failure (expired CA cert, corporate MITM proxy) +- Air-gapped environment without offline bundle configured + +## How to Fix + +### Docker Compose + +```bash +# Test Rekor connectivity from the attestor container +docker compose -f devops/compose/docker-compose.stella-ops.yml exec attestor \ + curl -s https://rekor.sigstore.dev/api/v1/log | jq . + +# Check DNS resolution +docker compose -f devops/compose/docker-compose.stella-ops.yml exec attestor \ + nslookup rekor.sigstore.dev + +# Set Rekor URL in environment +# ATTESTOR__REKOR__URL=https://rekor.sigstore.dev + +# For air-gapped environments, configure offline mode +# ATTESTOR__OFFLINE__ENABLED=true +``` + +### Bare Metal / systemd + +```bash +# Test Rekor connectivity manually +curl -s https://rekor.sigstore.dev/api/v1/log | jq . + +# Check network connectivity +nc -zv rekor.sigstore.dev 443 + +# Check DNS resolution +nslookup rekor.sigstore.dev + +# Check SSL certificates +openssl s_client -connect rekor.sigstore.dev:443 -brief + +# Verify Rekor URL configuration +grep -r 'rekor' /etc/stellaops/*.yaml + +# For air-gapped environments, download offline bundle +stella attestor offline-bundle download --output /var/lib/stellaops/rekor-offline + +# Enable offline mode +stella attestor config set --key offline.enabled --value true +``` + +### Kubernetes / Helm + +```bash +# Test from attestor pod +kubectl exec -it deploy/stellaops-attestor -n stellaops -- \ + curl -s https://rekor.sigstore.dev/api/v1/log | jq . + +# Check egress NetworkPolicy +kubectl get networkpolicy -n stellaops -o yaml | grep -A10 egress + +# Set Rekor URL in Helm values +# attestor: +# rekor: +# url: "https://rekor.sigstore.dev" +# # For air-gapped: +# # offline: +# # enabled: true +# # bundlePath: "/var/lib/stellaops/rekor-offline" +helm upgrade stellaops stellaops/stellaops -f values.yaml +``` + +## Verification + +``` +stella doctor run --check check.attestation.rekor.connectivity +``` + +## Related Checks + +- `check.attestation.clock.skew` -- clock accuracy affects Rekor entry timestamps +- `check.attestation.transparency.consistency` -- consistency check requires Rekor connectivity +- `check.attestation.rekor.verification.job` -- verification job depends on Rekor access +- `check.attestation.cosign.keymaterial` -- keyless signing requires Rekor for transparency logging diff --git a/docs/doctor/articles/attestor/rekor-verification-job.md b/docs/doctor/articles/attestor/rekor-verification-job.md new file mode 100644 index 000000000..da68b912e --- /dev/null +++ b/docs/doctor/articles/attestor/rekor-verification-job.md @@ -0,0 +1,138 @@ +--- +checkId: check.attestation.rekor.verification.job +plugin: stellaops.doctor.attestor +severity: warn +tags: [attestation, rekor, verification, background] +--- +# Rekor Verification Job + +## What It Checks + +Monitors the health of the periodic background job that re-verifies attestation entries stored in Rekor. The check queries `IRekorVerificationStatusProvider` from DI and evaluates several conditions in priority order: + +1. **Service not registered**: **Skip** if `IRekorVerificationStatusProvider` is not in the DI container. +2. **Never run**: **Warn** if `LastRunAt` is null (job has never executed). +3. **Critical alerts**: **Fail** if `CriticalAlertCount > 0` (possible log tampering, root hash mismatch, mass signature failures). +4. **Root consistency failed**: **Fail** if `RootConsistent` is false (stored checkpoint disagrees with remote log state). +5. **Stale run**: **Warn** if the job has not run in more than **48 hours**. +6. **High failure rate**: **Warn** if `FailureRate > 10%` (more than 10% of verified entries failed). +7. **Healthy**: **Pass** with last run time, status, entries verified, failure rate, root consistency, and duration. + +The check only runs when verification is enabled (`Attestor:Verification:Enabled` or `Transparency:Verification:Enabled` is not set to `false`). + +Evidence collected: `LastRun`, `LastRunStatus`, `IsRunning`, `NextScheduledRun`, `CriticalAlerts`, `RootConsistent`, `LastConsistencyCheck`, `HoursSinceLastRun`, `EntriesVerified`, `EntriesFailed`, `FailureRate`, `TimeSkewViolations`, `Duration`. + +## Why It Matters + +The verification job is the integrity watchdog for the attestation pipeline. It periodically re-checks that Rekor log entries have not been tampered with, that the root hash is consistent, and that signatures remain valid. Without this job running, an attacker could modify transparency log entries without detection, undermining the entire attestation trust model. A high failure rate may indicate clock skew, key rotation issues, or data corruption. + +## Common Causes + +- Job was just deployed and has not run yet (first-run delay) +- Job is disabled in configuration +- Background service failed to start (DI error, missing dependency) +- Transparency log tampering detected (critical alert) +- Root hash mismatch with stored checkpoints +- Mass signature verification failures after key rotation +- Background service stopped or scheduler not running (stale run) +- Job stuck or failed repeatedly +- Clock skew causing timestamp validation failures (high failure rate) +- Invalid signatures from previous key rotations +- Corrupted entries in local database + +## How to Fix + +### Docker Compose + +```bash +# Check attestor container status +docker compose -f devops/compose/docker-compose.stella-ops.yml ps attestor + +# View verification job logs +docker compose -f devops/compose/docker-compose.stella-ops.yml logs attestor --tail 300 | \ + grep -i 'verification\|rekor' + +# Trigger manual verification run +docker compose -f devops/compose/docker-compose.stella-ops.yml exec attestor \ + stella attestor verification run --now + +# Enable verification if disabled +# ATTESTOR__VERIFICATION__ENABLED=true + +# Restart attestor service +docker compose -f devops/compose/docker-compose.stella-ops.yml restart attestor +``` + +### Bare Metal / systemd + +```bash +# Check if the job is scheduled +stella attestor verification status + +# Trigger a manual verification run +stella attestor verification run --now + +# Check application logs for errors +journalctl -u stellaops-attestor --since '1 hour ago' | grep -i 'verification\|rekor' + +# Review critical alerts +stella attestor verification alerts --severity critical + +# Check transparency log status +stella attestor transparency status + +# Review failed entries (high failure rate) +stella attestor verification failures --last-run + +# Check system clock synchronization (if time skew violations) +timedatectl status + +# Re-sync failed entries from Rekor +stella attestor verification resync --failed-only + +# Restart the service if job is stale +sudo systemctl restart stellaops-attestor + +# Review recent error logs (stale job) +journalctl -u stellaops-attestor --since '48 hours ago' | grep -i error +``` + +### Kubernetes / Helm + +```bash +# Check attestor pod status +kubectl get pods -l app.kubernetes.io/component=attestor -n stellaops + +# View verification logs +kubectl logs -l app.kubernetes.io/component=attestor -n stellaops --tail=300 | \ + grep -i 'verification\|rekor' + +# Trigger manual verification +kubectl exec -it deploy/stellaops-attestor -n stellaops -- \ + stella attestor verification run --now + +# Enable verification in Helm values +# attestor: +# verification: +# enabled: true +# intervalHours: 24 +helm upgrade stellaops stellaops/stellaops -f values.yaml + +# Restart attestor pods +kubectl rollout restart deployment/stellaops-attestor -n stellaops +``` + +If critical alerts indicate possible log tampering, this may be a security incident. Review evidence carefully before dismissing alerts. + +## Verification + +``` +stella doctor run --check check.attestation.rekor.verification.job +``` + +## Related Checks + +- `check.attestation.rekor.connectivity` -- verification job requires Rekor connectivity +- `check.attestation.transparency.consistency` -- complementary consistency check against stored checkpoints +- `check.attestation.clock.skew` -- clock skew causes verification timestamp failures +- `check.attestation.keymaterial` -- expired signing keys cause verification failures diff --git a/docs/doctor/articles/attestor/transparency-consistency.md b/docs/doctor/articles/attestor/transparency-consistency.md new file mode 100644 index 000000000..b1d4be5ad --- /dev/null +++ b/docs/doctor/articles/attestor/transparency-consistency.md @@ -0,0 +1,151 @@ +--- +checkId: check.attestation.transparency.consistency +plugin: stellaops.doctor.attestor +severity: fail +tags: [attestation, transparency, security] +--- +# Transparency Log Consistency + +## What It Checks + +Verifies that locally stored transparency log checkpoints are consistent with the remote Rekor log. This is a critical security check that detects log rollback or tampering. + +The check only runs if a checkpoint path is configured (`Attestor:Transparency:CheckpointPath` or `Transparency:CheckpointPath`) or a checkpoint file exists at the default path (`{AppData}/stellaops/transparency/checkpoint.json`). + +Steps performed: + +1. **Read stored checkpoint** -- parses the local `checkpoint.json` file containing `TreeSize`, `RootHash`, `UpdatedAt`, and `LogId`. + - If the file does not exist: **Skip** (checkpoint will be created on first verification run). + - If the JSON is invalid: **Fail** with remediation to remove the corrupted file and re-sync. + - If the file is empty/null: **Fail**. + +2. **Fetch remote log state** -- HTTP GET to `{rekorUrl}/api/v1/log` (10-second timeout). Parses `treeSize` and `rootHash` from the response. + - If Rekor is unreachable: **Skip** (cannot verify consistency without remote state). + +3. **Compare tree sizes**: + - If remote tree size < stored tree size: **Fail** with "possible fork/rollback" (the log should only grow, never shrink). This is a CRITICAL security finding. + - If tree sizes match but root hashes differ: **Fail** with "possible tampering" (same size but different content). This is a CRITICAL security finding. + - If remote tree size >= stored tree size and hashes are consistent: **Pass** with entries-behind count and checkpoint age. + +Evidence collected: `CheckpointPath`, `Exists`, `StoredTreeSize`, `RemoteTreeSize`, `StoredRootHash`, `RemoteRootHash`, `EntriesBehind`, `CheckpointAge`, `ConsistencyVerified`, `Error`. + +## Why It Matters + +The transparency log is the tamper-evident backbone of the attestation system. If an attacker modifies or rolls back the log, they could hide revoked signatures, alter attestation records, or forge provenance data. This check is the primary defense against such attacks. A root hash mismatch at the same tree size is one of the strongest indicators of log tampering and should trigger an immediate security investigation. + +## Common Causes + +**For log rollback (remote < stored):** +- Transparency log was actually rolled back (CRITICAL security event) +- Stored checkpoint is from a different Rekor instance +- Man-in-the-middle attack on log queries (network interception) +- Configuration changed to point at a different Rekor server + +**For root hash mismatch:** +- Transparency log was modified (CRITICAL security event) +- Man-in-the-middle attack returning forged log state +- Checkpoint file corruption (disk error, incomplete write) + +**For corrupted checkpoint file:** +- Disk failure during checkpoint write +- Process killed during checkpoint update +- Manual editing of checkpoint file + +## How to Fix + +### Docker Compose + +```bash +# Check stored checkpoint +docker compose -f devops/compose/docker-compose.stella-ops.yml exec attestor \ + cat /app/data/transparency/checkpoint.json | jq . + +# Verify you are connecting to the correct Rekor instance +docker compose -f devops/compose/docker-compose.stella-ops.yml exec attestor \ + curl -s https://rekor.sigstore.dev/api/v1/log | jq . + +# If corrupted checkpoint, remove and re-sync +docker compose -f devops/compose/docker-compose.stella-ops.yml exec attestor \ + rm /app/data/transparency/checkpoint.json + +docker compose -f devops/compose/docker-compose.stella-ops.yml exec attestor \ + stella attestor transparency sync +``` + +### Bare Metal / systemd + +For **corrupted checkpoint**: +```bash +# Back up the corrupted checkpoint first +cp /path/to/checkpoint.json /path/to/checkpoint.json.bak + +# Remove corrupted checkpoint +rm /path/to/checkpoint.json + +# Trigger re-sync +stella attestor transparency sync +``` + +For **log rollback or hash mismatch** (CRITICAL): +```bash +# CRITICAL: This may indicate a security incident. Do not dismiss without investigation. + +# Get current root hash from Rekor +curl -s https://rekor.sigstore.dev/api/v1/log | jq .rootHash + +# Compare with stored checkpoint +stella attestor transparency checkpoint show + +# Verify you are connecting to the correct Rekor instance +curl -s https://rekor.sigstore.dev/api/v1/log | jq . + +# Check stored checkpoint +cat /path/to/checkpoint.json | jq . + +# If using wrong log instance, reset checkpoint (DESTRUCTIVE -- only after confirming wrong instance) +rm /path/to/checkpoint.json +stella attestor transparency sync + +# If mismatch persists with correct log, escalate to security team +``` + +### Kubernetes / Helm + +```bash +# Check stored checkpoint +kubectl exec -it deploy/stellaops-attestor -n stellaops -- \ + cat /app/data/transparency/checkpoint.json | jq . + +# Verify Rekor connectivity +kubectl exec -it deploy/stellaops-attestor -n stellaops -- \ + curl -s https://rekor.sigstore.dev/api/v1/log | jq . + +# If corrupted, remove checkpoint and re-sync +kubectl exec -it deploy/stellaops-attestor -n stellaops -- \ + rm /app/data/transparency/checkpoint.json + +kubectl exec -it deploy/stellaops-attestor -n stellaops -- \ + stella attestor transparency sync + +# Check checkpoint persistence (PVC) +kubectl get pvc -l app.kubernetes.io/component=attestor -n stellaops + +# Set checkpoint path in Helm values +# attestor: +# transparency: +# checkpointPath: "/app/data/transparency/checkpoint.json" +``` + +Root hash mismatches or log rollbacks should be treated as potential security incidents. Do not reset the checkpoint without first investigating whether the remote log was actually compromised. + +## Verification + +``` +stella doctor run --check check.attestation.transparency.consistency +``` + +## Related Checks + +- `check.attestation.rekor.connectivity` -- consistency check requires Rekor access +- `check.attestation.rekor.verification.job` -- verification job also checks root consistency +- `check.attestation.clock.skew` -- clock accuracy affects consistency proof timestamps diff --git a/docs/doctor/articles/auth/config.md b/docs/doctor/articles/auth/config.md new file mode 100644 index 000000000..1a9e7bbf6 --- /dev/null +++ b/docs/doctor/articles/auth/config.md @@ -0,0 +1,108 @@ +--- +checkId: check.auth.config +plugin: stellaops.doctor.auth +severity: fail +tags: [auth, security, core, config] +--- +# Auth Configuration + +## What It Checks + +Validates the overall authentication configuration by inspecting three layers in sequence: + +1. **Authentication configured** -- verifies that the auth subsystem has been set up (issuer URL present, basic config loaded). If not: **Fail** with "Authentication not configured". +2. **Signing keys available** -- checks whether signing keys exist for token issuance. If configured but no keys: **Fail** with "No signing keys available". +3. **Signing key expiration** -- checks if the active signing key is approaching expiration. If it will expire soon: **Warn** with the number of days remaining. +4. **All healthy** -- issuer URL configured, signing keys available, key not near expiry. Result: **Pass**. + +Evidence collected: `AuthConfigured` (YES/NO), `IssuerConfigured` (YES/NO), `IssuerUrl`, `SigningKeysConfigured`/`SigningKeysAvailable` (YES/NO), `KeyExpiration` (days), `ActiveClients` count, `ActiveScopes` count. + +The check always runs (`CanRun` returns true). + +## Why It Matters + +Authentication is the foundation of every API call in Stella Ops. If the auth subsystem is not configured, no user can log in, no service-to-service call can authenticate, and the entire platform is non-functional. Missing signing keys mean tokens cannot be issued, and an expiring key that is not rotated will cause a hard outage when it expires. + +## Common Causes + +- Authority service not configured (fresh installation without `stella setup auth`) +- Missing issuer URL configuration in environment variables or config files +- Signing keys not yet generated (first-run setup incomplete) +- Key material corrupted (disk failure, accidental deletion) +- HSM/PKCS#11 module not accessible (hardware key store offline) +- Signing key approaching expiration without scheduled rotation + +## How to Fix + +### Docker Compose + +```bash +# Check Authority service configuration +docker compose -f devops/compose/docker-compose.stella-ops.yml exec authority \ + cat /app/appsettings.json | grep -A5 "Issuer\|Signing" + +# Set issuer URL via environment variable +# In .env or docker-compose.override.yml: +# AUTHORITY__ISSUER__URL=https://stella-ops.local/authority + +# Restart Authority service after config changes +docker compose -f devops/compose/docker-compose.stella-ops.yml restart authority + +# Generate signing keys +docker compose -f devops/compose/docker-compose.stella-ops.yml exec authority \ + stella keys generate --type rsa +``` + +### Bare Metal / systemd + +```bash +# Run initial auth setup +stella setup auth + +# Configure issuer URL +stella auth configure --issuer https://auth.yourdomain.com + +# Generate signing keys +stella keys generate --type rsa + +# Rotate signing keys (if approaching expiration) +stella keys rotate + +# Schedule automatic key rotation +stella keys rotate --schedule 30d + +# Check key store health +stella doctor run --check check.crypto.keystore +``` + +### Kubernetes / Helm + +```bash +# Check authority pod configuration +kubectl get configmap stellaops-authority-config -n stellaops -o yaml + +# Set issuer URL in Helm values +# authority: +# issuer: +# url: "https://auth.yourdomain.com" +helm upgrade stellaops stellaops/stellaops -f values.yaml + +# Generate keys via job +kubectl exec -it deploy/stellaops-authority -n stellaops -- \ + stella keys generate --type rsa + +# Check secrets for key material +kubectl get secret stellaops-signing-keys -n stellaops +``` + +## Verification + +``` +stella doctor run --check check.auth.config +``` + +## Related Checks + +- `check.auth.signing-key` -- deeper signing key health (algorithm, size, rotation schedule) +- `check.auth.token-service` -- verifies token endpoint is responsive +- `check.auth.oidc` -- external OIDC provider connectivity diff --git a/docs/doctor/articles/auth/oidc.md b/docs/doctor/articles/auth/oidc.md new file mode 100644 index 000000000..703023ccc --- /dev/null +++ b/docs/doctor/articles/auth/oidc.md @@ -0,0 +1,100 @@ +--- +checkId: check.auth.oidc +plugin: stellaops.doctor.auth +severity: warn +tags: [auth, oidc, connectivity] +--- +# OIDC Provider Connectivity + +## What It Checks + +Tests connectivity to an external OIDC provider by performing real HTTP requests. The check reads the issuer URL from configuration keys (in priority order): `Authentication:Oidc:Issuer`, `Auth:Oidc:Authority`, `Oidc:Issuer`. If none is configured, the check passes immediately (local authority mode). + +When an external provider is configured, the check performs a multi-step validation: + +1. **Fetch discovery document** -- HTTP GET to `{issuerUrl}/.well-known/openid-configuration` with a 10-second timeout. If unreachable: **Fail** with connection error type classification (ssl_error, dns_failure, refused, timeout, connection_failed). +2. **Validate discovery fields** -- Parses the discovery JSON and verifies presence of `authorization_endpoint`, `token_endpoint`, and `jwks_uri`. If any are missing: **Warn** listing the missing fields. +3. **Fetch JWKS** -- HTTP GET to the `jwks_uri` from the discovery document. Counts the number of keys in the `keys` array. If zero keys: **Warn** (token validation may fail). +4. **All healthy** -- provider reachable, discovery valid, JWKS has keys. Result: **Pass**. + +Evidence collected: `issuer_url`, `discovery_reachable`, `discovery_response_ms`, `authorization_endpoint_present`, `token_endpoint_present`, `jwks_uri_present`, `jwks_key_count`, `jwks_fetch_ms`, `http_status_code`, `error_message`, `connection_error_type`. + +## Why It Matters + +When Stella Ops is configured to delegate authentication to an external OIDC provider (Azure AD, Keycloak, Okta, etc.), all user logins and token validations depend on that provider being reachable and correctly configured. A connectivity failure means users cannot log in, and services cannot validate tokens, leading to a platform-wide authentication outage. + +## Common Causes + +- OIDC provider is down or undergoing maintenance +- Network connectivity issue (proxy misconfiguration, firewall rule change) +- DNS resolution failure for the provider hostname +- Firewall blocking outbound HTTPS to the provider +- Discovery document missing required fields (misconfigured provider) +- Token endpoint misconfigured after provider upgrade +- JWKS endpoint returning empty key set (key rotation in progress) +- OIDC provider rate limiting or returning errors + +## How to Fix + +### Docker Compose + +```bash +# Test OIDC provider connectivity from the authority container +docker compose -f devops/compose/docker-compose.stella-ops.yml exec authority \ + curl -s https:///.well-known/openid-configuration | jq . + +# Check DNS resolution +docker compose -f devops/compose/docker-compose.stella-ops.yml exec authority \ + nslookup + +# Set OIDC configuration via environment +# AUTHENTICATION__OIDC__ISSUER=https://login.microsoftonline.com//v2.0 +``` + +### Bare Metal / systemd + +```bash +# Test provider connectivity +curl -s https:///.well-known/openid-configuration | jq . + +# Check DNS resolution +nslookup + +# Validate OIDC configuration +stella auth oidc validate + +# Check JWKS endpoint +curl -s $(curl -s https:///.well-known/openid-configuration | jq -r .jwks_uri) | jq . + +# Check network connectivity +stella doctor run --check check.network.dns +``` + +### Kubernetes / Helm + +```bash +# Test from authority pod +kubectl exec -it deploy/stellaops-authority -n stellaops -- \ + curl -s https:///.well-known/openid-configuration | jq . + +# Check NetworkPolicy allows egress to OIDC provider +kubectl get networkpolicy -n stellaops -o yaml | grep -A10 egress + +# Set OIDC configuration in Helm values +# authority: +# oidc: +# issuer: "https://login.microsoftonline.com//v2.0" +helm upgrade stellaops stellaops/stellaops -f values.yaml +``` + +## Verification + +``` +stella doctor run --check check.auth.oidc +``` + +## Related Checks + +- `check.auth.config` -- overall auth configuration health +- `check.auth.signing-key` -- local signing key health (used when not delegating to external OIDC) +- `check.auth.token-service` -- token endpoint availability diff --git a/docs/doctor/articles/auth/signing-key.md b/docs/doctor/articles/auth/signing-key.md new file mode 100644 index 000000000..184b09fbf --- /dev/null +++ b/docs/doctor/articles/auth/signing-key.md @@ -0,0 +1,106 @@ +--- +checkId: check.auth.signing-key +plugin: stellaops.doctor.auth +severity: fail +tags: [auth, security, keys] +--- +# Signing Key Health + +## What It Checks + +Verifies the health of the active signing key used for token issuance. The check evaluates three conditions in sequence: + +1. **No active key** -- if `HasActiveKey` is false: **Fail** with "No active signing key available". Evidence includes `ActiveKey: NONE` and total key count. +2. **Approaching expiration** -- if the active key expires within **30 days** (`ExpirationWarningDays`): **Warn** with the number of days remaining. Evidence includes key ID, algorithm, days until expiration, and whether rotation is scheduled. +3. **Healthy** -- active key exists with more than 30 days until expiration. Result: **Pass**. Evidence includes key ID, algorithm, key size (bits), days until expiration, and rotation schedule status. + +The check always runs (`CanRun` returns true). + +Evidence collected: `ActiveKeyId`, `Algorithm`, `KeySize`, `DaysUntilExpiration`, `RotationScheduled` (YES/NO), `TotalKeys`. + +## Why It Matters + +The signing key is used to sign every JWT token issued by the Authority service. If no active key exists, no tokens can be issued, and the entire platform's authentication stops working. If the key is approaching expiration without a rotation plan, the platform faces a hard outage on the expiration date -- all tokens signed with the key become unverifiable. + +## Common Causes + +- Signing keys not generated (incomplete setup) +- All keys expired without rotation +- Key store corrupted (file system issue, accidental deletion) +- Key rotation not scheduled (manual process that was forgotten) +- Previous rotation attempt failed (permissions, HSM connectivity) + +## How to Fix + +### Docker Compose + +```bash +# Check current key status +docker compose -f devops/compose/docker-compose.stella-ops.yml exec authority \ + stella keys status + +# Generate new signing key +docker compose -f devops/compose/docker-compose.stella-ops.yml exec authority \ + stella keys generate --type rsa --bits 4096 + +# Activate the new key +docker compose -f devops/compose/docker-compose.stella-ops.yml exec authority \ + stella keys activate + +# Rotate keys +docker compose -f devops/compose/docker-compose.stella-ops.yml exec authority \ + stella keys rotate +``` + +### Bare Metal / systemd + +```bash +# Generate new signing key +stella keys generate --type rsa --bits 4096 + +# Activate the key +stella keys activate + +# Rotate signing key +stella keys rotate + +# Schedule automatic rotation (every 30 days) +stella keys rotate --schedule 30d + +# Check key status +stella keys status +``` + +### Kubernetes / Helm + +```bash +# Check key status +kubectl exec -it deploy/stellaops-authority -n stellaops -- \ + stella keys status + +# Generate and activate key +kubectl exec -it deploy/stellaops-authority -n stellaops -- \ + stella keys generate --type rsa --bits 4096 + +# Set automatic rotation in Helm values +# authority: +# signing: +# autoRotate: true +# rotationIntervalDays: 30 +helm upgrade stellaops stellaops/stellaops -f values.yaml + +# Check signing key secret +kubectl get secret stellaops-signing-keys -n stellaops -o jsonpath='{.data}' | base64 -d +``` + +## Verification + +``` +stella doctor run --check check.auth.signing-key +``` + +## Related Checks + +- `check.auth.config` -- overall auth configuration including signing key presence +- `check.auth.token-service` -- token issuance depends on a healthy signing key +- `check.attestation.keymaterial` -- attestor signing keys (separate from auth signing keys) diff --git a/docs/doctor/articles/auth/token-service.md b/docs/doctor/articles/auth/token-service.md new file mode 100644 index 000000000..7954bde57 --- /dev/null +++ b/docs/doctor/articles/auth/token-service.md @@ -0,0 +1,114 @@ +--- +checkId: check.auth.token-service +plugin: stellaops.doctor.auth +severity: fail +tags: [auth, service, health] +--- +# Token Service Health + +## What It Checks + +Verifies the availability and performance of the token service endpoint (`/connect/token`). The check evaluates four conditions: + +1. **Service unavailable** -- token endpoint is not responding. Result: **Fail** with the endpoint URL and error message. +2. **Critically slow** -- response time exceeds **2000ms**. Result: **Fail** with actual response time and threshold. +3. **Slow** -- response time exceeds **500ms** but is under 2000ms. Result: **Warn** with response time, threshold, and token issuance count. +4. **Healthy** -- service is available and response time is under 500ms. Result: **Pass** with response time, tokens issued in last 24 hours, and active session count. + +Evidence collected: `ServiceAvailable` (YES/NO), `Endpoint`, `ResponseTimeMs`, `CriticalThreshold` (2000), `WarningThreshold` (500), `TokensIssuedLast24h`, `ActiveSessions`, `Error`. + +The check always runs (`CanRun` returns true). + +## Why It Matters + +The token service is the single point through which all access tokens are issued. If it is unavailable, no user can log in, no service can authenticate, and every API call fails with 401. Even if the service is available but slow, user login experiences degrade, automated integrations time out, and the platform feels unresponsive. This check is typically the first to detect Authority database issues or resource starvation. + +## Common Causes + +- Authority service not running (container stopped, process crashed) +- Token endpoint misconfigured (wrong path, wrong port) +- Database connectivity issue (Authority cannot query clients/keys) +- Database performance issues (slow queries for token validation) +- Service overloaded (high authentication request volume) +- Resource contention (CPU/memory pressure on Authority host) +- Higher than normal load (warning-level) +- Database query performance degraded (warning-level) + +## How to Fix + +### Docker Compose + +```bash +# Check Authority service status +docker compose -f devops/compose/docker-compose.stella-ops.yml ps authority + +# View Authority service logs +docker compose -f devops/compose/docker-compose.stella-ops.yml logs authority --tail 200 + +# Restart Authority service +docker compose -f devops/compose/docker-compose.stella-ops.yml restart authority + +# Test token endpoint directly +docker compose -f devops/compose/docker-compose.stella-ops.yml exec authority \ + curl -s -o /dev/null -w "%{http_code} %{time_total}s" http://localhost:80/connect/token + +# Check database connectivity +docker compose -f devops/compose/docker-compose.stella-ops.yml exec authority \ + stella doctor run --check check.storage.postgres +``` + +### Bare Metal / systemd + +```bash +# Check authority service status +stella auth status + +# Restart authority service +stella service restart authority + +# Check database connectivity +stella doctor run --check check.storage.postgres + +# Monitor service metrics +stella auth metrics --period 1h + +# Review database performance +stella doctor run --check check.storage.performance + +# Watch metrics in real-time (warning-level slowness) +stella auth metrics --watch +``` + +### Kubernetes / Helm + +```bash +# Check authority pod status +kubectl get pods -l app.kubernetes.io/component=authority -n stellaops + +# View pod logs +kubectl logs -l app.kubernetes.io/component=authority -n stellaops --tail=200 + +# Check resource usage +kubectl top pods -l app.kubernetes.io/component=authority -n stellaops + +# Restart authority pods +kubectl rollout restart deployment/stellaops-authority -n stellaops + +# Scale up if under load +kubectl scale deployment stellaops-authority --replicas=3 -n stellaops + +# Check liveness/readiness probe status +kubectl describe pod -l app.kubernetes.io/component=authority -n stellaops | grep -A5 "Liveness\|Readiness" +``` + +## Verification + +``` +stella doctor run --check check.auth.token-service +``` + +## Related Checks + +- `check.auth.config` -- auth must be configured before the token service can function +- `check.auth.signing-key` -- token issuance requires a valid signing key +- `check.auth.oidc` -- if delegating to external OIDC, that provider must also be healthy diff --git a/docs/doctor/articles/binary-analysis/buildinfo-cache.md b/docs/doctor/articles/binary-analysis/buildinfo-cache.md new file mode 100644 index 000000000..070158bdf --- /dev/null +++ b/docs/doctor/articles/binary-analysis/buildinfo-cache.md @@ -0,0 +1,74 @@ +--- +checkId: check.binaryanalysis.buildinfo.cache +plugin: stellaops.doctor.binaryanalysis +severity: warn +tags: [binaryanalysis, buildinfo, debian, cache, security] +--- +# Debian Buildinfo Cache + +## What It Checks +Verifies Debian buildinfo service accessibility and local cache directory configuration. The check: + +- Tests HTTPS connectivity to `buildinfos.debian.net` and `reproduce.debian.net` via HEAD requests. +- Checks the local cache directory (default `/var/cache/stella/buildinfo`, configurable via `BinaryAnalysis:BuildinfoCache:Directory`) for existence and writability by creating and deleting a temp file. +- Fails if both services are unreachable AND the cache directory does not exist. +- Warns if services are unreachable but the cache exists (offline mode possible), or if services are reachable but the cache directory is missing or not writable. + +## Why It Matters +Buildinfo files from Debian are used for reproducible-build verification. Without access to buildinfo services or a local cache, binary analysis cannot verify whether packages were built reproducibly, degrading supply-chain assurance for Debian-based container images. + +## Common Causes +- Firewall blocking HTTPS access to Debian buildinfo services +- Network connectivity issues or DNS resolution failure +- Proxy configuration required but not set +- Cache directory not created +- Insufficient permissions on cache directory + +## How to Fix + +### Docker Compose +```yaml +environment: + BinaryAnalysis__BuildinfoCache__Directory: "/var/cache/stella/buildinfo" +volumes: + - buildinfo-cache:/var/cache/stella/buildinfo +``` + +Test connectivity: +```bash +docker exec curl -I https://buildinfos.debian.net +``` + +### Bare Metal / systemd +```bash +# Create cache directory +sudo mkdir -p /var/cache/stella/buildinfo +sudo chmod 755 /var/cache/stella/buildinfo + +# Test connectivity +curl -I https://buildinfos.debian.net + +# If behind a proxy +export HTTPS_PROXY=http://proxy.example.com:8080 +``` + +### Kubernetes / Helm +```yaml +binaryAnalysis: + buildinfo: + cacheDirectory: "/var/cache/stella/buildinfo" + persistence: + enabled: true + size: 5Gi +``` + +For air-gapped environments, pre-populate the buildinfo cache with required files or disable this check. + +## Verification +``` +stella doctor run --check check.binaryanalysis.buildinfo.cache +``` + +## Related Checks +- `check.binaryanalysis.symbol.recovery.fallback` — meta-check ensuring at least one symbol recovery path is available +- `check.binaryanalysis.debuginfod.available` — verifies debuginfod service connectivity diff --git a/docs/doctor/articles/binary-analysis/corpus-mirror-freshness.md b/docs/doctor/articles/binary-analysis/corpus-mirror-freshness.md new file mode 100644 index 000000000..bb64fced8 --- /dev/null +++ b/docs/doctor/articles/binary-analysis/corpus-mirror-freshness.md @@ -0,0 +1,67 @@ +--- +checkId: check.binaryanalysis.corpus.mirror.freshness +plugin: stellaops.doctor.binaryanalysis +severity: warn +tags: [binaryanalysis, corpus, mirrors, freshness, security, groundtruth] +--- +# Corpus Mirror Freshness + +## What It Checks +Verifies that local corpus mirrors are not stale. The check: + +- Reads the mirrors root directory (default `/var/lib/stella/mirrors`, configurable via `BinaryAnalysis:Corpus:MirrorsDirectory`). +- Inspects five known mirror subdirectories: `debian/archive`, `debian/snapshot`, `ubuntu/usn-index`, `alpine/secdb`, and `osv`. +- For each existing mirror, finds the most recent file modification time (sampling up to 1000 files) and compares it against a staleness threshold (default 7 days, configurable via `BinaryAnalysis:Corpus:StalenessThresholdDays`). +- Fails if no mirrors exist or all mirrors are stale. Warns if some mirrors are stale. Reports info if all present mirrors are fresh but optional mirrors are missing. + +## Why It Matters +Corpus mirrors provide ground-truth vulnerability and package data for binary analysis. Stale mirrors mean symbol recovery operates on outdated data, leading to missed vulnerabilities and inaccurate matching in security scans. + +## Common Causes +- Corpus mirrors have not been initialized +- Mirror sync job has not run recently or is disabled +- Network connectivity issues preventing sync +- Air-gapped setup incomplete (mirrors not pre-populated) + +## How to Fix + +### Docker Compose +```bash +# Initialize all mirrors +docker exec stella groundtruth mirror sync --all +``` + +### Bare Metal / systemd +```bash +# Create mirrors directory +sudo mkdir -p /var/lib/stella/mirrors + +# Sync all mirrors +stella groundtruth mirror sync --all + +# Set up a timer for automatic sync +sudo systemctl enable stella-mirror-sync.timer +sudo systemctl start stella-mirror-sync.timer +``` + +### Kubernetes / Helm +```yaml +binaryAnalysis: + corpus: + mirrorsDirectory: "/var/lib/stella/mirrors" + syncSchedule: "0 2 * * *" # daily at 2am + persistence: + enabled: true + size: 50Gi +``` + +For air-gapped environments, transfer pre-populated mirrors from an online system. + +## Verification +``` +stella doctor run --check check.binaryanalysis.corpus.mirror.freshness +``` + +## Related Checks +- `check.binaryanalysis.corpus.kpi.baseline` — verifies KPI baseline exists for regression detection +- `check.binaryanalysis.symbol.recovery.fallback` — meta-check for symbol recovery path availability diff --git a/docs/doctor/articles/binary-analysis/ddeb-repo-enabled.md b/docs/doctor/articles/binary-analysis/ddeb-repo-enabled.md new file mode 100644 index 000000000..87e2a7516 --- /dev/null +++ b/docs/doctor/articles/binary-analysis/ddeb-repo-enabled.md @@ -0,0 +1,61 @@ +--- +checkId: check.binaryanalysis.ddeb.enabled +plugin: stellaops.doctor.binaryanalysis +severity: warn +tags: [binaryanalysis, ddeb, ubuntu, symbols, security] +--- +# Ubuntu Ddeb Repository + +## What It Checks +Verifies Ubuntu debug symbol repository (ddebs.ubuntu.com) is configured and accessible. The check (Linux only): + +- Parses `/etc/apt/sources.list` and `/etc/apt/sources.list.d/*.list` (and `.sources` DEB822 files) for entries containing `ddebs.ubuntu.com`. +- Tests HTTP connectivity to `http://ddebs.ubuntu.com` via a HEAD request. +- Detects the distribution codename from `/etc/lsb-release` or `/etc/os-release`. +- Reports different warnings based on whether the repo is configured, reachable, or both. +- Skips on non-Linux platforms. + +## Why It Matters +The Ubuntu ddeb repository provides debug symbol packages (`-dbgsym`) needed for binary analysis of Ubuntu-based container images. Without debug symbols, binary matching accuracy is significantly reduced, weakening vulnerability detection for Ubuntu workloads. + +## Common Causes +- Ddeb repository not added to apt sources +- Network connectivity issues preventing access to ddebs.ubuntu.com +- Firewall blocking HTTP access +- Running on a non-Ubuntu Linux distribution + +## How to Fix + +### Docker Compose +Add ddeb repository inside the binary analysis container: + +```bash +docker exec bash -c \ + 'echo "deb http://ddebs.ubuntu.com $(lsb_release -cs) main restricted universe multiverse" > /etc/apt/sources.list.d/ddebs.list' +docker exec apt-key adv --keyserver keyserver.ubuntu.com \ + --recv-keys F2EDC64DC5AEE1F6B9C621F0C8CAB6595FDFF622 +docker exec apt update +``` + +### Bare Metal / systemd +```bash +echo "deb http://ddebs.ubuntu.com $(lsb_release -cs) main restricted universe multiverse" \ + | sudo tee /etc/apt/sources.list.d/ddebs.list +sudo apt-key adv --keyserver keyserver.ubuntu.com \ + --recv-keys F2EDC64DC5AEE1F6B9C621F0C8CAB6595FDFF622 +sudo apt update +``` + +### Kubernetes / Helm +Include the ddeb repository in your container image's Dockerfile or use an init container to configure it at startup. + +For air-gapped environments, set up a local ddeb mirror or use offline symbol packages. + +## Verification +``` +stella doctor run --check check.binaryanalysis.ddeb.enabled +``` + +## Related Checks +- `check.binaryanalysis.debuginfod.available` — verifies debuginfod service availability +- `check.binaryanalysis.symbol.recovery.fallback` — meta-check for symbol recovery path availability diff --git a/docs/doctor/articles/binary-analysis/debuginfod-availability.md b/docs/doctor/articles/binary-analysis/debuginfod-availability.md new file mode 100644 index 000000000..fdae1e5b6 --- /dev/null +++ b/docs/doctor/articles/binary-analysis/debuginfod-availability.md @@ -0,0 +1,71 @@ +--- +checkId: check.binaryanalysis.debuginfod.available +plugin: stellaops.doctor.binaryanalysis +severity: warn +tags: [binaryanalysis, debuginfod, symbols, security] +--- +# Debuginfod Availability + +## What It Checks +Verifies DEBUGINFOD_URLS environment variable and debuginfod service connectivity. The check: + +- Reads the `DEBUGINFOD_URLS` environment variable (space-separated list of URLs). +- If not set, falls back to the default Fedora debuginfod at `https://debuginfod.fedoraproject.org`. +- Tests HTTP connectivity to each URL via HEAD requests. +- Reports info if DEBUGINFOD_URLS is not set but the default is reachable. +- Warns if some configured URLs are unreachable. Fails if none are reachable. + +## Why It Matters +Debuginfod provides on-demand debug information (DWARF, source) for ELF binaries. It is the primary mechanism for symbol recovery in binary analysis. Without a reachable debuginfod endpoint, binary matching accuracy drops significantly, reducing the effectiveness of vulnerability correlation and reachability analysis. + +## Common Causes +- `DEBUGINFOD_URLS` environment variable is not set +- Configured debuginfod servers may be down +- Firewall blocking HTTPS access to debuginfod servers +- Proxy configuration required but not set +- DNS resolution failure for debuginfod hostnames + +## How to Fix + +### Docker Compose +```yaml +environment: + DEBUGINFOD_URLS: "https://debuginfod.fedoraproject.org" +``` + +Test connectivity: +```bash +docker exec curl -I https://debuginfod.fedoraproject.org +``` + +### Bare Metal / systemd +```bash +# Set the environment variable +export DEBUGINFOD_URLS="https://debuginfod.fedoraproject.org" + +# Or add to service file +sudo systemctl edit stellaops-binaryindex +# Add: Environment=DEBUGINFOD_URLS=https://debuginfod.fedoraproject.org + +# Verify connectivity +curl -I https://debuginfod.fedoraproject.org +``` + +### Kubernetes / Helm +```yaml +binaryAnalysis: + debuginfod: + urls: "https://debuginfod.fedoraproject.org" +``` + +For air-gapped environments, deploy a local debuginfod instance or use offline symbol bundles. See `docs/modules/binary-index/ground-truth-corpus.md` for offline setup. + +## Verification +``` +stella doctor run --check check.binaryanalysis.debuginfod.available +``` + +## Related Checks +- `check.binaryanalysis.ddeb.enabled` — verifies Ubuntu ddeb repository availability +- `check.binaryanalysis.buildinfo.cache` — verifies Debian buildinfo service and cache +- `check.binaryanalysis.symbol.recovery.fallback` — meta-check aggregating all symbol sources diff --git a/docs/doctor/articles/binary-analysis/kpi-baseline-exists.md b/docs/doctor/articles/binary-analysis/kpi-baseline-exists.md new file mode 100644 index 000000000..4ef06b631 --- /dev/null +++ b/docs/doctor/articles/binary-analysis/kpi-baseline-exists.md @@ -0,0 +1,75 @@ +--- +checkId: check.binaryanalysis.corpus.kpi.baseline +plugin: stellaops.doctor.binaryanalysis +severity: warn +tags: [binaryanalysis, corpus, kpi, baseline, regression, ci, groundtruth, security] +--- +# KPI Baseline Configuration + +## What It Checks +Verifies that a KPI baseline file exists for regression detection in CI gates. The check: + +- Looks for a baseline file at the configured directory (default `/var/lib/stella/baselines`) and filename (default `current.json`), configurable via `BinaryAnalysis:Corpus:BaselineDirectory` and `BinaryAnalysis:Corpus:BaselineFilename`. +- If the directory does not exist, warns. +- If the default baseline file is missing but other `.json` files exist in the directory, warns and identifies the latest one. +- Validates the baseline file as JSON and checks for expected KPI fields: `precision`, `recall`, `falseNegativeRate`, `deterministicReplayRate`, `ttfrpP95Ms`. +- Fails if the file exists but is invalid JSON or has no recognized KPI fields. +- Warns if some recommended fields are missing. + +## Why It Matters +Without a KPI baseline, CI gates cannot detect regressions in binary matching accuracy. A regression in precision or recall means vulnerability detection quality has degraded without anyone being alerted. The baseline enables automated quality gates that block releases when binary analysis accuracy drops. + +## Common Causes +- KPI baseline has never been established (first run of corpus validation not yet completed) +- Baseline directory path misconfigured +- Baseline file was deleted or corrupted +- Baseline created with an older tool version missing newer KPI fields + +## How to Fix + +### Docker Compose +```bash +# Create baseline directory +docker exec mkdir -p /var/lib/stella/baselines + +# Run corpus validation to establish baseline +docker exec stella groundtruth validate run \ + --corpus datasets/golden-corpus/seed/ --output-baseline +``` + +### Bare Metal / systemd +```bash +sudo mkdir -p /var/lib/stella/baselines + +# Run validation and save baseline +stella groundtruth validate run \ + --corpus datasets/golden-corpus/seed/ \ + --output /var/lib/stella/baselines/current.json + +# Or promote latest results +stella groundtruth baseline update --from-latest \ + --output /var/lib/stella/baselines/current.json +``` + +### Kubernetes / Helm +```yaml +binaryAnalysis: + corpus: + baselineDirectory: "/var/lib/stella/baselines" + persistence: + enabled: true +``` + +Run a one-time job to establish the baseline: +```bash +kubectl exec -it -- stella groundtruth validate run --output-baseline +``` + +## Verification +``` +stella doctor run --check check.binaryanalysis.corpus.kpi.baseline +``` + +## Related Checks +- `check.binaryanalysis.corpus.mirror.freshness` — verifies corpus mirror data is not stale +- `check.binaryanalysis.symbol.recovery.fallback` — meta-check for symbol recovery availability diff --git a/docs/doctor/articles/binary-analysis/symbol-recovery-fallback.md b/docs/doctor/articles/binary-analysis/symbol-recovery-fallback.md new file mode 100644 index 000000000..d94f9da8e --- /dev/null +++ b/docs/doctor/articles/binary-analysis/symbol-recovery-fallback.md @@ -0,0 +1,69 @@ +--- +checkId: check.binaryanalysis.symbol.recovery.fallback +plugin: stellaops.doctor.binaryanalysis +severity: warn +tags: [binaryanalysis, symbols, fallback, security, meta] +--- +# Symbol Recovery Fallback + +## What It Checks +Meta-check that ensures at least one symbol recovery path is available. The check aggregates results from three child checks: + +- **Debuginfod Availability** (`check.binaryanalysis.debuginfod.available`) +- **Ubuntu Ddeb Repository** (`check.binaryanalysis.ddeb.enabled`) -- skipped on non-Linux +- **Debian Buildinfo Cache** (`check.binaryanalysis.buildinfo.cache`) + +Fails if zero sources are available. Reports info if some but not all sources are available. Passes if all sources are operational. + +## Why It Matters +Symbol recovery is critical for binary analysis accuracy. If all symbol sources are unavailable, binary matching operates without debug information, severely degrading vulnerability detection quality. Having at least one source ensures a minimum level of binary analysis capability; having multiple sources provides redundancy. + +## Common Causes +- All symbol recovery endpoints unreachable +- Network connectivity issues affecting all sources +- Firewall blocking access to symbol servers +- Air-gapped environment without offline symbol cache configured + +## How to Fix + +### Docker Compose +Configure at least one symbol source: + +```yaml +environment: + DEBUGINFOD_URLS: "https://debuginfod.fedoraproject.org" + BinaryAnalysis__BuildinfoCache__Directory: "/var/cache/stella/buildinfo" +``` + +### Bare Metal / systemd +```bash +# Option 1: Configure debuginfod +export DEBUGINFOD_URLS="https://debuginfod.fedoraproject.org" + +# Option 2: Set up buildinfo cache +sudo mkdir -p /var/cache/stella/buildinfo + +# Option 3: Configure ddeb repository (Ubuntu) +echo "deb http://ddebs.ubuntu.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/ddebs.list +``` + +### Kubernetes / Helm +```yaml +binaryAnalysis: + debuginfod: + urls: "https://debuginfod.fedoraproject.org" + buildinfo: + cacheDirectory: "/var/cache/stella/buildinfo" +``` + +For air-gapped environments, set up an offline symbol bundle. See `docs/modules/binary-index/ground-truth-corpus.md` for instructions on creating and importing offline symbol packs. + +## Verification +``` +stella doctor run --check check.binaryanalysis.symbol.recovery.fallback +``` + +## Related Checks +- `check.binaryanalysis.debuginfod.available` — individual debuginfod connectivity check +- `check.binaryanalysis.ddeb.enabled` — individual Ubuntu ddeb repository check +- `check.binaryanalysis.buildinfo.cache` — individual Debian buildinfo cache check diff --git a/docs/doctor/articles/compliance/attestation-signing.md b/docs/doctor/articles/compliance/attestation-signing.md new file mode 100644 index 000000000..d75894e21 --- /dev/null +++ b/docs/doctor/articles/compliance/attestation-signing.md @@ -0,0 +1,101 @@ +--- +checkId: check.compliance.attestation-signing +plugin: stellaops.doctor.compliance +severity: fail +tags: [compliance, attestation, signing, crypto] +--- +# Attestation Signing Health + +## What It Checks +Monitors attestation signing capability by querying the Attestor service at `/api/v1/signing/status`. The check validates: + +- **Key availability**: whether a signing key is loaded and accessible (via `keyAvailable` in the response). +- **Key expiration**: if the key has an `expiresAt` timestamp, the check fails when the key is already expired, warns when expiry is within 30 days, and passes otherwise. +- **Signing activity**: reports the key type and the number of signatures produced in the last 24 hours. + +The check only runs when `Attestor:Url` or `Services:Attestor:Url` is configured. It uses a 10-second HTTP timeout. + +| Condition | Result | +|---|---| +| Attestor unreachable or HTTP error | Fail | +| Key not available | Fail | +| Key expired | Fail | +| Key expires within 30 days | Warn | +| Key available and not expiring soon | Pass | + +## Why It Matters +Attestation signing is the foundation of Stella Ops' evidence chain. Without a working signing key, the system cannot create attestations for releases, SBOM scans, or policy decisions. This breaks the entire compliance audit trail and makes releases unverifiable. Key expiration without timely rotation causes the same downstream impact as a missing key, but with no advance warning unless monitored. + +## Common Causes +- HSM/KMS connectivity issue preventing key access +- Key rotation in progress (brief window of unavailability) +- Key expired or revoked without replacement +- Permission denied on the key management backend +- Attestor service unavailable or misconfigured endpoint URL + +## How to Fix + +### Docker Compose +Verify the Attestor service is running and the URL is correct: + +```bash +# Check attestor container health +docker compose ps attestor + +# Verify signing key status +docker compose exec attestor stella attestor key status + +# If key is expired, rotate it +docker compose exec attestor stella attestor key rotate + +# Ensure the URL is correct in your .env or compose override +# Attestor__Url=http://attestor:5082 +``` + +### Bare Metal / systemd +Check the Attestor service and key configuration: + +```bash +# Check service status +sudo systemctl status stellaops-attestor + +# Verify key status +stella attestor key status + +# Test HSM/KMS connectivity +stella attestor hsm test + +# Rotate an expired key +stella attestor key rotate + +# If using appsettings.json, verify Attestor:Url is correct +cat /etc/stellaops/appsettings.json | jq '.Attestor' +``` + +### Kubernetes / Helm +```bash +# Check attestor pod status +kubectl get pods -l app=stellaops-attestor + +# Check signing key status +kubectl exec deploy/stellaops-attestor -- stella attestor key status + +# Verify HSM/KMS connectivity from the pod +kubectl exec deploy/stellaops-attestor -- stella attestor hsm test + +# Schedule key rotation via Helm values +helm upgrade stellaops ./charts/stellaops \ + --set attestor.keyRotation.enabled=true \ + --set attestor.keyRotation.scheduleBeforeExpiryDays=30 +``` + +## Verification +``` +stella doctor run --check check.compliance.attestation-signing +``` + +## Related Checks +- `check.compliance.evidence-rate` — monitors evidence generation success rate, which depends on signing +- `check.compliance.provenance-completeness` — verifies provenance records exist for releases (requires working signing) +- `check.compliance.evidence-integrity` — verifies signatures on stored evidence +- `check.crypto.hsm` — validates HSM/PKCS#11 module availability used by the signing key diff --git a/docs/doctor/articles/compliance/audit-readiness.md b/docs/doctor/articles/compliance/audit-readiness.md new file mode 100644 index 000000000..d0c5fcc14 --- /dev/null +++ b/docs/doctor/articles/compliance/audit-readiness.md @@ -0,0 +1,100 @@ +--- +checkId: check.compliance.audit-readiness +plugin: stellaops.doctor.compliance +severity: warn +tags: [compliance, audit, evidence] +--- +# Audit Readiness + +## What It Checks +Verifies the system is ready for compliance audits by querying the Evidence Locker at `/api/v1/evidence/audit-readiness`. The check evaluates four readiness criteria: + +- **Retention policy configured**: whether a data retention policy is active. +- **Audit logging enabled**: whether audit log capture is turned on. +- **Backup verified**: whether the most recent backup has been validated. +- **Evidence retention age**: whether the oldest evidence meets the required retention period (default 365 days). + +| Condition | Result | +|---|---| +| Evidence Locker unreachable | Warn | +| 3 or more issues found | Fail | +| 1-2 issues found | Warn | +| All criteria satisfied | Pass | + +Evidence collected: `issues_count`, `retention_policy_configured`, `audit_log_enabled`, `backup_verified`, `evidence_count`, `oldest_evidence_days`. + +The check only runs when `EvidenceLocker:Url` or `Services:EvidenceLocker:Url` is configured. It uses a 15-second HTTP timeout. + +## Why It Matters +Compliance audits (SOC2, FedRAMP, HIPAA, PCI-DSS) require verifiable evidence retention, continuous audit logging, and validated backups. If any of these controls is missing, the organization cannot demonstrate compliance during an audit. A missing retention policy means evidence may be silently deleted. Disabled audit logging creates gaps in the chain of custody. Unverified backups risk data loss during incident recovery. + +## Common Causes +- No retention policy configured (default is not set) +- Audit logging disabled in configuration or by error +- Backup verification job not running or failing silently +- Evidence retention shorter than the required period (e.g., 90 days configured but 365 required) + +## How to Fix + +### Docker Compose +```bash +# Configure retention policy +docker compose exec evidence-locker stella evidence retention set --days 365 + +# Enable audit logging +docker compose exec platform stella audit enable + +# Verify backup status +docker compose exec evidence-locker stella evidence backup verify + +# Set environment variables if needed +# EvidenceLocker__Retention__Days=365 +# AuditLog__Enabled=true +``` + +### Bare Metal / systemd +```bash +# Configure retention policy +stella evidence retention set --days 365 + +# Enable audit logging +stella audit enable + +# Verify backup status +stella evidence backup verify + +# Edit appsettings.json +# "EvidenceLocker": { "Retention": { "Days": 365 } } +# "AuditLog": { "Enabled": true } + +sudo systemctl restart stellaops-evidence-locker +``` + +### Kubernetes / Helm +```yaml +# values.yaml +evidenceLocker: + retention: + days: 365 + backup: + enabled: true + schedule: "0 2 * * *" + verifyAfterBackup: true +auditLog: + enabled: true +``` + +```bash +helm upgrade stellaops ./charts/stellaops -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.compliance.audit-readiness +``` + +## Related Checks +- `check.compliance.evidence-integrity` — verifies evidence has not been tampered with +- `check.compliance.export-readiness` — verifies evidence can be exported for auditors +- `check.compliance.evidence-rate` — monitors evidence generation health +- `check.compliance.framework` — verifies compliance framework controls are passing diff --git a/docs/doctor/articles/compliance/evidence-integrity.md b/docs/doctor/articles/compliance/evidence-integrity.md new file mode 100644 index 000000000..30186a4f8 --- /dev/null +++ b/docs/doctor/articles/compliance/evidence-integrity.md @@ -0,0 +1,100 @@ +--- +checkId: check.compliance.evidence-integrity +plugin: stellaops.doctor.compliance +severity: fail +tags: [compliance, security, integrity, signatures] +--- +# Evidence Integrity + +## What It Checks +Detects evidence tampering or integrity issues by querying the Evidence Locker at `/api/v1/evidence/integrity-check`. The check verifies cryptographic signatures and hash chains across all stored evidence records. It evaluates: + +- **Tampered records**: evidence records where the signature or hash does not match the stored content. +- **Verification errors**: records that could not be verified (e.g., missing certificates, unsupported algorithms). +- **Hash chain validity**: whether the sequential hash chain linking evidence records is intact. + +| Condition | Result | +|---|---| +| Evidence Locker unreachable | Warn | +| Any tampered records detected (tamperedCount > 0) | Fail (CRITICAL) | +| Verification errors but no tampering | Warn | +| All records verified, no tampering | Pass | + +Evidence collected: `tampered_count`, `verified_count`, `total_checked`, `first_tampered_id`, `verification_errors`, `hash_chain_valid`. + +The check only runs when `EvidenceLocker:Url` or `Services:EvidenceLocker:Url` is configured. It uses a 60-second HTTP timeout due to the intensive nature of the integrity scan. + +## Why It Matters +Evidence integrity is the cornerstone of compliance and audit trust. Tampered evidence records indicate either storage corruption, a security breach, or malicious modification of release decisions. Any tampering invalidates the entire evidence chain and must be treated as a security incident. Verification errors, while less severe, mean some evidence cannot be independently validated, weakening the audit posture. + +## Common Causes +- Evidence modification after signing (accidental or malicious) +- Storage corruption (disk errors, incomplete writes) +- Malicious tampering by an attacker with storage access +- Key or certificate mismatch after key rotation +- Missing signing certificates needed for verification +- Certificate expiration rendering signatures unverifiable +- Unsupported signature algorithm in older evidence records + +## How to Fix + +### Docker Compose +```bash +# List tampered evidence (DO NOT DELETE - preserve for investigation) +docker compose exec evidence-locker stella evidence audit --tampered + +# Check for storage corruption +docker compose exec evidence-locker stella evidence integrity-check --verbose + +# If tampering is confirmed, escalate to security team +# Preserve all logs and evidence for forensic analysis +docker compose logs evidence-locker > evidence-locker-forensic.log + +# For verification errors (missing certs), import the required certificates +docker compose exec evidence-locker stella evidence certs import --path /certs/ +``` + +### Bare Metal / systemd +```bash +# List tampered evidence +stella evidence audit --tampered + +# Full integrity check with details +stella evidence integrity-check --verbose + +# Check for disk errors +sudo smartctl -H /dev/sda +sudo fsck -n /dev/sda1 + +# Import missing certificates for verification +stella evidence certs import --path /etc/stellaops/certs/ + +# DO NOT delete tampered evidence - preserve for investigation +``` + +### Kubernetes / Helm +```bash +# List tampered evidence +kubectl exec deploy/stellaops-evidence-locker -- stella evidence audit --tampered + +# Full integrity check +kubectl exec deploy/stellaops-evidence-locker -- stella evidence integrity-check --verbose + +# Check persistent volume health +kubectl describe pvc stellaops-evidence-data + +# Export forensic logs +kubectl logs deploy/stellaops-evidence-locker --all-containers > forensic.log +``` + +## Verification +``` +stella doctor run --check check.compliance.evidence-integrity +``` + +## Related Checks +- `check.compliance.attestation-signing` — signing key health affects evidence signature creation +- `check.compliance.evidence-rate` — evidence generation failures may relate to integrity issues +- `check.evidencelocker.merkle` — Merkle anchor verification provides additional integrity guarantees +- `check.evidencelocker.provenance` — provenance chain integrity validates the evidence chain +- `check.compliance.audit-readiness` — overall audit readiness depends on evidence integrity diff --git a/docs/doctor/articles/compliance/evidence-rate.md b/docs/doctor/articles/compliance/evidence-rate.md new file mode 100644 index 000000000..b9a9dfb1d --- /dev/null +++ b/docs/doctor/articles/compliance/evidence-rate.md @@ -0,0 +1,94 @@ +--- +checkId: check.compliance.evidence-rate +plugin: stellaops.doctor.compliance +severity: fail +tags: [compliance, evidence, attestation] +--- +# Evidence Generation Rate + +## What It Checks +Monitors evidence generation success rate by querying the Evidence Locker at `/api/v1/evidence/metrics`. The check computes the success rate as `(totalGenerated - failed) / totalGenerated` over the last 24 hours and compares it against two thresholds: + +| Condition | Result | +|---|---| +| Evidence Locker unreachable | Warn | +| Success rate < 95% | Fail | +| Success rate 95%-99% | Warn | +| Success rate >= 99% | Pass | + +Evidence collected: `success_rate`, `total_generated_24h`, `failed_24h`, `pending_24h`, `avg_generation_time_ms`. + +The check only runs when `EvidenceLocker:Url` or `Services:EvidenceLocker:Url` is configured. It uses a 10-second HTTP timeout. If no evidence has been generated (`totalGenerated == 0`), the success rate defaults to 100%. + +## Why It Matters +Evidence generation is a critical path in the release pipeline. Every release decision, scan result, and policy evaluation produces evidence that feeds compliance audits and attestation chains. A dropping success rate means evidence records are being lost, which creates gaps in the audit trail. Below 95%, the system is losing more than 1 in 20 evidence records, making compliance reporting unreliable and potentially invalidating release approvals that lack supporting evidence. + +## Common Causes +- Evidence generation service failures (internal errors, OOM) +- Database connectivity issues preventing evidence persistence +- Signing key unavailable, blocking signed evidence creation +- Storage quota exceeded on the evidence backend +- Intermittent failures due to high load or resource contention + +## How to Fix + +### Docker Compose +```bash +# Check evidence locker logs for errors +docker compose logs evidence-locker --since 1h | grep -i error + +# Verify signing keys +docker compose exec evidence-locker stella evidence keys status + +# Check database connectivity +docker compose exec evidence-locker stella evidence db check + +# Check storage capacity +docker compose exec evidence-locker df -h /data/evidence + +# If storage is full, clean up or expand volume +docker compose exec evidence-locker stella evidence cleanup --older-than 90d --dry-run +``` + +### Bare Metal / systemd +```bash +# Check service logs +journalctl -u stellaops-evidence-locker --since "1 hour ago" | grep -i error + +# Verify signing keys +stella evidence keys status + +# Check database connectivity +stella evidence db check + +# Check storage usage +df -h /var/lib/stellaops/evidence + +sudo systemctl restart stellaops-evidence-locker +``` + +### Kubernetes / Helm +```bash +# Check evidence locker pod logs +kubectl logs deploy/stellaops-evidence-locker --since=1h | grep -i error + +# Verify signing keys +kubectl exec deploy/stellaops-evidence-locker -- stella evidence keys status + +# Check persistent volume usage +kubectl exec deploy/stellaops-evidence-locker -- df -h /data/evidence + +# Check for OOMKilled pods +kubectl get events --field-selector reason=OOMKilled -n stellaops +``` + +## Verification +``` +stella doctor run --check check.compliance.evidence-rate +``` + +## Related Checks +- `check.compliance.attestation-signing` — signing key health affects evidence generation +- `check.compliance.evidence-integrity` — integrity of generated evidence +- `check.compliance.provenance-completeness` — provenance depends on evidence generation +- `check.compliance.audit-readiness` — overall audit readiness depends on evidence availability diff --git a/docs/doctor/articles/compliance/export-readiness.md b/docs/doctor/articles/compliance/export-readiness.md new file mode 100644 index 000000000..ca843a981 --- /dev/null +++ b/docs/doctor/articles/compliance/export-readiness.md @@ -0,0 +1,104 @@ +--- +checkId: check.compliance.export-readiness +plugin: stellaops.doctor.compliance +severity: warn +tags: [compliance, export, audit] +--- +# Evidence Export Readiness + +## What It Checks +Verifies that evidence can be exported in auditor-ready formats by querying the Evidence Locker at `/api/v1/evidence/export/capabilities`. The check evaluates four export capabilities: + +- **PDF export**: ability to generate PDF evidence reports. +- **JSON export**: ability to export evidence as structured JSON. +- **Signed bundle export**: ability to create cryptographically signed evidence bundles. +- **Chain of custody report**: ability to generate chain-of-custody documentation. + +| Condition | Result | +|---|---| +| Evidence Locker unreachable | Warn | +| 2 or more export formats unavailable | Fail | +| 1 export format unavailable | Warn | +| All 4 export formats available | Pass | + +Evidence collected: `pdf_export`, `json_export`, `signed_bundle`, `chain_of_custody`, `available_formats`. + +The check only runs when `EvidenceLocker:Url` or `Services:EvidenceLocker:Url` is configured. It uses a 10-second HTTP timeout. + +## Why It Matters +Auditors require evidence in specific formats. PDF reports are the most common delivery format for compliance reviews. Signed bundles provide cryptographic proof of evidence authenticity. The chain of custody report demonstrates that evidence has not been modified since collection. If these export capabilities are not available when an auditor requests them, it delays the audit process and may raise concerns about evidence integrity. + +## Common Causes +- Export dependencies not installed (e.g., PDF rendering libraries) +- Signing keys not configured for evidence bundle signing +- Template files missing for PDF report generation +- Evidence Locker deployed without export module enabled + +## How to Fix + +### Docker Compose +```bash +# Check export configuration +docker compose exec evidence-locker stella evidence export --check + +# Verify export dependencies are installed +docker compose exec evidence-locker dpkg -l | grep -i wkhtmltopdf + +# Enable export features in environment +# EvidenceLocker__Export__PdfEnabled=true +# EvidenceLocker__Export__SignedBundleEnabled=true +# EvidenceLocker__Export__ChainOfCustodyEnabled=true + +# Restart after configuration changes +docker compose restart evidence-locker +``` + +### Bare Metal / systemd +```bash +# Check export configuration +stella evidence export --check + +# Install PDF rendering dependencies if missing +sudo apt install wkhtmltopdf + +# Configure export in appsettings.json +# "EvidenceLocker": { +# "Export": { +# "PdfEnabled": true, +# "SignedBundleEnabled": true, +# "ChainOfCustodyEnabled": true +# } +# } + +sudo systemctl restart stellaops-evidence-locker +``` + +### Kubernetes / Helm +```yaml +# values.yaml +evidenceLocker: + export: + pdfEnabled: true + jsonEnabled: true + signedBundleEnabled: true + chainOfCustodyEnabled: true + signingKeySecret: "stellaops-export-signing-key" +``` + +```bash +# Create signing key secret for bundles +kubectl create secret generic stellaops-export-signing-key \ + --from-file=key.pem=./export-signing-key.pem + +helm upgrade stellaops ./charts/stellaops -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.compliance.export-readiness +``` + +## Related Checks +- `check.compliance.audit-readiness` — overall audit readiness including retention and logging +- `check.compliance.attestation-signing` — signing key health required for signed bundle export +- `check.compliance.evidence-integrity` — integrity of the evidence being exported diff --git a/docs/doctor/articles/compliance/framework.md b/docs/doctor/articles/compliance/framework.md new file mode 100644 index 000000000..07be6a914 --- /dev/null +++ b/docs/doctor/articles/compliance/framework.md @@ -0,0 +1,90 @@ +--- +checkId: check.compliance.framework +plugin: stellaops.doctor.compliance +severity: warn +tags: [compliance, framework, soc2, fedramp] +--- +# Compliance Framework + +## What It Checks +Verifies that configured compliance framework requirements are met by querying the Policy service at `/api/v1/compliance/status`. The check supports SOC2, FedRAMP, HIPAA, PCI-DSS, and custom frameworks. It evaluates: + +- **Failing controls**: any compliance controls in a failed state trigger a fail result. +- **Compliance score**: a score below 100% (but with zero failing controls) triggers a warning. +- **Control counts**: reports total, passing, and failing control counts along with the framework name. + +| Condition | Result | +|---|---| +| Policy service unreachable | Warn | +| Any controls failing (failingControls > 0) | Fail | +| Compliance score < 100% | Warn | +| All controls passing, score = 100% | Pass | + +The check only runs when `Compliance:Frameworks` is configured. It uses a 15-second HTTP timeout. + +## Why It Matters +Compliance frameworks define the security and operational controls your organization must satisfy. Failing controls mean the system is not meeting regulatory requirements, which can result in audit findings, failed certifications, or legal exposure. Even partial non-compliance (score below 100%) indicates controls that need attention before the next audit cycle. + +## Common Causes +- Control requirements not implemented in the platform configuration +- Evidence gaps where expected artifacts are missing +- Policy violations detected by the policy engine +- Configuration drift from the established compliance baseline +- New controls added to the framework that have not been addressed + +## How to Fix + +### Docker Compose +```bash +# List all failing controls +docker compose exec policy stella compliance audit --failing + +# Generate remediation plan +docker compose exec policy stella compliance remediate --plan + +# Review compliance status in detail +docker compose exec policy stella compliance status --framework soc2 + +# Configure frameworks in your .env +# Compliance__Frameworks=soc2,hipaa +``` + +### Bare Metal / systemd +```bash +# List failing controls +stella compliance audit --failing + +# Generate remediation plan +stella compliance remediate --plan + +# Configure frameworks in appsettings.json +# "Compliance": { "Frameworks": "soc2,hipaa" } + +sudo systemctl restart stellaops-policy +``` + +### Kubernetes / Helm +```yaml +# values.yaml +compliance: + frameworks: "soc2,hipaa" + autoRemediate: false + reportSchedule: "0 6 * * 1" # Weekly Monday 6am +``` + +```bash +# Apply and check +helm upgrade stellaops ./charts/stellaops -f values.yaml +kubectl exec deploy/stellaops-policy -- stella compliance audit --failing +``` + +## Verification +``` +stella doctor run --check check.compliance.framework +``` + +## Related Checks +- `check.compliance.audit-readiness` — verifies the system is ready for compliance audits +- `check.compliance.evidence-integrity` — verifies evidence integrity for compliance evidence +- `check.compliance.provenance-completeness` — verifies provenance records support compliance claims +- `check.compliance.export-readiness` — verifies evidence can be exported for auditor review diff --git a/docs/doctor/articles/compliance/provenance-completeness.md b/docs/doctor/articles/compliance/provenance-completeness.md new file mode 100644 index 000000000..cd3d8b8a7 --- /dev/null +++ b/docs/doctor/articles/compliance/provenance-completeness.md @@ -0,0 +1,102 @@ +--- +checkId: check.compliance.provenance-completeness +plugin: stellaops.doctor.compliance +severity: fail +tags: [compliance, provenance, slsa] +--- +# Provenance Completeness + +## What It Checks +Verifies that provenance records exist for all releases by querying the Provenance service at `/api/v1/provenance/completeness`. The check computes a completeness rate as `(totalReleases - missingCount) / totalReleases` and evaluates the SLSA (Supply-chain Levels for Software Artifacts) level: + +| Condition | Result | +|---|---| +| Provenance service unreachable | Warn | +| Completeness rate < 99% | Fail | +| SLSA level < 2 (but completeness >= 99%) | Warn | +| Completeness >= 99% and SLSA level >= 2 | Pass | + +Evidence collected: `completeness_rate`, `total_releases`, `missing_count`, `slsa_level`. + +The check only runs when `Provenance:Url` or `Services:Provenance:Url` is configured. It uses a 15-second HTTP timeout. If no releases exist (`totalReleases == 0`), completeness defaults to 100%. + +## Why It Matters +Provenance records document the complete history of how a software artifact was built, including the source code, build system, and build steps. Without provenance, there is no verifiable link between source code and the deployed artifact. This is a foundational requirement for SLSA compliance and supply-chain security. Missing provenance for even a small percentage of releases creates audit gaps that undermine the trustworthiness of the entire release pipeline. + +## Common Causes +- Build pipeline not configured to generate provenance attestations +- Provenance upload failures due to network or authentication issues +- Legacy releases created before provenance generation was enabled +- Manual deployments that bypass the standard build pipeline +- Build system not meeting SLSA level 2+ requirements + +## How to Fix + +### Docker Compose +```bash +# List releases missing provenance +docker compose exec provenance stella provenance audit --missing + +# Generate backfill provenance for existing releases (dry run first) +docker compose exec provenance stella provenance backfill --dry-run + +# If dry run looks correct, run the actual backfill +docker compose exec provenance stella provenance backfill + +# Check SLSA level +docker compose exec provenance stella provenance slsa-level + +# Ensure provenance generation is enabled in the pipeline +# Provenance__Enabled=true +# Provenance__SlsaLevel=2 +``` + +### Bare Metal / systemd +```bash +# List releases missing provenance +stella provenance audit --missing + +# Backfill provenance (dry run first) +stella provenance backfill --dry-run + +# Check SLSA level configuration +stella provenance slsa-level + +# Configure in appsettings.json +# "Provenance": { "Enabled": true, "SlsaLevel": 2 } + +sudo systemctl restart stellaops-provenance +``` + +### Kubernetes / Helm +```yaml +# values.yaml +provenance: + enabled: true + slsaLevel: 2 + backfill: + enabled: true + schedule: "0 3 * * 0" # Weekly Sunday 3am +``` + +```bash +# List missing provenance +kubectl exec deploy/stellaops-provenance -- stella provenance audit --missing + +# Backfill +kubectl exec deploy/stellaops-provenance -- stella provenance backfill --dry-run + +helm upgrade stellaops ./charts/stellaops -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.compliance.provenance-completeness +``` + +## Related Checks +- `check.compliance.attestation-signing` — signing key required for provenance attestations +- `check.compliance.evidence-rate` — evidence generation rate includes provenance records +- `check.compliance.evidence-integrity` — integrity of provenance evidence +- `check.evidencelocker.provenance` — provenance chain integrity at the storage level +- `check.compliance.framework` — compliance frameworks may require specific SLSA levels diff --git a/docs/doctor/articles/core/auth-config.md b/docs/doctor/articles/core/auth-config.md new file mode 100644 index 000000000..534c22abe --- /dev/null +++ b/docs/doctor/articles/core/auth-config.md @@ -0,0 +1,97 @@ +--- +checkId: check.core.auth.config +plugin: stellaops.doctor.core +severity: warn +tags: [security, authentication, configuration] +--- +# Authentication Configuration + +## What It Checks +Verifies that authentication and authorization configuration is valid. The check inspects three configuration sections (`Authentication`, `Authority`, `Identity`) and validates: + +- **JWT settings** (under `Authentication:Jwt`): ensures `Issuer` and `Audience` are set, the `SecretKey` is at least 32 characters long, and the key does not contain common weak values such as "secret" or "changeme". +- **OpenID Connect settings** (under `Authentication:OpenIdConnect`): ensures the `Authority` URL is configured. +- **Authority provider settings** (under `Authority`): reports which providers are enabled via `EnabledProviders`. + +The check only runs when at least one of the three auth configuration sections exists. If none exist, it reports an informational result noting that authentication may not be configured. + +## Why It Matters +Misconfigured authentication allows unauthorized access to the Stella Ops control plane. A missing JWT issuer or audience disables token validation. A short or default signing key can be brute-forced or guessed, enabling token forgery. Without a properly configured OIDC authority, federated login flows will fail entirely. + +## Common Causes +- JWT Issuer not configured +- JWT Audience not configured +- JWT SecretKey is shorter than 32 characters +- JWT SecretKey contains common weak values like "secret" or "changeme" +- OpenIdConnect Authority URL is missing +- Using development defaults in production + +## How to Fix + +### Docker Compose +Set the appropriate environment variables in your service definition inside `docker-compose.yml` or the `.env` file: + +```yaml +environment: + Authentication__Jwt__Issuer: "https://stella-ops.local" + Authentication__Jwt__Audience: "stellaops-api" + Authentication__Jwt__SecretKey: "" + Authentication__OpenIdConnect__Authority: "https://authority.stella-ops.local" +``` + +Generate a strong key: +```bash +openssl rand -base64 48 +``` + +### Bare Metal / systemd +Edit `appsettings.json` or `appsettings.Production.json`: + +```json +{ + "Authentication": { + "Jwt": { + "Issuer": "https://stella-ops.yourdomain.com", + "Audience": "stellaops-api", + "SecretKey": "" + }, + "OpenIdConnect": { + "Authority": "https://authority.yourdomain.com" + } + } +} +``` + +Restart the service: +```bash +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +Set values in your Helm `values.yaml`: + +```yaml +authentication: + jwt: + issuer: "https://stella-ops.yourdomain.com" + audience: "stellaops-api" + signingKeySecret: "stellaops-jwt-secret" # reference a Kubernetes Secret + oidc: + authority: "https://authority.yourdomain.com" +``` + +Create the signing key secret: +```bash +kubectl create secret generic stellaops-jwt-secret \ + --from-literal=key="$(openssl rand -base64 48)" +``` + +## Verification +``` +stella doctor run --check check.core.auth.config +``` + +## Related Checks +- `check.security.jwt.config` — deep validation of JWT signing, algorithm, and expiration settings +- `check.security.secrets` — ensures secrets like JWT keys are not stored as plain text in config +- `check.security.password.policy` — validates password complexity requirements diff --git a/docs/doctor/articles/core/config-loaded.md b/docs/doctor/articles/core/config-loaded.md new file mode 100644 index 000000000..e7a2470ab --- /dev/null +++ b/docs/doctor/articles/core/config-loaded.md @@ -0,0 +1,85 @@ +--- +checkId: check.core.config.loaded +plugin: stellaops.doctor.core +severity: fail +tags: [quick, configuration, startup] +--- +# Configuration Loaded + +## What It Checks +Verifies that the application configuration system is properly loaded and accessible. The check calls `IConfiguration.GetChildren()` and counts the number of root configuration sections. It collects: + +- **SectionCount**: total number of top-level configuration sections found. +- **RootSections**: names of up to 10 root sections (e.g., `Logging`, `ConnectionStrings`, `Authentication`). +- **Environment**: the current hosting environment name. + +If zero sections are found, the check fails. If the configuration object throws an exception when accessed, the check also fails with the exception details. + +## Why It Matters +Configuration is the foundation of every Stella Ops service. Without a loaded configuration, connection strings, authentication settings, feature flags, and service endpoints are all missing. The service will fail to connect to databases, message brokers, and upstream services. This check catches the scenario where config files are missing from the container image, environment variables are not injected, or a configuration provider failed to initialize. + +## Common Causes +- Configuration file (`appsettings.json`) is missing or empty +- Configuration provider not registered in `Program.cs` +- Environment variables not set in the deployment +- Config file not included in the Docker image build +- Volume mount overwriting the config directory with an empty directory + +## How to Fix + +### Docker Compose +Verify the configuration file exists inside the container: + +```bash +docker compose exec ls -la /app/appsettings.json +``` + +If missing, check your `Dockerfile` to ensure the file is copied. Alternatively, mount it as a volume: + +```yaml +volumes: + - ./config/appsettings.json:/app/appsettings.json:ro +``` + +Check that environment variables are being injected: + +```bash +docker compose exec printenv | grep -i stella +``` + +### Bare Metal / systemd +Verify the config file exists in the application directory: + +```bash +ls -la /opt/stellaops/appsettings.json +cat /opt/stellaops/appsettings.json | head -5 +``` + +Check environment variables: + +```bash +printenv | grep -i stella +``` + +### Kubernetes / Helm +Check that the ConfigMap is mounted: + +```bash +kubectl exec -it -- cat /app/appsettings.json +kubectl exec -it -- printenv | grep -i STELLA +``` + +Verify the ConfigMap exists: + +```bash +kubectl get configmap stellaops-config -o yaml +``` + +## Verification +``` +stella doctor run --check check.core.config.loaded +``` + +## Related Checks +- `check.core.config.required` — verifies specific required settings are present +- `check.core.env.variables` — verifies environment variables are set diff --git a/docs/doctor/articles/core/config-required.md b/docs/doctor/articles/core/config-required.md new file mode 100644 index 000000000..9abaa4aad --- /dev/null +++ b/docs/doctor/articles/core/config-required.md @@ -0,0 +1,104 @@ +--- +checkId: check.core.config.required +plugin: stellaops.doctor.core +severity: fail +tags: [quick, configuration, startup] +--- +# Required Settings + +## What It Checks +Verifies that required configuration settings are present and have non-empty values. The check supports multiple key variants to accommodate both `appsettings.json` (colon-separated) and environment variable (double-underscore-separated) configuration styles. + +**Required settings** (at least one variant must be present): + +| Canonical Name | Accepted Variants | +|---|---| +| `ConnectionStrings:DefaultConnection` | `ConnectionStrings:DefaultConnection`, `ConnectionStrings:Default`, `CONNECTIONSTRINGS__DEFAULTCONNECTION`, `CONNECTIONSTRINGS__DEFAULT` | + +**Recommended settings** (warn if missing, not fail): + +| Setting | Purpose | +|---|---| +| `Logging:LogLevel:Default` | Default log level | + +The check also reads `PluginConfig:RequiredSettings` for additional plugin-specific required settings configured at runtime. For each required setting, it checks both the `IConfiguration` value and the direct environment variable (converting `:` to `__`). + +## Why It Matters +The database connection string is the most critical setting for any Stella Ops service. Without it, the service cannot connect to PostgreSQL, auto-migration cannot run, and every database-dependent operation will fail with a 500 error. This check catches the most common deployment mistake: forgetting to set the connection string. + +## Common Causes +- Database connection string not configured in environment variables or appsettings +- Environment variables not set (check Docker compose `.env` or service environment section) +- Typo in the environment variable name (e.g., `CONNECTIONSTRING` instead of `CONNECTIONSTRINGS`) +- Config file present but missing the `ConnectionStrings` section + +## How to Fix + +### Docker Compose +Add the connection string to your `.env` file or directly in `docker-compose.yml`: + +```bash +# In .env file +CONNECTIONSTRINGS__DEFAULTCONNECTION=Host=127.1.1.1;Port=5432;Database=stellaops_platform;Username=stellaops;Password=stellaops +``` + +Or in the service environment section: + +```yaml +services: + platform: + environment: + ConnectionStrings__DefaultConnection: "Host=postgres;Port=5432;Database=stellaops_platform;Username=stellaops;Password=stellaops" +``` + +### Bare Metal / systemd +Add to `appsettings.json`: + +```json +{ + "ConnectionStrings": { + "DefaultConnection": "Host=localhost;Port=5432;Database=stellaops_platform;Username=stellaops;Password=stellaops" + }, + "Logging": { + "LogLevel": { + "Default": "Information" + } + } +} +``` + +Or set as an environment variable in the systemd unit: + +```ini +[Service] +Environment=CONNECTIONSTRINGS__DEFAULTCONNECTION=Host=localhost;Port=5432;Database=stellaops_platform;Username=stellaops;Password=stellaops +``` + +### Kubernetes / Helm +Set connection string via a Kubernetes Secret: + +```bash +kubectl create secret generic stellaops-db \ + --from-literal=connection-string="Host=postgres;Port=5432;Database=stellaops_platform;Username=stellaops;Password=stellaops" +``` + +Reference in Helm values: + +```yaml +env: + - name: ConnectionStrings__DefaultConnection + valueFrom: + secretKeyRef: + name: stellaops-db + key: connection-string +``` + +## Verification +``` +stella doctor run --check check.core.config.required +``` + +## Related Checks +- `check.core.config.loaded` — verifies the configuration system itself is loaded +- `check.core.env.variables` — verifies environment variables are set +- `check.core.services.health` — database health checks will fail if the connection string is missing diff --git a/docs/doctor/articles/core/crypto-available.md b/docs/doctor/articles/core/crypto-available.md new file mode 100644 index 000000000..5a650d710 --- /dev/null +++ b/docs/doctor/articles/core/crypto-available.md @@ -0,0 +1,88 @@ +--- +checkId: check.core.crypto.available +plugin: stellaops.doctor.core +severity: fail +tags: [quick, security, crypto] +--- +# Cryptography Providers + +## What It Checks +Verifies that required cryptographic algorithms are available on the host system. The check tests six algorithms by actually executing them: + +| Algorithm | Test | +|-----------|------| +| **SHA-256** | Hashes a 4-byte test payload | +| **SHA-384** | Hashes a 4-byte test payload | +| **SHA-512** | Hashes a 4-byte test payload | +| **RSA** | Creates an RSA key pair and reads the key size | +| **ECDSA** | Creates an ECDSA key pair and reads the key size | +| **AES** | Creates an AES cipher and reads the key size | + +The check also detects whether FIPS mode is enforced on the system via `CryptoConfig.AllowOnlyFipsAlgorithms` and reports the OS platform. + +If any algorithm fails to execute, the check reports `fail` with the list of unavailable algorithms. + +## Why It Matters +Stella Ops relies on these cryptographic primitives for: +- **SHA-256/384/512**: SBOM digests, evidence hashing, content-addressable storage, DSSE payloads. +- **RSA/ECDSA**: JWT signing, TLS certificates, code signing, attestation signatures. +- **AES**: Data-at-rest encryption, data protection keys. + +If any algorithm is unavailable, core features like evidence signing, token validation, and encrypted storage will fail at runtime. + +## Common Causes +- Operating system does not support required algorithms (minimal or stripped-down containers) +- FIPS mode restrictions preventing non-FIPS algorithms +- Missing cryptographic libraries (e.g., OpenSSL not installed in Alpine images) +- Running on a platform with limited crypto support + +## How to Fix + +### Docker Compose +If using Alpine-based images, ensure OpenSSL is installed: + +```dockerfile +RUN apk add --no-cache openssl +``` + +Or switch to a Debian/Ubuntu-based image that includes full crypto support: + +```dockerfile +FROM mcr.microsoft.com/dotnet/aspnet:8.0 +``` + +### Bare Metal / systemd +Install required crypto libraries: + +```bash +# Debian/Ubuntu +sudo apt-get install -y openssl libssl-dev + +# RHEL/CentOS +sudo yum install -y openssl openssl-devel +``` + +If FIPS mode is required, ensure all algorithms used are FIPS-compliant: + +```bash +# Check FIPS status +cat /proc/sys/crypto/fips_enabled +``` + +### Kubernetes / Helm +Use a base image with full cryptographic support. In your Helm values: + +```yaml +image: + repository: stellaops/platform + tag: latest # Uses Debian-based runtime with full crypto +``` + +## Verification +``` +stella doctor run --check check.core.crypto.available +``` + +## Related Checks +- `check.security.encryption` — validates encryption key configuration and algorithms +- `check.security.tls.certificate` — validates TLS certificate availability and validity diff --git a/docs/doctor/articles/core/env-diskspace.md b/docs/doctor/articles/core/env-diskspace.md new file mode 100644 index 000000000..34ed4b4b6 --- /dev/null +++ b/docs/doctor/articles/core/env-diskspace.md @@ -0,0 +1,101 @@ +--- +checkId: check.core.env.diskspace +plugin: stellaops.doctor.core +severity: fail +tags: [quick, environment, resources] +--- +# Disk Space + +## What It Checks +Verifies sufficient disk space is available on the drive where the application is running. The check reads the drive information for the current working directory and applies two thresholds: + +| Threshold | Value | Result | +|---|---|---| +| Critical (fail) | Less than **1 GB** free | `fail` | +| Warning | Less than **5 GB** free | `warn` | +| Healthy | 5 GB or more free | `pass` | + +Evidence collected includes: drive name, free space, total space, and used percentage. + +## Why It Matters +Stella Ops services write logs, evidence files, SBOM data, scan results, and temporary processing artifacts to disk. When disk space is critically low: + +- Database writes fail (PostgreSQL requires WAL space). +- Container images cannot be pulled or built. +- Log files cannot be written, causing silent data loss. +- Evidence locker writes fail, breaking the audit trail. +- Temporary scan artifacts fill up, causing scanner crashes. + +## Common Causes +- Log files consuming disk space without rotation +- Temporary files not cleaned up after processing +- Application data growth (evidence locker, SBOM storage, scan results) +- Docker images and volumes consuming space on the same partition +- Database WAL files growing due to long-running transactions + +## How to Fix + +### Docker Compose +Check disk usage and clean up: + +```bash +# Check overall disk usage +df -h + +# Find large files +du -sh /var/lib/docker/* | sort -hr | head -20 + +# Clean Docker artifacts +docker system prune -a --volumes + +# Clean application temp files +docker compose exec rm -rf /tmp/* + +# Set up log rotation in compose +# Add to your service definition: +logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" +``` + +### Bare Metal / systemd +```bash +# Find large files +du -sh /* | sort -hr | head -20 + +# Clean temp files +rm -rf /tmp/* + +# Rotate logs +sudo logrotate -f /etc/logrotate.conf + +# Check and clean old journal logs +sudo journalctl --vacuum-size=100M +``` + +### Kubernetes / Helm +```bash +# Check node disk usage +kubectl top nodes + +# Check PVC usage +kubectl exec -it -- df -h + +# Set ephemeral storage limits in Helm values: +resources: + limits: + ephemeral-storage: "2Gi" + requests: + ephemeral-storage: "1Gi" +``` + +## Verification +``` +stella doctor run --check check.core.env.diskspace +``` + +## Related Checks +- `check.docker.storage` — checks Docker-specific storage driver and disk usage +- `check.core.env.memory` — checks process memory usage diff --git a/docs/doctor/articles/core/env-memory.md b/docs/doctor/articles/core/env-memory.md new file mode 100644 index 000000000..bc0542bb1 --- /dev/null +++ b/docs/doctor/articles/core/env-memory.md @@ -0,0 +1,113 @@ +--- +checkId: check.core.env.memory +plugin: stellaops.doctor.core +severity: warn +tags: [quick, environment, resources] +--- +# Memory Usage + +## What It Checks +Verifies that the application process memory usage is within acceptable limits. The check reads the current process metrics and applies two thresholds: + +| Threshold | Value (Working Set) | Result | +|---|---|---| +| Critical (fail) | Greater than **2 GB** | `fail` | +| Warning | Greater than **1 GB** | `warn` | +| Healthy | 1 GB or less | `pass` | + +Evidence collected includes: +- **WorkingSet**: physical memory currently allocated to the process. +- **PrivateBytes**: total private memory allocated. +- **GCHeapSize**: managed heap size reported by the GC. +- **GCMemory**: total managed memory from `GC.GetTotalMemory()`. +- **Gen0/Gen1/Gen2 Collections**: garbage collection counts for each generation. + +## Why It Matters +Excessive memory usage leads to out-of-memory kills (OOM), especially in containerized deployments where memory limits are enforced. When a Stella Ops service is OOM-killed: + +- In-flight requests are dropped. +- Evidence writes may be incomplete, compromising the audit trail. +- Scan results in progress are lost. +- The container restarts, causing a brief outage and potential data corruption. + +High Gen2 GC counts can also indicate a memory leak, where objects are promoted to the long-lived generation faster than they can be collected. + +## Common Causes +- Memory leak in application code (undisposed resources, growing caches) +- Large data sets loaded entirely into memory (SBOM graphs, scan results) +- Insufficient memory limits configured for the container +- Normal operation with high load (many concurrent scans or requests) +- Memory-intensive operations in progress (large SBOM diff, graph analysis) + +## How to Fix + +### Docker Compose +Set memory limits for the service in `docker-compose.yml`: + +```yaml +services: + platform: + deploy: + resources: + limits: + memory: 2G + reservations: + memory: 512M +``` + +Analyze memory usage: + +```bash +# Check container memory stats +docker stats --no-stream + +# Capture a memory dump for analysis +docker compose exec dotnet-dump collect -p 1 +``` + +### Bare Metal / systemd +Set memory limits in the systemd unit: + +```ini +[Service] +MemoryMax=2G +MemoryHigh=1536M +``` + +Analyze memory: + +```bash +# Install diagnostics tools +dotnet tool install -g dotnet-gcdump + +# Capture GC dump +dotnet-gcdump collect -p + +# Analyze with dotnet-dump +dotnet-dump analyze +``` + +### Kubernetes / Helm +Set resource limits in Helm values: + +```yaml +resources: + limits: + memory: "2Gi" + requests: + memory: "512Mi" +``` + +Monitor memory: +```bash +kubectl top pods -l app=stellaops-platform +``` + +## Verification +``` +stella doctor run --check check.core.env.memory +``` + +## Related Checks +- `check.core.env.diskspace` — checks available disk space +- `check.core.services.health` — overall service health which can degrade under memory pressure diff --git a/docs/doctor/articles/core/env-variables.md b/docs/doctor/articles/core/env-variables.md new file mode 100644 index 000000000..10eb0cc75 --- /dev/null +++ b/docs/doctor/articles/core/env-variables.md @@ -0,0 +1,88 @@ +--- +checkId: check.core.env.variables +plugin: stellaops.doctor.core +severity: warn +tags: [quick, environment, configuration] +--- +# Environment Variables + +## What It Checks +Verifies that expected environment variables are configured for the runtime environment. The check looks for two recommended variables: + +| Variable | Purpose | +|---|---| +| `ASPNETCORE_ENVIRONMENT` | Sets the ASP.NET Core hosting environment (Development, Staging, Production) | +| `DOTNET_ENVIRONMENT` | Sets the .NET hosting environment (fallback for non-ASP.NET hosts) | + +The check also counts all platform-related environment variables matching these prefixes: `STELLA*`, `ASPNETCORE*`, `DOTNET*`, `CONNECTIONSTRINGS*`. + +**Result logic:** +- If neither recommended variable is set but other platform variables exist (e.g., `STELLAOPS_*`, `CONNECTIONSTRINGS__*`), the check **passes** with a note that the environment defaults are being used. +- If no platform variables at all are found, the check **warns** that the service may not be running in a configured deployment. +- If at least one recommended variable is set, the check **passes** and reports the current environment name and total platform variable count. + +## Why It Matters +The hosting environment controls which configuration files are loaded (`appsettings.Development.json` vs. `appsettings.Production.json`), whether developer exception pages are shown, and how logging is configured. Running in the wrong environment can expose detailed error information in production or apply development-only settings that degrade performance. + +## Common Causes +- No StellaOps, ASP.NET, or .NET environment variables found in the process +- The service is not running in a configured deployment (e.g., running directly without Docker or systemd) +- Docker compose `.env` file missing or not loaded +- Environment variables defined in the wrong scope (host-level vs. container-level) + +## How to Fix + +### Docker Compose +Add the environment variable to your service in `docker-compose.yml`: + +```yaml +services: + platform: + environment: + ASPNETCORE_ENVIRONMENT: Production +``` + +Or in the `.env` file: + +```bash +ASPNETCORE_ENVIRONMENT=Production +``` + +### Bare Metal / systemd +Set the variable in the systemd unit file: + +```ini +[Service] +Environment=ASPNETCORE_ENVIRONMENT=Production +``` + +Or export it in the shell: + +```bash +export ASPNETCORE_ENVIRONMENT=Production +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +Set in Helm values: + +```yaml +env: + - name: ASPNETCORE_ENVIRONMENT + value: "Production" +``` + +Or in the pod spec directly: + +```bash +kubectl set env deployment/stellaops-platform ASPNETCORE_ENVIRONMENT=Production +``` + +## Verification +``` +stella doctor run --check check.core.env.variables +``` + +## Related Checks +- `check.core.config.loaded` — verifies the configuration system is loaded (environment affects which config files load) +- `check.core.config.required` — verifies specific required settings are present diff --git a/docs/doctor/articles/core/services-dependencies.md b/docs/doctor/articles/core/services-dependencies.md new file mode 100644 index 000000000..08496f354 --- /dev/null +++ b/docs/doctor/articles/core/services-dependencies.md @@ -0,0 +1,70 @@ +--- +checkId: check.core.services.dependencies +plugin: stellaops.doctor.core +severity: fail +tags: [quick, services, di] +--- +# Required Services + +## What It Checks +Verifies that required infrastructure services are registered in the .NET dependency injection (DI) container. The check resolves the following service types from the `IServiceProvider`: + +| Service Type | Purpose | +|---|---| +| `TimeProvider` | Abstracts system clock for testability and time-based logic | +| `ILoggerFactory` | Provides structured logging across all components | + +For each service type, the check attempts `GetService()`. If the service resolves to `null` or throws, it is recorded as missing. + +The check reports the count of registered vs. missing services and lists the missing ones by name. + +## Why It Matters +These services are foundational dependencies used by nearly every Stella Ops component. If `TimeProvider` is missing, time-based features (token expiration, certificate validity, scheduling) will not work. If `ILoggerFactory` is missing, no structured logging is produced, making troubleshooting impossible. A missing DI registration usually indicates a misconfigured `Program.cs` or a missing `AddStellaOps*()` call during startup. + +## Common Causes +- Services not registered in the DI container during application startup +- Missing `builder.Services.AddXxx()` call in `Program.cs` or `Startup.cs` +- Incorrect service registration order causing dependency resolution failures +- Custom host builder that skips default service registrations + +## How to Fix + +### Docker Compose +This is a code-level issue, not a deployment configuration problem. Ensure the service's `Program.cs` includes the standard Stella Ops service registration: + +```csharp +builder.Services.AddSingleton(TimeProvider.System); +builder.Services.AddLogging(); +``` + +Rebuild the container after code changes: +```bash +docker compose build --no-cache +docker compose up -d +``` + +### Bare Metal / systemd +Verify the application is using the standard Stella Ops host builder. Check `Program.cs` for the required registrations. + +Restart after any code changes: +```bash +sudo systemctl restart stellaops- +``` + +### Kubernetes / Helm +This issue requires a code fix and new container image. After fixing the registration, build and push a new image: + +```bash +docker build -t stellaops/:latest . +docker push stellaops/:latest +kubectl rollout restart deployment/ +``` + +## Verification +``` +stella doctor run --check check.core.services.dependencies +``` + +## Related Checks +- `check.core.services.health` — aggregates health check results from registered health check services +- `check.core.config.loaded` — verifies the configuration system is loaded (a prerequisite for service registration) diff --git a/docs/doctor/articles/core/services-health.md b/docs/doctor/articles/core/services-health.md new file mode 100644 index 000000000..042462d7d --- /dev/null +++ b/docs/doctor/articles/core/services-health.md @@ -0,0 +1,110 @@ +--- +checkId: check.core.services.health +plugin: stellaops.doctor.core +severity: fail +tags: [health, services] +--- +# Service Health + +## What It Checks +Aggregates health status from all registered ASP.NET Core `IHealthCheck` services. The check resolves `HealthCheckService` from the DI container and calls `CheckHealthAsync()`. It then categorizes each registered health check as Healthy, Degraded, or Unhealthy. + +| Overall Status | Result | +|---|---| +| **Unhealthy** (any check unhealthy) | `fail` — lists the failing checks by name with error details for up to 5 | +| **Degraded** (any check degraded, none unhealthy) | `warn` | +| **Healthy** (all checks healthy) | `pass` — reports total count and duration | + +If `HealthCheckService` is not registered in the DI container, the check is skipped. + +Evidence collected includes: overall status, total checks count, healthy/degraded/unhealthy counts, failed check names, and execution duration. + +## Why It Matters +Health checks are the primary mechanism for detecting infrastructure problems: database connectivity, message broker availability, external API reachability, and internal service dependencies. An unhealthy result means at least one critical dependency is down, and the service cannot function correctly. Load balancers and orchestrators use health check endpoints to route traffic away from unhealthy instances. + +## Common Causes +- Dependent service unavailable (database, Valkey, external API) +- Database connection failed or timed out +- External API unreachable (network partition, DNS failure) +- Health check timeout exceeded (default check estimated duration is 5 seconds) +- Configuration error preventing a dependency from connecting + +## How to Fix + +### Docker Compose +Check the health endpoint directly: + +```bash +# Hit the health endpoint +curl -s http://localhost:5000/health | jq + +# Check dependent service connectivity +docker compose exec curl -s http://postgres:5432 +docker compose exec curl -s http://valkey:6379 + +# Restart unhealthy services +docker compose restart +``` + +Ensure dependent services are healthy before starting: + +```yaml +services: + platform: + depends_on: + postgres: + condition: service_healthy + valkey: + condition: service_healthy +``` + +### Bare Metal / systemd +```bash +# Check the health endpoint +curl -s http://localhost:5000/health | jq + +# Check database connectivity +pg_isready -h localhost -p 5432 + +# Check service logs for errors +journalctl -u stellaops-platform --since "5 minutes ago" | grep -i error +``` + +### Kubernetes / Helm +```bash +# Check pod health +kubectl describe pod | grep -A 5 "Conditions" + +# Check health endpoint inside the pod +kubectl exec -it -- curl -s http://localhost:5000/health | jq + +# Check events for restart loops +kubectl get events --field-selector involvedObject.name= --sort-by='.lastTimestamp' +``` + +Configure health check probes in Helm values: + +```yaml +livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 +readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 10 + periodSeconds: 5 +``` + +## Verification +``` +stella doctor run --check check.core.services.health +``` + +## Related Checks +- `check.core.services.dependencies` — verifies required DI services are registered (prerequisite for health checks) +- `check.core.config.required` — verifies required settings like connection strings are present +- `check.docker.daemon` — verifies Docker daemon is running (relevant when health checks include Docker connectivity) diff --git a/docs/doctor/articles/crypto/certchain.md b/docs/doctor/articles/crypto/certchain.md new file mode 100644 index 000000000..241121bdc --- /dev/null +++ b/docs/doctor/articles/crypto/certchain.md @@ -0,0 +1,127 @@ +--- +checkId: check.crypto.certchain +plugin: stellaops.doctor.crypto +severity: warn +tags: [crypto, certificate, tls, security] +--- +# Certificate Chain Validation + +## What It Checks +Verifies certificate chain completeness, trust anchor validity, and expiration for the configured TLS certificate. The check reads the certificate path from `Crypto:TlsCertPath`, `Kestrel:Certificates:Default:Path`, or `Server:TlsCertificate` and validates: + +- **File existence**: whether the configured certificate file exists on disk. +- **Chain completeness**: whether all intermediate certificates are present (no missing links). +- **Trust anchor validity**: whether the root CA is trusted by the system trust store. +- **Expiration**: days until the certificate expires, with tiered severity. + +| Condition | Result | +|---|---| +| No TLS certificate configured | Skip | +| Certificate file not found | Fail | +| Certificate chain incomplete (missing intermediates) | Fail | +| Trust anchor not valid (unknown root CA) | Fail | +| Certificate already expired | Fail | +| Certificate expires within 7 days | Fail | +| Certificate expires within 30 days | Warn | +| Chain complete, trust anchor valid, not expiring soon | Pass | + +Evidence collected: `CertPath`, `ChainLength`, `MissingIntermediates`, `TrustAnchorValid`, `TrustAnchorIssuer`, `ExpirationDate`, `DaysRemaining`. + +This check always runs (no precondition), but skips if no TLS certificate path is configured. + +## Why It Matters +An incomplete certificate chain causes TLS handshake failures for clients that do not have intermediate certificates cached. An untrusted root CA triggers browser and API client warnings or outright connection refusal. An expired certificate causes immediate service outage for all HTTPS connections. Certificate issues affect every component that communicates over TLS, including the UI, API, inter-service communication, and external integrations. + +## Common Causes +- Certificate file was moved or deleted from the configured path +- Incorrect certificate path in configuration +- Missing intermediate certificates in the certificate bundle +- Incomplete certificate bundle (only leaf certificate, no intermediates) +- Root CA not added to the system trust store +- Self-signed certificate not explicitly trusted +- Certificate not renewed before expiration +- Automated renewal process failed silently + +## How to Fix + +### Docker Compose +```bash +# Check if certificate file exists at configured path +docker compose exec gateway ls -la /certs/ + +# Verify certificate details +docker compose exec gateway openssl x509 -in /certs/server.crt -noout -dates -subject -issuer + +# Verify certificate chain +docker compose exec gateway openssl verify -untrusted /certs/chain.pem /certs/server.crt + +# Bundle certificates correctly (leaf + intermediates) +cat server.crt intermediate.crt > fullchain.pem + +# Update configuration in .env or compose override +# Crypto__TlsCertPath=/certs/fullchain.pem + +# Set up automated renewal notification +# Notify__CertExpiry__ThresholdDays=14 +``` + +### Bare Metal / systemd +```bash +# Verify certificate file exists +ls -la /etc/stellaops/certs/server.crt + +# Check certificate expiration +openssl x509 -in /etc/stellaops/certs/server.crt -noout -enddate + +# Download missing intermediates +stella crypto cert fetch-chain --cert /etc/stellaops/certs/server.crt --output /etc/stellaops/certs/fullchain.pem + +# Add CA to system trust store (Debian/Ubuntu) +sudo cp root-ca.crt /usr/local/share/ca-certificates/ +sudo update-ca-certificates + +# Or configure explicit trust anchor +stella crypto trust-anchors add --type ca --cert root-ca.crt + +# Renew certificate +stella crypto cert renew --cert /etc/stellaops/certs/server.crt + +# Update appsettings.json +# "Crypto": { "TlsCertPath": "/etc/stellaops/certs/fullchain.pem" } + +sudo systemctl restart stellaops-gateway +``` + +### Kubernetes / Helm +```bash +# Check certificate secret +kubectl get secret stellaops-tls-cert -o jsonpath='{.data.tls\.crt}' | base64 -d | openssl x509 -noout -dates + +# Verify certificate chain +kubectl get secret stellaops-tls-cert -o jsonpath='{.data.tls\.crt}' | base64 -d | openssl verify + +# Update TLS certificate secret +kubectl create secret tls stellaops-tls-cert \ + --cert=fullchain.pem \ + --key=server.key \ + --dry-run=client -o yaml | kubectl apply -f - +``` + +```yaml +# values.yaml - use cert-manager for automated renewal +certManager: + enabled: true + issuer: letsencrypt-prod + renewBefore: 360h # 15 days before expiry +``` + +## Verification +``` +stella doctor run --check check.crypto.certchain +``` + +## Related Checks +- `check.crypto.fips` — FIPS compliance may impose certificate algorithm constraints +- `check.crypto.eidas` — eIDAS compliance requires specific signature algorithms on certificates +- `check.crypto.hsm` — HSM may store the private key associated with the certificate +- `check.compliance.attestation-signing` — attestation signing uses related key material diff --git a/docs/doctor/articles/crypto/eidas.md b/docs/doctor/articles/crypto/eidas.md new file mode 100644 index 000000000..f47a3eefb --- /dev/null +++ b/docs/doctor/articles/crypto/eidas.md @@ -0,0 +1,101 @@ +--- +checkId: check.crypto.eidas +plugin: stellaops.doctor.crypto +severity: fail +tags: [crypto, eidas, eu, compliance, signature] +--- +# eIDAS Compliance + +## What It Checks +Verifies that eIDAS-compliant signature algorithms are available for EU deployments. The check references ETSI TS 119 312 (Cryptographic Suites) and validates availability of the following required algorithms: + +- **RSA-PSS-SHA256** (RSA-PSS with SHA-256) +- **RSA-PSS-SHA384** (RSA-PSS with SHA-384) +- **RSA-PSS-SHA512** (RSA-PSS with SHA-512) +- **ECDSA-P256-SHA256** (ECDSA with P-256 and SHA-256) +- **ECDSA-P384-SHA384** (ECDSA with P-384 and SHA-384) +- **Ed25519** (EdDSA with Curve25519) + +The check also validates the minimum RSA key size. Per eIDAS guidelines post-2024, RSA keys must be at least 3072 bits. The configured minimum is read from `Crypto:MinRsaKeySize` (default 2048). + +| Condition | Result | +|---|---| +| Any required algorithms missing | Fail | +| All algorithms available but RSA key size < 3072 | Warn | +| All algorithms available and key size >= 3072 | Pass | + +Evidence collected: `CryptoProfile`, `AvailableAlgorithms`, `MissingAlgorithms`, `MinRsaKeySize`, `RequiredMinRsaKeySize`. + +The check only runs when `Crypto:Profile` or `Cryptography:Profile` contains "eidas", "eu", or "european". + +## Why It Matters +eIDAS (Electronic Identification, Authentication and Trust Services) is an EU regulation that establishes standards for electronic signatures and trust services. Deployments in the EU that create qualified electronic signatures or seals must use algorithms approved by ETSI. Using non-compliant algorithms means signatures may not be legally recognized, and the deployment may fail regulatory requirements. RSA keys below 3072 bits are considered insufficient for long-term security under current eIDAS guidelines. + +## Common Causes +- OpenSSL version too old to support all required algorithms +- Crypto libraries compiled without required algorithm support +- Configuration restricting the set of available algorithms +- Legacy RSA key size configuration not updated for post-2024 requirements +- Using LibreSSL instead of OpenSSL (missing some algorithms) + +## How to Fix + +### Docker Compose +```bash +# Check OpenSSL version and available algorithms +docker compose exec gateway openssl version +docker compose exec gateway openssl list -signature-algorithms + +# Update minimum RSA key size +# Crypto__MinRsaKeySize=3072 +# Crypto__Profile=eu + +# Restart services after configuration change +docker compose restart gateway +``` + +### Bare Metal / systemd +```bash +# Check OpenSSL version +openssl version + +# Verify available signature algorithms +openssl list -signature-algorithms + +# Update OpenSSL if algorithms are missing +sudo apt update && sudo apt install openssl libssl-dev + +# Configure eIDAS crypto profile +stella crypto profile set --profile eu + +# Set minimum RSA key size in appsettings.json +# "Crypto": { "Profile": "eu", "MinRsaKeySize": 3072 } + +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +crypto: + profile: eu + minRsaKeySize: 3072 +``` + +```bash +# Verify algorithm support in pod +kubectl exec deploy/stellaops-gateway -- openssl list -signature-algorithms + +helm upgrade stellaops ./charts/stellaops -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.crypto.eidas +``` + +## Related Checks +- `check.crypto.certchain` — certificate chain must use eIDAS-compliant algorithms +- `check.crypto.fips` — FIPS and eIDAS have overlapping but distinct algorithm requirements +- `check.crypto.hsm` — HSM may be required for qualified eIDAS signatures +- `check.compliance.attestation-signing` — attestation signing should use eIDAS-compliant algorithms in EU deployments diff --git a/docs/doctor/articles/crypto/fips.md b/docs/doctor/articles/crypto/fips.md new file mode 100644 index 000000000..bf33c6e83 --- /dev/null +++ b/docs/doctor/articles/crypto/fips.md @@ -0,0 +1,141 @@ +--- +checkId: check.crypto.fips +plugin: stellaops.doctor.crypto +severity: fail +tags: [crypto, fips, compliance, security] +--- +# FIPS 140-2 Compliance + +## What It Checks +Verifies that FIPS 140-2 mode is enabled and that FIPS-compliant algorithms are functional. The check performs two phases: + +**Phase 1 - FIPS mode detection:** +- On Linux: reads `/proc/sys/crypto/fips_enabled` (expects "1"). +- On Windows: checks the registry at `HKLM\System\CurrentControlSet\Control\Lsa\FipsAlgorithmPolicy\Enabled` and the `DOTNET_SYSTEM_NET_SECURITY_USEFIPSVALIDATED` environment variable. +- Reports the platform, crypto provider (OpenSSL/bcrypt/CoreCrypto), and whether the OpenSSL FIPS module is loaded. + +**Phase 2 - Algorithm verification** (actual crypto operations, not just configuration): +- **AES-256**: creates key, encrypts test data, verifies output. +- **SHA-256**: hashes test data, verifies 32-byte output. +- **SHA-384**: hashes test data, verifies 48-byte output. +- **SHA-512**: hashes test data, verifies 64-byte output. +- **RSA-2048**: generates key pair, signs and verifies test data. +- **ECDSA-P256**: generates key pair, signs and verifies test data. + +| Condition | Result | +|---|---| +| FIPS mode not enabled at OS level | Fail | +| FIPS mode enabled but some algorithms fail testing | Warn | +| FIPS mode enabled and all algorithms pass | Pass | + +Evidence collected: `fips_mode_enabled`, `platform`, `crypto_provider`, `openssl_fips_module_loaded`, `crypto_profile`, `algorithms_tested`, `algorithms_available`, `algorithms_missing`, per-algorithm test results. + +The check only runs when `Crypto:Profile` or `Cryptography:Profile` contains "fips", "fedramp", or equals "us-gov". + +## Why It Matters +FIPS 140-2 compliance is mandatory for US government deployments (FedRAMP, DoD, ITAR) and many regulated industries (finance, healthcare). Running without FIPS mode means cryptographic operations may use non-validated implementations, which violates federal security requirements. Even with FIPS mode enabled, individual algorithm failures indicate a broken crypto subsystem that could silently produce invalid signatures or weak encryption. + +## Common Causes +- FIPS mode not enabled in the operating system +- OpenSSL FIPS provider not loaded or not installed +- .NET runtime not configured for FIPS-validated algorithms +- FIPS module version incompatible with the OpenSSL version +- Algorithm test failure due to incomplete FIPS provider installation + +## How to Fix + +### Docker Compose +```bash +# Check if FIPS mode is enabled in the container +docker compose exec gateway cat /proc/sys/crypto/fips_enabled + +# Enable FIPS mode in the host OS first (container inherits host FIPS) +# Then restart the compose stack + +# Set crypto profile +# Crypto__Profile=fips + +# Verify algorithms inside container +docker compose exec gateway openssl list -providers +docker compose exec gateway openssl list -digest-algorithms +``` + +### Bare Metal / systemd + +**Linux (RHEL/CentOS/Fedora):** +```bash +# Enable FIPS mode +sudo fips-mode-setup --enable + +# Verify FIPS status +fips-mode-setup --check + +# Reboot required after enabling +sudo reboot + +# After reboot, verify +cat /proc/sys/crypto/fips_enabled # Should output "1" + +# Restart StellaOps services +sudo systemctl restart stellaops +``` + +**Linux (Ubuntu/Debian):** +```bash +# Install FIPS packages +sudo apt install ubuntu-fips +sudo ua enable fips + +# Reboot required +sudo reboot +``` + +**Windows:** +``` +Enable via Local Security Policy: + Security Settings > Local Policies > Security Options > + "System cryptography: Use FIPS compliant algorithms" = Enabled + +Or via registry (requires reboot): + reg add HKLM\System\CurrentControlSet\Control\Lsa\FipsAlgorithmPolicy /v Enabled /t REG_DWORD /d 1 /f +``` + +```bash +# Configure StellaOps +# "Crypto": { "Profile": "fips" } + +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +crypto: + profile: fips + +# FIPS must be enabled at the node level +# For EKS: use Amazon Linux 2 FIPS AMI +# For AKS: use FIPS-enabled node pools +# For GKE: use Container-Optimized OS with FIPS +``` + +```bash +# Verify FIPS in pod +kubectl exec deploy/stellaops-gateway -- cat /proc/sys/crypto/fips_enabled + +# Check OpenSSL FIPS provider +kubectl exec deploy/stellaops-gateway -- openssl list -providers + +helm upgrade stellaops ./charts/stellaops -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.crypto.fips +``` + +## Related Checks +- `check.crypto.certchain` — certificates must use FIPS-approved algorithms +- `check.crypto.eidas` — eIDAS has overlapping but distinct requirements from FIPS +- `check.crypto.hsm` — FIPS 140-2 Level 3+ may require HSM for key storage +- `check.compliance.attestation-signing` — signing must use FIPS-validated algorithms in FIPS deployments diff --git a/docs/doctor/articles/crypto/gost.md b/docs/doctor/articles/crypto/gost.md new file mode 100644 index 000000000..0affbef8d --- /dev/null +++ b/docs/doctor/articles/crypto/gost.md @@ -0,0 +1,120 @@ +--- +checkId: check.crypto.gost +plugin: stellaops.doctor.crypto +severity: fail +tags: [crypto, gost, russia, compliance] +--- +# GOST Algorithm Availability + +## What It Checks +Verifies that GOST cryptographic algorithms are available for Russian deployments. The check validates two layers: + +**Layer 1 - GOST engine detection:** +Checks whether the OpenSSL GOST engine is loaded by looking for the engine shared object at: +- A custom path configured via `Crypto:Gost:EnginePath` +- Common system paths: `/usr/lib/x86_64-linux-gnu/engines-{3,1.1}/gost.so`, `/usr/lib64/engines-{3,1.1}/gost.so` + +**Layer 2 - Algorithm availability** (only if engine is loaded): +Verifies the following GOST algorithms are accessible: +- **GOST R 34.10-2012-256** (digital signature, 256-bit) +- **GOST R 34.10-2012-512** (digital signature, 512-bit) +- **GOST R 34.11-2012-256** (Stribog hash, 256-bit) +- **GOST R 34.11-2012-512** (Stribog hash, 512-bit) +- **GOST R 34.12-2015** (Kuznyechik block cipher) +- **GOST 28147-89** (Magma legacy block cipher) + +| Condition | Result | +|---|---| +| GOST engine not loaded | Fail | +| Engine loaded but some algorithms missing | Warn | +| Engine loaded and all algorithms available | Pass | + +Evidence collected: `CryptoProfile`, `GostEngineLoaded`, `AvailableAlgorithms`, `MissingAlgorithms`, `RequiredAlgorithms`. + +The check only runs when `Crypto:Profile` or `Cryptography:Profile` contains "gost", "russia", or equals "ru". + +## Why It Matters +Russian regulatory requirements mandate the use of GOST cryptographic algorithms for government and many commercial deployments. Without GOST algorithm support, the platform cannot create compliant digital signatures or encrypt data according to Russian standards. This blocks deployment in regulated Russian environments and may violate data protection requirements. + +## Common Causes +- OpenSSL GOST engine not installed on the system +- GOST engine not configured in `openssl.cnf` +- Missing `gost-engine` package +- GOST engine version too old (missing newer algorithms) +- GOST engine installed but algorithm disabled in configuration +- Incomplete GOST engine installation + +## How to Fix + +### Docker Compose +```bash +# Check if GOST engine is available +docker compose exec gateway openssl engine gost -c 2>/dev/null || echo "GOST engine not found" + +# Install GOST engine in the container (add to Dockerfile for persistence) +# For Debian/Ubuntu based images: +# RUN apt-get install -y libengine-gost-openssl1.1 + +# Set crypto profile +# Crypto__Profile=ru +# Crypto__Gost__EnginePath=/usr/lib/x86_64-linux-gnu/engines-3/gost.so + +docker compose restart gateway +``` + +### Bare Metal / systemd +```bash +# Install GOST engine (Debian/Ubuntu) +sudo apt install libengine-gost-openssl1.1 + +# Or install from source +git clone https://github.com/gost-engine/engine +cd engine && mkdir build && cd build +cmake .. && make && sudo make install + +# Configure OpenSSL to load GOST engine +# Add to /etc/ssl/openssl.cnf: +# [gost_section] +# engine_id = gost +# default_algorithms = ALL + +# Verify engine is loaded +openssl engine gost -c + +# Configure StellaOps GOST profile +stella crypto profile set --profile ru + +# In appsettings.json: +# "Crypto": { "Profile": "ru" } + +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +crypto: + profile: ru + gost: + enginePath: /usr/lib/x86_64-linux-gnu/engines-3/gost.so +``` + +```bash +# Verify in pod +kubectl exec deploy/stellaops-gateway -- openssl engine gost -c + +# Use a base image that includes GOST engine support +# Or mount the engine as a volume +helm upgrade stellaops ./charts/stellaops -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.crypto.gost +``` + +## Related Checks +- `check.crypto.certchain` — certificates in GOST deployments should use GOST signature algorithms +- `check.crypto.fips` — FIPS and GOST are mutually exclusive regional crypto profiles +- `check.crypto.sm` — SM (Chinese) is another regional crypto profile with similar structure +- `check.crypto.hsm` — GOST keys may be stored in an HSM with GOST support diff --git a/docs/doctor/articles/crypto/hsm.md b/docs/doctor/articles/crypto/hsm.md new file mode 100644 index 000000000..3f705ac4e --- /dev/null +++ b/docs/doctor/articles/crypto/hsm.md @@ -0,0 +1,131 @@ +--- +checkId: check.crypto.hsm +plugin: stellaops.doctor.crypto +severity: warn +tags: [crypto, hsm, pkcs11, security] +--- +# HSM/PKCS#11 Availability + +## What It Checks +Verifies HSM (Hardware Security Module) availability via PKCS#11 interface. The check validates three layers: + +1. **Module configuration**: whether a PKCS#11 module path is configured via `Crypto:Hsm:ModulePath` or `Cryptography:Pkcs11:ModulePath`. +2. **Module file existence**: whether the configured `.so` (Linux) or `.dll` (Windows) file exists on disk. +3. **Slot access**: whether the PKCS#11 module can enumerate slots and access the configured slot. +4. **Token presence**: whether a token is initialized in the slot and accessible (login test). + +| Condition | Result | +|---|---| +| Module path not configured | Fail | +| Module file not found at configured path | Fail | +| Slot access failed (init error, no slots, permission denied) | Fail | +| Token not accessible (not initialized, login failure) | Warn | +| Module loaded, slot accessible, token present | Pass | + +Evidence collected: `ModulePath`, `ModuleExists`, `SlotId`, `SlotLabel`, `SlotAccess`, `TokenPresent`, `TokenLabel`. + +The check only runs when `Crypto:Hsm:Enabled` or `Cryptography:Pkcs11:Enabled` is set to "true". + +## Why It Matters +HSMs provide tamper-resistant hardware protection for cryptographic keys. When HSM is enabled, all signing operations (attestations, evidence seals, certificate signing) depend on the HSM being accessible. An unavailable HSM means no signing can occur, which blocks evidence generation, attestation creation, and release approvals. HSM connectivity issues can silently degrade to software-based signing if fallback is enabled, which may violate compliance requirements for FIPS 140-2 Level 3 or eIDAS qualified signatures. + +## Common Causes +- PKCS#11 module path not configured in application settings +- Module file was moved or deleted from the configured path +- HSM software not installed (e.g., SoftHSM2 not installed for development) +- PKCS#11 module initialization failure (driver compatibility issues) +- No slots available in the HSM +- Permission denied accessing the PKCS#11 module or device +- Token not initialized in the configured slot +- Token login required but PIN not configured or incorrect + +## How to Fix + +### Docker Compose +```bash +# Verify HSM module is accessible +docker compose exec gateway ls -la /usr/lib/softhsm/libsofthsm2.so + +# Initialize a token if needed (SoftHSM2 for development) +docker compose exec gateway softhsm2-util --init-token --slot 0 --label "stellaops" --pin 1234 --so-pin 0000 + +# List available slots +docker compose exec gateway softhsm2-util --show-slots + +# Set environment variables +# Crypto__Hsm__Enabled=true +# Crypto__Hsm__ModulePath=/usr/lib/softhsm/libsofthsm2.so +# Crypto__Hsm__Pin=1234 + +docker compose restart gateway +``` + +### Bare Metal / systemd +```bash +# Install SoftHSM2 (for development/testing) +sudo apt install softhsm2 + +# Configure PKCS#11 module path +stella crypto config set --hsm-module /usr/lib/softhsm/libsofthsm2.so + +# Initialize token +softhsm2-util --init-token --slot 0 --label "stellaops" --pin 1234 --so-pin 0000 + +# List slots +softhsm2-util --show-slots + +# Verify module permissions +ls -la /usr/lib/softhsm/libsofthsm2.so + +# Configure token PIN +stella crypto config set --hsm-pin + +# For Windows with SoftHSM2: +# stella crypto config set --hsm-module C:\SoftHSM2\lib\softhsm2.dll + +# In appsettings.json: +# "Crypto": { +# "Hsm": { +# "Enabled": true, +# "ModulePath": "/usr/lib/softhsm/libsofthsm2.so" +# } +# } + +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +crypto: + hsm: + enabled: true + modulePath: /usr/lib/softhsm/libsofthsm2.so + pinSecret: stellaops-hsm-pin + slotId: 0 +``` + +```bash +# Create HSM PIN secret +kubectl create secret generic stellaops-hsm-pin \ + --from-literal=pin= + +# For hardware HSMs, mount the device into the pod +# Add to pod spec: devices: ["/dev/pkcs11"] + +# Initialize token +kubectl exec deploy/stellaops-gateway -- softhsm2-util --init-token --slot 0 --label stellaops --pin 1234 --so-pin 0000 + +helm upgrade stellaops ./charts/stellaops -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.crypto.hsm +``` + +## Related Checks +- `check.crypto.fips` — FIPS 140-2 Level 3+ requires key storage in a validated HSM +- `check.crypto.eidas` — qualified eIDAS signatures may require HSM-backed keys +- `check.crypto.certchain` — the TLS certificate private key may reside in the HSM +- `check.compliance.attestation-signing` — attestation signing keys may be HSM-protected diff --git a/docs/doctor/articles/crypto/sm.md b/docs/doctor/articles/crypto/sm.md new file mode 100644 index 000000000..772dc57f5 --- /dev/null +++ b/docs/doctor/articles/crypto/sm.md @@ -0,0 +1,112 @@ +--- +checkId: check.crypto.sm +plugin: stellaops.doctor.crypto +severity: fail +tags: [crypto, sm2, sm3, sm4, china, compliance] +--- +# SM2/SM3/SM4 Availability + +## What It Checks +Verifies that Chinese national cryptographic algorithms (GM/T standards) are available for CN deployments. The check validates: + +1. **OpenSSL version**: SM algorithms are natively supported in OpenSSL 1.1.1+. If the version is older, the check fails immediately. +2. **Algorithm availability**: tests each required algorithm: + - **SM2**: Elliptic curve cryptography (signature, key exchange) + - **SM3**: Cryptographic hash function (256-bit output) + - **SM4**: Block cipher (128-bit blocks, 128-bit key) +3. **SM2 curve parameters**: verifies the SM2 elliptic curve is properly initialized. + +| Condition | Result | +|---|---| +| OpenSSL < 1.1.1 and algorithms missing | Fail | +| Any SM algorithms unavailable | Fail | +| All algorithms available but SM2 curve cannot be verified | Warn | +| All algorithms available and SM2 curve verified | Pass | + +Evidence collected: `CryptoProfile`, `OpenSslVersion`, `NativeSmSupport`, `AvailableAlgorithms`, `MissingAlgorithms`, `SM2CurveVerified`. + +The check only runs when `Crypto:Profile` or `Cryptography:Profile` contains "sm", "china", or equals "cn". + +## Why It Matters +Chinese regulatory requirements (GB/T standards) mandate the use of SM2, SM3, and SM4 algorithms for government systems, financial services, and critical infrastructure. Without SM algorithm support, the platform cannot create compliant digital signatures or encrypt data according to Chinese national standards. This blocks deployment in regulated Chinese environments and may violate the Cryptography Law of the People's Republic of China. + +## Common Causes +- OpenSSL version too old (pre-1.1.1) to include native SM support +- Using LibreSSL instead of OpenSSL (lacks SM algorithm support) +- System OpenSSL not updated to a version with SM support +- OpenSSL compiled without SM algorithm support (custom builds) +- SM algorithms disabled in OpenSSL configuration +- SM2 curve not properly initialized in the crypto provider +- Missing external SM crypto provider (e.g., GmSSL) + +## How to Fix + +### Docker Compose +```bash +# Check OpenSSL version (must be 1.1.1+) +docker compose exec gateway openssl version + +# Verify SM algorithm support +docker compose exec gateway openssl list -cipher-algorithms | grep -i sm +docker compose exec gateway openssl ecparam -list_curves | grep -i sm2 + +# Set crypto profile +# Crypto__Profile=cn + +# If OpenSSL is too old, rebuild with a newer base image +# FROM ubuntu:22.04 (includes OpenSSL 3.0+) + +docker compose restart gateway +``` + +### Bare Metal / systemd +```bash +# Check current OpenSSL version +openssl version + +# Update OpenSSL to 1.1.1+ if needed +sudo apt update && sudo apt install openssl + +# Verify SM algorithm support +openssl list -cipher-algorithms | grep -i sm +openssl ecparam -list_curves | grep -i sm2 + +# Configure SM crypto profile +stella crypto profile set --profile cn + +# Or use external SM provider (GmSSL) +stella crypto config set --sm-provider gmssl + +# In appsettings.json: +# "Crypto": { "Profile": "cn" } + +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +crypto: + profile: cn + # Optionally specify external SM provider + smProvider: native # or "gmssl" for GmSSL +``` + +```bash +# Verify SM support in pod +kubectl exec deploy/stellaops-gateway -- openssl version +kubectl exec deploy/stellaops-gateway -- openssl ecparam -list_curves | grep -i sm2 + +helm upgrade stellaops ./charts/stellaops -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.crypto.sm +``` + +## Related Checks +- `check.crypto.certchain` — certificates in CN deployments should use SM2 signatures +- `check.crypto.gost` — GOST (Russian) is another regional crypto profile with similar structure +- `check.crypto.fips` — FIPS and SM are mutually exclusive regional crypto profiles +- `check.crypto.hsm` — SM keys may be stored in an HSM with SM algorithm support diff --git a/docs/doctor/articles/docker/apiversion.md b/docs/doctor/articles/docker/apiversion.md new file mode 100644 index 000000000..6986fcbba --- /dev/null +++ b/docs/doctor/articles/docker/apiversion.md @@ -0,0 +1,94 @@ +--- +checkId: check.docker.apiversion +plugin: stellaops.doctor.docker +severity: warn +tags: [docker, api, compatibility] +--- +# Docker API Version + +## What It Checks +Validates that the Docker API version meets minimum requirements for Stella Ops. The check connects to the Docker daemon (using `Docker:Host` configuration or the platform default) and queries the API version via `System.GetVersionAsync()`. + +| API Version | Result | +|---|---| +| Below **1.41** | `warn` — below minimum required | +| Between **1.41** and **1.43** | `warn` — below recommended | +| **1.43** or higher | `pass` | + +The minimum API version 1.41 corresponds to Docker Engine 20.10+. The recommended version 1.43 corresponds to Docker Engine 23.0+. + +Evidence collected includes: API version, Docker version, minimum required version, recommended version, OS, build time, and git commit. + +Default Docker host: +- **Linux**: `unix:///var/run/docker.sock` +- **Windows**: `npipe://./pipe/docker_engine` + +## Why It Matters +Stella Ops uses Docker API features for container management, image inspection, and network configuration. Older API versions may not support required features such as: + +- BuildKit-based image builds (API 1.39+). +- Multi-platform image inspection (API 1.41+). +- Container resource management improvements (API 1.43+). + +Running an outdated Docker version also means missing security patches and bug fixes. + +## Common Causes +- Docker Engine is outdated (version < 20.10) +- Docker Engine is functional but below recommended version (< 23.0) +- Using a Docker-compatible runtime (Podman, containerd) that reports a lower API version +- Docker not updated after OS upgrade + +## How to Fix + +### Docker Compose +Update Docker Engine to the latest stable version: + +```bash +# Ubuntu/Debian +sudo apt-get update +sudo apt-get install docker-ce docker-ce-cli containerd.io + +# RHEL/CentOS +sudo yum update docker-ce docker-ce-cli containerd.io + +# Verify version +docker version +``` + +### Bare Metal / systemd +```bash +# Check current version +docker version + +# Update Docker +curl -fsSL https://get.docker.com | sh + +# Restart Docker +sudo systemctl restart docker + +# Verify +docker version +``` + +### Kubernetes / Helm +Update the container runtime on cluster nodes. The method depends on your Kubernetes distribution: + +```bash +# Check node runtime version +kubectl get nodes -o wide + +# For kubeadm clusters, update containerd on each node +sudo apt-get update && sudo apt-get install containerd.io + +# Verify +sudo crictl version +``` + +## Verification +``` +stella doctor run --check check.docker.apiversion +``` + +## Related Checks +- `check.docker.daemon` — verifies Docker daemon is running (prerequisite for version check) +- `check.docker.socket` — verifies Docker socket is accessible diff --git a/docs/doctor/articles/docker/daemon.md b/docs/doctor/articles/docker/daemon.md new file mode 100644 index 000000000..26837d104 --- /dev/null +++ b/docs/doctor/articles/docker/daemon.md @@ -0,0 +1,124 @@ +--- +checkId: check.docker.daemon +plugin: stellaops.doctor.docker +severity: fail +tags: [docker, daemon, container] +--- +# Docker Daemon + +## What It Checks +Validates that the Docker daemon is running and responsive. The check connects to the Docker daemon (using `Docker:Host` configuration or the platform default) and performs two operations: + +1. **Ping**: Sends a ping request to verify the daemon is alive (with a configurable timeout, default 10 seconds via `Docker:TimeoutSeconds`). +2. **Version**: Retrieves version information to confirm the daemon is fully operational. + +Evidence collected on success: host address, Docker version, API version, OS, architecture, and kernel version. + +On failure, the check distinguishes between: +- **DockerApiException**: The daemon is running but returned an error (reports status code and response body). +- **Connection failure**: Cannot connect to the daemon at all (Docker not installed, not running, or socket inaccessible). + +Default Docker host: +- **Linux**: `unix:///var/run/docker.sock` +- **Windows**: `npipe://./pipe/docker_engine` + +## Why It Matters +The Docker daemon is the core runtime for all Stella Ops containers. If the daemon is down: + +- No containers can start, stop, or restart. +- Health checks for all containerized services fail. +- Image pulls and builds are impossible. +- Docker Compose operations fail entirely. +- The entire Stella Ops platform is offline in container-based deployments. + +## Common Causes +- Docker daemon is not running or not accessible +- Docker is not installed on the host +- Docker service crashed or was stopped +- Docker daemon returned an error response (resource exhaustion, configuration error) +- Timeout connecting to the daemon (overloaded host, slow disk) + +## How to Fix + +### Docker Compose +Check and restart the Docker daemon: + +```bash +# Check daemon status +sudo systemctl status docker + +# Start the daemon +sudo systemctl start docker + +# Enable auto-start on boot +sudo systemctl enable docker + +# Verify +docker info +``` + +If Docker is not installed: +```bash +curl -fsSL https://get.docker.com | sh +sudo usermod -aG docker $USER +``` + +### Bare Metal / systemd +```bash +# Check status +sudo systemctl status docker + +# View daemon logs +sudo journalctl -u docker --since "10 minutes ago" + +# Restart the daemon +sudo systemctl restart docker + +# Verify connectivity +docker version +docker info +``` + +If the daemon crashes repeatedly, check for resource exhaustion: +```bash +# Check disk space (Docker requires space for images/containers) +df -h /var/lib/docker + +# Check memory +free -h + +# Clean up Docker resources +docker system prune -a +``` + +### Kubernetes / Helm +On Kubernetes nodes, the container runtime (containerd/CRI-O) replaces Docker daemon. Check the runtime: + +```bash +# Check containerd status +sudo systemctl status containerd + +# Check CRI-O status +sudo systemctl status crio + +# Restart if needed +sudo systemctl restart containerd +``` + +For Docker Desktop (development): +```bash +# Restart Docker Desktop +# macOS: killall Docker && open -a Docker +# Windows: Restart-Service docker +``` + +## Verification +``` +stella doctor run --check check.docker.daemon +``` + +## Related Checks +- `check.docker.socket` — verifies the Docker socket exists and has correct permissions +- `check.docker.apiversion` — verifies the Docker API version is compatible +- `check.docker.storage` — verifies Docker storage is healthy (requires running daemon) +- `check.docker.network` — verifies Docker networks are configured (requires running daemon) diff --git a/docs/doctor/articles/docker/network.md b/docs/doctor/articles/docker/network.md new file mode 100644 index 000000000..7c4600352 --- /dev/null +++ b/docs/doctor/articles/docker/network.md @@ -0,0 +1,104 @@ +--- +checkId: check.docker.network +plugin: stellaops.doctor.docker +severity: warn +tags: [docker, network, connectivity] +--- +# Docker Network + +## What It Checks +Validates Docker network configuration and connectivity. The check connects to the Docker daemon and lists all networks, then verifies: + +1. **Required networks exist**: Checks that each network listed in `Docker:RequiredNetworks` configuration is present. Defaults to `["bridge"]` if not configured. +2. **Bridge driver available**: Verifies at least one network using the `bridge` driver exists. + +Evidence collected includes: total network count, available network drivers, found/missing required networks, and bridge network name. + +If the Docker daemon is unreachable, the check is skipped. + +## Why It Matters +Docker networks provide isolated communication channels between containers. Stella Ops services communicate over dedicated networks for: + +- **Service-to-service communication**: Platform, Authority, Gateway, and other services need to reach each other. +- **Database access**: PostgreSQL and Valkey are on specific networks. +- **Network isolation**: Separating frontend, backend, and data tiers. + +Missing networks cause container DNS resolution failures and connection refused errors between services. + +## Common Causes +- Required network not found (not yet created or was deleted) +- No bridge network driver available (Docker networking misconfigured) +- Docker Compose network not created (compose project not started) +- Network name mismatch between configuration and actual Docker networks + +## How to Fix + +### Docker Compose +Docker Compose normally creates networks automatically. If missing: + +```bash +# List existing networks +docker network ls + +# Start compose to create networks +docker compose -f devops/compose/docker-compose.stella-ops.yml up -d + +# Create a network manually if needed +docker network create stellaops-network + +# Inspect a network +docker network inspect +``` + +Configure required networks for the check: +```yaml +environment: + Docker__RequiredNetworks__0: "stellaops-network" + Docker__RequiredNetworks__1: "bridge" +``` + +### Bare Metal / systemd +For bare metal deployments, Docker networks must be created manually: + +```bash +# Create required networks +docker network create --driver bridge stellaops-frontend +docker network create --driver bridge stellaops-backend +docker network create --driver bridge stellaops-data + +# List networks +docker network ls + +# Inspect network details +docker network inspect stellaops-backend +``` + +### Kubernetes / Helm +Docker networks are not used in Kubernetes; instead, Kubernetes networking (Services, NetworkPolicies) handles inter-pod communication. Configure the check to skip Docker network requirements: + +```yaml +doctor: + docker: + requiredNetworks: [] # Not applicable in Kubernetes +``` + +Or verify Kubernetes networking: +```bash +# Check services +kubectl get svc -n stellaops + +# Check network policies +kubectl get networkpolicy -n stellaops + +# Test connectivity between pods +kubectl exec -it -- curl http://:5000/health +``` + +## Verification +``` +stella doctor run --check check.docker.network +``` + +## Related Checks +- `check.docker.daemon` — Docker daemon must be running to query networks +- `check.docker.socket` — Docker socket must be accessible to communicate with the daemon diff --git a/docs/doctor/articles/docker/socket.md b/docs/doctor/articles/docker/socket.md new file mode 100644 index 000000000..900227f97 --- /dev/null +++ b/docs/doctor/articles/docker/socket.md @@ -0,0 +1,125 @@ +--- +checkId: check.docker.socket +plugin: stellaops.doctor.docker +severity: fail +tags: [docker, socket, permissions] +--- +# Docker Socket + +## What It Checks +Validates that the Docker socket exists and is accessible with correct permissions. The check behavior differs by platform: + +### Linux / Unix +Checks the Unix socket at the path extracted from `Docker:Host` (default: `/var/run/docker.sock`): + +| Condition | Result | +|---|---| +| Socket does not exist + running inside a container | `pass` — socket mount is optional for most services | +| Socket does not exist + not inside a container | `warn` | +| Socket exists but not readable or writable | `warn` — insufficient permissions | +| Socket exists and is readable + writable | `pass` | + +The check detects whether the process is running inside a Docker container by checking for `/.dockerenv` or `/proc/1/cgroup`. When running inside a container without a mounted socket, this is considered normal for services that don't need direct Docker access. + +### Windows +On Windows, the check verifies that the named pipe path is configured (default: `npipe://./pipe/docker_engine`). The actual connectivity is deferred to the daemon check since named pipe access testing differs from Unix sockets. + +Evidence collected includes: socket path, existence, readability, writability, and whether the process is running inside a container. + +## Why It Matters +The Docker socket is the communication channel between clients (CLI, SDKs, Stella Ops services) and the Docker daemon. Without socket access: + +- Docker CLI commands fail. +- Services that manage containers (scanner, job engine) cannot create or inspect containers. +- Docker Compose operations fail. +- Health checks that query Docker state cannot run. + +Note that most Stella Ops services do NOT need direct Docker socket access. Only services that manage containers (e.g., scanner, job engine) require the socket to be mounted. + +## Common Causes +- Docker socket not found at the expected path +- Docker not installed or daemon not running +- Insufficient permissions on the socket file (user not in `docker` group) +- Docker socket not mounted into the container (for containerized services that need it) +- SELinux or AppArmor blocking socket access + +## How to Fix + +### Docker Compose +Mount the Docker socket for services that need container management: + +```yaml +services: + scanner: + volumes: + - /var/run/docker.sock:/var/run/docker.sock + + # Most services do NOT need the socket: + platform: + # No socket mount needed +``` + +Fix socket permissions on the host: +```bash +# Add your user to the docker group +sudo usermod -aG docker $USER + +# Log out and back in, then verify +docker ps +``` + +### Bare Metal / systemd +```bash +# Check if Docker is installed +which docker + +# Check socket existence +ls -la /var/run/docker.sock + +# Check socket permissions +stat /var/run/docker.sock + +# Add user to docker group +sudo usermod -aG docker $USER +logout # Must log out and back in + +# If socket is missing, start Docker +sudo systemctl start docker + +# Verify +docker ps +``` + +If SELinux is blocking access: +```bash +# Check SELinux denials +sudo ausearch -m avc -ts recent | grep docker + +# Allow Docker socket access (create a policy module) +sudo setsebool -P container_manage_cgroup on +``` + +### Kubernetes / Helm +In Kubernetes, the Docker socket is typically not available. Use the container runtime socket instead: + +```yaml +# For containerd +volumes: + - name: containerd-sock + hostPath: + path: /run/containerd/containerd.sock + type: Socket +``` + +Most Stella Ops services should NOT mount any runtime socket in Kubernetes. Only the scanner or job engine may need it for container-in-container operations. + +## Verification +``` +stella doctor run --check check.docker.socket +``` + +## Related Checks +- `check.docker.daemon` — verifies the Docker daemon is running and responsive (uses the socket) +- `check.docker.apiversion` — verifies Docker API version compatibility (requires socket access) +- `check.docker.network` — verifies Docker networks (requires socket access) +- `check.docker.storage` — verifies Docker storage (requires socket access) diff --git a/docs/doctor/articles/docker/storage.md b/docs/doctor/articles/docker/storage.md new file mode 100644 index 000000000..1d726aa7b --- /dev/null +++ b/docs/doctor/articles/docker/storage.md @@ -0,0 +1,123 @@ +--- +checkId: check.docker.storage +plugin: stellaops.doctor.docker +severity: warn +tags: [docker, storage, disk] +--- +# Docker Storage + +## What It Checks +Validates Docker storage driver and disk space usage. The check connects to the Docker daemon and retrieves system information, then inspects: + +| Condition | Result | +|---|---| +| Storage driver is not `overlay2`, `btrfs`, or `zfs` | `warn` — non-recommended driver | +| Free disk space on Docker root partition < **10 GB** (configurable via `Docker:MinFreeSpaceGb`) | `warn` | +| Disk usage > **85%** (configurable via `Docker:MaxStorageUsagePercent`) | `warn` | + +The check reads the Docker root directory (typically `/var/lib/docker`) and queries drive info for that partition. On platforms where disk info is unavailable, the check still validates the storage driver. + +Evidence collected includes: storage driver, Docker root directory, total space, free space, usage percentage, and whether the driver is recommended. + +## Why It Matters +Docker storage issues are a leading cause of container deployment failures: + +- **Non-recommended storage drivers** (e.g., `vfs`, `devicemapper`) have performance and reliability problems. `overlay2` is the recommended driver for most workloads. +- **Low disk space** prevents image pulls, container creation, and volume writes. Docker images and layers consume significant space. +- **High disk usage** can cause container crashes, database corruption, and evidence write failures. + +The Docker root directory often shares a partition with the OS, so storage exhaustion affects the entire host. + +## Common Causes +- Storage driver is not overlay2, btrfs, or zfs (e.g., using legacy `devicemapper` or `vfs`) +- Low disk space on the Docker root partition (less than 10 GB free) +- Disk usage exceeds 85% threshold +- Unused images, containers, and volumes consuming space +- Large build caches not pruned + +## How to Fix + +### Docker Compose +Check and clean Docker storage: + +```bash +# Check disk usage +docker system df + +# Detailed disk usage +docker system df -v + +# Prune unused data (images, containers, networks, build cache) +docker system prune -a + +# Prune volumes too (WARNING: removes data volumes) +docker system prune -a --volumes + +# Check storage driver +docker info | grep "Storage Driver" +``` + +Configure storage thresholds: +```yaml +environment: + Docker__MinFreeSpaceGb: "10" + Docker__MaxStorageUsagePercent: "85" +``` + +### Bare Metal / systemd +Switch to overlay2 storage driver if not already using it: + +```bash +# Check current driver +docker info | grep "Storage Driver" + +# Configure overlay2 in /etc/docker/daemon.json +{ + "storage-driver": "overlay2" +} + +# Restart Docker (WARNING: may require re-pulling images) +sudo systemctl restart docker +``` + +Free up disk space: +```bash +# Find large Docker directories +du -sh /var/lib/docker/* + +# Clean unused resources +docker system prune -a + +# Set up automatic cleanup via cron +echo "0 2 * * 0 docker system prune -f --filter 'until=168h'" | sudo crontab - +``` + +### Kubernetes / Helm +Monitor node disk usage: + +```bash +# Check node disk pressure +kubectl describe node | grep -A 5 "Conditions" + +# Check for DiskPressure condition +kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{range .status.conditions[?(@.type=="DiskPressure")]}{.status}{"\n"}{end}{end}' +``` + +Configure kubelet garbage collection thresholds: +```yaml +# In kubelet config +imageGCHighThresholdPercent: 85 +imageGCLowThresholdPercent: 80 +evictionHard: + nodefs.available: "10%" + imagefs.available: "15%" +``` + +## Verification +``` +stella doctor run --check check.docker.storage +``` + +## Related Checks +- `check.core.env.diskspace` — checks general disk space (not Docker-specific) +- `check.docker.daemon` — daemon must be running to query storage info diff --git a/docs/doctor/articles/environment/environment-capacity.md b/docs/doctor/articles/environment/environment-capacity.md new file mode 100644 index 000000000..f1ceeb8d5 --- /dev/null +++ b/docs/doctor/articles/environment/environment-capacity.md @@ -0,0 +1,84 @@ +--- +checkId: check.environment.capacity +plugin: stellaops.doctor.environment +severity: warn +tags: [environment, capacity, resources, cpu, memory, storage] +--- +# Environment Capacity + +## What It Checks +Queries the Release Orchestrator API (`/api/v1/environments/capacity`) and evaluates CPU, memory, storage, and deployment slot usage for every configured environment. Each resource is compared against two thresholds: +- **Warn** when usage >= 75% +- **Fail** when usage >= 90% + +Deployment slot utilization is calculated as `activeDeployments / maxConcurrentDeployments * 100`. If no environments exist, the check passes with a note. If the orchestrator is unreachable, the check returns warn. + +## Why It Matters +Resource exhaustion in a target environment blocks deployments and can cause running services to crash or degrade. Detecting capacity pressure early gives operators time to scale up, clean up unused deployments, or redistribute workloads before an outage occurs. In production environments, exceeding 90% on any resource dimension is a leading indicator of imminent service disruption. + +## Common Causes +- Gradual organic growth without corresponding resource scaling +- Runaway or leaked processes consuming CPU/memory +- Accumulated old deployments that were never cleaned up +- Resource limits set too tightly relative to actual workload +- Unexpected traffic spike or batch job saturating storage + +## How to Fix + +### Docker Compose +```bash +# Check current resource usage on the host +docker stats --no-stream + +# Increase resource limits in docker-compose.stella-ops.yml +# Edit the target service under deploy.resources.limits: +# cpus: '4.0' +# memory: 8G + +# Remove stopped containers to free deployment slots +docker container prune -f + +# Restart with updated limits +docker compose -f docker-compose.stella-ops.yml up -d +``` + +### Bare Metal / systemd +```bash +# Check system resource usage +free -h && df -h && top -bn1 | head -20 + +# Increase memory/CPU limits in systemd unit overrides +sudo systemctl edit stellaops-environment-agent.service +# Add under [Service]: +# MemoryMax=8G +# CPUQuota=400% + +sudo systemctl daemon-reload && sudo systemctl restart stellaops-environment-agent.service + +# Clean up old deployments +stella env cleanup +``` + +### Kubernetes / Helm +```bash +# Check node resource usage +kubectl top nodes +kubectl top pods -n stellaops + +# Scale up resources via Helm values +helm upgrade stellaops stellaops/stellaops \ + --set environments.resources.limits.cpu=4 \ + --set environments.resources.limits.memory=8Gi \ + --set environments.maxConcurrentDeployments=20 + +# Or add more nodes to the cluster for horizontal scaling +``` + +## Verification +```bash +stella doctor run --check check.environment.capacity +``` + +## Related Checks +- `check.environment.deployments` - checks deployed service health, which may degrade under capacity pressure +- `check.environment.connectivity` - verifies agents are reachable, which capacity exhaustion can prevent diff --git a/docs/doctor/articles/environment/environment-connectivity.md b/docs/doctor/articles/environment/environment-connectivity.md new file mode 100644 index 000000000..a2bed256f --- /dev/null +++ b/docs/doctor/articles/environment/environment-connectivity.md @@ -0,0 +1,98 @@ +--- +checkId: check.environment.connectivity +plugin: stellaops.doctor.environment +severity: warn +tags: [environment, connectivity, agent, network] +--- +# Environment Connectivity + +## What It Checks +Retrieves the list of environments from the Release Orchestrator (`/api/v1/environments`), then probes each environment agent's `/health` endpoint. For each agent the check measures: +- **Reachability** -- whether the health endpoint returns a success status code +- **Latency** -- fails warn if response takes more than 500ms +- **TLS certificate validity** -- warns if the agent's TLS certificate expires within 30 days +- **Authentication** -- detects 401/403 responses indicating credential issues + +If any agent is unreachable, the check fails. High latency or expiring certificates produce a warn. + +## Why It Matters +Environment agents are the control surface through which Stella Ops manages deployments, collects telemetry, and enforces policy. An unreachable agent means the platform cannot deploy to, monitor, or roll back services in that environment. TLS certificate expiry causes hard connectivity failures with no graceful degradation. High latency slows deployment pipelines and can cause timeouts in approval workflows. + +## Common Causes +- Environment agent service is stopped or crashed +- Firewall rule change blocking the agent port +- Network partition between Stella Ops control plane and target environment +- TLS certificate not renewed before expiry +- Agent authentication credentials rotated without updating Stella Ops configuration +- DNS resolution failure for the agent hostname + +## How to Fix + +### Docker Compose +```bash +# Check if the environment agent container is running +docker ps --filter "name=environment-agent" + +# View agent logs for errors +docker logs stellaops-environment-agent --tail 100 + +# Restart the agent +docker compose -f docker-compose.stella-ops.yml restart environment-agent + +# If TLS cert is expiring, replace the certificate files +# mounted into the agent container and restart +cp /path/to/new/cert.pem devops/compose/certs/agent.pem +cp /path/to/new/key.pem devops/compose/certs/agent-key.pem +docker compose -f docker-compose.stella-ops.yml restart environment-agent +``` + +### Bare Metal / systemd +```bash +# Check agent service status +sudo systemctl status stellaops-environment-agent + +# View logs +sudo journalctl -u stellaops-environment-agent --since "1 hour ago" + +# Restart agent +sudo systemctl restart stellaops-environment-agent + +# Renew TLS certificate +sudo cp /path/to/new/cert.pem /etc/stellaops/certs/agent.pem +sudo cp /path/to/new/key.pem /etc/stellaops/certs/agent-key.pem +sudo systemctl restart stellaops-environment-agent + +# Test network connectivity from control plane +curl -v https://:/health +``` + +### Kubernetes / Helm +```bash +# Check agent pod status +kubectl get pods -n stellaops -l app=environment-agent + +# View agent logs +kubectl logs -n stellaops -l app=environment-agent --tail=100 + +# Restart agent pods +kubectl rollout restart deployment/environment-agent -n stellaops + +# Renew TLS certificate via cert-manager or manual secret update +kubectl create secret tls agent-tls \ + --cert=/path/to/cert.pem \ + --key=/path/to/key.pem \ + -n stellaops --dry-run=client -o yaml | kubectl apply -f - + +# Check network policies +kubectl get networkpolicies -n stellaops +``` + +## Verification +```bash +stella doctor run --check check.environment.connectivity +``` + +## Related Checks +- `check.environment.deployments` - checks health of services deployed via agents +- `check.environment.network.policy` - verifies network policies that may block agent connectivity +- `check.environment.secrets` - agent credentials may need rotation diff --git a/docs/doctor/articles/environment/environment-deployment-health.md b/docs/doctor/articles/environment/environment-deployment-health.md new file mode 100644 index 000000000..9a38e5262 --- /dev/null +++ b/docs/doctor/articles/environment/environment-deployment-health.md @@ -0,0 +1,90 @@ +--- +checkId: check.environment.deployments +plugin: stellaops.doctor.environment +severity: warn +tags: [environment, deployment, services, health] +--- +# Environment Deployment Health + +## What It Checks +Queries the Release Orchestrator (`/api/v1/environments/deployments`) for all deployed services across all environments. Each service is evaluated for: +- **Status** -- `failed`, `stopped`, `degraded`, or healthy +- **Replica health** -- compares `healthyReplicas` against total `replicas`; partial health triggers degraded status + +Severity escalation: +- **Fail** if any production service has status `failed` (production detected by environment name containing "prod") +- **Fail** if any non-production service has status `failed` +- **Warn** if services are `degraded` (partial replica health) +- **Warn** if services are `stopped` +- **Pass** if all services are healthy + +## Why It Matters +Failed services in production directly impact end users and violate SLA commitments. Degraded services with partial replica health reduce fault tolerance and can cascade into full outages under load. Stopped services may indicate incomplete deployments or maintenance windows that were never closed. This check provides the earliest signal that a deployment rollout needs intervention. + +## Common Causes +- Service crashed due to unhandled exception or OOM kill +- Deployment rolled out a bad image version +- Dependency (database, cache, message broker) became unavailable +- Resource exhaustion preventing replicas from starting +- Health check endpoint misconfigured, causing false failures +- Node failure taking down co-located replicas + +## How to Fix + +### Docker Compose +```bash +# Identify failed containers +docker ps -a --filter "status=exited" --filter "status=dead" + +# View logs for the failed service +docker logs --tail 200 + +# Restart the failed service +docker compose -f docker-compose.stella-ops.yml restart + +# If the image is bad, roll back to previous version +# Edit docker-compose.stella-ops.yml to pin the previous image tag +docker compose -f docker-compose.stella-ops.yml up -d +``` + +### Bare Metal / systemd +```bash +# Check service status +sudo systemctl status stellaops- + +# View logs for crash details +sudo journalctl -u stellaops- --since "30 minutes ago" --no-pager + +# Restart the service +sudo systemctl restart stellaops- + +# Roll back to previous binary +sudo cp /opt/stellaops/backup/ /opt/stellaops/bin/ +sudo systemctl restart stellaops- +``` + +### Kubernetes / Helm +```bash +# Check pod status across environments +kubectl get pods -n stellaops- --field-selector=status.phase!=Running + +# View events and logs for failing pods +kubectl describe pod -n stellaops- +kubectl logs -n stellaops- --previous + +# Rollback a deployment +kubectl rollout undo deployment/ -n stellaops- + +# Or via Helm +helm rollback stellaops -n stellaops- +``` + +## Verification +```bash +stella doctor run --check check.environment.deployments +``` + +## Related Checks +- `check.environment.capacity` - resource exhaustion can cause deployment failures +- `check.environment.connectivity` - agent must be reachable to report deployment health +- `check.environment.drift` - configuration drift can cause services to fail after redeployment diff --git a/docs/doctor/articles/environment/environment-drift.md b/docs/doctor/articles/environment/environment-drift.md new file mode 100644 index 000000000..72dac29d9 --- /dev/null +++ b/docs/doctor/articles/environment/environment-drift.md @@ -0,0 +1,86 @@ +--- +checkId: check.environment.drift +plugin: stellaops.doctor.environment +severity: warn +tags: [environment, drift, configuration, consistency] +--- +# Environment Drift Detection + +## What It Checks +Queries the Release Orchestrator drift report API (`/api/v1/environments/drift`) and compares configuration snapshots across environments. The check requires at least 2 environments to perform comparison. Each drift item carries a severity classification: +- **Fail** if any drift is classified as `critical` (e.g., security-relevant configuration differences between staging and production) +- **Warn** if drifts exist but none are critical +- **Pass** if no configuration drift is detected between environments + +Evidence includes the specific configuration keys that drifted and which environments are affected. + +## Why It Matters +Configuration drift between environments undermines the core promise of promotion-based releases: that what you test in staging is what runs in production. Drift can cause subtle behavioral differences that only manifest under production load, making bugs nearly impossible to reproduce. Critical drift in security-related configuration (TLS settings, authentication, network policies) can create compliance violations and security exposures. + +## Common Causes +- Manual configuration changes applied directly to one environment (bypassing the release pipeline) +- Failed deployment that left partial configuration in one environment +- Configuration sync job that did not propagate to all environments +- Environment restored from an outdated backup +- Intentional per-environment overrides that were not tracked as accepted exceptions + +## How to Fix + +### Docker Compose +```bash +# View the current drift report +stella env drift show + +# Compare specific configuration between environments +diff <(docker exec stellaops-staging cat /app/appsettings.json) \ + <(docker exec stellaops-prod cat /app/appsettings.json) + +# Reconcile by redeploying from the canonical source +docker compose -f docker-compose.stella-ops.yml up -d --force-recreate + +# If drift is intentional, mark it as accepted +stella env drift accept +``` + +### Bare Metal / systemd +```bash +# View drift report +stella env drift show + +# Compare config files between environments +diff /etc/stellaops/staging/appsettings.json /etc/stellaops/prod/appsettings.json + +# Reconcile by copying from source of truth +sudo cp /etc/stellaops/staging/appsettings.json /etc/stellaops/prod/appsettings.json +sudo systemctl restart stellaops- + +# Or accept drift as intentional +stella env drift accept +``` + +### Kubernetes / Helm +```bash +# View drift between environments +stella env drift show + +# Compare Helm values between environments +diff <(helm get values stellaops -n stellaops-staging -o yaml) \ + <(helm get values stellaops -n stellaops-prod -o yaml) + +# Reconcile by redeploying with consistent values +helm upgrade stellaops stellaops/stellaops -n stellaops-prod \ + -f values-prod.yaml + +# Compare ConfigMaps +kubectl diff -f configmap.yaml -n stellaops-prod +``` + +## Verification +```bash +stella doctor run --check check.environment.drift +``` + +## Related Checks +- `check.environment.deployments` - drift can cause service failures after redeployment +- `check.environment.secrets` - secret configuration differences between environments +- `check.environment.network.policy` - network policy drift is a security concern diff --git a/docs/doctor/articles/environment/environment-network-policy.md b/docs/doctor/articles/environment/environment-network-policy.md new file mode 100644 index 000000000..49086db3c --- /dev/null +++ b/docs/doctor/articles/environment/environment-network-policy.md @@ -0,0 +1,107 @@ +--- +checkId: check.environment.network.policy +plugin: stellaops.doctor.environment +severity: warn +tags: [environment, network, policy, security, isolation] +--- +# Environment Network Policy + +## What It Checks +Retrieves network policies from the Release Orchestrator (`/api/v1/environments/network-policies`) and evaluates isolation posture for each environment. The check enforces these rules: +- **Production environments must not allow ingress from dev** -- detected as critical violation +- **Production environments should use default-deny policies** -- missing default-deny is a warning +- **No environment should have wildcard ingress** (`*` or `0.0.0.0/0`) -- critical for production, warning for others +- **Wildcard egress** (`*` or `0.0.0.0/0`) is flagged as informational + +Severity: +- **Fail** if any critical violations exist (prod ingress from dev, wildcard ingress on prod) +- **Warn** if only warning-level violations exist (missing default-deny, wildcard ingress on non-prod) +- **Warn** if no network policies are configured at all +- **Pass** if all policies are correctly configured + +## Why It Matters +Network isolation between environments is a fundamental security control. Allowing dev-to-production ingress means compromised development infrastructure can directly attack production services. Missing default-deny policies mean any new service added to the environment is implicitly network-accessible. Wildcard ingress exposes services to the entire network or internet. These misconfigurations are common audit findings that can block compliance certifications. + +## Common Causes +- Network policies not yet defined for a new environment +- Legacy policy left in place from initial setup +- Production policy copied from dev without tightening rules +- Manual firewall rule change not reflected in Stella Ops policy +- Policy update deployed to staging but not promoted to production + +## How to Fix + +### Docker Compose +```bash +# Review current network policies +stella env network-policy list + +# Create a default-deny policy for production +stella env network-policy create prod --default-deny + +# Allow only staging ingress to production +stella env network-policy update prod --default-deny --allow-from staging + +# Restrict egress to specific destinations +stella env network-policy update prod --egress-allow "10.0.0.0/8,registry.internal" + +# In Docker Compose, use network isolation +# Define separate networks in docker-compose.stella-ops.yml: +# networks: +# prod-internal: +# internal: true +# staging-internal: +# internal: true +``` + +### Bare Metal / systemd +```bash +# Review current iptables/nftables rules +sudo iptables -L -n -v +# or +sudo nft list ruleset + +# Apply default-deny for production network interface +sudo iptables -A INPUT -i prod0 -j DROP +sudo iptables -I INPUT -i prod0 -s -j ACCEPT + +# Or configure via stellaops policy +stella env network-policy update prod --default-deny --allow-from staging + +# Persist firewall rules +sudo netfilter-persistent save +``` + +### Kubernetes / Helm +```bash +# Review existing network policies +kubectl get networkpolicies -n stellaops-prod + +# Apply default-deny via Helm +helm upgrade stellaops stellaops/stellaops \ + --set environments.prod.networkPolicy.defaultDeny=true \ + --set environments.prod.networkPolicy.allowFrom[0]=stellaops-staging + +# Or apply a NetworkPolicy manifest directly +cat < + +# Check secret provider connectivity +stella secrets provider status + +# Update secret in .env file for compose deployments +# Edit devops/compose/.env with the new secret value +# Then restart affected services +docker compose -f docker-compose.stella-ops.yml restart +``` + +### Bare Metal / systemd +```bash +# List secrets with expiry details +stella env secrets list --expiring + +# Rotate expired secret +stella env secrets rotate + +# If using file-based secrets, update the file +sudo vi /etc/stellaops/secrets/ +sudo chmod 600 /etc/stellaops/secrets/ +sudo systemctl restart stellaops- + +# Schedule automated rotation +stella env secrets rotate-scheduled --days 7 +``` + +### Kubernetes / Helm +```bash +# List expiring secrets +stella env secrets list --expiring + +# Rotate secret and update Kubernetes secret +stella env secrets rotate + +# Or update manually +kubectl create secret generic \ + --from-literal=value= \ + -n stellaops- --dry-run=client -o yaml | kubectl apply -f - + +# Restart pods to pick up new secret +kubectl rollout restart deployment/ -n stellaops- + +# For external-secrets-operator, trigger a refresh +kubectl annotate externalsecret -n stellaops force-sync=$(date +%s) +``` + +## Verification +```bash +stella doctor run --check check.environment.secrets +``` + +## Related Checks +- `check.environment.connectivity` - expired agent credentials cause connectivity failures +- `check.environment.deployments` - services fail when their secrets expire +- `check.integration.secrets.manager` - verifies the secrets manager itself is healthy diff --git a/docs/doctor/articles/evidence-locker/index.md b/docs/doctor/articles/evidence-locker/index.md new file mode 100644 index 000000000..d0b059695 --- /dev/null +++ b/docs/doctor/articles/evidence-locker/index.md @@ -0,0 +1,120 @@ +--- +checkId: check.evidencelocker.index +plugin: stellaops.doctor.evidencelocker +severity: warn +tags: [evidence, index, consistency] +--- +# Evidence Index Consistency + +## What It Checks +Verifies that the evidence index is consistent with the artifacts stored on disk. The check operates on the local evidence locker path (`EvidenceLocker:Path`) and performs: + +1. **Index existence**: looks for `index.json` or an `index/` directory at the locker root. +2. **Artifact counting**: counts `.json` files across five artifact directories: `attestations/`, `sboms/`, `vex/`, `verdicts/`, `provenance/`. +3. **Cross-reference validation**: for each entry in `index.json`, verifies the referenced artifact file exists on disk. Records any artifacts that are indexed but missing from disk. +4. **Drift detection**: compares the total indexed count against the total disk artifact count. Flags a warning if drift exceeds 10% of total artifacts. + +| Condition | Result | +|---|---| +| Evidence locker path not configured or missing | Skip | +| Index file and index directory both missing | Warn | +| Artifacts indexed but missing from disk | Fail | +| Index count drifts > 10% from disk count | Warn | +| Index consistent with disk artifacts | Pass | + +Evidence collected: `IndexedCount`, `DiskArtifactCount`, `MissingFromDisk`, `MissingSamples`, `Drift`, per-directory counts (`attestationsCount`, `sbomsCount`, `vexCount`, `verdictsCount`, `provenanceCount`). + +The check only runs when `EvidenceLocker:Path` is configured and the directory exists. + +## Why It Matters +The evidence index provides fast lookup for attestations, SBOMs, VEX documents, and provenance records. An inconsistent index means queries may return stale references to deleted artifacts (causing retrieval errors) or miss artifacts that exist on disk (causing incomplete audit reports). Index drift accumulates over time and degrades the reliability of evidence searches, compliance exports, and release verification lookups. + +## Common Causes +- Index never created (evidence locker not initialized) +- Index file was deleted or corrupted +- Artifacts deleted without updating the index (manual cleanup) +- Disk corruption causing artifact loss +- Background indexer not running or crashed +- Race condition during concurrent writes +- Incomplete cleanup operations removing files but not index entries + +## How to Fix + +### Docker Compose +```bash +# Check index status +docker compose exec evidence-locker ls -la /data/evidence/index.json + +# Rebuild evidence index +docker compose exec evidence-locker stella evidence index rebuild + +# Fix orphaned index entries +docker compose exec evidence-locker stella evidence index rebuild --fix-orphans + +# Verify evidence integrity after rebuild +docker compose exec evidence-locker stella evidence verify --all + +# Refresh index (less aggressive than rebuild) +docker compose exec evidence-locker stella evidence index refresh + +# Check disk health +docker compose exec evidence-locker df -h /data/evidence +``` + +### Bare Metal / systemd +```bash +# Check index file +ls -la /var/lib/stellaops/evidence/index.json + +# Rebuild evidence index +stella evidence index rebuild + +# Fix orphaned entries +stella evidence index rebuild --fix-orphans + +# Refresh index +stella evidence index refresh + +# Check for disk errors +sudo fsck -n /dev/sda1 + +# Verify evidence integrity +stella evidence verify --all + +sudo systemctl restart stellaops-evidence-locker +``` + +### Kubernetes / Helm +```bash +# Check index in pod +kubectl exec deploy/stellaops-evidence-locker -- ls -la /data/evidence/index.json + +# Rebuild index +kubectl exec deploy/stellaops-evidence-locker -- stella evidence index rebuild --fix-orphans + +# Verify evidence +kubectl exec deploy/stellaops-evidence-locker -- stella evidence verify --all + +# Check persistent volume health +kubectl describe pvc stellaops-evidence-data +``` + +```yaml +# values.yaml - enable background indexer +evidenceLocker: + indexer: + enabled: true + intervalMinutes: 15 + repairOnDrift: true +``` + +## Verification +``` +stella doctor run --check check.evidencelocker.index +``` + +## Related Checks +- `check.evidencelocker.retrieval` — retrieval depends on index accuracy for lookups +- `check.evidencelocker.provenance` — provenance records are one of the indexed artifact types +- `check.evidencelocker.merkle` — Merkle anchors reference indexed artifacts +- `check.compliance.evidence-integrity` — evidence integrity includes index consistency diff --git a/docs/doctor/articles/evidence-locker/merkle.md b/docs/doctor/articles/evidence-locker/merkle.md new file mode 100644 index 000000000..48bdff9bf --- /dev/null +++ b/docs/doctor/articles/evidence-locker/merkle.md @@ -0,0 +1,122 @@ +--- +checkId: check.evidencelocker.merkle +plugin: stellaops.doctor.evidencelocker +severity: fail +tags: [evidence, merkle, anchoring, integrity] +--- +# Merkle Anchor Verification + +## What It Checks +Verifies Merkle root anchoring integrity when anchoring is enabled. The check operates on anchor records stored in the `anchors/` subdirectory of the evidence locker path. It validates: + +1. **Anchor record presence**: checks for `.json` anchor files in the anchors directory. +2. **Anchor structural validity**: each anchor must contain `merkleRoot`, `timestamp`, and `signature` fields with non-empty values. +3. **Anchor integrity**: validates the most recent 5 anchor records for structural completeness and signature presence. +4. **Anchor freshness**: compares the latest anchor timestamp against the configured interval (`EvidenceLocker:Anchoring:IntervalHours`, default 24). Warns if the latest anchor is more than 2x the interval age. + +| Condition | Result | +|---|---| +| Anchoring not enabled | Skip | +| Evidence locker path not configured | Skip | +| No anchor records found | Warn | +| Any anchor records invalid (missing fields, corrupt) | Fail | +| Latest anchor older than 2x configured interval | Warn | +| All checked anchors valid and fresh | Pass | + +Evidence collected: `CheckedCount`, `ValidCount`, `InvalidCount`, `InvalidAnchors`, `LatestAnchorTime`, `AnchorAgeHours`, `ExpectedIntervalHours`, `LatestRoot`. + +The check only runs when `EvidenceLocker:Anchoring:Enabled` is set to "true". + +## Why It Matters +Merkle anchoring provides cryptographic proof that evidence has not been tampered with since the anchor was created. Each anchor captures the Merkle root hash of all evidence at a point in time, creating an immutable checkpoint. Invalid anchors mean the integrity chain is broken and evidence cannot be independently verified against its anchor. Stale anchors indicate the anchoring job has stopped running, creating a window where evidence changes are not captured by any checkpoint. + +## Common Causes +- Anchoring job not run yet (new deployment) +- Anchoring job scheduler not running or misconfigured +- Anchor record corrupted on disk +- Merkle root hash mismatch due to evidence modification after anchoring +- Evidence tampered or modified after the anchor was created +- Anchors directory deleted during maintenance +- Anchor creation failing silently (check job logs) + +## How to Fix + +### Docker Compose +```bash +# Create an initial anchor +docker compose exec evidence-locker stella evidence anchor create + +# Check anchor job status +docker compose exec evidence-locker stella evidence anchor status + +# Audit anchor integrity +docker compose exec evidence-locker stella evidence anchor audit --full + +# Verify a specific anchor +docker compose exec evidence-locker stella evidence anchor verify + +# Enable anchoring in configuration +# EvidenceLocker__Anchoring__Enabled=true +# EvidenceLocker__Anchoring__IntervalHours=24 + +docker compose restart evidence-locker +``` + +### Bare Metal / systemd +```bash +# Create an anchor +stella evidence anchor create + +# Check anchor status +stella evidence anchor status + +# Full anchor audit +stella evidence anchor audit --full + +# Configure anchoring in appsettings.json +# "EvidenceLocker": { +# "Anchoring": { +# "Enabled": true, +# "IntervalHours": 24 +# } +# } + +# Verify the anchor job is scheduled +stella jobs list --filter anchor + +sudo systemctl restart stellaops-evidence-locker +``` + +### Kubernetes / Helm +```yaml +# values.yaml +evidenceLocker: + anchoring: + enabled: true + intervalHours: 24 + schedule: "0 */24 * * *" # Every 24 hours +``` + +```bash +# Create initial anchor +kubectl exec deploy/stellaops-evidence-locker -- stella evidence anchor create + +# Check anchor status +kubectl exec deploy/stellaops-evidence-locker -- stella evidence anchor status + +# Audit anchors +kubectl exec deploy/stellaops-evidence-locker -- stella evidence anchor audit --full + +helm upgrade stellaops ./charts/stellaops -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.evidencelocker.merkle +``` + +## Related Checks +- `check.evidencelocker.provenance` — provenance chain integrity complements Merkle anchoring +- `check.evidencelocker.index` — index consistency ensures anchored artifacts are still present +- `check.evidencelocker.retrieval` — retrieval health required to validate anchored artifacts +- `check.compliance.evidence-integrity` — evidence integrity is the broader check that includes anchoring diff --git a/docs/doctor/articles/evidence-locker/provenance.md b/docs/doctor/articles/evidence-locker/provenance.md new file mode 100644 index 000000000..af325ec1c --- /dev/null +++ b/docs/doctor/articles/evidence-locker/provenance.md @@ -0,0 +1,116 @@ +--- +checkId: check.evidencelocker.provenance +plugin: stellaops.doctor.evidencelocker +severity: fail +tags: [evidence, provenance, integrity, chain] +--- +# Provenance Chain Integrity + +## What It Checks +Validates provenance chain integrity using random sample verification. The check operates on provenance records stored in the `provenance/` subdirectory of the evidence locker path. It performs: + +1. **Random sampling**: selects up to 5 random provenance records from the pool for validation (configurable via `SampleSize` constant). +2. **Hash verification**: for each sampled record, reads the `contentHash` and `payload` fields, recomputes the SHA-256 hash of the payload, and compares it against the declared hash. Supports `sha256:` prefixed hash values. +3. **Structural validation**: verifies that each record contains required `contentHash` and `payload` fields. + +| Condition | Result | +|---|---| +| Evidence locker path not configured | Skip | +| No provenance directory or no records | Pass (nothing to verify) | +| Any sampled records fail hash verification | Fail | +| All sampled records pass hash verification | Pass | + +Evidence collected: `TotalRecords`, `SamplesChecked`, `ValidCount`, `InvalidCount`, `InvalidRecords`. + +The check only runs when `EvidenceLocker:Path` is configured and the directory exists. + +## Why It Matters +Provenance records link each software artifact to its build source, build system, and build steps. The content hash ensures that the provenance payload has not been modified since it was created. A broken hash indicates the provenance record was corrupted or tampered with, which invalidates the supply-chain integrity guarantee for the associated release. Even a single invalid provenance record undermines trust in the entire provenance chain and should be investigated as a potential security incident. + +## Common Causes +- Provenance record corrupted on disk (storage errors, incomplete writes) +- Hash verification failure after accidental file modification +- Chain link broken due to missing predecessor records +- Data tampered or modified by unauthorized access +- Hash format mismatch (missing or extra `sha256:` prefix) +- Character encoding differences during payload serialization + +## How to Fix + +### Docker Compose +```bash +# Run full provenance audit +docker compose exec evidence-locker stella evidence audit --type provenance --full + +# Check specific invalid records +docker compose exec evidence-locker stella evidence verify --id + +# Review evidence locker integrity +docker compose exec evidence-locker stella evidence integrity-check + +# Check for storage errors +docker compose exec evidence-locker dmesg | grep -i error + +# Check disk health +docker compose exec evidence-locker df -h /data/evidence/provenance/ +``` + +### Bare Metal / systemd +```bash +# Full provenance audit +stella evidence audit --type provenance --full + +# Verify specific records +stella evidence verify --id + +# Full integrity check +stella evidence integrity-check + +# Check filesystem health +sudo fsck -n /dev/sda1 + +# Check for disk I/O errors +dmesg | grep -i "i/o error" + +# List provenance records +ls -la /var/lib/stellaops/evidence/provenance/ +``` + +### Kubernetes / Helm +```bash +# Full provenance audit +kubectl exec deploy/stellaops-evidence-locker -- stella evidence audit --type provenance --full + +# Verify specific record +kubectl exec deploy/stellaops-evidence-locker -- stella evidence verify --id + +# Integrity check +kubectl exec deploy/stellaops-evidence-locker -- stella evidence integrity-check + +# Check persistent volume health +kubectl describe pvc stellaops-evidence-data + +# Check for pod restarts that might indicate storage issues +kubectl get events --field-selector involvedObject.name=stellaops-evidence-locker -n stellaops +``` + +```yaml +# values.yaml - schedule periodic integrity checks +evidenceLocker: + integrityCheck: + enabled: true + schedule: "0 4 * * *" # Daily at 4am + sampleSize: 10 +``` + +## Verification +``` +stella doctor run --check check.evidencelocker.provenance +``` + +## Related Checks +- `check.evidencelocker.merkle` — Merkle anchoring provides checkpoint-level integrity on top of per-record verification +- `check.evidencelocker.index` — index consistency ensures provenance records are discoverable +- `check.evidencelocker.retrieval` — retrieval health is required to access provenance records +- `check.compliance.provenance-completeness` — verifies provenance exists for all releases (completeness vs. integrity) +- `check.compliance.evidence-integrity` — broader evidence integrity check including provenance diff --git a/docs/doctor/articles/evidence-locker/retrieval.md b/docs/doctor/articles/evidence-locker/retrieval.md new file mode 100644 index 000000000..8713d3b8d --- /dev/null +++ b/docs/doctor/articles/evidence-locker/retrieval.md @@ -0,0 +1,139 @@ +--- +checkId: check.evidencelocker.retrieval +plugin: stellaops.doctor.evidencelocker +severity: fail +tags: [evidence, attestation, retrieval, core] +--- +# Attestation Retrieval + +## What It Checks +Verifies that attestation artifacts can be retrieved from the evidence locker. The check supports two modes depending on the deployment: + +**HTTP mode** (when `IHttpClientFactory` is available): +Sends a GET request to `{endpoint}/v1/attestations/sample` with a 5-second timeout and measures response latency. + +**Local file mode** (fallback): +Checks the local evidence locker path at `EvidenceLocker:Path`, verifies the `attestations/` subdirectory exists, and attempts to read a sample attestation JSON file. + +| Condition | Result | +|---|---| +| Endpoint not configured | Skip | +| HTTP request times out (> 5000ms) | Fail | +| HTTP error status code | Fail | +| Connection error | Fail | +| HTTP success but latency > 500ms | Warn | +| Local attestations directory missing | Warn | +| HTTP success with latency <= 500ms | Pass | +| Local file read successful | Pass | + +Evidence collected: `Endpoint`, `StatusCode`, `LatencyMs`, `Threshold`, `Path`, `SampleAttestation`, `ContentLength`. + +The check only runs when `EvidenceLocker:Endpoint` or `Services:EvidenceLocker` is configured. + +## Why It Matters +Attestation retrieval is a core operation used throughout the release pipeline. Release approvals, audit queries, compliance reports, and evidence exports all depend on being able to retrieve attestation artifacts from the evidence locker. If retrieval is slow or failing, release approvals may time out, audit queries will fail, and compliance reports cannot be generated. Latency above 500ms indicates performance degradation that will compound when retrieving multiple attestations during a release or audit. + +## Common Causes +- Evidence locker service unavailable or not running +- Authentication failure when accessing the evidence locker API +- Artifact not found (empty or uninitialized evidence locker) +- Evidence locker under heavy load causing elevated latency +- Network latency between services +- Storage backend slow (disk I/O bottleneck) +- Local evidence locker path not configured or directory missing +- File permission issues on local attestation files + +## How to Fix + +### Docker Compose +```bash +# Check evidence locker service status +docker compose ps evidence-locker + +# Test evidence retrieval +docker compose exec evidence-locker stella evidence status + +# Test authentication +docker compose exec evidence-locker stella evidence auth-test + +# Check service logs for errors +docker compose logs evidence-locker --since 5m + +# If local mode, verify the evidence path and permissions +docker compose exec evidence-locker ls -la /data/evidence/attestations/ + +# Initialize evidence locker if needed +docker compose exec evidence-locker stella evidence init + +# Set endpoint configuration +# EvidenceLocker__Endpoint=http://evidence-locker:5080 +``` + +### Bare Metal / systemd +```bash +# Check service status +sudo systemctl status stellaops-evidence-locker + +# Test evidence retrieval +stella evidence status + +# Test connectivity +stella evidence ping + +# Check attestations directory +ls -la /var/lib/stellaops/evidence/attestations/ + +# Initialize if empty +stella evidence init + +# Check disk I/O +iostat -x 1 5 + +# In appsettings.json: +# "EvidenceLocker": { "Endpoint": "http://localhost:5080" } + +sudo systemctl restart stellaops-evidence-locker +``` + +### Kubernetes / Helm +```bash +# Check evidence locker pod status +kubectl get pods -l app=stellaops-evidence-locker + +# Check pod logs +kubectl logs deploy/stellaops-evidence-locker --since=5m + +# Test retrieval from within cluster +kubectl exec deploy/stellaops-evidence-locker -- stella evidence status + +# Check persistent volume +kubectl describe pvc stellaops-evidence-data + +# Check for resource constraints +kubectl top pod -l app=stellaops-evidence-locker +``` + +```yaml +# values.yaml +evidenceLocker: + endpoint: http://stellaops-evidence-locker:5080 + resources: + requests: + memory: 256Mi + cpu: 100m + limits: + memory: 512Mi + cpu: 500m +``` + +## Verification +``` +stella doctor run --check check.evidencelocker.retrieval +``` + +## Related Checks +- `check.evidencelocker.index` — evidence index consistency affects retrieval accuracy +- `check.evidencelocker.provenance` — provenance chain integrity depends on reliable retrieval +- `check.evidencelocker.merkle` — Merkle anchor verification requires attestation access +- `check.compliance.evidence-rate` — evidence generation feeds the retrieval pipeline +- `check.compliance.evidence-integrity` — integrity verification requires successful retrieval diff --git a/docs/doctor/articles/integration/ci-system-connectivity.md b/docs/doctor/articles/integration/ci-system-connectivity.md new file mode 100644 index 000000000..2ffff4598 --- /dev/null +++ b/docs/doctor/articles/integration/ci-system-connectivity.md @@ -0,0 +1,71 @@ +--- +checkId: check.integration.ci.system +plugin: stellaops.doctor.integration +severity: warn +tags: [integration, ci, cd, jenkins, gitlab, github] +--- +# CI System Connectivity + +## What It Checks +Iterates over all CI/CD systems defined under `CI:Systems` (or the legacy `CI:Url` single-system key). For each system it sends an HTTP GET to a type-specific health endpoint (Jenkins `/api/json`, GitLab `/api/v4/version`, GitHub `/rate_limit`, Azure DevOps `/_apis/connectionData`, or generic `/health`), sets the appropriate auth header (Bearer for GitHub/generic, `PRIVATE-TOKEN` for GitLab), and records reachability, authentication success, and latency. If the system is reachable and authenticated, it optionally queries runner/agent status (Jenkins `/computer/api/json`, GitLab `/api/v4/runners?status=online`). The check **fails** when any system is unreachable or returns 401/403, **warns** when all systems are reachable but one or more has zero available runners (out of a non-zero total), and **passes** otherwise. + +## Why It Matters +CI/CD systems are the trigger point for automated builds, tests, and release pipelines. If a CI system is unreachable or its credentials have expired, new commits will not be built, security scans will not run, and promotions will stall. Runner exhaustion has the same effect: pipelines queue indefinitely, delaying releases and blocking evidence collection. + +## Common Causes +- CI system is down or undergoing maintenance +- Network connectivity issue between Stella Ops and the CI host +- API credentials (token or password) have expired or been rotated +- Firewall or security group blocking the CI API port +- All CI runners/agents are offline or busy + +## How to Fix + +### Docker Compose +```bash +# Verify the CI URL is correct in your environment file +grep -E '^CI__' .env + +# Test connectivity from within the Docker network +docker compose exec gateway curl -sv https://ci.example.com/api/json + +# Rotate or set a new API token +echo 'CI__Systems__0__ApiToken=' >> .env +docker compose restart gateway +``` + +### Bare Metal / systemd +```bash +# Check config in appsettings +cat /etc/stellaops/appsettings.Production.json | jq '.CI' + +# Test connectivity +curl -H "Authorization: Bearer $CI_TOKEN" https://ci.example.com/api/json + +# Update the token +sudo nano /etc/stellaops/appsettings.Production.json +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +ci: + systems: + - name: jenkins-prod + url: https://ci.example.com + type: jenkins + apiToken: # or use existingSecret +``` +```bash +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.ci.system +``` + +## Related Checks +- `check.integration.webhooks` -- validates webhook delivery from CI events +- `check.integration.git` -- validates Git provider reachability (often same host as CI) diff --git a/docs/doctor/articles/integration/git-provider-api.md b/docs/doctor/articles/integration/git-provider-api.md new file mode 100644 index 000000000..4856341e9 --- /dev/null +++ b/docs/doctor/articles/integration/git-provider-api.md @@ -0,0 +1,66 @@ +--- +checkId: check.integration.git +plugin: stellaops.doctor.integration +severity: warn +tags: [connectivity, git, scm] +--- +# Git Provider API + +## What It Checks +Resolves the configured Git provider URL from `Git:Url`, `Scm:Url`, `GitHub:Url`, `GitLab:Url`, or `Gitea:Url`. Auto-detects the provider type (GitHub, GitLab, Gitea, Bitbucket, Azure DevOps) from the URL and sends an HTTP GET to the corresponding API endpoint (e.g., GitHub -> `api.github.com`, GitLab -> `/api/v4/version`, Gitea -> `/api/v1/version`, Bitbucket -> `/rest/api/1.0/application-properties`). The check **passes** if the response is 2xx, 401, or 403 (reachable even if auth is needed), **warns** on other non-error status codes, and **fails** on connection errors or exceptions. + +## Why It Matters +Git provider connectivity is essential for source-code scanning, SBOM ingestion, webhook event reception, and commit-status reporting. A misconfigured or unreachable Git URL silently breaks SCM-triggered workflows and prevents evidence collection from source repositories. + +## Common Causes +- Git provider URL is incorrect or has a trailing-path typo +- Network connectivity issues or DNS failure +- Git provider service is down or undergoing maintenance +- Provider uses a non-standard API path + +## How to Fix + +### Docker Compose +```bash +# Check current Git URL +grep 'GIT__URL\|SCM__URL\|GITHUB__URL' .env + +# Test from inside the network +docker compose exec gateway curl -sv https://git.example.com/api/v4/version + +# Update the URL +echo 'Git__Url=https://git.example.com' >> .env +docker compose restart gateway +``` + +### Bare Metal / systemd +```bash +# Verify configuration +cat /etc/stellaops/appsettings.Production.json | jq '.Git' + +# Test connectivity +curl -v https://git.example.com/api/v4/version + +# Fix the URL +sudo nano /etc/stellaops/appsettings.Production.json +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +git: + url: https://git.example.com +``` +```bash +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.git +``` + +## Related Checks +- `check.integration.ci.system` -- CI systems often share the same Git host +- `check.integration.webhooks` -- webhook endpoints receive events from Git providers diff --git a/docs/doctor/articles/integration/ldap-connectivity.md b/docs/doctor/articles/integration/ldap-connectivity.md new file mode 100644 index 000000000..67137730c --- /dev/null +++ b/docs/doctor/articles/integration/ldap-connectivity.md @@ -0,0 +1,72 @@ +--- +checkId: check.integration.ldap +plugin: stellaops.doctor.integration +severity: warn +tags: [connectivity, ldap, directory, auth] +--- +# LDAP/AD Connectivity + +## What It Checks +Reads the LDAP host from `Ldap:Host`, `ActiveDirectory:Host`, or `Authority:Ldap:Host` and the port from the corresponding `:Port` key (defaulting to 389, or 636 when `UseSsl` is true). Opens a raw TCP connection to the host and port with a 5-second timeout. The check **passes** if the TCP connection succeeds, **fails** on timeout, socket error, or connection refusal. + +## Why It Matters +LDAP or Active Directory integration is used for user authentication, group synchronization, and role mapping. If the LDAP server is unreachable, users cannot log in via directory credentials, group-based access policies cannot be evaluated, and new user provisioning stops. This directly impacts operator access to the control plane. + +## Common Causes +- LDAP/AD server is not running or is being restarted +- Firewall blocking LDAP port (389) or LDAPS port (636) +- DNS resolution failure for the LDAP hostname +- Network unreachable between Stella Ops and the directory server +- Incorrect host or port in configuration + +## How to Fix + +### Docker Compose +```bash +# Check LDAP configuration +grep 'LDAP__\|ACTIVEDIRECTORY__' .env + +# Test TCP connectivity from the gateway container +docker compose exec gateway bash -c "echo > /dev/tcp/ldap.example.com/389 && echo OK || echo FAIL" + +# Update LDAP host/port +echo 'Ldap__Host=ldap.example.com' >> .env +echo 'Ldap__Port=636' >> .env +echo 'Ldap__UseSsl=true' >> .env +docker compose restart gateway +``` + +### Bare Metal / systemd +```bash +# Verify configuration +cat /etc/stellaops/appsettings.Production.json | jq '.Ldap' + +# Test connectivity +telnet ldap.example.com 389 +# or +nslookup ldap.example.com + +# Update configuration +sudo nano /etc/stellaops/appsettings.Production.json +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +ldap: + host: ldap.example.com + port: 636 + useSsl: true +``` +```bash +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.ldap +``` + +## Related Checks +- `check.integration.oidc` -- OIDC provider connectivity (alternative auth mechanism) diff --git a/docs/doctor/articles/integration/object-storage.md b/docs/doctor/articles/integration/object-storage.md new file mode 100644 index 000000000..476e6bbdb --- /dev/null +++ b/docs/doctor/articles/integration/object-storage.md @@ -0,0 +1,73 @@ +--- +checkId: check.integration.s3.storage +plugin: stellaops.doctor.integration +severity: warn +tags: [connectivity, s3, storage] +--- +# Object Storage Connectivity + +## What It Checks +Reads the S3 endpoint from `S3:Endpoint`, `Storage:S3:Endpoint`, or `AWS:S3:ServiceURL`. Parses the URI to extract host and port (defaulting to 443 for HTTPS, 80 for HTTP). Opens a raw TCP connection with a 5-second timeout. The check **passes** if the TCP connection succeeds, **fails** on timeout, socket error, invalid URI format, or connection refusal. + +## Why It Matters +S3-compatible object storage is used for evidence packet archival, SBOM storage, offline kit distribution, and large artifact persistence. If the storage endpoint is unreachable, evidence export fails, SBOM uploads are rejected, and offline kit generation cannot complete. This blocks audit compliance workflows and air-gap distribution. + +## Common Causes +- S3 endpoint (MinIO, AWS S3, or compatible) is unreachable +- Network connectivity issues or DNS failure +- Firewall blocking the storage port +- Invalid endpoint URL format in configuration +- MinIO or S3-compatible service is not running + +## How to Fix + +### Docker Compose +```bash +# Check S3 configuration +grep 'S3__\|STORAGE__S3' .env + +# Test connectivity to MinIO +docker compose exec gateway curl -v http://minio:9000/minio/health/live + +# Restart MinIO if stopped +docker compose up -d minio + +# Update endpoint +echo 'S3__Endpoint=http://minio:9000' >> .env +docker compose restart platform +``` + +### Bare Metal / systemd +```bash +# Verify S3 configuration +cat /etc/stellaops/appsettings.Production.json | jq '.S3' + +# Test connectivity +curl -v http://minio.example.com:9000/minio/health/live + +# Check if MinIO is running +sudo systemctl status minio + +# Update configuration +sudo nano /etc/stellaops/appsettings.Production.json +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +s3: + endpoint: http://minio.storage.svc.cluster.local:9000 + bucket: stellaops-evidence +``` +```bash +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.s3.storage +``` + +## Related Checks +- `check.integration.oci.registry` -- OCI registries may also store artifacts diff --git a/docs/doctor/articles/integration/oci-registry-connectivity.md b/docs/doctor/articles/integration/oci-registry-connectivity.md new file mode 100644 index 000000000..2ca02ec05 --- /dev/null +++ b/docs/doctor/articles/integration/oci-registry-connectivity.md @@ -0,0 +1,70 @@ +--- +checkId: check.integration.oci.registry +plugin: stellaops.doctor.integration +severity: warn +tags: [connectivity, oci, registry] +--- +# OCI Registry Connectivity + +## What It Checks +Reads the registry URL from `OCI:RegistryUrl` or `Registry:Url`. Sends an HTTP GET to `/v2/` (the OCI Distribution Spec base endpoint). The check **passes** if the response is 200 (open registry) or 401 (registry reachable, auth required), **warns** on any other status code, and **fails** on connection errors. + +## Why It Matters +The OCI registry is the central artifact store for container images, SBOMs, attestations, and signatures. If the registry is unreachable, image pulls fail during deployment, SBOM scans cannot fetch manifests, attestation verification cannot retrieve signatures, and promotions are blocked. This is a foundational dependency for nearly every Stella Ops workflow. + +## Common Causes +- Registry URL is incorrect (typo, wrong port, wrong scheme) +- Network connectivity issues between Stella Ops and the registry +- Registry service is down or restarting +- Registry does not support the OCI Distribution spec at `/v2/` +- Registry endpoint is misconfigured (path prefix required) + +## How to Fix + +### Docker Compose +```bash +# Check registry configuration +grep 'OCI__REGISTRYURL\|REGISTRY__URL' .env + +# Test the /v2/ endpoint from inside the network +docker compose exec gateway curl -sv https://registry.example.com/v2/ + +# Update registry URL +echo 'OCI__RegistryUrl=https://registry.example.com' >> .env +docker compose restart platform +``` + +### Bare Metal / systemd +```bash +# Verify configuration +cat /etc/stellaops/appsettings.Production.json | jq '.OCI' + +# Test connectivity +curl -v https://registry.example.com/v2/ + +# Fix configuration +sudo nano /etc/stellaops/appsettings.Production.json +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +oci: + registryUrl: https://registry.example.com +``` +```bash +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.oci.registry +``` + +## Related Checks +- `check.integration.oci.credentials` -- validates registry credentials +- `check.integration.oci.pull` -- verifies pull authorization +- `check.integration.oci.push` -- verifies push authorization +- `check.integration.oci.referrers` -- checks OCI 1.1 referrers API support +- `check.integration.oci.capabilities` -- probes full capability matrix diff --git a/docs/doctor/articles/integration/oidc-provider.md b/docs/doctor/articles/integration/oidc-provider.md new file mode 100644 index 000000000..141b83e32 --- /dev/null +++ b/docs/doctor/articles/integration/oidc-provider.md @@ -0,0 +1,75 @@ +--- +checkId: check.integration.oidc +plugin: stellaops.doctor.integration +severity: warn +tags: [connectivity, oidc, auth, identity] +--- +# OIDC Provider + +## What It Checks +Reads the OIDC issuer URL from `Oidc:Issuer`, `Authentication:Oidc:Issuer`, or `Authority:Oidc:Issuer`. Fetches the OpenID Connect discovery document at `/.well-known/openid-configuration`. On a successful response, parses the JSON for three required endpoints: `authorization_endpoint`, `token_endpoint`, and `jwks_uri`. The check **passes** if all three are present, **warns** if the discovery document is incomplete (missing one or more endpoints), **fails** if the discovery endpoint returns a non-success status code, and **fails** on connection errors. + +## Why It Matters +OIDC authentication is the primary identity mechanism for Stella Ops operators and API clients. If the OIDC provider is unreachable or misconfigured, users cannot log in, API tokens cannot be validated, and all authenticated workflows halt. An incomplete discovery document causes subtle failures where some auth flows work but others (e.g., token refresh) silently break. + +## Common Causes +- OIDC issuer URL is incorrect or has a trailing slash issue +- OIDC provider (Authority, Keycloak, Azure AD, etc.) is down +- Network connectivity issues between Stella Ops and the identity provider +- Provider does not support OpenID Connect discovery +- Discovery document is missing required endpoints + +## How to Fix + +### Docker Compose +```bash +# Check OIDC configuration +grep 'OIDC__ISSUER\|AUTHENTICATION__OIDC' .env + +# Test discovery endpoint +docker compose exec gateway curl -sv \ + https://auth.example.com/.well-known/openid-configuration + +# Verify the Authority service is running +docker compose ps authority + +# Update issuer URL +echo 'Oidc__Issuer=https://auth.example.com' >> .env +docker compose restart gateway platform +``` + +### Bare Metal / systemd +```bash +# Verify configuration +cat /etc/stellaops/appsettings.Production.json | jq '.Oidc' + +# Test discovery +curl -v https://auth.example.com/.well-known/openid-configuration + +# Check required fields in the response +curl -s https://auth.example.com/.well-known/openid-configuration \ + | jq '{authorization_endpoint, token_endpoint, jwks_uri}' + +# Fix configuration +sudo nano /etc/stellaops/appsettings.Production.json +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +oidc: + issuer: https://auth.example.com + clientId: stellaops-ui +``` +```bash +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.oidc +``` + +## Related Checks +- `check.integration.ldap` -- alternative directory-based authentication diff --git a/docs/doctor/articles/integration/registry-capability-probe.md b/docs/doctor/articles/integration/registry-capability-probe.md new file mode 100644 index 000000000..1c63598c5 --- /dev/null +++ b/docs/doctor/articles/integration/registry-capability-probe.md @@ -0,0 +1,89 @@ +--- +checkId: check.integration.oci.capabilities +plugin: stellaops.doctor.integration +severity: info +tags: [registry, oci, capabilities, compatibility] +--- +# OCI Registry Capability Matrix + +## What It Checks +Probes the configured OCI registry for five capabilities using a test repository (`OCI:TestRepository`, default `library/alpine`): + +1. **Distribution version** -- GET `/v2/`, reads `OCI-Distribution-API-Version` or `Docker-Distribution-API-Version` header. +2. **Referrers API** -- GET `/v2//referrers/` with OCI accept header; passes if 200 or if a 404 response contains OCI index JSON. +3. **Chunked upload** -- POST `/v2//blobs/uploads/`; passes on 202 Accepted (upload session is immediately cancelled). +4. **Cross-repo mount** -- POST `/v2//blobs/uploads/?mount=&from=library/alpine`; passes on 201 Created or 202 Accepted. +5. **Delete support** (manifests and blobs) -- OPTIONS request to check if `DELETE` appears in the `Allow` header. + +Calculates a capability score (N/5). **Warns** if referrers API is unsupported, **info** if any other capability is missing, **passes** if all 5 are supported. **Fails** on connection errors. + +## Why It Matters +Different OCI registries support different subsets of the OCI Distribution Spec. Stella Ops uses referrers for attestation linking, chunked uploads for large SBOMs, cross-repo mounts for efficient promotion, and deletes for garbage collection. Knowing the capability matrix upfront prevents mysterious failures during release operations and allows operators to configure appropriate fallbacks. + +## Common Causes +- Registry does not implement OCI Distribution Spec v1.1 (no referrers API) +- Registry has delete operations disabled by policy +- Chunked upload is disabled in registry configuration +- Cross-repo mount is not supported by the registry implementation +- Registry version is too old for newer OCI features + +## How to Fix + +### Docker Compose +```bash +# Check registry type and version +docker compose exec gateway curl -sv https://registry.example.com/v2/ \ + -o /dev/null 2>&1 | grep -i 'distribution-api-version' + +# If referrers API is missing, consider upgrading the registry +# Harbor 2.6+, Quay 3.12+, ACR, ECR, GCR/Artifact Registry support referrers + +# Enable delete in Harbor +# Update harbor.yml: delete_enabled: true +# Restart Harbor +``` + +### Bare Metal / systemd +```bash +# Test referrers API directly +curl -H "Accept: application/vnd.oci.image.index.v1+json" \ + https://registry.example.com/v2/library/alpine/referrers/sha256:abc... + +# Test chunked upload +curl -X POST https://registry.example.com/v2/test/blobs/uploads/ + +# Enable delete in Docker Distribution +# In /etc/docker/registry/config.yml: +# storage: +# delete: +# enabled: true +sudo systemctl restart docker-registry +``` + +### Kubernetes / Helm +```yaml +# values.yaml (for Harbor) +harbor: + registry: + deleteEnabled: true + +# values.yaml (for Stella Ops) +oci: + registryUrl: https://registry.example.com + testRepository: library/alpine +``` +```bash +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.oci.capabilities +``` + +## Related Checks +- `check.integration.oci.registry` -- basic registry connectivity +- `check.integration.oci.referrers` -- focused referrers API check with digest resolution +- `check.integration.oci.credentials` -- credential validation +- `check.integration.oci.pull` -- pull authorization +- `check.integration.oci.push` -- push authorization diff --git a/docs/doctor/articles/integration/registry-credentials.md b/docs/doctor/articles/integration/registry-credentials.md new file mode 100644 index 000000000..802125d4b --- /dev/null +++ b/docs/doctor/articles/integration/registry-credentials.md @@ -0,0 +1,76 @@ +--- +checkId: check.integration.oci.credentials +plugin: stellaops.doctor.integration +severity: fail +tags: [registry, oci, credentials, secrets, auth] +--- +# OCI Registry Credentials + +## What It Checks +Determines the authentication method from configuration: bearer token (`OCI:Token` / `Registry:Token`), basic auth (`OCI:Username` + `OCI:Password` / `Registry:Username` + `Registry:Password`), or anonymous. Immediately **fails** if a username is provided without a password. Then validates credentials by sending an authenticated HTTP GET to `/v2/`. The check **passes** on 200 OK, or on 401 if the response includes a `WWW-Authenticate: Bearer` challenge and basic credentials are configured (OAuth2 token exchange scenario). It **fails** on 401 (invalid credentials) or 403 (forbidden), and **fails** on connection errors or timeouts. + +## Why It Matters +Invalid or expired registry credentials cause image pull/push failures across all deployment pipelines. Because credentials are often rotated on a schedule, this check provides early detection of expired tokens before they silently break promotions, SBOM ingestion, or attestation storage. A username-without-password misconfiguration indicates a secret reference that failed to resolve. + +## Common Causes +- Credentials are invalid or have been rotated without updating the configuration +- Token has been revoked by the registry administrator +- Username provided without a corresponding password (broken secret reference) +- Service account token expired +- IP address or network not in the registry's allowlist + +## How to Fix + +### Docker Compose +```bash +# Check credential configuration +grep 'OCI__USERNAME\|OCI__PASSWORD\|OCI__TOKEN\|REGISTRY__' .env + +# Test credentials manually +docker login registry.example.com + +# Rotate credentials +echo 'OCI__Username=stellaops-svc' >> .env +echo 'OCI__Password=' >> .env +docker compose restart platform +``` + +### Bare Metal / systemd +```bash +# Check credential configuration +cat /etc/stellaops/appsettings.Production.json | jq '.OCI | {Username, Password: (if .Password then "****" else null end), Token: (if .Token then "****" else null end)}' + +# Test with curl +curl -u stellaops-svc: https://registry.example.com/v2/ + +# Update credentials +sudo nano /etc/stellaops/appsettings.Production.json +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +oci: + registryUrl: https://registry.example.com + existingSecret: stellaops-registry-creds # Secret with username/password keys +``` +```bash +# Create or update the secret +kubectl create secret generic stellaops-registry-creds \ + --from-literal=username=stellaops-svc \ + --from-literal=password= \ + --dry-run=client -o yaml | kubectl apply -f - + +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.oci.credentials +``` + +## Related Checks +- `check.integration.oci.registry` -- basic connectivity (does not test auth) +- `check.integration.oci.pull` -- verifies pull authorization with these credentials +- `check.integration.oci.push` -- verifies push authorization with these credentials diff --git a/docs/doctor/articles/integration/registry-pull-authorization.md b/docs/doctor/articles/integration/registry-pull-authorization.md new file mode 100644 index 000000000..c0e970273 --- /dev/null +++ b/docs/doctor/articles/integration/registry-pull-authorization.md @@ -0,0 +1,72 @@ +--- +checkId: check.integration.oci.pull +plugin: stellaops.doctor.integration +severity: fail +tags: [registry, oci, pull, authorization, credentials] +--- +# OCI Registry Pull Authorization + +## What It Checks +Sends an authenticated HTTP HEAD request to `/v2//manifests/` with OCI and Docker manifest accept headers. Uses the test repository from `OCI:TestRepository` (default `library/alpine`) and test tag from `OCI:TestTag` (default `latest`). The check **passes** on 2xx (records manifest digest and content type), returns **info** on 404 (test image not found -- cannot verify), **fails** on 401 (invalid credentials), **fails** on 403 (valid credentials but no pull permission), and **fails** on connection errors or timeouts. + +## Why It Matters +Pull authorization is the most fundamental registry operation. Stella Ops pulls images for scanning, SBOM extraction, attestation verification, and deployment. If pull authorization fails, the entire image-based workflow is blocked. This check tests actual pull permissions rather than just credential validity, catching permission misconfigurations that `check.integration.oci.credentials` cannot detect. + +## Common Causes +- Credentials are invalid or expired +- Token has been revoked +- Anonymous pull is not allowed and no credentials are configured +- Service account has been removed from the repository's access list +- Repository access restricted by IP, network, or organization policy +- Test image does not exist in the registry (404 -- configure `OCI:TestRepository`) + +## How to Fix + +### Docker Compose +```bash +# Test pull manually +docker pull registry.example.com/library/alpine:latest + +# Check configured test repository +grep 'OCI__TESTREPOSITORY\|REGISTRY__TESTREPOSITORY' .env + +# Set a valid test image that exists in your registry +echo 'OCI__TestRepository=myorg/base-image' >> .env +echo 'OCI__TestTag=latest' >> .env +docker compose restart platform +``` + +### Bare Metal / systemd +```bash +# Test pull authorization with curl +curl -I -H "Accept: application/vnd.oci.image.manifest.v1+json" \ + -u stellaops-svc: \ + https://registry.example.com/v2/library/alpine/manifests/latest + +# Configure a test image that exists in your registry +sudo nano /etc/stellaops/appsettings.Production.json +# Set OCI:TestRepository and OCI:TestTag +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +oci: + registryUrl: https://registry.example.com + testRepository: myorg/base-image + testTag: latest +``` +```bash +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.oci.pull +``` + +## Related Checks +- `check.integration.oci.credentials` -- validates credential configuration and token validity +- `check.integration.oci.push` -- verifies push authorization +- `check.integration.oci.registry` -- basic registry connectivity diff --git a/docs/doctor/articles/integration/registry-push-authorization.md b/docs/doctor/articles/integration/registry-push-authorization.md new file mode 100644 index 000000000..c91d79105 --- /dev/null +++ b/docs/doctor/articles/integration/registry-push-authorization.md @@ -0,0 +1,74 @@ +--- +checkId: check.integration.oci.push +plugin: stellaops.doctor.integration +severity: fail +tags: [registry, oci, push, authorization, credentials] +--- +# OCI Registry Push Authorization + +## What It Checks +Sends an authenticated HTTP POST to `/v2//blobs/uploads/` to initiate a blob upload session. Uses the test repository from `OCI:TestRepository` or `OCI:PushTestRepository` (default `stellaops/doctor-test`). Only runs if credentials are configured. The check **passes** on 202 Accepted (the upload session is immediately cancelled by sending a DELETE to the returned Location header), **fails** on 401 (invalid credentials), **fails** on 403 (valid credentials but no push permission), and **fails** on connection errors or timeouts. No data is actually written to the registry. + +## Why It Matters +Push authorization is required for storing attestations, SBOMs, signatures, and promoted images in the registry. Without push access, Stella Ops cannot attach evidence artifacts to releases, sign images, or complete promotion workflows. This check verifies the actual push permission grant, not just credential validity, using a non-destructive probe that leaves no artifacts behind. + +## Common Causes +- Credentials are valid but lack push (write) permissions +- Repository does not exist and the registry does not support auto-creation +- Service account has read-only access +- Organization or team policy restricts push to specific accounts +- Token has been revoked or expired +- IP or network restrictions prevent write operations + +## How to Fix + +### Docker Compose +```bash +# Test push manually +echo "test" | docker push registry.example.com/stellaops/doctor-test:probe + +# Grant push permissions to the service account in your registry UI + +# Set a writable test repository +echo 'OCI__PushTestRepository=myorg/stellaops-test' >> .env +docker compose restart platform +``` + +### Bare Metal / systemd +```bash +# Test push authorization with curl +curl -X POST \ + -u stellaops-svc: \ + https://registry.example.com/v2/stellaops/doctor-test/blobs/uploads/ + +# Expected: 202 Accepted with Location header + +# Fix permissions in registry +# Harbor: Add stellaops-svc as Developer/Admin to the project +# GitLab: Grant Reporter+ role to the service account +# ECR: Attach ecr:InitiateLayerUpload policy + +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +oci: + registryUrl: https://registry.example.com + pushTestRepository: myorg/stellaops-test + existingSecret: stellaops-registry-creds +``` +```bash +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.oci.push +``` + +## Related Checks +- `check.integration.oci.credentials` -- validates credential configuration and token validity +- `check.integration.oci.pull` -- verifies pull authorization +- `check.integration.oci.registry` -- basic registry connectivity diff --git a/docs/doctor/articles/integration/registry-referrers-api.md b/docs/doctor/articles/integration/registry-referrers-api.md new file mode 100644 index 000000000..50b0e992d --- /dev/null +++ b/docs/doctor/articles/integration/registry-referrers-api.md @@ -0,0 +1,82 @@ +--- +checkId: check.integration.oci.referrers +plugin: stellaops.doctor.integration +severity: warn +tags: [registry, oci, referrers, compatibility, oci-1.1] +--- +# OCI Registry Referrers API Support + +## What It Checks +First resolves the manifest digest for the test image (`OCI:TestRepository`:`OCI:TestTag`, defaults to `library/alpine:latest`) by sending a HEAD request to the manifests endpoint and reading the `Docker-Content-Digest` header. Then probes the referrers API at `/v2//referrers/` with the `application/vnd.oci.image.index.v1+json` accept header. The check **passes** on 200 OK or on 404 if the response body contains OCI index JSON (valid response meaning no referrers exist yet). It **warns** on 404 without OCI index (API not supported, tag-based fallback required) or 405 Method Not Allowed. Returns **info** if the test image is not found (cannot verify). **Fails** on connection errors. + +## Why It Matters +The OCI 1.1 referrers API enables artifact linking: attaching SBOMs, signatures, attestations, and VEX documents directly to container image manifests. Without it, Stella Ops must fall back to the tag-based referrer pattern (`sha256-{digest}.{artifactType}`), which is less efficient, harder to discover, and may conflict with registry tag naming policies. Knowing referrers API availability determines which linking strategy is used. + +## Common Causes +- Registry does not implement OCI Distribution Spec v1.1 +- Registry version is too old (pre-referrers API) +- Referrers API disabled in registry configuration +- Test image does not exist in registry (cannot resolve digest to probe) +- Credentials lack pull permissions for the test image + +## How to Fix + +### Docker Compose +```bash +# Check registry version and referrers support +docker compose exec gateway curl -sv \ + -H "Accept: application/vnd.oci.image.index.v1+json" \ + https://registry.example.com/v2/library/alpine/referrers/sha256:abc... + +# Upgrade registry to a version supporting OCI 1.1 referrers: +# - Harbor 2.6+ +# - Quay 3.12+ +# - ACR (default) +# - ECR (default) +# - GCR/Artifact Registry (default) +# - Distribution 2.8+ +``` + +### Bare Metal / systemd +```bash +# Verify registry version +curl -I https://registry.example.com/v2/ 2>&1 | grep -i distribution + +# Test referrers API +DIGEST=$(curl -sI -H "Accept: application/vnd.oci.image.manifest.v1+json" \ + https://registry.example.com/v2/library/alpine/manifests/latest \ + | grep Docker-Content-Digest | awk '{print $2}' | tr -d '\r') + +curl -H "Accept: application/vnd.oci.image.index.v1+json" \ + https://registry.example.com/v2/library/alpine/referrers/$DIGEST + +# Upgrade the registry package +sudo apt upgrade docker-registry # or equivalent +sudo systemctl restart docker-registry +``` + +### Kubernetes / Helm +```yaml +# Upgrade Harbor chart +helm upgrade harbor harbor/harbor --set registry.referrers.enabled=true + +# Or configure Stella Ops with a test image that exists +# values.yaml +oci: + registryUrl: https://registry.example.com + testRepository: myorg/base-image + testTag: latest +``` +```bash +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.oci.referrers +``` + +## Related Checks +- `check.integration.oci.capabilities` -- broader capability matrix including referrers +- `check.integration.oci.registry` -- basic registry connectivity +- `check.integration.oci.pull` -- pull authorization (needed to resolve test image digest) diff --git a/docs/doctor/articles/integration/secrets-manager-connectivity.md b/docs/doctor/articles/integration/secrets-manager-connectivity.md new file mode 100644 index 000000000..9c4faa476 --- /dev/null +++ b/docs/doctor/articles/integration/secrets-manager-connectivity.md @@ -0,0 +1,89 @@ +--- +checkId: check.integration.secrets.manager +plugin: stellaops.doctor.integration +severity: fail +tags: [integration, secrets, vault, security, keyvault] +--- +# Secrets Manager Connectivity + +## What It Checks +Iterates over all secrets managers defined under `Secrets:Managers` (or the legacy `Secrets:Vault:Url` / `Vault:Url` single-manager key). For each manager it sends an HTTP GET to a type-specific health endpoint: Vault uses `/v1/sys/health?standbyok=true&sealedcode=200&uninitcode=200`, Azure Key Vault uses `/healthstatus`, and others use `/health`. Sets the appropriate auth header (`X-Vault-Token` for Vault, `Bearer` for others). Records reachability, authentication success, and latency. For Vault, parses the response JSON for `sealed`, `initialized`, and `version` fields. The check **fails** if any manager is unreachable or returns 401/403, **fails** if any Vault instance is sealed, and **passes** if all managers are healthy and unsealed. + +## Why It Matters +Secrets managers store registry credentials, signing keys, API tokens, and encryption keys. If a secrets manager is unreachable, Stella Ops cannot retrieve credentials for deployments, cannot sign attestations, and cannot decrypt sensitive configuration. A sealed Vault is equally critical: all secret reads fail until it is manually unsealed. This is a hard blocker for any release operation. + +## Common Causes +- Secrets manager service is down or restarting +- Network connectivity issue between Stella Ops and the secrets manager +- Authentication token has expired or been revoked +- TLS certificate issue (expired, untrusted CA) +- Vault was restarted and needs manual unseal +- Vault auto-seal triggered due to HSM connectivity loss + +## How to Fix + +### Docker Compose +```bash +# Check secrets manager configuration +grep 'SECRETS__\|VAULT__' .env + +# Test Vault health +docker compose exec gateway curl -sv \ + http://vault:8200/v1/sys/health + +# Unseal Vault if sealed +docker compose exec vault vault operator unseal +docker compose exec vault vault operator unseal +docker compose exec vault vault operator unseal + +# Refresh Vault token +docker compose exec vault vault token create -policy=stellaops +echo 'Secrets__Managers__0__Token=' >> .env +docker compose restart platform +``` + +### Bare Metal / systemd +```bash +# Check Vault status +vault status + +# Unseal if needed +vault operator unseal + +# Renew the Vault token +vault token renew + +# Check Azure Key Vault health +curl -v https://myvault.vault.azure.net/healthstatus + +# Update configuration +sudo nano /etc/stellaops/appsettings.Production.json +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +secrets: + managers: + - name: vault-prod + url: http://vault.vault.svc.cluster.local:8200 + type: vault + existingSecret: stellaops-vault-token +``` +```bash +# Update Vault token secret +kubectl create secret generic stellaops-vault-token \ + --from-literal=token= \ + --dry-run=client -o yaml | kubectl apply -f - + +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.secrets.manager +``` + +## Related Checks +- `check.integration.oci.credentials` -- registry credentials that may be sourced from the secrets manager diff --git a/docs/doctor/articles/integration/slack-webhook.md b/docs/doctor/articles/integration/slack-webhook.md new file mode 100644 index 000000000..cff4c0b27 --- /dev/null +++ b/docs/doctor/articles/integration/slack-webhook.md @@ -0,0 +1,74 @@ +--- +checkId: check.integration.slack +plugin: stellaops.doctor.integration +severity: info +tags: [notification, slack, webhook] +--- +# Slack Webhook + +## What It Checks +Reads the Slack webhook URL from `Slack:WebhookUrl` or `Notify:Slack:WebhookUrl`. First validates the URL format: **warns** if the URL does not start with `https://hooks.slack.com/`. Then tests host reachability by sending an HTTP GET to the base URL (`https://hooks.slack.com`). The check **passes** if the Slack host is reachable, **warns** if the host is unreachable or if the URL format is suspicious. Does not send an actual webhook payload to avoid generating noise in the Slack channel. + +## Why It Matters +Slack notifications keep operators informed about deployment status, policy violations, security findings, and approval requests in real time. A misconfigured or unreachable Slack webhook means critical alerts go undelivered, potentially delaying incident response, approval workflows, or security remediation. + +## Common Causes +- Network connectivity issues between Stella Ops and Slack +- Firewall blocking outbound HTTPS to `hooks.slack.com` +- Proxy misconfiguration preventing external HTTPS +- Webhook URL is malformed or points to the wrong service +- Slack webhook URL has been regenerated (old URL invalidated) + +## How to Fix + +### Docker Compose +```bash +# Check Slack webhook configuration +grep 'SLACK__WEBHOOKURL\|NOTIFY__SLACK' .env + +# Test connectivity to Slack +docker compose exec gateway curl -sv https://hooks.slack.com/ -o /dev/null + +# Update webhook URL +echo 'Slack__WebhookUrl=https://hooks.slack.com/services/T.../B.../xxx' >> .env +docker compose restart platform + +# If behind a proxy +echo 'HTTP_PROXY=http://proxy:8080' >> .env +echo 'HTTPS_PROXY=http://proxy:8080' >> .env +docker compose restart platform +``` + +### Bare Metal / systemd +```bash +# Verify configuration +cat /etc/stellaops/appsettings.Production.json | jq '.Slack' + +# Test connectivity +curl -sv https://hooks.slack.com/ -o /dev/null + +# Update webhook URL +sudo nano /etc/stellaops/appsettings.Production.json +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +slack: + webhookUrl: https://hooks.slack.com/services/T.../B.../xxx + # or use an existing secret + existingSecret: stellaops-slack-webhook +``` +```bash +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.slack +``` + +## Related Checks +- `check.integration.teams` -- Microsoft Teams webhook (alternative notification channel) +- `check.integration.webhooks` -- general webhook health monitoring diff --git a/docs/doctor/articles/integration/smtp-connectivity.md b/docs/doctor/articles/integration/smtp-connectivity.md new file mode 100644 index 000000000..3168d8488 --- /dev/null +++ b/docs/doctor/articles/integration/smtp-connectivity.md @@ -0,0 +1,76 @@ +--- +checkId: check.integration.smtp +plugin: stellaops.doctor.integration +severity: warn +tags: [connectivity, email, smtp] +--- +# SMTP Email Connectivity + +## What It Checks +Reads the SMTP host from `Smtp:Host`, `Email:Smtp:Host`, or `Notify:Email:Host` and the port from the corresponding `:Port` key (defaulting to 587). Opens a raw TCP connection to the SMTP server with a 5-second timeout. The check **passes** if the TCP connection succeeds, **fails** on timeout, socket error, DNS failure, or connection refusal. + +## Why It Matters +Email notifications deliver approval requests, security alerts, deployment summaries, and audit reports to operators who may not be monitoring Slack or the web UI. If the SMTP server is unreachable, these notifications silently fail. For organizations with compliance requirements, email delivery may be the mandated audit notification channel. + +## Common Causes +- SMTP server is not running or is being restarted +- Firewall blocking SMTP port (25, 465, or 587) +- DNS resolution failure for the SMTP hostname +- Network unreachable between Stella Ops and the mail server +- Incorrect host or port in configuration +- ISP/cloud provider blocking outbound SMTP + +## How to Fix + +### Docker Compose +```bash +# Check SMTP configuration +grep 'SMTP__\|EMAIL__SMTP\|NOTIFY__EMAIL' .env + +# Test TCP connectivity +docker compose exec gateway bash -c \ + "echo > /dev/tcp/smtp.example.com/587 && echo OK || echo FAIL" + +# Update SMTP settings +echo 'Smtp__Host=smtp.example.com' >> .env +echo 'Smtp__Port=587' >> .env +echo 'Smtp__UseSsl=true' >> .env +docker compose restart platform +``` + +### Bare Metal / systemd +```bash +# Verify configuration +cat /etc/stellaops/appsettings.Production.json | jq '.Smtp' + +# Test connectivity +telnet smtp.example.com 587 +# or +nslookup smtp.example.com + +# Update configuration +sudo nano /etc/stellaops/appsettings.Production.json +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +smtp: + host: smtp.example.com + port: 587 + useSsl: true + existingSecret: stellaops-smtp-creds # Secret with username/password +``` +```bash +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.smtp +``` + +## Related Checks +- `check.integration.slack` -- Slack notifications (alternative channel) +- `check.integration.teams` -- Teams notifications (alternative channel) diff --git a/docs/doctor/articles/integration/teams-webhook.md b/docs/doctor/articles/integration/teams-webhook.md new file mode 100644 index 000000000..fcca6257d --- /dev/null +++ b/docs/doctor/articles/integration/teams-webhook.md @@ -0,0 +1,75 @@ +--- +checkId: check.integration.teams +plugin: stellaops.doctor.integration +severity: info +tags: [notification, teams, webhook] +--- +# Teams Webhook + +## What It Checks +Reads the Microsoft Teams webhook URL from `Teams:WebhookUrl` or `Notify:Teams:WebhookUrl`. First validates the URL format: **warns** if the URL does not contain `webhook.office.com` or `teams.microsoft.com`. Then tests host reachability by sending an HTTP GET to the base URL of the webhook host. The check **passes** if the Teams host is reachable, **warns** if the host is unreachable or if the URL format is suspicious. Does not send an actual webhook payload to avoid generating noise in the Teams channel. + +## Why It Matters +Microsoft Teams notifications keep operators informed about deployment status, policy violations, security findings, and approval requests. A misconfigured or unreachable Teams webhook means critical alerts go undelivered, potentially delaying incident response and approval workflows. For organizations standardized on Microsoft 365, Teams may be the primary notification channel. + +## Common Causes +- Network connectivity issues between Stella Ops and Microsoft services +- Firewall blocking outbound HTTPS to `webhook.office.com` +- Proxy misconfiguration preventing external HTTPS +- Webhook URL is malformed or was copied incorrectly +- Teams webhook connector has been removed or regenerated +- Microsoft has migrated to a new webhook URL domain + +## How to Fix + +### Docker Compose +```bash +# Check Teams webhook configuration +grep 'TEAMS__WEBHOOKURL\|NOTIFY__TEAMS' .env + +# Test connectivity to Teams webhook host +docker compose exec gateway curl -sv https://webhook.office.com/ -o /dev/null + +# Update webhook URL +echo 'Teams__WebhookUrl=https://webhook.office.com/webhookb2/...' >> .env +docker compose restart platform + +# If behind a proxy +echo 'HTTP_PROXY=http://proxy:8080' >> .env +echo 'HTTPS_PROXY=http://proxy:8080' >> .env +docker compose restart platform +``` + +### Bare Metal / systemd +```bash +# Verify configuration +cat /etc/stellaops/appsettings.Production.json | jq '.Teams' + +# Test connectivity +curl -sv https://webhook.office.com/ -o /dev/null + +# Update webhook URL +sudo nano /etc/stellaops/appsettings.Production.json +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +```yaml +# values.yaml +teams: + webhookUrl: https://webhook.office.com/webhookb2/... + # or use an existing secret + existingSecret: stellaops-teams-webhook +``` +```bash +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.teams +``` + +## Related Checks +- `check.integration.slack` -- Slack webhook (alternative notification channel) +- `check.integration.webhooks` -- general webhook health monitoring diff --git a/docs/doctor/articles/integration/webhook-health.md b/docs/doctor/articles/integration/webhook-health.md new file mode 100644 index 000000000..ffbabde40 --- /dev/null +++ b/docs/doctor/articles/integration/webhook-health.md @@ -0,0 +1,77 @@ +--- +checkId: check.integration.webhooks +plugin: stellaops.doctor.integration +severity: warn +tags: [integration, webhooks, notifications, events] +--- +# Integration Webhook Health + +## What It Checks +Iterates over all webhook endpoints defined under `Webhooks:Endpoints`. For **outbound** webhooks it sends an HTTP HEAD request to the target URL and considers the endpoint reachable if the response status code is below 500. For **inbound** webhooks it marks reachability as true (endpoint is local). It then calculates the delivery failure rate from `TotalDeliveries` and `SuccessfulDeliveries` counters. The check **fails** if any outbound endpoint is unreachable or if any webhook's failure rate exceeds 20%, **warns** if any webhook's failure rate is between 5% and 20%, and **passes** otherwise. + +## Why It Matters +Webhooks are the primary event-driven communication channel between Stella Ops and external systems. Unreachable outbound endpoints mean notifications, CI triggers, and audit event deliveries silently fail. A rising failure rate is an early warning of endpoint degradation that can cascade into missed alerts, delayed approvals, and incomplete audit trails. + +## Common Causes +- Webhook endpoint is down or returning 5xx errors +- Network connectivity issue or DNS resolution failure +- TLS certificate expired or untrusted +- Payload format changed causing receiver to reject events +- Rate limiting by the receiving service +- Intermittent timeouts under load + +## How to Fix + +### Docker Compose +```bash +# List configured webhooks +grep 'WEBHOOKS__' .env + +# Test an outbound webhook endpoint +docker compose exec gateway curl -I https://hooks.example.com/stellaops + +# View webhook delivery logs +docker compose logs platform | grep -i webhook + +# Update a webhook URL +echo 'Webhooks__Endpoints__0__Url=https://hooks.example.com/v2/stellaops' >> .env +docker compose restart platform +``` + +### Bare Metal / systemd +```bash +# Check webhook configuration +cat /etc/stellaops/appsettings.Production.json | jq '.Webhooks' + +# Test endpoint connectivity +curl -I https://hooks.example.com/stellaops + +# Review delivery history +stella webhooks logs --status failed + +# Retry failed deliveries +stella webhooks retry +``` + +### Kubernetes / Helm +```yaml +# values.yaml +webhooks: + endpoints: + - name: slack-releases + url: https://hooks.example.com/stellaops + direction: outbound +``` +```bash +helm upgrade stellaops ./chart -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.integration.webhooks +``` + +## Related Checks +- `check.integration.slack` -- Slack-specific webhook validation +- `check.integration.teams` -- Teams-specific webhook validation +- `check.integration.ci.system` -- CI systems that receive webhook events diff --git a/docs/doctor/articles/notify/email-configured.md b/docs/doctor/articles/notify/email-configured.md new file mode 100644 index 000000000..09e704d70 --- /dev/null +++ b/docs/doctor/articles/notify/email-configured.md @@ -0,0 +1,86 @@ +--- +checkId: check.notify.email.configured +plugin: stellaops.doctor.notify +severity: warn +tags: [notify, email, smtp, quick, configuration] +--- +# Email Configuration + +## What It Checks +Verifies that the email (SMTP) notification channel is properly configured. The check reads the `Notify:Channels:Email` configuration section and validates: + +- **SMTP host** (`SmtpHost` or `Host`): must be set and non-empty. +- **SMTP port** (`SmtpPort` or `Port`): must be a valid number between 1 and 65535. +- **From address** (`FromAddress` or `From`): must be set so outbound emails have a valid sender. +- **Enabled flag** (`Enabled`): if explicitly set to `false`, reports a warning that the channel is configured but disabled. + +The check only runs when the `Notify:Channels:Email` configuration section exists. + +## Why It Matters +Email notifications deliver critical alerts for release gate failures, policy violations, and security findings. Without a properly configured SMTP host, no email notifications can be sent, leaving operators blind to events that require immediate action. A missing from-address causes emails to be rejected by receiving mail servers. + +## Common Causes +- SMTP host not set in configuration +- Missing `Notify:Channels:Email:SmtpHost` setting +- SMTP port not specified or set to an invalid value +- From address not configured +- Email channel explicitly disabled in configuration + +## How to Fix + +### Docker Compose +Add environment variables to your service definition: + +```yaml +environment: + Notify__Channels__Email__SmtpHost: "smtp.example.com" + Notify__Channels__Email__SmtpPort: "587" + Notify__Channels__Email__FromAddress: "noreply@example.com" + Notify__Channels__Email__UseSsl: "true" +``` + +### Bare Metal / systemd +Edit `appsettings.json`: + +```json +{ + "Notify": { + "Channels": { + "Email": { + "SmtpHost": "smtp.example.com", + "SmtpPort": 587, + "FromAddress": "noreply@example.com", + "UseSsl": true + } + } + } +} +``` + +Restart the service: +```bash +sudo systemctl restart stellaops-notify +``` + +### Kubernetes / Helm +Set values in your Helm `values.yaml`: + +```yaml +notify: + channels: + email: + smtpHost: "smtp.example.com" + smtpPort: 587 + fromAddress: "noreply@example.com" + useSsl: true + credentialsSecret: "stellaops-smtp-credentials" +``` + +## Verification +``` +stella doctor run --check check.notify.email.configured +``` + +## Related Checks +- `check.notify.email.connectivity` — tests whether the configured SMTP server is reachable +- `check.notify.queue.health` — verifies the notification delivery queue is healthy diff --git a/docs/doctor/articles/notify/email-connectivity.md b/docs/doctor/articles/notify/email-connectivity.md new file mode 100644 index 000000000..da157bc35 --- /dev/null +++ b/docs/doctor/articles/notify/email-connectivity.md @@ -0,0 +1,78 @@ +--- +checkId: check.notify.email.connectivity +plugin: stellaops.doctor.notify +severity: warn +tags: [notify, email, smtp, connectivity, network] +--- +# Email Connectivity + +## What It Checks +Verifies that the configured SMTP server is reachable by opening a TCP connection to the SMTP host and port. The check: + +- Opens a TCP socket to `SmtpHost:SmtpPort` with a 10-second timeout. +- Reads the SMTP banner and verifies it starts with `220` (standard SMTP greeting). +- Reports an info-level result if the connection succeeds but the banner is not a recognized SMTP response. +- Fails if the connection times out, is refused, or encounters a socket error. + +The check only runs when both `SmtpHost` and `SmtpPort` are configured with valid values. + +## Why It Matters +A configured but unreachable SMTP server means email notifications will silently fail. Release gate alerts, security finding notifications, and approval requests will never reach operators, potentially delaying incident response. + +## Common Causes +- SMTP server not running +- Wrong host or port in configuration +- Firewall blocking outbound SMTP connections +- DNS resolution failure for the SMTP hostname +- Network latency too high (exceeding 10-second timeout) + +## How to Fix + +### Docker Compose +Verify network connectivity from the container: + +```bash +docker exec nc -zv smtp.example.com 587 +docker exec nslookup smtp.example.com +``` + +Ensure the container network can reach the SMTP server. If behind a proxy, configure it: +```yaml +environment: + HTTP_PROXY: "http://proxy.example.com:8080" +``` + +### Bare Metal / systemd +Test connectivity manually: + +```bash +nc -zv smtp.example.com 587 +telnet smtp.example.com 587 +nslookup smtp.example.com +``` + +Check firewall rules: +```bash +sudo iptables -L -n | grep 587 +``` + +### Kubernetes / Helm +Verify connectivity from the pod: + +```bash +kubectl exec -it -- nc -zv smtp.example.com 587 +``` + +Check NetworkPolicy resources that might block egress: +```bash +kubectl get networkpolicy -n stellaops +``` + +## Verification +``` +stella doctor run --check check.notify.email.connectivity +``` + +## Related Checks +- `check.notify.email.configured` — verifies SMTP configuration is complete +- `check.notify.queue.health` — verifies the notification delivery queue is healthy diff --git a/docs/doctor/articles/notify/queue-health.md b/docs/doctor/articles/notify/queue-health.md new file mode 100644 index 000000000..4cb67ab02 --- /dev/null +++ b/docs/doctor/articles/notify/queue-health.md @@ -0,0 +1,93 @@ +--- +checkId: check.notify.queue.health +plugin: stellaops.doctor.notify +severity: fail +tags: [notify, queue, redis, nats, infrastructure] +--- +# Notification Queue Health + +## What It Checks +Verifies that the notification event and delivery queues are healthy. The check: + +- Reads the `Notify:Queue:Transport` (or `Kind`) setting to determine the queue transport type (Redis/Valkey or NATS). +- Resolves `NotifyQueueHealthCheck` and `NotifyDeliveryQueueHealthCheck` from the DI container. +- Invokes each registered health check and aggregates the results. +- Fails if any queue reports an `Unhealthy` status; warns if degraded; passes if all are healthy. + +The check only runs when a queue transport is configured in `Notify:Queue:Transport`. + +## Why It Matters +The notification queue is the backbone of the notification pipeline. If the event queue is unhealthy, new notification events are lost. If the delivery queue is unhealthy, pending notifications to email, Slack, Teams, and webhook channels will not be delivered. This is a severity-fail check because queue failure means complete notification blackout. + +## Common Causes +- Queue server (Redis/Valkey/NATS) not running +- Network connectivity issues between the Notify service and the queue server +- Authentication failure (wrong password or credentials) +- Incorrect connection string in configuration + +## How to Fix + +### Docker Compose +For Redis/Valkey transport: + +```bash +# Check Redis health +docker exec redis-cli ping + +# Check connection string +docker exec env | grep Notify__Queue + +# Restart Redis if needed +docker restart +``` + +For NATS transport: + +```bash +# Check NATS server status +docker exec nats server ping + +# Check NATS logs +docker logs --tail 50 +``` + +### Bare Metal / systemd +```bash +# Redis/Valkey +redis-cli ping +redis-cli info server + +# NATS +nats server ping +systemctl status nats +``` + +Verify the connection string in `appsettings.json`: +```json +{ + "Notify": { + "Queue": { + "Transport": "redis", + "Redis": { + "ConnectionString": "127.1.1.2:6379" + } + } + } +} +``` + +### Kubernetes / Helm +```bash +kubectl exec -it -- redis-cli ping +kubectl logs --tail 50 | grep -i queue +``` + +## Verification +``` +stella doctor run --check check.notify.queue.health +``` + +## Related Checks +- `check.notify.email.configured` — verifies email channel configuration +- `check.notify.slack.configured` — verifies Slack channel configuration +- `check.notify.webhook.configured` — verifies webhook channel configuration diff --git a/docs/doctor/articles/notify/slack-configured.md b/docs/doctor/articles/notify/slack-configured.md new file mode 100644 index 000000000..67e2ba45d --- /dev/null +++ b/docs/doctor/articles/notify/slack-configured.md @@ -0,0 +1,72 @@ +--- +checkId: check.notify.slack.configured +plugin: stellaops.doctor.notify +severity: warn +tags: [notify, slack, quick, configuration] +--- +# Slack Configuration + +## What It Checks +Verifies that the Slack notification channel is properly configured. The check reads `Notify:Channels:Slack` and validates: + +- **Webhook URL** (`WebhookUrl`): must be set and non-empty. +- **Enabled flag** (`Enabled`): if explicitly `false`, reports a warning that Slack is configured but disabled. + +The check only runs when the `Notify:Channels:Slack` configuration section exists. + +## Why It Matters +Slack is a primary real-time notification channel for many operations teams. Without a configured webhook URL, security alerts, release gate notifications, and approval requests cannot reach Slack channels, delaying incident response. + +## Common Causes +- Slack webhook URL not set in configuration +- Missing `Notify:Channels:Slack:WebhookUrl` setting +- Environment variable not bound to configuration +- Slack notifications explicitly disabled + +## How to Fix + +### Docker Compose +```yaml +environment: + Notify__Channels__Slack__WebhookUrl: "https://hooks.slack.com/services/YOUR/WEBHOOK/URL" +``` + +> **Security note:** Slack webhook URLs are secrets. Store them in a secrets manager or Docker secrets, not in plain-text compose files. + +### Bare Metal / systemd +Edit `appsettings.json`: + +```json +{ + "Notify": { + "Channels": { + "Slack": { + "WebhookUrl": "https://hooks.slack.com/services/YOUR/WEBHOOK/URL" + } + } + } +} +``` + +### Kubernetes / Helm +```yaml +notify: + channels: + slack: + webhookUrlSecret: "stellaops-slack-webhook" +``` + +Create the secret: +```bash +kubectl create secret generic stellaops-slack-webhook \ + --from-literal=url="https://hooks.slack.com/services/YOUR/WEBHOOK/URL" +``` + +## Verification +``` +stella doctor run --check check.notify.slack.configured +``` + +## Related Checks +- `check.notify.slack.connectivity` — tests whether the Slack webhook endpoint is reachable +- `check.notify.queue.health` — verifies the notification delivery queue is healthy diff --git a/docs/doctor/articles/notify/slack-connectivity.md b/docs/doctor/articles/notify/slack-connectivity.md new file mode 100644 index 000000000..da99c4bb4 --- /dev/null +++ b/docs/doctor/articles/notify/slack-connectivity.md @@ -0,0 +1,68 @@ +--- +checkId: check.notify.slack.connectivity +plugin: stellaops.doctor.notify +severity: warn +tags: [notify, slack, connectivity, network] +--- +# Slack Connectivity + +## What It Checks +Verifies that the configured Slack webhook endpoint is reachable. The check: + +- Sends an empty-text POST payload to the webhook URL with a 10-second timeout. +- Slack returns `no_text` for empty messages, which proves the endpoint is alive without posting a visible message. +- Passes if the response is successful or contains `no_text`. +- Warns if an unexpected HTTP status is returned (e.g., invalid or revoked webhook). +- Fails on connection timeout or HTTP request exceptions. + +The check only runs when `Notify:Channels:Slack:WebhookUrl` is set and is a valid absolute URL. + +## Why It Matters +A configured but unreachable Slack webhook means notifications are silently dropped. Teams relying on Slack for release alerts and security findings will miss critical events. + +## Common Causes +- Invalid or expired webhook URL +- Slack workspace configuration changed +- Webhook URL revoked or regenerated +- Rate limiting by Slack +- Firewall blocking outbound HTTPS to hooks.slack.com +- Proxy configuration required but not set + +## How to Fix + +### Docker Compose +Test connectivity from the container: + +```bash +docker exec curl -v https://hooks.slack.com/ +``` + +If behind a proxy: +```yaml +environment: + HTTPS_PROXY: "http://proxy.example.com:8080" +``` + +### Bare Metal / systemd +```bash +curl -v https://hooks.slack.com/ +curl -X POST -H 'Content-type: application/json' \ + --data '{"text":"Doctor test"}' \ + 'https://hooks.slack.com/services/YOUR/WEBHOOK/URL' +``` + +### Kubernetes / Helm +```bash +kubectl exec -it -- curl -v https://hooks.slack.com/ +``` + +If the webhook URL has been revoked, create a new one in the Slack App settings under **Incoming Webhooks** and update the configuration. + +## Verification +``` +stella doctor run --check check.notify.slack.connectivity +``` + +## Related Checks +- `check.notify.slack.configured` — verifies Slack webhook URL is set +- `check.notify.queue.health` — verifies the notification delivery queue is healthy diff --git a/docs/doctor/articles/notify/teams-configured.md b/docs/doctor/articles/notify/teams-configured.md new file mode 100644 index 000000000..5c6b9a2bc --- /dev/null +++ b/docs/doctor/articles/notify/teams-configured.md @@ -0,0 +1,67 @@ +--- +checkId: check.notify.teams.configured +plugin: stellaops.doctor.notify +severity: warn +tags: [notify, teams, quick, configuration] +--- +# Teams Configuration + +## What It Checks +Verifies that the Microsoft Teams notification channel is properly configured. The check reads `Notify:Channels:Teams` and validates: + +- **Webhook URL** (`WebhookUrl`): must be set and non-empty. +- **URL format**: validates that the URL belongs to a Microsoft domain (`webhook.office.com` or `microsoft.com`). +- **Enabled flag** (`Enabled`): if explicitly `false`, reports a warning. + +The check only runs when the `Notify:Channels:Teams` configuration section exists. + +## Why It Matters +Teams is a common enterprise notification channel. Without a valid webhook URL, notifications about release decisions, policy violations, and security findings cannot reach Teams channels. + +## Common Causes +- Teams webhook URL not set in configuration +- Webhook URL is not from a Microsoft domain (malformed or legacy URL) +- Teams notifications explicitly disabled +- Environment variable not bound to configuration + +## How to Fix + +### Docker Compose +```yaml +environment: + Notify__Channels__Teams__WebhookUrl: "https://YOUR_TENANT.webhook.office.com/webhookb2/..." +``` + +> **Security note:** Teams webhook URLs are secrets. Use Docker secrets or a vault. + +### Bare Metal / systemd +```json +{ + "Notify": { + "Channels": { + "Teams": { + "WebhookUrl": "https://YOUR_TENANT.webhook.office.com/webhookb2/..." + } + } + } +} +``` + +### Kubernetes / Helm +```yaml +notify: + channels: + teams: + webhookUrlSecret: "stellaops-teams-webhook" +``` + +To create the webhook in Teams: Channel > Connectors > Incoming Webhook > Create. + +## Verification +``` +stella doctor run --check check.notify.teams.configured +``` + +## Related Checks +- `check.notify.teams.connectivity` — tests whether the Teams webhook endpoint is reachable +- `check.notify.queue.health` — verifies the notification delivery queue is healthy diff --git a/docs/doctor/articles/notify/teams-connectivity.md b/docs/doctor/articles/notify/teams-connectivity.md new file mode 100644 index 000000000..4fa6740d7 --- /dev/null +++ b/docs/doctor/articles/notify/teams-connectivity.md @@ -0,0 +1,60 @@ +--- +checkId: check.notify.teams.connectivity +plugin: stellaops.doctor.notify +severity: warn +tags: [notify, teams, connectivity, network] +--- +# Teams Connectivity + +## What It Checks +Verifies that the configured Microsoft Teams webhook endpoint is reachable. The check: + +- Sends a minimal Adaptive Card payload to the webhook URL with a 10-second timeout. +- Passes if the response is successful (HTTP 2xx). +- Warns if an unexpected HTTP status is returned (invalid, expired, or revoked webhook). +- Fails on connection timeout or HTTP request exceptions. + +The check only runs when `Notify:Channels:Teams:WebhookUrl` is set and is a valid absolute URL. + +## Why It Matters +An unreachable Teams webhook means notifications silently fail to deliver. Operations teams will miss release alerts and security findings if the webhook is broken. + +## Common Causes +- Invalid or expired webhook URL +- Teams connector disabled or deleted +- Microsoft 365 tenant configuration changed +- Firewall blocking outbound HTTPS to webhook.office.com +- Proxy configuration required + +## How to Fix + +### Docker Compose +```bash +docker exec curl -v https://webhook.office.com/ +``` + +### Bare Metal / systemd +```bash +curl -v https://webhook.office.com/ +curl -H 'Content-Type: application/json' \ + -d '{"text":"Doctor test"}' \ + 'https://YOUR_TENANT.webhook.office.com/webhookb2/...' +``` + +Check Microsoft 365 service status at https://status.office.com. + +### Kubernetes / Helm +```bash +kubectl exec -it -- curl -v https://webhook.office.com/ +``` + +If the webhook is broken, recreate it: Teams channel > Connectors > Incoming Webhook > delete and recreate. + +## Verification +``` +stella doctor run --check check.notify.teams.connectivity +``` + +## Related Checks +- `check.notify.teams.configured` — verifies Teams webhook URL is set and valid +- `check.notify.queue.health` — verifies the notification delivery queue is healthy diff --git a/docs/doctor/articles/notify/webhook-configured.md b/docs/doctor/articles/notify/webhook-configured.md new file mode 100644 index 000000000..ae94c0b34 --- /dev/null +++ b/docs/doctor/articles/notify/webhook-configured.md @@ -0,0 +1,68 @@ +--- +checkId: check.notify.webhook.configured +plugin: stellaops.doctor.notify +severity: warn +tags: [notify, webhook, quick, configuration] +--- +# Webhook Configuration + +## What It Checks +Verifies that the generic webhook notification channel is properly configured. The check reads `Notify:Channels:Webhook` and validates: + +- **URL** (`Url` or `Endpoint`): must be set and be a valid HTTP or HTTPS URL. +- **Enabled flag** (`Enabled`): if explicitly `false`, reports a warning. +- Also reads `Method` (defaults to POST) and `ContentType` (defaults to application/json) for evidence. + +The check only runs when the `Notify:Channels:Webhook` configuration section exists. + +## Why It Matters +Generic webhooks integrate Stella Ops notifications with third-party systems (PagerDuty, OpsGenie, custom dashboards, SIEM tools). A missing or malformed URL prevents these integrations from receiving events. + +## Common Causes +- Webhook URL not set in configuration +- Malformed URL (missing protocol `http://` or `https://`) +- Invalid characters in URL +- Webhook channel explicitly disabled + +## How to Fix + +### Docker Compose +```yaml +environment: + Notify__Channels__Webhook__Url: "https://your-endpoint/webhook" + Notify__Channels__Webhook__Method: "POST" + Notify__Channels__Webhook__ContentType: "application/json" +``` + +### Bare Metal / systemd +```json +{ + "Notify": { + "Channels": { + "Webhook": { + "Url": "https://your-endpoint/webhook", + "Method": "POST", + "ContentType": "application/json" + } + } + } +} +``` + +### Kubernetes / Helm +```yaml +notify: + channels: + webhook: + url: "https://your-endpoint/webhook" + method: "POST" +``` + +## Verification +``` +stella doctor run --check check.notify.webhook.configured +``` + +## Related Checks +- `check.notify.webhook.connectivity` — tests whether the webhook endpoint is reachable +- `check.notify.queue.health` — verifies the notification delivery queue is healthy diff --git a/docs/doctor/articles/notify/webhook-connectivity.md b/docs/doctor/articles/notify/webhook-connectivity.md new file mode 100644 index 000000000..bb4a08285 --- /dev/null +++ b/docs/doctor/articles/notify/webhook-connectivity.md @@ -0,0 +1,58 @@ +--- +checkId: check.notify.webhook.connectivity +plugin: stellaops.doctor.notify +severity: warn +tags: [notify, webhook, connectivity, network] +--- +# Webhook Connectivity + +## What It Checks +Verifies that the configured generic webhook endpoint is reachable. The check: + +- Sends a HEAD request to the webhook URL (falls back to OPTIONS if HEAD is unsupported) with a 10-second timeout. +- Any response with HTTP status < 500 is considered reachable (even 401/403, which indicate the endpoint exists but requires authentication). +- Warns on HTTP 5xx responses (server-side errors). +- Fails on connection timeout or HTTP request exceptions. + +The check only runs when `Notify:Channels:Webhook:Url` (or `Endpoint`) is set and is a valid absolute URL. + +## Why It Matters +A configured but unreachable webhook endpoint means third-party integrations silently stop receiving notifications. Events that should trigger PagerDuty alerts, SIEM ingestion, or custom dashboard updates will be lost. + +## Common Causes +- Endpoint server not responding +- Network connectivity issue or firewall blocking connection +- DNS resolution failure +- TLS/SSL certificate problem on the endpoint +- Webhook endpoint service is down + +## How to Fix + +### Docker Compose +```bash +docker exec curl -v --max-time 10 https://your-endpoint/webhook +docker exec nslookup your-endpoint +``` + +### Bare Metal / systemd +```bash +curl -I https://your-endpoint/webhook +nslookup your-endpoint +nc -zv your-endpoint 443 +``` + +### Kubernetes / Helm +```bash +kubectl exec -it -- curl -v https://your-endpoint/webhook +``` + +Check that egress NetworkPolicies allow traffic to the webhook destination. + +## Verification +``` +stella doctor run --check check.notify.webhook.connectivity +``` + +## Related Checks +- `check.notify.webhook.configured` — verifies webhook URL is set and valid +- `check.notify.queue.health` — verifies the notification delivery queue is healthy diff --git a/docs/doctor/articles/observability/log-directory-writable.md b/docs/doctor/articles/observability/log-directory-writable.md new file mode 100644 index 000000000..1f3be5e66 --- /dev/null +++ b/docs/doctor/articles/observability/log-directory-writable.md @@ -0,0 +1,68 @@ +--- +checkId: check.logs.directory.writable +plugin: stellaops.doctor.observability +severity: fail +tags: [observability, logs, quick] +--- +# Log Directory Writable + +## What It Checks +Verifies that the log directory exists and is writable. The check: + +- Reads the log path from `Logging:Path` configuration. Falls back to platform defaults: `/var/log/stellaops` on Linux, `%ProgramData%\StellaOps\logs` on Windows. +- Verifies the directory exists. +- Writes a temporary file to test write access, then deletes it. +- Fails if the directory does not exist, is not writable due to permissions, or encounters an I/O error. + +## Why It Matters +If the log directory is not writable, application logs are silently lost. Without logs, troubleshooting service failures, debugging policy evaluation issues, and performing security incident investigations becomes impossible. This is a severity-fail check because log loss breaks the auditability guarantee. + +## Common Causes +- Log directory not created during installation +- Directory was deleted +- Configuration points to wrong path +- Insufficient permissions or directory owned by different user +- Read-only file system +- Disk full + +## How to Fix + +### Docker Compose +```yaml +volumes: + - log-data:/var/log/stellaops +``` + +```bash +docker exec mkdir -p /var/log/stellaops +``` + +### Bare Metal / systemd +```bash +# Create log directory +sudo mkdir -p /var/log/stellaops + +# Set ownership and permissions +sudo chown -R stellaops:stellaops /var/log/stellaops +sudo chmod 755 /var/log/stellaops +``` + +### Kubernetes / Helm +```yaml +logging: + path: "/var/log/stellaops" + persistence: + enabled: true + size: 10Gi +``` + +Or use an `emptyDir` volume for ephemeral log storage with a sidecar shipping logs to an external system. + +## Verification +``` +stella doctor run --check check.logs.directory.writable +``` + +## Related Checks +- `check.logs.rotation.configured` — verifies log rotation is configured +- `check.storage.diskspace` — verifies sufficient disk space is available diff --git a/docs/doctor/articles/observability/log-rotation.md b/docs/doctor/articles/observability/log-rotation.md new file mode 100644 index 000000000..240d4b1ab --- /dev/null +++ b/docs/doctor/articles/observability/log-rotation.md @@ -0,0 +1,83 @@ +--- +checkId: check.logs.rotation.configured +plugin: stellaops.doctor.observability +severity: warn +tags: [observability, logs] +--- +# Log Rotation + +## What It Checks +Verifies that log rotation is configured to prevent disk exhaustion. The check: + +- Looks for application-level rotation via `Logging:RollingPolicy` configuration. +- Checks for Serilog rolling configuration at `Serilog:WriteTo:0:Args:rollingInterval`. +- On Linux, checks for system-level logrotate at `/etc/logrotate.d/stellaops`. +- Scans log files in the log directory and flags any file exceeding 100MB. +- Warns if rotation is not configured and large log files exist or total log size exceeds 200MB. +- Reports info if rotation is not configured but logs are still small. + +## Why It Matters +Without log rotation, log files grow unbounded until they exhaust disk space. Disk exhaustion causes cascading failures across all services. Even before exhaustion, very large log files are slow to search and analyze during incident response. + +## Common Causes +- Log rotation not configured in application settings +- logrotate not installed or stellaops config missing from `/etc/logrotate.d/` +- Application-level rotation disabled +- Rotation threshold set too high +- Very high log volume overwhelming rotation schedule + +## How to Fix + +### Docker Compose +Set application-level log rotation: + +```yaml +environment: + Logging__RollingPolicy: "Size" + Serilog__WriteTo__0__Args__rollingInterval: "Day" + Serilog__WriteTo__0__Args__fileSizeLimitBytes: "104857600" # 100MB +``` + +### Bare Metal / systemd +Option 1 -- Application-level rotation in `appsettings.json`: +```json +{ + "Logging": { + "RollingPolicy": "Size" + } +} +``` + +Option 2 -- System-level logrotate: +```bash +sudo cp /usr/share/stellaops/logrotate.conf /etc/logrotate.d/stellaops + +# Or create manually: +cat < curl -v http://otel-collector:4317/v1/health +``` + +### Bare Metal / systemd +```bash +# Check collector status +systemctl status otel-collector + +# Test endpoint +curl -v http://localhost:4317/v1/health + +# Check port binding +netstat -an | grep 4317 +``` + +Edit `appsettings.json`: +```json +{ + "Telemetry": { + "OtlpEndpoint": "http://localhost:4317" + } +} +``` + +### Kubernetes / Helm +```yaml +telemetry: + otlpEndpoint: "http://otel-collector.monitoring.svc:4317" +``` + +```bash +kubectl get pods -n monitoring | grep otel +kubectl logs -n monitoring --tail 50 +``` + +## Verification +``` +stella doctor run --check check.telemetry.otlp.endpoint +``` + +## Related Checks +- `check.metrics.prometheus.scrape` — verifies Prometheus metrics endpoint accessibility +- `check.logs.directory.writable` — verifies log directory is writable diff --git a/docs/doctor/articles/observability/prometheus-scrape.md b/docs/doctor/articles/observability/prometheus-scrape.md new file mode 100644 index 000000000..a8eea283c --- /dev/null +++ b/docs/doctor/articles/observability/prometheus-scrape.md @@ -0,0 +1,90 @@ +--- +checkId: check.metrics.prometheus.scrape +plugin: stellaops.doctor.observability +severity: warn +tags: [observability, metrics, prometheus] +--- +# Prometheus Scrape + +## What It Checks +Verifies that the application metrics endpoint is accessible for Prometheus scraping. The check: + +- Reads `Metrics:Path` (default `/metrics`), `Metrics:Port` (default `8080`), and `Metrics:Host` (default `localhost`). +- Sends a GET request to `http://{host}:{port}{path}` with a 5-second timeout. +- Counts the number of Prometheus-formatted metric lines in the response. +- Passes if the endpoint returns a successful response with metrics. +- Warns on non-success status codes, timeouts, or connection failures. + +The check only runs when `Metrics:Enabled` is set to `true`. + +## Why It Matters +Prometheus metrics provide real-time visibility into service health, request latencies, error rates, and resource utilization. Without a scrapeable metrics endpoint, alerting rules cannot fire, dashboards go blank, and capacity planning has no data. + +## Common Causes +- Metrics endpoint not enabled in configuration +- Wrong port configured +- Service not running on the expected port +- Authentication required but not configured for Prometheus +- Firewall blocking the metrics port + +## How to Fix + +### Docker Compose +```yaml +environment: + Metrics__Enabled: "true" + Metrics__Path: "/metrics" + Metrics__Port: "8080" +``` + +```bash +# Test metrics endpoint +docker exec curl -s http://localhost:8080/metrics | head -5 +``` + +### Bare Metal / systemd +Edit `appsettings.json`: +```json +{ + "Metrics": { + "Enabled": true, + "Path": "/metrics", + "Port": 8080 + } +} +``` + +```bash +# Verify metrics are exposed +curl -s http://localhost:8080/metrics | head -5 + +# Check port binding +netstat -an | grep 8080 +``` + +### Kubernetes / Helm +```yaml +metrics: + enabled: true + port: 8080 + path: "/metrics" + serviceMonitor: + enabled: true +``` + +Add Prometheus annotations to the pod: +```yaml +annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" +``` + +## Verification +``` +stella doctor run --check check.metrics.prometheus.scrape +``` + +## Related Checks +- `check.telemetry.otlp.endpoint` — verifies OTLP collector endpoint reachability +- `check.logs.directory.writable` — verifies log directory is writable diff --git a/docs/doctor/articles/operations/dead-letter.md b/docs/doctor/articles/operations/dead-letter.md new file mode 100644 index 000000000..f20cf8681 --- /dev/null +++ b/docs/doctor/articles/operations/dead-letter.md @@ -0,0 +1,90 @@ +--- +checkId: check.operations.dead-letter +plugin: stellaops.doctor.operations +severity: warn +tags: [operations, queue, dead-letter] +--- +# Dead Letter Queue + +## What It Checks +Examines the dead letter queue for failed jobs that have exhausted their retry attempts and require manual review: + +- **Critical threshold**: fail when more than 50 failed jobs accumulate in the dead letter queue. +- **Warning threshold**: warn when more than 10 failed jobs are present. +- **Acceptable range**: 1-10 failed jobs pass with an informational note. + +Evidence collected: `FailedJobs`, `OldestFailure`, `MostCommonError`, `RetryableCount`. + +This check always runs (no configuration prerequisites). + +## Why It Matters +Dead letter queue entries represent work that the system was unable to complete after all retry attempts. Each entry is a job that may have had side effects (partial writes, notifications sent, resources allocated) and now sits in an inconsistent state. A growing dead letter queue indicates a systemic issue -- a downstream service outage, a configuration error, or a bug that is causing repeated failures. Left unattended, dead letters accumulate and can mask the root cause of operational issues. + +## Common Causes +- Persistent downstream service failures (registry unavailable, external API down) +- Configuration errors causing jobs to fail deterministically (wrong credentials, missing endpoints) +- Resource exhaustion (out of memory, disk full) during job execution +- Integration service outage (SCM, CI, secrets manager) +- Transient failures accumulating faster than the retry mechanism can clear them +- Jobs consistently failing on specific artifact types or inputs + +## How to Fix + +### Docker Compose +```bash +# List dead letter queue entries +stella orchestrator deadletter list --limit 20 + +# Analyze common failure patterns +stella orchestrator deadletter analyze + +# Retry jobs that are eligible for retry +stella orchestrator deadletter retry --filter retryable + +# Retry all failed jobs +stella orchestrator deadletter retry --all + +# View orchestrator logs for root cause +docker compose -f docker-compose.stella-ops.yml logs --tail 200 orchestrator | grep -i "error\|fail" +``` + +### Bare Metal / systemd +```bash +# List recent failures +stella orchestrator deadletter list --since 1h + +# Analyze failure patterns +stella orchestrator deadletter analyze + +# Retry retryable jobs +stella orchestrator deadletter retry --filter retryable + +# Check orchestrator service health +sudo systemctl status stellaops-orchestrator +sudo journalctl -u stellaops-orchestrator --since "4 hours ago" | grep -i "deadletter\|error" +``` + +### Kubernetes / Helm +```bash +# List dead letter entries +kubectl exec -it -- stella orchestrator deadletter list --limit 20 + +# Analyze failures +kubectl exec -it -- stella orchestrator deadletter analyze + +# Retry retryable jobs +kubectl exec -it -- stella orchestrator deadletter retry --filter retryable + +# Check orchestrator pod logs +kubectl logs -l app=stellaops-orchestrator --tail=200 | grep -i dead.letter +``` + +## Verification +``` +stella doctor run --check check.operations.dead-letter +``` + +## Related Checks +- `check.operations.job-queue` -- job queue backlog can indicate the same underlying issue +- `check.operations.scheduler` -- scheduler failures may produce dead letter entries +- `check.postgres.connectivity` -- database issues are a common root cause of job failures diff --git a/docs/doctor/articles/operations/job-queue.md b/docs/doctor/articles/operations/job-queue.md new file mode 100644 index 000000000..4778ecd6e --- /dev/null +++ b/docs/doctor/articles/operations/job-queue.md @@ -0,0 +1,113 @@ +--- +checkId: check.operations.job-queue +plugin: stellaops.doctor.operations +severity: fail +tags: [operations, queue, jobs, core] +--- +# Job Queue Health + +## What It Checks +Evaluates the platform job queue health across three dimensions: + +- **Worker availability**: fail immediately if no workers are active (zero active workers). +- **Queue depth**: warn at 100+ pending jobs, fail at 500+ pending jobs. +- **Processing rate**: warn if processing rate drops below 10 jobs/minute. + +Evidence collected: `QueueDepth`, `ActiveWorkers`, `TotalWorkers`, `ProcessingRate`, `OldestJobAge`, `CompletedLast24h`, `CriticalThreshold`, `WarningThreshold`, `RateStatus`. + +This check always runs (no configuration prerequisites). + +## Why It Matters +The job queue is the backbone of asynchronous processing in Stella Ops. It handles scan jobs, SBOM generation, vulnerability matching, evidence collection, notification delivery, and many other background tasks. If no workers are available, all background processing stops. A deep queue means jobs are waiting longer than expected, which cascades into delayed scan results, stale findings, and blocked release gates. A low processing rate indicates a performance bottleneck that will only get worse under load. + +## Common Causes +- Worker service not running (crashed, not started, configuration error) +- All workers crashed or became unhealthy simultaneously +- Job processing slower than submission rate during high-activity periods +- Workers overloaded or misconfigured (too few workers for the workload) +- Downstream service bottleneck (database slow, external API rate-limited) +- Database performance issues slowing job dequeue operations +- Higher than normal job submission rate (bulk scan, new integration) + +## How to Fix + +### Docker Compose +```bash +# Check orchestrator service status +docker compose -f docker-compose.stella-ops.yml ps orchestrator + +# View worker logs +docker compose -f docker-compose.stella-ops.yml logs --tail 200 orchestrator + +# Restart the orchestrator service +docker compose -f docker-compose.stella-ops.yml restart orchestrator + +# Scale workers +docker compose -f docker-compose.stella-ops.yml up -d --scale orchestrator=4 +``` + +```yaml +services: + orchestrator: + environment: + Orchestrator__Workers__Count: "8" + Orchestrator__Workers__MaxConcurrent: "4" +``` + +### Bare Metal / systemd +```bash +# Check orchestrator service +sudo systemctl status stellaops-orchestrator + +# View logs for worker errors +sudo journalctl -u stellaops-orchestrator --since "1 hour ago" | grep -i "worker\|queue" + +# Restart workers +stella orchestrator workers restart + +# Scale workers +stella orchestrator workers scale --count 8 + +# Monitor queue depth trend +stella orchestrator queue watch +``` + +### Kubernetes / Helm +```bash +# Check orchestrator pods +kubectl get pods -l app=stellaops-orchestrator + +# View worker logs +kubectl logs -l app=stellaops-orchestrator --tail=200 + +# Scale workers +kubectl scale deployment stellaops-orchestrator --replicas=4 + +# Check for stuck jobs +kubectl exec -it -- stella orchestrator jobs list --status stuck +``` + +Set in Helm `values.yaml`: + +```yaml +orchestrator: + replicas: 4 + workers: + count: 8 + maxConcurrent: 4 + resources: + limits: + memory: 2Gi + cpu: "2" +``` + +## Verification +``` +stella doctor run --check check.operations.job-queue +``` + +## Related Checks +- `check.operations.dead-letter` -- failed jobs end up in the dead letter queue +- `check.operations.scheduler` -- scheduler feeds jobs into the queue +- `check.scanner.queue` -- scanner-specific queue health +- `check.postgres.connectivity` -- database issues affect job dequeue performance diff --git a/docs/doctor/articles/operations/scheduler.md b/docs/doctor/articles/operations/scheduler.md new file mode 100644 index 000000000..440b973dc --- /dev/null +++ b/docs/doctor/articles/operations/scheduler.md @@ -0,0 +1,108 @@ +--- +checkId: check.operations.scheduler +plugin: stellaops.doctor.operations +severity: warn +tags: [operations, scheduler, core] +--- +# Scheduler Health + +## What It Checks +Evaluates the scheduler service status, scheduled jobs, and execution history: + +- **Service status**: fail if the scheduler service is not running. +- **Missed executions**: warn if any scheduled job executions were missed (scheduled time passed without the job running). + +Evidence collected: `ServiceStatus`, `ScheduledJobs`, `MissedExecutions`, `LastExecution`, `NextExecution`, `CompletedToday`. + +This check always runs (no configuration prerequisites). + +## Why It Matters +The scheduler is responsible for triggering time-based operations across the platform: vulnerability database syncs, periodic scans, evidence expiration, report generation, feed updates, and more. If the scheduler is down, none of these periodic tasks run, causing data staleness across the system. Missed executions indicate that the scheduler was unable to trigger a job at its scheduled time, which can cause cascading data freshness issues. + +## Common Causes +- Scheduler service crashed or was not started +- Service configuration error preventing startup +- System was down during a scheduled execution time (maintenance, outage) +- Scheduler overloaded with too many concurrent scheduled jobs +- Clock skew between the scheduler and other services +- Resource exhaustion preventing the scheduler from processing triggers + +## How to Fix + +### Docker Compose +```bash +# Check scheduler/orchestrator service status +docker compose -f docker-compose.stella-ops.yml ps orchestrator + +# View scheduler logs +docker compose -f docker-compose.stella-ops.yml logs --tail 200 orchestrator | grep -i "scheduler\|schedule" + +# Restart the service +docker compose -f docker-compose.stella-ops.yml restart orchestrator + +# Review missed executions +stella scheduler preview --missed + +# Trigger catch-up for missed jobs +stella scheduler catchup --dry-run +stella scheduler catchup +``` + +### Bare Metal / systemd +```bash +# Check scheduler service status +sudo systemctl status stellaops-scheduler + +# Start the scheduler if stopped +sudo systemctl start stellaops-scheduler + +# View scheduler logs +sudo journalctl -u stellaops-scheduler --since "4 hours ago" + +# Review missed executions +stella scheduler preview --missed + +# Trigger catch-up +stella scheduler catchup --dry-run + +# Verify system clock is synchronized +timedatectl status +``` + +### Kubernetes / Helm +```bash +# Check scheduler pod status +kubectl get pods -l app=stellaops-scheduler + +# View logs for the scheduler pod +kubectl logs -l app=stellaops-scheduler --tail=200 + +# Restart the scheduler +kubectl rollout restart deployment stellaops-scheduler + +# Check NTP synchronization in the node +kubectl exec -it -- date -u +``` + +Set in Helm `values.yaml`: + +```yaml +scheduler: + replicas: 1 # only one scheduler instance to avoid duplicate execution + resources: + limits: + memory: 512Mi + cpu: "0.5" + catchupOnStart: true # run missed jobs on startup +``` + +## Verification +``` +stella doctor run --check check.operations.scheduler +``` + +## Related Checks +- `check.operations.job-queue` -- scheduler feeds jobs into the queue +- `check.operations.dead-letter` -- scheduler-triggered jobs that fail end up in dead letter +- `check.release.schedule` -- release schedule depends on the scheduler service +- `check.scanner.vuln` -- vulnerability database sync is scheduler-driven diff --git a/docs/doctor/articles/policy/engine.md b/docs/doctor/articles/policy/engine.md new file mode 100644 index 000000000..cdd9ab640 --- /dev/null +++ b/docs/doctor/articles/policy/engine.md @@ -0,0 +1,124 @@ +--- +checkId: check.policy.engine +plugin: stellaops.doctor.policy +severity: fail +tags: [policy, core, health] +--- +# Policy Engine Health + +## What It Checks +Performs a three-part health check against the policy engine (OPA): + +1. **Compilation**: queries `/health` to verify the engine is responding, `/v1/policies` to count loaded policies and verify they compiled, and `/v1/status` for engine version and cache metrics. +2. **Evaluation**: sends a canary POST to `/v1/data/system/health` with a minimal input and measures response time. HTTP 200 or 404 are acceptable (no policy at that path is fine). HTTP 500 indicates an engine error. Evaluation latency above 100ms triggers a warning. +3. **Storage**: queries `/v1/data` to verify the policy data store is accessible and counts top-level data entries. + +If any of the three sub-checks fail, the overall result is fail. If all pass but evaluation latency exceeds 100ms, the result is warn. + +Evidence collected: `engine_type`, `engine_version`, `engine_url`, `compilation_status`, `evaluation_status`, `storage_status`, `policy_count`, `compilation_time_ms`, `evaluation_latency_p50_ms`, `cache_hit_ratio`, `last_compilation_error`, `evaluation_error`, `storage_error`. + +The check requires `Policy:Engine:Url` or `PolicyEngine:BaseUrl` to be configured. + +## Why It Matters +The policy engine is the decision authority for all release gates, promotion approvals, and security policy enforcement. If the policy engine is down, no release can pass its policy gate. If compilation fails, policies are not enforced. Slow evaluation delays release pipelines. A corrupt or inaccessible policy store means decisions are being made against stale or missing rules, which can result in either blocked releases or unintended policy bypasses. + +## Common Causes +- Policy engine service (OPA) not running or crashed +- Policy storage backend unavailable (bundled or external) +- OPA/Rego compilation error in a recently pushed policy +- Policy cache corrupted after abnormal shutdown +- Policy evaluation slower than expected due to complex rules +- Network connectivity issue between Stella Ops services and the policy engine +- Firewall blocking access to the policy engine port +- DNS resolution failure for the policy engine hostname + +## How to Fix + +### Docker Compose +```bash +# Check policy engine container status +docker compose -f docker-compose.stella-ops.yml ps policy-engine + +# View policy engine logs +docker compose -f docker-compose.stella-ops.yml logs --tail 200 policy-engine + +# Test engine health directly +curl -s http://localhost:8181/health + +# Recompile all policies +stella policy compile --all + +# Warm the policy cache +stella policy cache warm +``` + +```yaml +services: + policy-engine: + environment: + Policy__Engine__Url: "http://policy-engine:8181" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8181/health"] + interval: 30s + timeout: 10s + retries: 3 +``` + +### Bare Metal / systemd +```bash +# Check OPA service status +sudo systemctl status stellaops-policy-engine + +# View logs +sudo journalctl -u stellaops-policy-engine --since "1 hour ago" + +# Restart the service +sudo systemctl restart stellaops-policy-engine + +# Verify health +curl -s http://localhost:8181/health + +# Recompile policies +stella policy compile --all +``` + +### Kubernetes / Helm +```bash +# Check policy engine pods +kubectl get pods -l app=stellaops-policy-engine + +# View pod logs +kubectl logs -l app=stellaops-policy-engine --tail=200 + +# Restart policy engine +kubectl rollout restart deployment stellaops-policy-engine + +# Verify health from within the cluster +kubectl exec -it -- curl -s http://stellaops-policy-engine:8181/health +``` + +Set in Helm `values.yaml`: + +```yaml +policyEngine: + replicas: 2 + resources: + limits: + memory: 1Gi + cpu: "1" + livenessProbe: + httpGet: + path: /health + port: 8181 + initialDelaySeconds: 10 + periodSeconds: 30 +``` + +## Verification +``` +stella doctor run --check check.policy.engine +``` + +## Related Checks +- `check.release.promotion.gates` -- promotion gates depend on policy engine availability +- `check.postgres.connectivity` -- policy storage may depend on database connectivity diff --git a/docs/doctor/articles/postgres/connectivity.md b/docs/doctor/articles/postgres/connectivity.md new file mode 100644 index 000000000..65e05af38 --- /dev/null +++ b/docs/doctor/articles/postgres/connectivity.md @@ -0,0 +1,124 @@ +--- +checkId: check.postgres.connectivity +plugin: stellaops.doctor.postgres +severity: fail +tags: [database, postgres, connectivity, core] +--- +# PostgreSQL Connectivity + +## What It Checks +Opens a connection to PostgreSQL and executes `SELECT version(), current_timestamp` to verify the database is accessible and responsive. Measures round-trip latency: + +- **Critical latency**: fail if response time exceeds 500ms. +- **Warning latency**: warn if response time exceeds 100ms. +- **Connection timeout**: fail if the connection attempt exceeds 10 seconds. +- **Connection failure**: fail on authentication errors, DNS failures, or network issues. + +The connection string password is masked in all evidence output. + +Evidence collected: `ConnectionString` (masked), `LatencyMs`, `Version`, `ServerTime`, `Status`, `Threshold`, `ErrorCode`, `ErrorMessage`, `TimeoutSeconds`. + +The check requires `ConnectionStrings:StellaOps` or `Database:ConnectionString` to be configured. + +## Why It Matters +PostgreSQL is the primary data store for the entire Stella Ops platform. Every service depends on it for configuration, state, and transactional data. If the database is unreachable, the platform is effectively down. High latency propagates through every database operation, degrading the performance of all services, API endpoints, and background jobs simultaneously. This is the most fundamental infrastructure health check. + +## Common Causes +- Database server not running or crashed +- Network connectivity issues between the application and database +- Firewall blocking the database port (5432) +- DNS resolution failure for the database hostname +- Invalid connection string (wrong host, port, or database name) +- Authentication failure (wrong username or password) +- Database does not exist +- Database server overloaded (high CPU, memory pressure, I/O saturation) +- Network latency between application and database hosts +- Slow queries blocking connections +- SSL/TLS certificate issues + +## How to Fix + +### Docker Compose +```bash +# Check postgres container status +docker compose -f docker-compose.stella-ops.yml ps postgres + +# Test direct connection +docker compose -f docker-compose.stella-ops.yml exec postgres \ + pg_isready -U stellaops -d stellaops_platform + +# View postgres logs +docker compose -f docker-compose.stella-ops.yml logs --tail 100 postgres + +# Restart postgres if needed +docker compose -f docker-compose.stella-ops.yml restart postgres +``` + +Verify connection string in environment: + +```yaml +services: + platform: + environment: + ConnectionStrings__StellaOps: "Host=postgres;Port=5432;Database=stellaops_platform;Username=stellaops;Password=stellaops" +``` + +### Bare Metal / systemd +```bash +# Check PostgreSQL service status +sudo systemctl status postgresql + +# Test connectivity +pg_isready -h localhost -p 5432 -U stellaops -d stellaops_platform + +# Check PostgreSQL logs +sudo tail -100 /var/log/postgresql/postgresql-*.log + +# Verify connection string +stella config get ConnectionStrings:StellaOps + +# Test connection manually +psql -h localhost -p 5432 -U stellaops -d stellaops_platform -c "SELECT 1;" +``` + +### Kubernetes / Helm +```bash +# Check PostgreSQL pod status +kubectl get pods -l app=postgresql + +# Test connectivity from an application pod +kubectl exec -it -- pg_isready -h postgres -p 5432 + +# View PostgreSQL pod logs +kubectl logs -l app=postgresql --tail=100 + +# Check service DNS resolution +kubectl exec -it -- nslookup postgres +``` + +Verify connection string in secret: + +```bash +kubectl get secret stellaops-db-credentials -o jsonpath='{.data.connection-string}' | base64 -d +``` + +Set in Helm `values.yaml`: + +```yaml +postgresql: + host: postgres + port: 5432 + database: stellaops_platform + auth: + existingSecret: stellaops-db-credentials +``` + +## Verification +``` +stella doctor run --check check.postgres.connectivity +``` + +## Related Checks +- `check.postgres.pool` -- pool exhaustion can masquerade as connectivity issues +- `check.postgres.migrations` -- migration checks depend on connectivity +- `check.operations.job-queue` -- database issues cause job queue failures diff --git a/docs/doctor/articles/postgres/migrations.md b/docs/doctor/articles/postgres/migrations.md new file mode 100644 index 000000000..f9c72527b --- /dev/null +++ b/docs/doctor/articles/postgres/migrations.md @@ -0,0 +1,127 @@ +--- +checkId: check.postgres.migrations +plugin: stellaops.doctor.postgres +severity: warn +tags: [database, postgres, migrations, schema] +--- +# PostgreSQL Migration Status + +## What It Checks +Connects to PostgreSQL and examines the EF Core migration history to identify pending migrations: + +1. **Migration table existence**: checks for the `__EFMigrationsHistory` table in the `public` schema. Warns if the table does not exist. +2. **Applied migrations**: queries the migration history table (ordered by `MigrationId` descending) to determine which migrations have been applied. +3. **Pending migrations**: compares applied migrations against the expected set to identify any unapplied migrations. Warns if pending migrations are found. + +Evidence collected: `TableExists`, `AppliedCount`, `PendingCount`, `LatestApplied`, `PendingMigrations`, `Status`. + +The check requires `ConnectionStrings:StellaOps` or `Database:ConnectionString` to be configured. + +## Why It Matters +Pending database migrations mean the database schema does not match what the application code expects. This causes 500 errors when the application tries to access tables or columns that do not exist, or uses schema features that have not been applied. In Stella Ops, missing migrations are the number one cause of service failures after an upgrade. Services may start and appear healthy but fail on the first database operation that touches a missing table or column. + +## Common Causes +- New deployment with schema changes but migration not executed +- Migration was not run after a version update +- Previous migration attempt failed partway through +- Database initialized without EF Core (manual SQL scripts used instead) +- Migration history table was accidentally dropped +- First deployment to a fresh database with no migration history +- Auto-migration disabled or not configured in service startup + +## How to Fix + +### Docker Compose +```bash +# Check migration status +docker compose -f docker-compose.stella-ops.yml exec platform \ + stella db migrations status + +# Apply pending migrations +docker compose -f docker-compose.stella-ops.yml exec platform \ + stella db migrate + +# If auto-migration is configured, restart the service (it migrates on startup) +docker compose -f docker-compose.stella-ops.yml restart platform + +# Verify migration status after applying +docker compose -f docker-compose.stella-ops.yml exec platform \ + stella db migrations list +``` + +Ensure auto-migration is enabled: + +```yaml +services: + platform: + environment: + Platform__AutoMigrate: "true" +``` + +### Bare Metal / systemd +```bash +# List pending migrations +stella db migrations list --pending + +# Apply pending migrations +stella db migrate + +# Verify all migrations are applied +stella db migrations status + +# If auto-migration is configured, restart the service +sudo systemctl restart stellaops-platform +``` + +Edit `/etc/stellaops/platform/appsettings.json` to enable auto-migration: + +```json +{ + "Platform": { + "AutoMigrate": true + } +} +``` + +### Kubernetes / Helm +```bash +# Check migration status +kubectl exec -it -- stella db migrations status + +# Apply pending migrations +kubectl exec -it -- stella db migrate + +# Or use a migration Job +kubectl apply -f - < + postgres + -c max_connections=200 + -c shared_buffers=256MB +``` + +Increase Npgsql pool size via connection string: + +```yaml +services: + platform: + environment: + ConnectionStrings__StellaOps: "Host=postgres;Database=stellaops_platform;Username=stellaops;Password=stellaops;Maximum Pool Size=50;Minimum Pool Size=5" +``` + +### Bare Metal / systemd +```bash +# Check connection statistics +psql -U stellaops -d stellaops_platform -c \ + "SELECT state, count(*) FROM pg_stat_activity GROUP BY state;" + +# Check for long-running queries +psql -U stellaops -d stellaops_platform -c \ + "SELECT pid, now() - query_start AS duration, query FROM pg_stat_activity WHERE state = 'active' ORDER BY duration DESC LIMIT 10;" + +# Increase max connections +sudo -u postgres psql -c "ALTER SYSTEM SET max_connections = 200;" +sudo systemctl restart postgresql +``` + +### Kubernetes / Helm +```bash +# Check connection pool from inside a pod +kubectl exec -it -- psql -U stellaops -d stellaops_platform -c \ + "SELECT state, count(*) FROM pg_stat_activity GROUP BY state;" + +# Terminate idle connections +kubectl exec -it -- psql -U stellaops -d stellaops_platform -c \ + "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE state = 'idle' AND query_start < now() - interval '10 minutes';" +``` + +Set in Helm `values.yaml`: + +```yaml +postgresql: + maxConnections: 200 + sharedBuffers: 256MB + +platform: + database: + connectionString: "Host=postgres;Database=stellaops_platform;Username=stellaops;Password=stellaops;Maximum Pool Size=50;Minimum Pool Size=5" +``` + +## Verification +``` +stella doctor run --check check.postgres.pool +``` + +## Related Checks +- `check.postgres.connectivity` -- connectivity issues compound pool problems +- `check.postgres.migrations` -- schema issues can cause queries to hang, consuming connections +- `check.operations.job-queue` -- database bottleneck slows job processing diff --git a/docs/doctor/articles/release/active.md b/docs/doctor/articles/release/active.md new file mode 100644 index 000000000..9110d6f5f --- /dev/null +++ b/docs/doctor/articles/release/active.md @@ -0,0 +1,85 @@ +--- +checkId: check.release.active +plugin: stellaops.doctor.release +severity: warn +tags: [release, pipeline, active, monitoring] +--- +# Active Release Health + +## What It Checks +Queries the Release Orchestrator at `/api/v1/releases?state=active` and evaluates the health of all currently active releases: + +- **Stuck releases**: warn if an executing or pending release has been running for more than 1 hour, fail after 4 hours. +- **Failed releases**: any release with an error triggers an immediate fail. +- **Pending approvals**: warn if an approval has been pending for more than 4 hours, fail after 24 hours. + +Evidence collected: `active_release_count`, `stuck_release_count`, `failed_release_count`, `pending_approval_count`, `oldest_active_release_age_minutes`, `stuck_releases`, `failed_releases`, `approval_pending_releases`. + +The check requires `ReleaseOrchestrator:Url` or `Release:Orchestrator:Url` to be configured. + +## Why It Matters +Active releases represent in-flight changes moving through the promotion pipeline. A stuck release blocks the target environment from receiving updates and can hold locks that prevent other releases. Failed releases indicate broken deployment workflows that need immediate attention. Stale approvals delay time-sensitive deployments and can indicate that approvers are unaware of pending requests or that notification delivery has failed. + +## Common Causes +- Release workflow step failed (script error, timeout, integration failure) +- Approval bottleneck -- approvers not notified or unavailable +- Target environment became unreachable during deployment +- Resource contention between concurrent releases +- Release taking longer than expected due to large artifact size +- Environment slow to respond to health probes after deployment + +## How to Fix + +### Docker Compose +```bash +# Inspect a failed or stuck release +stella release inspect + +# View release execution logs +stella release logs + +# Check Release Orchestrator service health +docker compose -f docker-compose.stella-ops.yml logs --tail 200 orchestrator + +# List pending approvals +stella release approvals list +``` + +### Bare Metal / systemd +```bash +# Check Release Orchestrator service +sudo systemctl status stellaops-orchestrator + +# Inspect the stuck release +stella release inspect + +# View release logs +stella release logs + +# Review and action pending approvals +stella release approvals list +stella release approve +``` + +### Kubernetes / Helm +```bash +# Check orchestrator pod status +kubectl get pods -l app=stellaops-orchestrator + +# View orchestrator logs +kubectl logs -l app=stellaops-orchestrator --tail=200 + +# Inspect stuck release +kubectl exec -it -- stella release inspect +``` + +## Verification +``` +stella doctor run --check check.release.active +``` + +## Related Checks +- `check.release.environment.readiness` -- environment issues cause releases to get stuck +- `check.release.promotion.gates` -- misconfigured gates can block releases indefinitely +- `check.release.configuration` -- workflow configuration errors cause release failures +- `check.release.schedule` -- schedule conflicts can cause resource contention diff --git a/docs/doctor/articles/release/configuration.md b/docs/doctor/articles/release/configuration.md new file mode 100644 index 000000000..03c33b409 --- /dev/null +++ b/docs/doctor/articles/release/configuration.md @@ -0,0 +1,115 @@ +--- +checkId: check.release.configuration +plugin: stellaops.doctor.release +severity: warn +tags: [release, configuration, workflow, validation] +--- +# Release Configuration + +## What It Checks +Queries the Release Orchestrator at `/api/v1/workflows` and validates all release workflow definitions: + +- **Empty stages**: fail if a workflow has no stages defined. +- **Invalid transitions**: fail if a stage references a next stage that does not exist in the workflow. +- **Unreachable stages**: warn if a stage has no incoming transitions and is not the entry point (first stage). +- **Missing environment mapping**: fail if a stage has no target environment assigned. +- **No workflows**: warn if no release workflows are configured at all. + +Evidence collected: `workflow_count`, `active_workflow_count`, `total_stages`, `validation_error_count`, `errors`, `workflow_names`. + +The check requires `ReleaseOrchestrator:Url` or `Release:Orchestrator:Url` to be configured. + +## Why It Matters +Release workflows define the promotion path from development through production. Configuration errors in workflows are silent until a release attempts to use the broken path, at which point the release fails mid-flight. An invalid stage transition creates a dead end in the pipeline. A missing environment mapping means the orchestrator does not know where to deploy. These issues should be caught at configuration time, not during a production release. + +## Common Causes +- Workflow configuration incomplete (created but not finished) +- Stage transition misconfigured after adding or removing stages +- Environment deleted from the system but workflow not updated to reflect the change +- Copy-paste errors when duplicating workflows +- Stages added but not connected to any transition path + +## How to Fix + +### Docker Compose +```bash +# List all workflows and their validation status +stella release workflow list + +# View details of a specific workflow +stella release workflow show + +# Edit workflow to fix configuration +stella release workflow edit + +# Create a new workflow from scratch +stella release workflow create --name "standard" --stages dev,staging,prod +``` + +### Bare Metal / systemd +```bash +# List workflows +stella release workflow list + +# Validate a specific workflow +stella release workflow validate + +# Fix workflow configuration +stella release workflow edit +``` + +Edit the workflow configuration file directly if needed: + +```json +{ + "workflows": [ + { + "name": "standard", + "isActive": true, + "stages": [ + { "name": "dev", "environmentId": "", "nextStages": ["staging"] }, + { "name": "staging", "environmentId": "", "nextStages": ["prod"] }, + { "name": "prod", "environmentId": "", "nextStages": [] } + ] + } + ] +} +``` + +### Kubernetes / Helm +```bash +# Check workflow configuration in ConfigMap +kubectl get configmap stellaops-release-workflows -o yaml + +# Validate workflows +kubectl exec -it -- stella release workflow list +``` + +Set in Helm `values.yaml`: + +```yaml +releaseOrchestrator: + workflows: + - name: standard + isActive: true + stages: + - name: dev + environmentRef: dev + nextStages: [staging] + - name: staging + environmentRef: staging + nextStages: [prod] + - name: prod + environmentRef: prod + nextStages: [] +``` + +## Verification +``` +stella doctor run --check check.release.configuration +``` + +## Related Checks +- `check.release.active` -- active releases fail when workflow configuration is broken +- `check.release.environment.readiness` -- workflows reference environments that must exist and be healthy +- `check.release.promotion.gates` -- gates are associated with workflow stage transitions diff --git a/docs/doctor/articles/release/environment-readiness.md b/docs/doctor/articles/release/environment-readiness.md new file mode 100644 index 000000000..c92510783 --- /dev/null +++ b/docs/doctor/articles/release/environment-readiness.md @@ -0,0 +1,85 @@ +--- +checkId: check.release.environment.readiness +plugin: stellaops.doctor.release +severity: warn +tags: [release, environment, readiness, deployment] +--- +# Environment Readiness + +## What It Checks +Queries the Release Orchestrator at `/api/v1/environments` and evaluates the health and readiness of all configured target environments: + +- **Reachability**: environments must respond to health checks. +- **Health status**: environments must report as healthy. +- **Health check freshness**: warn if the last health check data is older than 1 hour. +- **Production priority**: production environment issues escalate to fail severity; non-production issues are warnings. + +Evidence collected: `environment_count`, `dev_environments`, `staging_environments`, `prod_environments`, `unreachable_count`, `unhealthy_count`, `unreachable_environments`, `unhealthy_environments`, `stale_health_check_count`. + +The check requires `ReleaseOrchestrator:Url` or `Release:Orchestrator:Url` to be configured. + +## Why It Matters +Environments are the deployment targets in the release pipeline. An unreachable or unhealthy environment will cause any release targeting it to fail, blocking the promotion chain. Production environment issues are critical because they can indicate that the currently deployed version is also impacted. Stale health data means the system is operating on outdated information, which can lead to deploying to an environment that is actually down. + +## Common Causes +- Environment agent not responding (crashed, network partition) +- Network connectivity issue between the orchestrator and target environment +- Container runtime issue in the target environment (Docker daemon down) +- Resource exhaustion (disk full, memory pressure) on the target host +- Dev/staging environment intentionally powered down +- Health check scheduler not running, producing stale data +- Environment agent intermittent connectivity causing stale health reports + +## How to Fix + +### Docker Compose +```bash +# Ping the unreachable environment +stella env ping + +# View environment agent logs +stella env logs + +# Check environment health details +stella env health + +# Refresh health data for all environments +stella env health --refresh-all +``` + +### Bare Metal / systemd +```bash +# Check the environment agent service +ssh "systemctl status stellaops-agent" + +# Test network connectivity +stella env ping + +# View agent logs on the target host +ssh "journalctl -u stellaops-agent --since '1 hour ago'" + +# Restart agent if needed +ssh "systemctl restart stellaops-agent" +``` + +### Kubernetes / Helm +```bash +# Check agent pods in the target cluster +kubectl --context get pods -l app=stellaops-agent + +# View agent logs +kubectl --context logs -l app=stellaops-agent --tail=200 + +# Check node resource availability +kubectl --context top nodes +``` + +## Verification +``` +stella doctor run --check check.release.environment.readiness +``` + +## Related Checks +- `check.release.active` -- unreachable environments cause active releases to get stuck +- `check.release.rollback.readiness` -- environment health affects rollback capability +- `check.release.promotion.gates` -- environments must be reachable for gate checks to pass diff --git a/docs/doctor/articles/release/promotion-gates.md b/docs/doctor/articles/release/promotion-gates.md new file mode 100644 index 000000000..deb561681 --- /dev/null +++ b/docs/doctor/articles/release/promotion-gates.md @@ -0,0 +1,105 @@ +--- +checkId: check.release.promotion.gates +plugin: stellaops.doctor.release +severity: warn +tags: [release, promotion, gates, policy, attestation] +--- +# Promotion Gate Health + +## What It Checks +Queries the Release Orchestrator at `/api/v1/promotion-gates` and validates each promotion gate's dependencies: + +- **Policy availability**: if a gate requires policy pass, verifies that all required policies are loaded in the policy engine (queries OPA at `/v1/policies`). +- **Attestor availability**: if a gate requires attestations, verifies the attestor service is reachable at its health endpoint. +- **Approval configuration**: if a gate requires approval, verifies that at least one approver is configured. +- **Severity**: missing policies or missing approvers escalate to fail; attestor unavailability is a warning. + +Evidence collected: `gate_count`, `gates_with_policy`, `gates_with_attestation`, `gates_with_approval`, `issue_count`, `issues`. + +The check requires `ReleaseOrchestrator:Url` or `Release:Orchestrator:Url` to be configured. + +## Why It Matters +Promotion gates enforce the security and compliance requirements for environment promotions. If a gate references a policy that does not exist in the policy engine, releases will fail at promotion time with a cryptic error. If the attestor is down, attestation-gated promotions will block. If approval is required but no approvers are configured, releases will wait indefinitely. These issues are best caught proactively, not during a time-critical production deployment. + +## Common Causes +- Required policies not loaded or compiled in the policy engine +- Attestor service unavailable (crashed, misconfigured, network issue) +- Approval workflow misconfigured (approvers removed, role changes) +- Environment was deleted but its promotion gate configuration remains +- Policy engine URL misconfigured so policy lookup fails +- Policy was renamed but gate still references the old name + +## How to Fix + +### Docker Compose +```bash +# List all promotion gates and their status +stella release gates list + +# Check policy engine for required policies +stella policy list + +# Verify attestor health +curl -s http://localhost:5090/health + +# Configure approvers for a gate +stella release gates configure --approvers , +``` + +```yaml +services: + orchestrator: + environment: + ReleaseOrchestrator__DefaultApprovers: "admin,release-manager" +``` + +### Bare Metal / systemd +```bash +# List gates +stella release gates list + +# Verify policy engine is running +sudo systemctl status stellaops-policy-engine + +# Verify attestor is running +sudo systemctl status stellaops-attestor + +# Reload policies +stella policy compile --all +``` + +### Kubernetes / Helm +```bash +# Check policy engine pods +kubectl get pods -l app=stellaops-policy-engine + +# Check attestor pods +kubectl get pods -l app=stellaops-attestor + +# Verify gate configuration +kubectl exec -it -- stella release gates list +``` + +Set in Helm `values.yaml`: + +```yaml +releaseOrchestrator: + promotionGates: + defaultApprovers: + - admin + - release-manager + policy: + engineUrl: "http://stellaops-policy-engine:8181" + attestor: + url: "http://stellaops-attestor:5090" +``` + +## Verification +``` +stella doctor run --check check.release.promotion.gates +``` + +## Related Checks +- `check.policy.engine` -- policy engine health affects gate policy checks +- `check.release.active` -- gate failures cause active releases to get stuck +- `check.release.configuration` -- workflow configuration defines which gates are used diff --git a/docs/doctor/articles/release/rollback-readiness.md b/docs/doctor/articles/release/rollback-readiness.md new file mode 100644 index 000000000..bbea3d1bb --- /dev/null +++ b/docs/doctor/articles/release/rollback-readiness.md @@ -0,0 +1,124 @@ +--- +checkId: check.release.rollback.readiness +plugin: stellaops.doctor.release +severity: warn +tags: [release, rollback, disaster-recovery, production] +--- +# Rollback Readiness + +## What It Checks +Queries the Release Orchestrator at `/api/v1/environments/rollback-status` (with fallback to `/api/v1/environments`) and evaluates rollback capability for production environments: + +- **Cannot rollback**: fail if a production environment has a previous version but cannot roll back (e.g., irreversible migration, artifacts purged). +- **No previous version**: warn if a production environment has no previous deployment to roll back to. +- **Missing health probe**: warn if a production environment lacks a health probe (prevents auto-rollback on failure). + +Only production environments (type "prod" or "production") are evaluated. Non-production environments are not checked. + +Evidence collected: `prod_environment_count`, `rollback_ready_count`, `cannot_rollback_count`, `no_previous_version_count`, `no_health_probe_count`, `cannot_rollback_environments`, `rollback_blocker`. + +The check requires `ReleaseOrchestrator:Url` or `Release:Orchestrator:Url` to be configured. + +## Why It Matters +Rollback is the primary recovery mechanism when a production deployment introduces a critical issue. If rollback is unavailable, the only options are an emergency forward-fix or extended downtime. Missing health probes prevent automatic rollback on deployment failure, requiring manual intervention during incidents. In regulated environments, rollback readiness is often a compliance requirement for change management. + +## Common Causes +- Previous deployment artifacts not retained (artifact retention policy too aggressive) +- Database migration not reversible (destructive schema change) +- Breaking API change deployed that prevents running the previous version +- Rollback manually disabled for the environment +- First deployment to environment (no previous version exists) +- Deployment history cleared during maintenance +- Health probe URL not configured for auto-rollback +- Auto-rollback on failure not enabled + +## How to Fix + +### Docker Compose +```bash +# Check rollback status for a specific environment +stella env rollback-status + +# View deployment history +stella env history + +# Configure artifact retention to keep previous versions +``` + +```yaml +services: + orchestrator: + environment: + Release__ArtifactRetention__Count: "5" + Release__ArtifactRetention__Days: "30" +``` + +Configure health probes: + +```bash +# Set health probe for a production environment +stella env configure --health-probe-url "http://:8080/health" + +# Enable auto-rollback on failure +stella env configure --auto-rollback-on-failure +``` + +### Bare Metal / systemd +```bash +# Check rollback blockers +stella env rollback-status + +# View deployment history +stella env history + +# Configure health probe +stella env configure --health-probe-url "http://localhost:8080/health" + +# Enable auto-rollback +stella env configure --auto-rollback-on-failure +``` + +Edit `/etc/stellaops/orchestrator/appsettings.json`: + +```json +{ + "Release": { + "ArtifactRetention": { + "Count": 5, + "Days": 30 + } + } +} +``` + +### Kubernetes / Helm +```bash +# Check rollback status +kubectl exec -it -- stella env rollback-status + +# View deployment history +kubectl exec -it -- stella env history +``` + +Set in Helm `values.yaml`: + +```yaml +releaseOrchestrator: + artifactRetention: + count: 5 + days: 30 + environments: + production: + healthProbeUrl: "http://app:8080/health" + autoRollbackOnFailure: true +``` + +## Verification +``` +stella doctor run --check check.release.rollback.readiness +``` + +## Related Checks +- `check.release.active` -- failed releases may require rollback +- `check.release.environment.readiness` -- environment health affects rollback execution +- `check.release.configuration` -- workflow configuration defines rollback behavior diff --git a/docs/doctor/articles/release/schedule.md b/docs/doctor/articles/release/schedule.md new file mode 100644 index 000000000..a001ff750 --- /dev/null +++ b/docs/doctor/articles/release/schedule.md @@ -0,0 +1,88 @@ +--- +checkId: check.release.schedule +plugin: stellaops.doctor.release +severity: info +tags: [release, schedule, upcoming, planning] +--- +# Release Schedule Health + +## What It Checks +Queries the Release Orchestrator at `/api/v1/releases/scheduled` and evaluates the health of scheduled releases: + +- **Missed schedules**: fail if any scheduled release with status "pending" has a scheduled time in the past. +- **Schedule conflicts**: warn if two pending releases target the same environment within 1 hour of each other. +- **Upcoming releases**: informational -- reports releases scheduled within the next 24 hours. + +Evidence collected: `scheduled_release_count`, `upcoming_24h_count`, `missed_schedule_count`, `conflict_count`, `missed_releases`, `conflicts`, `upcoming_releases`. + +The check requires `ReleaseOrchestrator:Url` or `Release:Orchestrator:Url` to be configured. + +## Why It Matters +Missed scheduled releases indicate that the release scheduler is not functioning or that prerequisites were not met at the scheduled time. This can delay time-critical deployments such as security patches or compliance deadlines. Schedule conflicts can cause deployment failures when two releases compete for the same environment simultaneously, potentially leaving the environment in an inconsistent state. + +## Common Causes +- Release scheduler service not running or crashed +- Prerequisite conditions (policy gates, approvals) not met at scheduled time +- Target environment was unavailable when the schedule triggered +- Multiple teams scheduling releases to the same environment without coordination +- Manual schedule override without checking for existing schedules +- Clock skew between scheduler and orchestrator services + +## How to Fix + +### Docker Compose +```bash +# View missed schedules +stella release schedule list --missed + +# Run a missed release immediately +stella release schedule run + +# View schedule conflicts +stella release schedule list --conflicts + +# Reschedule a conflicting release +stella release schedule update --time "2026-03-27T14:00:00Z" + +# Check scheduler service +docker compose -f docker-compose.stella-ops.yml logs --tail 100 orchestrator | grep -i schedule +``` + +### Bare Metal / systemd +```bash +# Check scheduler status +stella release schedule status + +# List missed and conflicting schedules +stella release schedule list --missed +stella release schedule list --conflicts + +# Reschedule +stella release schedule update --time "2026-03-27T14:00:00Z" + +# Check system clock synchronization +timedatectl status +``` + +### Kubernetes / Helm +```bash +# Check orchestrator pod time synchronization +kubectl exec -it -- date -u + +# View scheduled releases +kubectl exec -it -- stella release schedule list + +# Check for CronJob issues +kubectl get cronjobs -l app=stellaops-release-scheduler +kubectl describe cronjob stellaops-release-scheduler +``` + +## Verification +``` +stella doctor run --check check.release.schedule +``` + +## Related Checks +- `check.release.active` -- missed schedules may result in delayed active releases +- `check.release.environment.readiness` -- environment availability affects schedule execution +- `check.operations.scheduler` -- platform scheduler health affects release scheduling diff --git a/docs/doctor/articles/scanner/queue.md b/docs/doctor/articles/scanner/queue.md new file mode 100644 index 000000000..c091500fd --- /dev/null +++ b/docs/doctor/articles/scanner/queue.md @@ -0,0 +1,110 @@ +--- +checkId: check.scanner.queue +plugin: stellaops.doctor.scanner +severity: warn +tags: [scanner, queue, jobs, processing] +--- +# Scanner Queue Health + +## What It Checks +Queries the Scanner service at `/api/v1/queue/stats` and evaluates job queue health across four dimensions: + +- **Queue depth**: warn at 100+ pending jobs, fail at 500+. +- **Failure rate**: warn at 5%+ of processed jobs failing, fail at 15%+. +- **Stuck jobs**: any stuck jobs trigger an immediate fail. +- **Backlog growth**: a growing backlog triggers a warning. + +Evidence collected: `queue_depth`, `processing_rate_per_min`, `stuck_jobs`, `failed_jobs`, `failure_rate`, `oldest_job_age_min`, `backlog_growing`. + +The check requires `Scanner:Url` or `Services:Scanner:Url` to be configured; otherwise it is skipped. + +## Why It Matters +The scanner queue is the central work pipeline for SBOM generation, vulnerability scanning, and reachability analysis. A backlogged or stuck queue delays security findings, blocks release gates that depend on scan results, and can cascade into approval timeouts. Stuck jobs indicate a worker crash or resource failure that will not self-heal. + +## Common Causes +- Scanner worker process crashed or was OOM-killed +- Job dependency (registry, database) became unavailable mid-scan +- Resource exhaustion (CPU, memory, disk) on the scanner host +- Database connection lost during job processing +- Sudden spike in image pushes overwhelming worker capacity +- Processing rate slower than ingest rate during bulk import + +## How to Fix + +### Docker Compose +Check scanner worker status and restart if needed: + +```bash +# View scanner container logs for errors +docker compose -f docker-compose.stella-ops.yml logs --tail 200 scanner + +# Restart the scanner service +docker compose -f docker-compose.stella-ops.yml restart scanner + +# Scale scanner workers (if using replicas) +docker compose -f docker-compose.stella-ops.yml up -d --scale scanner=4 +``` + +Adjust concurrency via environment variables: + +```yaml +environment: + Scanner__Queue__MaxConcurrentJobs: "4" + Scanner__Queue__StuckJobTimeoutMinutes: "30" +``` + +### Bare Metal / systemd +```bash +# Check scanner service status +sudo systemctl status stellaops-scanner + +# View recent logs +sudo journalctl -u stellaops-scanner --since "1 hour ago" + +# Restart the service +sudo systemctl restart stellaops-scanner +``` + +Edit `/etc/stellaops/scanner/appsettings.json`: + +```json +{ + "Queue": { + "MaxConcurrentJobs": 4, + "StuckJobTimeoutMinutes": 30 + } +} +``` + +### Kubernetes / Helm +```bash +# Check scanner pod status +kubectl get pods -l app=stellaops-scanner + +# View logs for crash loops +kubectl logs -l app=stellaops-scanner --tail=200 + +# Scale scanner deployment +kubectl scale deployment stellaops-scanner --replicas=4 +``` + +Set in Helm `values.yaml`: + +```yaml +scanner: + replicas: 4 + queue: + maxConcurrentJobs: 4 + stuckJobTimeoutMinutes: 30 +``` + +## Verification +``` +stella doctor run --check check.scanner.queue +``` + +## Related Checks +- `check.scanner.resources` -- scanner CPU/memory utilization affecting processing rate +- `check.scanner.sbom` -- SBOM generation failures may originate from queue issues +- `check.scanner.vuln` -- vulnerability scan health depends on queue throughput +- `check.operations.job-queue` -- platform-wide job queue health diff --git a/docs/doctor/articles/scanner/reachability.md b/docs/doctor/articles/scanner/reachability.md new file mode 100644 index 000000000..f45295a65 --- /dev/null +++ b/docs/doctor/articles/scanner/reachability.md @@ -0,0 +1,113 @@ +--- +checkId: check.scanner.reachability +plugin: stellaops.doctor.scanner +severity: warn +tags: [scanner, reachability, analysis, performance] +--- +# Reachability Computation Health + +## What It Checks +Queries the Scanner service at `/api/v1/reachability/stats` and evaluates reachability analysis performance and accuracy: + +- **Computation failures**: fail if failure rate exceeds 10% of total computations. +- **Average computation time**: warn at 5,000ms, fail at 30,000ms. +- **Vulnerability filtering effectiveness**: reported as evidence (ratio of unreachable to total vulnerabilities). + +Evidence collected: `total_computations`, `computation_failures`, `failure_rate`, `avg_computation_time_ms`, `p95_computation_time_ms`, `reachable_vulns`, `unreachable_vulns`, `filter_rate`. + +The check requires `Scanner:Url` or `Services:Scanner:Url` to be configured. + +## Why It Matters +Reachability analysis is what separates actionable vulnerability findings from noise. It determines which vulnerabilities are actually reachable in the call graph, filtering out false positives that would otherwise block releases or waste triage time. Slow computations delay security feedback loops, and failures mean vulnerabilities are reported without reachability context, inflating finding counts and eroding operator trust. + +## Common Causes +- Invalid or incomplete call graph data from the SBOM/slice pipeline +- Missing slice cache entries forcing full recomputation +- Timeout on large codebases with deep dependency trees +- Memory exhaustion during graph traversal on complex projects +- Complex call graphs with high fan-out or cyclical references +- Insufficient CPU/memory allocated to scanner workers + +## How to Fix + +### Docker Compose +```bash +# Check scanner logs for reachability errors +docker compose -f docker-compose.stella-ops.yml logs scanner | grep -i "reachability\|computation" + +# Warm the slice cache to speed up subsequent computations +stella scanner cache warm + +# Increase scanner resources +``` + +```yaml +services: + scanner: + deploy: + resources: + limits: + memory: 4G + cpus: "4.0" + environment: + Scanner__Reachability__TimeoutMs: "60000" + Scanner__Reachability__MaxGraphDepth: "100" +``` + +### Bare Metal / systemd +```bash +# View reachability computation errors +sudo journalctl -u stellaops-scanner --since "1 hour ago" | grep -i reachability + +# Retry failed computations +stella scanner reachability retry --failed + +# Warm the slice cache +stella scanner cache warm +``` + +Edit `/etc/stellaops/scanner/appsettings.json`: + +```json +{ + "Reachability": { + "TimeoutMs": 60000, + "MaxGraphDepth": 100, + "MaxConcurrentComputations": 4 + } +} +``` + +### Kubernetes / Helm +```bash +# Check scanner pod resource usage +kubectl top pods -l app=stellaops-scanner + +# Scale scanner workers for parallel computation +kubectl scale deployment stellaops-scanner --replicas=4 +``` + +Set in Helm `values.yaml`: + +```yaml +scanner: + replicas: 4 + resources: + limits: + memory: 4Gi + cpu: "4" + reachability: + timeoutMs: 60000 + maxGraphDepth: 100 +``` + +## Verification +``` +stella doctor run --check check.scanner.reachability +``` + +## Related Checks +- `check.scanner.slice.cache` -- cache misses are a primary cause of slow computations +- `check.scanner.witness.graph` -- reachability depends on witness graph integrity +- `check.scanner.sbom` -- SBOM quality directly affects reachability accuracy +- `check.scanner.resources` -- resource constraints cause computation timeouts diff --git a/docs/doctor/articles/scanner/resources.md b/docs/doctor/articles/scanner/resources.md new file mode 100644 index 000000000..8d8d8fab2 --- /dev/null +++ b/docs/doctor/articles/scanner/resources.md @@ -0,0 +1,119 @@ +--- +checkId: check.scanner.resources +plugin: stellaops.doctor.scanner +severity: warn +tags: [scanner, resources, cpu, memory, workers] +--- +# Scanner Resource Utilization + +## What It Checks +Queries the Scanner service at `/api/v1/resources/stats` and evaluates CPU, memory, and worker pool health: + +- **CPU utilization**: warn at 75%, fail at 90%. +- **Memory utilization**: warn at 80%, fail at 95%. +- **Worker pool saturation**: warn when all workers are busy (zero idle workers). + +Evidence collected: `cpu_utilization`, `memory_utilization`, `memory_used_mb`, `active_workers`, `total_workers`, `idle_workers`. + +The check requires `Scanner:Url` or `Services:Scanner:Url` to be configured. + +## Why It Matters +The scanner is one of the most resource-intensive services in the Stella Ops stack. It processes container images, generates SBOMs, runs vulnerability matching, and performs reachability analysis. When scanner resources are exhausted, all downstream pipelines stall: queue depth grows, scan latency increases, and release gates time out waiting for scan results. Memory exhaustion can cause OOM kills that lose in-progress work. + +## Common Causes +- High scan volume during bulk import or CI surge +- Memory leak from accumulated scan artifacts not being garbage collected +- Large container images (multi-GB layers) being processed concurrently +- Insufficient CPU/memory allocation relative to workload +- All workers busy with no capacity for new jobs +- Worker scaling not keeping up with demand + +## How to Fix + +### Docker Compose +```bash +# Check scanner resource usage +docker stats scanner --no-stream + +# Reduce concurrent jobs to lower resource pressure +# In docker-compose.stella-ops.yml: +``` + +```yaml +services: + scanner: + deploy: + resources: + limits: + memory: 4G + cpus: "4.0" + environment: + Scanner__MaxConcurrentJobs: "2" + Scanner__Workers__Count: "4" +``` + +```bash +# Restart scanner to apply new resource limits +docker compose -f docker-compose.stella-ops.yml up -d scanner +``` + +### Bare Metal / systemd +```bash +# Check current resource usage +top -p $(pgrep -f stellaops-scanner) + +# Reduce concurrent processing +stella scanner config set MaxConcurrentJobs 2 +``` + +Edit `/etc/stellaops/scanner/appsettings.json`: + +```json +{ + "Scanner": { + "MaxConcurrentJobs": 2, + "Workers": { + "Count": 4 + } + } +} +``` + +```bash +sudo systemctl restart stellaops-scanner +``` + +### Kubernetes / Helm +```bash +# Check pod resource usage +kubectl top pods -l app=stellaops-scanner + +# Scale horizontally instead of vertically +kubectl scale deployment stellaops-scanner --replicas=4 +``` + +Set in Helm `values.yaml`: + +```yaml +scanner: + replicas: 4 + resources: + requests: + memory: 2Gi + cpu: "2" + limits: + memory: 4Gi + cpu: "4" + maxConcurrentJobs: 2 +``` + +## Verification +``` +stella doctor run --check check.scanner.resources +``` + +## Related Checks +- `check.scanner.queue` -- resource exhaustion causes queue backlog growth +- `check.scanner.sbom` -- memory exhaustion causes SBOM generation failures +- `check.scanner.reachability` -- CPU constraints slow computation times +- `check.scanner.slice.cache` -- cache effectiveness reduces resource demand diff --git a/docs/doctor/articles/scanner/sbom.md b/docs/doctor/articles/scanner/sbom.md new file mode 100644 index 000000000..e2884bcff --- /dev/null +++ b/docs/doctor/articles/scanner/sbom.md @@ -0,0 +1,106 @@ +--- +checkId: check.scanner.sbom +plugin: stellaops.doctor.scanner +severity: warn +tags: [scanner, sbom, cyclonedx, spdx, compliance] +--- +# SBOM Generation Health + +## What It Checks +Queries the Scanner service at `/api/v1/sbom/stats` and evaluates SBOM generation health: + +- **Success rate**: warn when below 95%, fail when below 80%. +- **Validation failures**: any schema validation failures trigger a warning regardless of success rate. + +Evidence collected: `total_generated`, `successful_generations`, `failed_generations`, `success_rate`, `format_cyclonedx`, `format_spdx`, `validation_failures`. + +The check requires `Scanner:Url` or `Services:Scanner:Url` to be configured. + +## Why It Matters +SBOMs are the foundation of the entire Stella Ops security pipeline. Without valid SBOMs, vulnerability scanning produces incomplete results, reachability analysis cannot run, and release gates that require an SBOM attestation will block promotions. Compliance frameworks (e.g., EO 14028, EU CRA) mandate accurate SBOMs for every shipped artifact. + +## Common Causes +- Invalid or corrupted source artifacts (truncated layers, missing manifests) +- Parser errors for specific ecosystems (e.g., unsupported lockfile format) +- Memory exhaustion on large monorepo or multi-module projects +- SBOM schema validation failures due to generator version mismatch +- Unsupported container base image format +- Minor parsing issues in transitive dependency resolution + +## How to Fix + +### Docker Compose +```bash +# View recent SBOM generation failures +docker compose -f docker-compose.stella-ops.yml logs scanner | grep -i "sbom.*fail" + +# Restart the scanner to clear any cached bad state +docker compose -f docker-compose.stella-ops.yml restart scanner + +# Increase memory limit if OOM is suspected +# In docker-compose.stella-ops.yml: +``` + +```yaml +services: + scanner: + deploy: + resources: + limits: + memory: 4G + environment: + Scanner__Sbom__ValidationMode: "Strict" + Scanner__Sbom__MaxArtifactSizeMb: "500" +``` + +### Bare Metal / systemd +```bash +# Check scanner logs for SBOM errors +sudo journalctl -u stellaops-scanner --since "1 hour ago" | grep -i sbom + +# Retry failed SBOMs +stella scanner sbom retry --failed +``` + +Edit `/etc/stellaops/scanner/appsettings.json`: + +```json +{ + "Sbom": { + "ValidationMode": "Strict", + "MaxArtifactSizeMb": 500 + } +} +``` + +### Kubernetes / Helm +```bash +# Check for OOMKilled scanner pods +kubectl get pods -l app=stellaops-scanner -o wide +kubectl describe pod | grep -A 5 "Last State" + +# View SBOM-related logs +kubectl logs -l app=stellaops-scanner --tail=200 | grep -i sbom +``` + +Set in Helm `values.yaml`: + +```yaml +scanner: + resources: + limits: + memory: 4Gi + sbom: + validationMode: Strict + maxArtifactSizeMb: 500 +``` + +## Verification +``` +stella doctor run --check check.scanner.sbom +``` + +## Related Checks +- `check.scanner.queue` -- queue backlog can delay SBOM generation +- `check.scanner.witness.graph` -- witness graphs depend on successful SBOM output +- `check.scanner.resources` -- resource exhaustion is a top cause of SBOM failures diff --git a/docs/doctor/articles/scanner/slice-cache.md b/docs/doctor/articles/scanner/slice-cache.md new file mode 100644 index 000000000..835481469 --- /dev/null +++ b/docs/doctor/articles/scanner/slice-cache.md @@ -0,0 +1,112 @@ +--- +checkId: check.scanner.slice.cache +plugin: stellaops.doctor.scanner +severity: warn +tags: [scanner, cache, slice, performance] +--- +# Slice Cache Health + +## What It Checks +Queries the Scanner service at `/api/v1/cache/stats` and evaluates slice cache effectiveness: + +- **Storage utilization**: warn at 80% full, fail at 95% full. +- **Hit rate**: warn below 50%, fail below 20%. +- **Eviction rate**: reported as evidence. + +Evidence collected: `hit_rate`, `hits`, `misses`, `entry_count`, `used_bytes`, `total_bytes`, `storage_utilization`, `eviction_rate`. + +The check requires `Scanner:Url` or `Services:Scanner:Url` to be configured. + +## Why It Matters +The slice cache stores pre-computed code slices used by the reachability engine. A healthy cache avoids re-analyzing the same dependency trees on every scan, reducing computation time from seconds to milliseconds. When the cache hit rate drops, reachability computations slow dramatically, causing queue backlog and delayed security feedback. When storage fills up, evictions accelerate and the cache thrashes, making it effectively useless. + +## Common Causes +- Cache size limit too small for the working set of scanned artifacts +- TTL configured too long, preventing eviction of stale entries +- Eviction policy not working (configuration error) +- Unexpected growth in the number of unique slices (new projects onboarded) +- Cache was recently cleared (restart, volume reset) +- Working set larger than cache capacity + +## How to Fix + +### Docker Compose +```bash +# Clear stale cache entries +stella scanner cache prune --stale + +# Warm the cache for active projects +stella scanner cache warm +``` + +Increase cache size in `docker-compose.stella-ops.yml`: + +```yaml +services: + scanner: + environment: + Scanner__Cache__MaxSizeBytes: "4294967296" # 4 GB + Scanner__Cache__TtlHours: "72" + Scanner__Cache__EvictionPolicy: "LRU" + volumes: + - scanner-cache:/data/cache +``` + +### Bare Metal / systemd +```bash +# Check cache directory size +du -sh /var/lib/stellaops/scanner/cache + +# Prune stale entries +stella scanner cache prune --stale + +# Warm cache +stella scanner cache warm +``` + +Edit `/etc/stellaops/scanner/appsettings.json`: + +```json +{ + "Cache": { + "MaxSizeBytes": 4294967296, + "TtlHours": 72, + "EvictionPolicy": "LRU", + "DataPath": "/var/lib/stellaops/scanner/cache" + } +} +``` + +```bash +sudo systemctl restart stellaops-scanner +``` + +### Kubernetes / Helm +```bash +# Check PVC usage for cache volume +kubectl exec -it -- df -h /data/cache +``` + +Set in Helm `values.yaml`: + +```yaml +scanner: + cache: + maxSizeBytes: 4294967296 # 4 GB + ttlHours: 72 + evictionPolicy: LRU + persistence: + enabled: true + size: 10Gi + storageClass: fast-ssd +``` + +## Verification +``` +stella doctor run --check check.scanner.slice.cache +``` + +## Related Checks +- `check.scanner.reachability` -- cache misses directly increase computation time +- `check.scanner.resources` -- cache thrashing increases CPU and memory usage +- `check.scanner.queue` -- slow cache performance cascades into queue backlog diff --git a/docs/doctor/articles/scanner/vuln.md b/docs/doctor/articles/scanner/vuln.md new file mode 100644 index 000000000..33c6093c6 --- /dev/null +++ b/docs/doctor/articles/scanner/vuln.md @@ -0,0 +1,111 @@ +--- +checkId: check.scanner.vuln +plugin: stellaops.doctor.scanner +severity: warn +tags: [scanner, vulnerability, cve, database] +--- +# Vulnerability Scan Health + +## What It Checks +Queries the Scanner service at `/api/v1/vuln/stats` and evaluates vulnerability scanning health, focusing on database freshness: + +- **Database freshness**: warn when the vulnerability database is older than 24 hours, fail when older than 72 hours. +- **Scan failure rate**: warn when scan failure rate exceeds 10%. + +Evidence collected: `database_age_hours`, `last_db_update`, `total_cves`, `scans_completed`, `scan_failures`, `failure_rate`, `vulnerabilities_found`. + +The check requires `Scanner:Url` or `Services:Scanner:Url` to be configured. + +## Why It Matters +A stale vulnerability database means newly disclosed CVEs are not detected in scans, creating a false sense of security. Artifacts that pass policy gates with an outdated database may contain exploitable vulnerabilities that would have been caught with current data. In regulated environments, scan freshness is an auditable compliance requirement. High scan failure rates mean some artifacts are not being scanned at all. + +## Common Causes +- Vulnerability database sync job failed or is not scheduled +- Feed source (NVD, OSV, vendor advisory) unavailable or rate-limited +- Network connectivity issue preventing feed downloads +- Scheduled sync delayed due to system overload +- Parsing errors on specific artifact formats +- Unsupported package ecosystem or lockfile format + +## How to Fix + +### Docker Compose +```bash +# Trigger an immediate database sync +stella scanner db sync + +# Check sync job status +stella scanner db status + +# View scanner logs for sync errors +docker compose -f docker-compose.stella-ops.yml logs scanner | grep -i "sync\|feed\|vuln" +``` + +Configure sync schedule in `docker-compose.stella-ops.yml`: + +```yaml +services: + scanner: + environment: + Scanner__VulnDb__SyncIntervalHours: "6" + Scanner__VulnDb__FeedSources: "nvd,osv,github" + Scanner__VulnDb__RetryCount: "3" +``` + +### Bare Metal / systemd +```bash +# Trigger manual sync +stella scanner db sync + +# Check sync schedule +stella scanner db schedule + +# View sync logs +sudo journalctl -u stellaops-scanner --since "24 hours ago" | grep -i "sync\|feed" +``` + +Edit `/etc/stellaops/scanner/appsettings.json`: + +```json +{ + "VulnDb": { + "SyncIntervalHours": 6, + "FeedSources": ["nvd", "osv", "github"], + "RetryCount": 3 + } +} +``` + +### Kubernetes / Helm +```bash +# Check scanner pod logs for sync status +kubectl logs -l app=stellaops-scanner --tail=100 | grep -i sync + +# Verify CronJob for database sync exists and is running +kubectl get cronjobs -l app=stellaops-scanner-sync +``` + +Set in Helm `values.yaml`: + +```yaml +scanner: + vulnDb: + syncIntervalHours: 6 + feedSources: + - nvd + - osv + - github + retryCount: 3 + syncCronJob: + schedule: "0 */6 * * *" +``` + +## Verification +``` +stella doctor run --check check.scanner.vuln +``` + +## Related Checks +- `check.scanner.queue` -- scan failures may originate from queue processing issues +- `check.scanner.sbom` -- vulnerability matching depends on SBOM quality +- `check.scanner.reachability` -- reachability analysis uses vulnerability data to filter findings diff --git a/docs/doctor/articles/scanner/witness-graph.md b/docs/doctor/articles/scanner/witness-graph.md new file mode 100644 index 000000000..c7b2c72c1 --- /dev/null +++ b/docs/doctor/articles/scanner/witness-graph.md @@ -0,0 +1,108 @@ +--- +checkId: check.scanner.witness.graph +plugin: stellaops.doctor.scanner +severity: warn +tags: [scanner, witness, graph, reachability, evidence] +--- +# Witness Graph Health + +## What It Checks +Queries the Scanner service at `/api/v1/witness/stats` and evaluates witness graph construction health: + +- **Construction failures**: fail if failure rate exceeds 10% of total constructions. +- **Incomplete graphs**: warn if any graphs are incomplete (missing nodes or edges). +- **Consistency errors**: warn if any consistency errors are detected (orphaned nodes, version mismatches). + +Evidence collected: `total_constructed`, `construction_failures`, `failure_rate`, `incomplete_graphs`, `avg_nodes_per_graph`, `avg_edges_per_graph`, `avg_completeness`, `consistency_errors`. + +The check requires `Scanner:Url` or `Services:Scanner:Url` to be configured. + +## Why It Matters +Witness graphs are the evidence artifacts that prove how a vulnerability reachability verdict was reached. They record the call chain from application entry point to vulnerable function. Without intact witness graphs, reachability findings lack provenance, attestation of scan results is weakened, and auditors cannot verify that "unreachable" verdicts are legitimate. Incomplete or inconsistent graphs can cause incorrect reachability conclusions. + +## Common Causes +- Missing SBOM input (SBOM generation failed for the artifact) +- Parser error on specific artifact types or ecosystems +- Cyclical dependency detected causing infinite traversal +- Resource exhaustion during graph construction on large projects +- Partial SBOM data (some dependencies resolved, others missing) +- Missing transitive dependencies in the dependency tree +- Version mismatch between SBOM and slice data +- Orphaned nodes from stale cache entries + +## How to Fix + +### Docker Compose +```bash +# View recent construction failures +docker compose -f docker-compose.stella-ops.yml logs scanner | grep -i "witness.*fail\|graph.*error" + +# Rebuild failed graphs +stella scanner witness rebuild --failed + +# Check SBOM pipeline health (witness graphs depend on SBOMs) +stella doctor run --check check.scanner.sbom +``` + +```yaml +services: + scanner: + environment: + Scanner__WitnessGraph__MaxDepth: "50" + Scanner__WitnessGraph__TimeoutMs: "30000" + Scanner__WitnessGraph__ConsistencyCheckEnabled: "true" +``` + +### Bare Metal / systemd +```bash +# View construction errors +sudo journalctl -u stellaops-scanner --since "1 hour ago" | grep -i witness + +# Rebuild failed graphs +stella scanner witness rebuild --failed + +# View graph statistics +stella scanner witness stats +``` + +Edit `/etc/stellaops/scanner/appsettings.json`: + +```json +{ + "WitnessGraph": { + "MaxDepth": 50, + "TimeoutMs": 30000, + "ConsistencyCheckEnabled": true + } +} +``` + +### Kubernetes / Helm +```bash +# Check scanner logs for witness graph issues +kubectl logs -l app=stellaops-scanner --tail=200 | grep -i witness + +# Rebuild failed graphs +kubectl exec -it -- stella scanner witness rebuild --failed +``` + +Set in Helm `values.yaml`: + +```yaml +scanner: + witnessGraph: + maxDepth: 50 + timeoutMs: 30000 + consistencyCheckEnabled: true +``` + +## Verification +``` +stella doctor run --check check.scanner.witness.graph +``` + +## Related Checks +- `check.scanner.sbom` -- witness graphs are constructed from SBOM data +- `check.scanner.reachability` -- reachability verdicts depend on witness graph integrity +- `check.scanner.slice.cache` -- stale cache entries can cause consistency errors +- `check.scanner.resources` -- resource exhaustion causes construction failures diff --git a/docs/doctor/articles/security/apikey.md b/docs/doctor/articles/security/apikey.md new file mode 100644 index 000000000..215cbe5ca --- /dev/null +++ b/docs/doctor/articles/security/apikey.md @@ -0,0 +1,83 @@ +--- +checkId: check.security.apikey +plugin: stellaops.doctor.security +severity: warn +tags: [security, apikey, authentication] +--- +# API Key Security + +## What It Checks +Validates API key configuration and security practices. The check only runs when an API key configuration section exists (`ApiKey`, `Authentication:ApiKey`, or `Security:ApiKey`). It inspects: + +| Setting | Threshold/Condition | Issue | +|---|---|---| +| `MinLength` | Less than 32 characters | Key too short (escalates to `fail` if < 16) | +| `AllowInQueryString` | `true` | Keys in query strings get logged in access logs | +| `HeaderName` | Equals `Authorization` | Conflicts with other auth schemes | +| `RateLimitPerKey` | `false` or not set | Compromised key could abuse the API without limits | +| `RotationDays` | Not set | No rotation policy configured | +| `RotationDays` | Greater than 365 | Rotation period is very long | + +If API key authentication is explicitly disabled (`Enabled: false`), the check reports an informational result and exits. + +## Why It Matters +API keys are the primary authentication mechanism for service-to-service communication and CI/CD integrations. Short keys can be brute-forced. Keys passed in query strings are recorded in web server access logs, proxy logs, and browser history, creating exposure vectors. Without per-key rate limiting, a compromised key allows unlimited API abuse. Without rotation, a leaked key remains valid indefinitely. + +## Common Causes +- Minimum API key length configured below 32 characters +- API keys allowed in query strings (`AllowInQueryString: true`) +- Using the `Authorization` header for API keys, conflicting with JWT/OAuth +- Per-key rate limiting not enabled +- API key rotation policy not configured or set to more than 365 days + +## How to Fix + +### Docker Compose +Set API key security configuration in environment variables: + +```yaml +environment: + ApiKey__MinLength: "32" + ApiKey__AllowInQueryString: "false" + ApiKey__HeaderName: "X-API-Key" + ApiKey__RateLimitPerKey: "true" + ApiKey__RotationDays: "90" +``` + +### Bare Metal / systemd +Edit `appsettings.json`: + +```json +{ + "ApiKey": { + "Enabled": true, + "MinLength": 32, + "HeaderName": "X-API-Key", + "AllowInQueryString": false, + "RateLimitPerKey": true, + "RotationDays": 90 + } +} +``` + +### Kubernetes / Helm +Set in Helm values: + +```yaml +apiKey: + minLength: 32 + headerName: "X-API-Key" + allowInQueryString: false + rateLimitPerKey: true + rotationDays: 90 +``` + +## Verification +``` +stella doctor run --check check.security.apikey +``` + +## Related Checks +- `check.security.ratelimit` — validates global rate limiting configuration +- `check.security.secrets` — ensures API keys are not stored as plain text +- `check.core.auth.config` — validates overall authentication configuration diff --git a/docs/doctor/articles/security/audit-logging.md b/docs/doctor/articles/security/audit-logging.md new file mode 100644 index 000000000..7702512f7 --- /dev/null +++ b/docs/doctor/articles/security/audit-logging.md @@ -0,0 +1,93 @@ +--- +checkId: check.security.audit.logging +plugin: stellaops.doctor.security +severity: warn +tags: [security, audit, logging] +--- +# Audit Logging + +## What It Checks +Validates that audit logging is enabled and properly configured for security events. The check inspects configuration under `Audit:*`, `Security:Audit:*`, and `Logging:Audit:*` sections: + +| Setting | Expected | Issue if not met | +|---|---|---| +| `Enabled` | `true` | Audit logging explicitly disabled or not configured | +| `LogAuthenticationEvents` | `true` | Authentication events not being logged | +| `LogAdministrativeEvents` | `true` | Admin actions not being logged | +| `Destination` | Non-empty | Audit log destination not configured | + +The check also reads `LogAccessEvents` (data access logging) for reporting, but does not flag it as an issue since it defaults to `false` and is optional. + +If audit logging is explicitly disabled (`Enabled: false`), the check warns and recommends enabling it. If `Enabled` is not set at all, it flags this as a potential gap. + +## Why It Matters +Audit logging is a compliance requirement for security frameworks (SOC 2, ISO 27001, FedRAMP). Without audit logs: + +- Authentication failures and brute-force attempts go undetected. +- Administrative actions (user creation, permission changes, policy modifications) are untraceable. +- Incident response has no forensic evidence. +- Release decisions and approval workflows cannot be reconstructed. + +Stella Ops is a release control plane where every decision must be auditable. Missing audit logs undermine the core value proposition. + +## Common Causes +- Audit logging disabled in configuration +- Audit logging configuration not found (never explicitly enabled) +- Authentication event logging turned off +- Administrative event logging turned off +- Audit log destination not configured (logs go nowhere) + +## How to Fix + +### Docker Compose +Add audit configuration to environment variables: + +```yaml +environment: + Audit__Enabled: "true" + Audit__LogAuthenticationEvents: "true" + Audit__LogAdministrativeEvents: "true" + Audit__LogAccessEvents: "true" + Audit__Destination: "database" +``` + +### Bare Metal / systemd +Edit `appsettings.json`: + +```json +{ + "Audit": { + "Enabled": true, + "LogAuthenticationEvents": true, + "LogAccessEvents": true, + "LogAdministrativeEvents": true, + "Destination": "database" + } +} +``` + +Restart the service: +```bash +sudo systemctl restart stellaops-platform +``` + +### Kubernetes / Helm +Set in Helm values: + +```yaml +audit: + enabled: true + logAuthenticationEvents: true + logAccessEvents: true + logAdministrativeEvents: true + destination: "database" +``` + +## Verification +``` +stella doctor run --check check.security.audit.logging +``` + +## Related Checks +- `check.security.secrets` — ensures audit log credentials are not exposed +- `check.core.config.loaded` — audit logging depends on configuration being loaded diff --git a/docs/doctor/articles/security/cors.md b/docs/doctor/articles/security/cors.md new file mode 100644 index 000000000..7d18923e0 --- /dev/null +++ b/docs/doctor/articles/security/cors.md @@ -0,0 +1,88 @@ +--- +checkId: check.security.cors +plugin: stellaops.doctor.security +severity: warn +tags: [security, cors, web] +--- +# CORS Configuration + +## What It Checks +Validates Cross-Origin Resource Sharing (CORS) security settings. The check inspects `Cors:*` and `Security:Cors:*` configuration sections: + +| Condition | Severity | Issue | +|---|---|---| +| `AllowAnyOrigin` is `true` | `fail` | Any origin can make cross-origin requests | +| `AllowAnyOrigin` + `AllowCredentials` both true | `fail` | Critical: any origin can send credentialed requests | +| Wildcard `*` in `AllowedOrigins` array | `warn` | Wildcard provides no protection | +| No allowed origins configured | `warn` | CORS origins not explicitly defined | +| Non-HTTPS origin (except localhost/127.0.0.1) | `warn` | Non-HTTPS origins are insecure | + +Evidence collected includes: allowed origins list (up to 5), `AllowCredentials` flag, `AllowAnyOrigin` flag, and configured methods. + +## Why It Matters +Overly permissive CORS configuration allows malicious websites to make authenticated API requests on behalf of logged-in users. If `AllowAnyOrigin` is combined with `AllowCredentials`, an attacker's site can read responses from the Stella Ops API using the victim's session cookies. This can lead to data exfiltration, unauthorized release approvals, or policy modifications. + +## Common Causes +- CORS allows any origin (`AllowAnyOrigin: true`) -- common in development, dangerous in production +- CORS wildcard origin `*` configured in the allowed origins list +- CORS allows any origin with credentials enabled simultaneously +- Allowed origins include non-HTTPS URLs in production +- No CORS allowed origins configured at all + +## How to Fix + +### Docker Compose +Set explicit CORS origins in environment variables: + +```yaml +environment: + Cors__AllowAnyOrigin: "false" + Cors__AllowCredentials: "true" + Cors__AllowedOrigins__0: "https://stella-ops.local" + Cors__AllowedOrigins__1: "https://console.stella-ops.local" + Cors__AllowedMethods__0: "GET" + Cors__AllowedMethods__1: "POST" + Cors__AllowedMethods__2: "PUT" + Cors__AllowedMethods__3: "DELETE" +``` + +### Bare Metal / systemd +Edit `appsettings.json`: + +```json +{ + "Cors": { + "AllowAnyOrigin": false, + "AllowCredentials": true, + "AllowedOrigins": [ + "https://stella-ops.yourdomain.com" + ], + "AllowedMethods": ["GET", "POST", "PUT", "DELETE"] + } +} +``` + +### Kubernetes / Helm +Set in Helm values: + +```yaml +cors: + allowAnyOrigin: false + allowCredentials: true + allowedOrigins: + - "https://stella-ops.yourdomain.com" + allowedMethods: + - GET + - POST + - PUT + - DELETE +``` + +## Verification +``` +stella doctor run --check check.security.cors +``` + +## Related Checks +- `check.security.headers` — validates other HTTP security headers (HSTS, CSP, X-Frame-Options) +- `check.core.auth.config` — authentication must complement CORS to prevent unauthorized access diff --git a/docs/doctor/articles/security/encryption.md b/docs/doctor/articles/security/encryption.md new file mode 100644 index 000000000..c971640fa --- /dev/null +++ b/docs/doctor/articles/security/encryption.md @@ -0,0 +1,94 @@ +--- +checkId: check.security.encryption +plugin: stellaops.doctor.security +severity: warn +tags: [security, encryption, cryptography] +--- +# Encryption Keys + +## What It Checks +Validates encryption key configuration and algorithms. The check only runs when an encryption configuration section exists (`Encryption`, `DataProtection`, or `Cryptography`). It inspects: + +| Setting | Threshold/Condition | Severity | +|---|---|---| +| `Algorithm` | Contains DES, 3DES, RC4, MD5, or SHA1 | `fail` — weak algorithm | +| `KeySize` | Less than 128 bits | `fail` — key too small | +| `KeyRotationDays` | Greater than 365 | `warn` — infrequent rotation | +| `DataProtection:KeysPath` | Directory does not exist | `warn` — keys path missing | + +Defaults if not explicitly configured: algorithm is `AES-256`. + +Evidence collected includes: configured algorithm, key size, key rotation period, and data protection keys path. + +## Why It Matters +Encryption protects data at rest and data protection keys used by ASP.NET Core for cookie encryption, anti-forgery tokens, and TempData. Weak algorithms (DES, 3DES, RC4) have known vulnerabilities and can be broken with modern hardware. Small key sizes reduce the keyspace, making brute-force attacks feasible. Without key rotation, a compromised key provides indefinite access to all encrypted data. + +## Common Causes +- Weak encryption algorithm configured (DES, 3DES, RC4, MD5, SHA1) +- Encryption key size too small (less than 128 bits) +- Key rotation period greater than 365 days or not configured +- Data protection keys directory does not exist on disk + +## How to Fix + +### Docker Compose +Set encryption configuration: + +```yaml +environment: + Encryption__Algorithm: "AES-256" + Encryption__KeySize: "256" + Encryption__KeyRotationDays: "90" + DataProtection__KeysPath: "/app/keys" + +volumes: + - stellaops-keys:/app/keys +``` + +### Bare Metal / systemd +Edit `appsettings.json`: + +```json +{ + "Encryption": { + "Algorithm": "AES-256", + "KeySize": 256, + "KeyRotationDays": 90 + }, + "DataProtection": { + "KeysPath": "/var/lib/stellaops/keys" + } +} +``` + +Create the keys directory: +```bash +sudo mkdir -p /var/lib/stellaops/keys +sudo chown stellaops:stellaops /var/lib/stellaops/keys +sudo chmod 700 /var/lib/stellaops/keys +``` + +### Kubernetes / Helm +Set in Helm values and use a PersistentVolume for key storage: + +```yaml +encryption: + algorithm: "AES-256" + keySize: 256 + keyRotationDays: 90 + +dataProtection: + persistentVolume: + enabled: true + size: "100Mi" +``` + +## Verification +``` +stella doctor run --check check.security.encryption +``` + +## Related Checks +- `check.core.crypto.available` — verifies cryptographic algorithms are available at the OS level +- `check.security.secrets` — ensures encryption keys are not stored as plain text in configuration +- `check.security.tls.certificate` — validates TLS certificate for encryption in transit diff --git a/docs/doctor/articles/security/evidence-integrity.md b/docs/doctor/articles/security/evidence-integrity.md new file mode 100644 index 000000000..967494125 --- /dev/null +++ b/docs/doctor/articles/security/evidence-integrity.md @@ -0,0 +1,111 @@ +--- +checkId: check.security.evidence.integrity +plugin: stellaops.doctor.security +severity: fail +tags: [security, evidence, integrity, dsse, rekor, offline] +--- +# Evidence Integrity + +## What It Checks +Validates DSSE signatures, Rekor inclusion proofs, and evidence hash consistency for files in the evidence locker. The check only runs when `EvidenceLocker:LocalPath` or `Evidence:BasePath` is configured and the directory exists. + +The check scans up to **100 evidence files** (`.json` and `.dsse`) and performs structural verification on three evidence formats: + +### DSSE Envelopes +- Payload must be valid base64. +- At least one signature must exist. +- Each signature must have `keyid` and `sig` fields, with `sig` being valid base64. +- If `payloadDigest` is present, verifies SHA-256 digest matches the payload bytes. + +### Evidence Bundles +- Manifest must have a `version` field. +- If `rekorReceipt` is present, validates the Rekor receipt structure. + +### Rekor Receipts +- Must have non-empty `uuid`. +- Must have numeric `logIndex`. +- Must have `inclusionProof` with a non-empty `hashes` array. + +### Content Digest +- Must have algorithm prefix (`sha256:` or `sha512:`). + +Files that don't match any known format are skipped. Files that fail to parse as JSON are marked invalid. + +## Why It Matters +Evidence integrity is the foundation of Stella Ops' auditability guarantee. Every release decision, scan result, and policy evaluation is recorded as signed evidence. If evidence files are tampered with, the entire audit trail becomes untrustworthy. Broken DSSE signatures mean attestations may have been modified after signing. Missing or invalid Rekor inclusion proofs mean the transparency log cannot verify the evidence was recorded. + +## Common Causes +- Evidence files may have been tampered with or corrupted +- DSSE signatures are invalid (payload was modified after signing) +- Evidence digests do not match content (partial writes, disk corruption) +- Rekor inclusion proofs are invalid or missing required fields +- Evidence locker directory does not exist or has not been initialized + +## How to Fix + +### Docker Compose +Verify the evidence locker path is configured and accessible: + +```yaml +environment: + EvidenceLocker__LocalPath: "/data/evidence" + +volumes: + - stellaops-evidence:/data/evidence +``` + +Investigate invalid files: +```bash +# List evidence files +docker compose exec platform ls -la /data/evidence/ + +# Check a specific file +docker compose exec platform cat /data/evidence/.json | jq +``` + +Re-generate affected evidence: +```bash +# Re-scan and re-sign evidence bundles +docker compose exec platform stella evidence regenerate --path /data/evidence/ +``` + +### Bare Metal / systemd +```bash +# Create the evidence directory if missing +mkdir -p /var/lib/stellaops/evidence +chown stellaops:stellaops /var/lib/stellaops/evidence + +# Verify file integrity +sha256sum /var/lib/stellaops/evidence/*.json + +# Check Rekor entries +rekor-cli get --uuid +``` + +### Kubernetes / Helm +Ensure evidence is stored on a persistent volume: + +```yaml +evidenceLocker: + localPath: "/data/evidence" + persistentVolume: + enabled: true + size: "10Gi" + storageClass: "standard" +``` + +Verify inside the pod: +```bash +kubectl exec -it -- ls -la /data/evidence/ +kubectl exec -it -- stella doctor run --check check.security.evidence.integrity +``` + +## Verification +``` +stella doctor run --check check.security.evidence.integrity +``` + +## Related Checks +- `check.security.encryption` — validates encryption keys used for evidence signing +- `check.core.crypto.available` — SHA-256 must be available for digest verification +- `check.core.env.diskspace` — insufficient disk space can cause incomplete evidence writes diff --git a/docs/doctor/articles/security/headers.md b/docs/doctor/articles/security/headers.md new file mode 100644 index 000000000..6694cfc5e --- /dev/null +++ b/docs/doctor/articles/security/headers.md @@ -0,0 +1,109 @@ +--- +checkId: check.security.headers +plugin: stellaops.doctor.security +severity: warn +tags: [security, headers, web] +--- +# Security Headers + +## What It Checks +Validates that HTTP security headers are properly configured. The check inspects `Security:Headers:*` and `Headers:*` configuration sections for five critical headers: + +| Header | Setting | Issue if missing/wrong | +|---|---|---| +| **HSTS** | `Hsts:Enabled` | Not enabled — browsers won't enforce HTTPS | +| **X-Frame-Options** | `XFrameOptions` | Not configured — clickjacking vulnerability | +| **X-Frame-Options** | Set to `ALLOWALL` | Provides no protection | +| **Content-Security-Policy** | `ContentSecurityPolicy` / `Csp` | Not configured — XSS and injection risks | +| **X-Content-Type-Options** | `XContentTypeOptions` | Not enabled — MIME type sniffing vulnerability | +| **Referrer-Policy** | `ReferrerPolicy` | Not configured — referrer information leaks | + +The check reports a warning listing all unconfigured headers. + +## Why It Matters +Security headers are a defense-in-depth measure that protects against common web attacks: + +- **HSTS**: Forces browsers to use HTTPS, preventing SSL-stripping attacks. +- **X-Frame-Options**: Prevents the UI from being embedded in iframes on malicious sites (clickjacking). +- **Content-Security-Policy**: Prevents cross-site scripting (XSS) and other code injection attacks. +- **X-Content-Type-Options**: Prevents browsers from interpreting files as a different MIME type. +- **Referrer-Policy**: Controls how much referrer information is included with requests, preventing data leaks. + +## Common Causes +- HSTS not enabled (common in development environments) +- X-Frame-Options header not configured or set to ALLOWALL +- Content-Security-Policy header not defined +- X-Content-Type-Options: nosniff not enabled +- Referrer-Policy header not configured +- Security headers middleware not added to the ASP.NET Core pipeline + +## How to Fix + +### Docker Compose +Set security headers via environment variables: + +```yaml +environment: + Security__Headers__Hsts__Enabled: "true" + Security__Headers__XFrameOptions: "DENY" + Security__Headers__ContentSecurityPolicy: "default-src 'self'; script-src 'self'; style-src 'self' 'unsafe-inline'" + Security__Headers__XContentTypeOptions__Enabled: "true" + Security__Headers__ReferrerPolicy: "strict-origin-when-cross-origin" +``` + +### Bare Metal / systemd +Edit `appsettings.json`: + +```json +{ + "Security": { + "Headers": { + "Hsts": { + "Enabled": true + }, + "XFrameOptions": "DENY", + "ContentSecurityPolicy": "default-src 'self'; script-src 'self'; style-src 'self' 'unsafe-inline'", + "XContentTypeOptions": { + "Enabled": true + }, + "ReferrerPolicy": "strict-origin-when-cross-origin" + } + } +} +``` + +### Kubernetes / Helm +Set in Helm values: + +```yaml +security: + headers: + hsts: + enabled: true + xFrameOptions: "DENY" + contentSecurityPolicy: "default-src 'self'; script-src 'self'; style-src 'self' 'unsafe-inline'" + xContentTypeOptions: + enabled: true + referrerPolicy: "strict-origin-when-cross-origin" +``` + +Alternatively, configure at the ingress level: + +```yaml +ingress: + annotations: + nginx.ingress.kubernetes.io/configuration-snippet: | + add_header X-Frame-Options "DENY" always; + add_header X-Content-Type-Options "nosniff" always; + add_header Referrer-Policy "strict-origin-when-cross-origin" always; + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; +``` + +## Verification +``` +stella doctor run --check check.security.headers +``` + +## Related Checks +- `check.security.cors` — CORS headers are another critical web security mechanism +- `check.security.tls.certificate` — HSTS requires a valid TLS certificate diff --git a/docs/doctor/articles/security/jwt-config.md b/docs/doctor/articles/security/jwt-config.md new file mode 100644 index 000000000..aee75239a --- /dev/null +++ b/docs/doctor/articles/security/jwt-config.md @@ -0,0 +1,104 @@ +--- +checkId: check.security.jwt.config +plugin: stellaops.doctor.security +severity: fail +tags: [security, jwt, authentication] +--- +# JWT Configuration + +## What It Checks +Validates JWT token signing and validation configuration. The check only runs when a JWT configuration section exists (`Jwt` or `Authentication:Jwt`). It inspects: + +| Setting | Threshold/Condition | Severity | +|---|---|---| +| `SigningKey` | Not configured | `fail` | +| `SigningKey` | Shorter than 32 characters | `fail` | +| `Issuer` | Not configured | `fail` | +| `Audience` | Not configured | `fail` | +| `ExpirationMinutes` | Greater than 1440 (24 hours) | `warn` | +| `Algorithm` | `none` | `fail` — completely insecure | +| `Algorithm` | `HS256` | `warn` — acceptable but RS256/ES256 recommended | + +Default values if not explicitly set: `ExpirationMinutes` = 60, `Algorithm` = HS256. + +Evidence collected includes: whether a signing key is configured, key length, issuer, audience, expiration minutes, and algorithm. + +## Why It Matters +JWT tokens are the primary authentication mechanism for API access. A missing or short signing key allows token forgery. The `none` algorithm disables signature verification entirely. Missing issuer or audience values disable critical validation claims, allowing tokens from other systems to be accepted. Long expiration times increase the window of opportunity if a token is compromised. + +## Common Causes +- JWT signing key is not configured in the deployment +- JWT signing key is too short (fewer than 32 characters) +- JWT issuer or audience not configured +- JWT expiration time set too long (more than 24 hours) +- Using algorithm `none` which disables all signature verification +- Using HS256 symmetric algorithm when asymmetric (RS256/ES256) would be more secure + +## How to Fix + +### Docker Compose +Set JWT configuration as environment variables: + +```yaml +environment: + Jwt__SigningKey: "" + Jwt__Issuer: "https://stella-ops.local" + Jwt__Audience: "stellaops-api" + Jwt__ExpirationMinutes: "60" + Jwt__Algorithm: "RS256" +``` + +Generate a strong signing key: +```bash +openssl rand -base64 48 +``` + +### Bare Metal / systemd +Edit `appsettings.json`: + +```json +{ + "Jwt": { + "SigningKey": "", + "Issuer": "https://stella-ops.yourdomain.com", + "Audience": "stellaops-api", + "ExpirationMinutes": 60, + "Algorithm": "RS256" + } +} +``` + +For RS256, generate a key pair: +```bash +openssl genrsa -out jwt-private.pem 2048 +openssl rsa -in jwt-private.pem -pubout -out jwt-public.pem +``` + +### Kubernetes / Helm +Store the signing key as a Kubernetes Secret: + +```bash +kubectl create secret generic stellaops-jwt \ + --from-literal=signing-key="$(openssl rand -base64 48)" +``` + +Reference in Helm values: + +```yaml +jwt: + issuer: "https://stella-ops.yourdomain.com" + audience: "stellaops-api" + expirationMinutes: 60 + algorithm: "RS256" + signingKeySecret: "stellaops-jwt" +``` + +## Verification +``` +stella doctor run --check check.security.jwt.config +``` + +## Related Checks +- `check.core.auth.config` — validates broader authentication configuration including JWT +- `check.security.secrets` — ensures the JWT signing key is not stored as plain text +- `check.security.tls.certificate` — TLS protects JWT tokens in transit diff --git a/docs/doctor/articles/security/password-policy.md b/docs/doctor/articles/security/password-policy.md new file mode 100644 index 000000000..e186d1e5f --- /dev/null +++ b/docs/doctor/articles/security/password-policy.md @@ -0,0 +1,95 @@ +--- +checkId: check.security.password.policy +plugin: stellaops.doctor.security +severity: warn +tags: [security, password, authentication] +--- +# Password Policy + +## What It Checks +Validates password requirements meet security standards. The check only runs when a password policy configuration section exists (`Identity:Password`, `Password`, or `Security:Password`). It inspects: + +| Setting | Threshold | Severity | +|---|---|---| +| `RequiredLength` / `MinLength` | Less than 8 | `fail` (if < 6), otherwise `warn` | +| `RequiredLength` / `MinLength` | Less than 12 | `warn` — 12+ recommended | +| `RequireDigit` | `false` | `warn` | +| `RequireLowercase` | `false` | `warn` | +| `RequireUppercase` | `false` | `warn` | +| `RequireNonAlphanumeric` / `RequireSpecialChar` | `false` | `warn` | +| `MaxFailedAccessAttempts` / `MaxAttempts` | Greater than 10 | `warn` | +| `DefaultLockoutTimeSpan` / `DurationMinutes` | Less than 1 minute | `warn` | + +Default values if not explicitly set: min length = 8, require digit/lowercase/uppercase/special = true, max failed attempts = 5, lockout duration = 5 minutes. + +## Why It Matters +Weak password policies enable brute-force and credential-stuffing attacks. Short passwords with low complexity can be cracked quickly with dictionary attacks. Without account lockout or with too many allowed attempts, automated attacks can run indefinitely. In a release control plane, compromised credentials could lead to unauthorized release approvals, policy changes, or data exfiltration. + +## Common Causes +- Minimum password length set too short (below 8 characters) +- Password complexity requirements disabled (no digit, uppercase, lowercase, or special character requirement) +- Maximum failed login attempts too high (above 10), allowing extended brute-force +- Account lockout duration too short (less than 1 minute) + +## How to Fix + +### Docker Compose +Set password policy via environment variables: + +```yaml +environment: + Identity__Password__RequiredLength: "12" + Identity__Password__RequireDigit: "true" + Identity__Password__RequireLowercase: "true" + Identity__Password__RequireUppercase: "true" + Identity__Password__RequireNonAlphanumeric: "true" + Identity__Lockout__MaxFailedAccessAttempts: "5" + Identity__Lockout__DefaultLockoutTimeSpan: "15" +``` + +### Bare Metal / systemd +Edit `appsettings.json`: + +```json +{ + "Identity": { + "Password": { + "RequiredLength": 12, + "RequireDigit": true, + "RequireLowercase": true, + "RequireUppercase": true, + "RequireNonAlphanumeric": true + }, + "Lockout": { + "MaxFailedAccessAttempts": 5, + "DefaultLockoutTimeSpan": 15 + } + } +} +``` + +### Kubernetes / Helm +Set in Helm values: + +```yaml +identity: + password: + requiredLength: 12 + requireDigit: true + requireLowercase: true + requireUppercase: true + requireNonAlphanumeric: true + lockout: + maxFailedAccessAttempts: 5 + defaultLockoutTimeSpan: 15 +``` + +## Verification +``` +stella doctor run --check check.security.password.policy +``` + +## Related Checks +- `check.core.auth.config` — validates overall authentication configuration +- `check.security.audit.logging` — authentication failure events should be logged +- `check.security.ratelimit` — rate limiting provides an additional layer of brute-force protection diff --git a/docs/doctor/articles/security/ratelimit.md b/docs/doctor/articles/security/ratelimit.md new file mode 100644 index 000000000..eb95e451f --- /dev/null +++ b/docs/doctor/articles/security/ratelimit.md @@ -0,0 +1,99 @@ +--- +checkId: check.security.ratelimit +plugin: stellaops.doctor.security +severity: warn +tags: [security, ratelimit, api] +--- +# Rate Limiting + +## What It Checks +Validates that rate limiting is configured to prevent API abuse. The check inspects `RateLimiting:*` and `Security:RateLimiting:*` configuration sections: + +| Condition | Result | +|---|---| +| `Enabled` not set at all | `info` — rate limiting configuration not found | +| `Enabled` is `false` | `warn` — rate limiting explicitly disabled | +| `PermitLimit` > 10,000 | `warn` — permit count very high | +| `WindowSeconds` < 1 | `warn` — window too short | +| `WindowSeconds` > 3,600 | `warn` — window too long for burst prevention | +| Effective rate > 1,000 req/s | `warn` — rate may be too permissive | + +The effective rate is calculated as `PermitLimit / WindowSeconds`. + +Default values if not explicitly set: `PermitLimit` = 100, `WindowSeconds` = 60, `QueueLimit` = 0. + +Evidence collected includes: enabled state, permit limit, window seconds, queue limit, and effective requests per second. + +## Why It Matters +Without rate limiting, the API is vulnerable to denial-of-service attacks, credential-stuffing, and resource exhaustion. A single client or compromised API key can overwhelm the service, affecting all users. Rate limiting is especially important for: + +- Login endpoints (prevents brute-force attacks) +- Scan submission endpoints (prevents resource exhaustion) +- Evidence upload endpoints (prevents storage exhaustion) + +## Common Causes +- Rate limiting explicitly disabled in configuration +- Rate limiting configuration section not present +- Permit limit set too high (greater than 10,000 per window) +- Rate limit window too short (less than 1 second) or too long (greater than 1 hour) +- Effective rate too permissive (more than 1,000 requests per second) + +## How to Fix + +### Docker Compose +Set rate limiting configuration: + +```yaml +environment: + RateLimiting__Enabled: "true" + RateLimiting__PermitLimit: "100" + RateLimiting__WindowSeconds: "60" + RateLimiting__QueueLimit: "10" +``` + +### Bare Metal / systemd +Edit `appsettings.json`: + +```json +{ + "RateLimiting": { + "Enabled": true, + "PermitLimit": 100, + "WindowSeconds": 60, + "QueueLimit": 10 + } +} +``` + +### Kubernetes / Helm +Set in Helm values: + +```yaml +rateLimiting: + enabled: true + permitLimit: 100 + windowSeconds: 60 + queueLimit: 10 +``` + +For stricter per-endpoint limits, configure additional policies: + +```yaml +rateLimiting: + policies: + login: + permitLimit: 10 + windowSeconds: 300 + scan: + permitLimit: 20 + windowSeconds: 60 +``` + +## Verification +``` +stella doctor run --check check.security.ratelimit +``` + +## Related Checks +- `check.security.apikey` — per-key rate limiting for API key authentication +- `check.security.password.policy` — lockout policy provides complementary brute-force protection diff --git a/docs/doctor/articles/security/secrets.md b/docs/doctor/articles/security/secrets.md new file mode 100644 index 000000000..d3960c1a5 --- /dev/null +++ b/docs/doctor/articles/security/secrets.md @@ -0,0 +1,138 @@ +--- +checkId: check.security.secrets +plugin: stellaops.doctor.security +severity: fail +tags: [security, secrets, configuration] +--- +# Secrets Configuration + +## What It Checks +Validates that secrets are properly managed and not exposed as plain text in configuration. The check scans the following configuration keys for potential plain-text secrets: + +| Key | What it protects | +|---|---| +| `Jwt:SigningKey` | JWT token signing | +| `Jwt:Secret` | JWT secret (alternative key) | +| `ApiKey` | API authentication key | +| `ApiSecret` | API secret | +| `S3:SecretKey` | Object storage credentials | +| `Smtp:Password` | Email server credentials | +| `Ldap:Password` | Directory service credentials | +| `Redis:Password` | Cache/message broker credentials | +| `Valkey:Password` | Cache/message broker credentials | + +A value is considered a plain-text secret if it: +1. Is at least 8 characters long. +2. Contains both uppercase and lowercase letters. +3. Contains digits or special characters. +4. Does NOT start with a secrets provider prefix: `vault:`, `azurekv:`, `aws:`, `gcp:`, `${`, or `@Microsoft.KeyVault`. + +The check also examines whether a secrets management provider is configured (`Secrets:Provider`, `KeyVault:Provider`, `Secrets:VaultUrl`, `KeyVault:Url`, `Vault:Address`). A missing secrets manager is only flagged if plain-text secrets are also found. + +Note: Connection strings are intentionally excluded from this check as they are DSNs (host/port/db) and are expected in configuration. + +## Why It Matters +Plain-text secrets in configuration files are a critical security risk. Configuration files are often committed to version control, stored in CI artifacts, or readable by anyone with filesystem access. Leaked secrets enable: + +- Token forgery (JWT signing keys). +- Unauthorized API access (API keys). +- Data access via backend services (database, SMTP, LDAP passwords). +- Lateral movement within the infrastructure. + +## Common Causes +- Secrets stored directly in `appsettings.json` instead of using a secrets provider +- Environment variables containing secrets not sourced from a secrets manager +- Development secrets left in production configuration +- No secrets management provider configured (HashiCorp Vault, Azure Key Vault, etc.) + +## How to Fix + +### Docker Compose +Use Docker secrets or reference an external secrets manager: + +```yaml +services: + platform: + environment: + Jwt__SigningKey: "vault:secret/data/stellaops/jwt#signing_key" + Secrets__Provider: "vault" + Secrets__VaultUrl: "http://vault:8200" + secrets: + - jwt_signing_key + +secrets: + jwt_signing_key: + file: ./secrets/jwt_signing_key.txt +``` + +Or use `dotnet user-secrets` for development: +```bash +dotnet user-secrets set "Jwt:SigningKey" "" +``` + +### Bare Metal / systemd +Configure a secrets provider in `appsettings.json`: + +```json +{ + "Secrets": { + "Provider": "vault", + "VaultUrl": "https://vault.internal:8200", + "UseSecretManager": true + } +} +``` + +Store secrets in the provider instead of config files: +```bash +# HashiCorp Vault +vault kv put secret/stellaops/jwt signing_key="" + +# dotnet user-secrets (development) +dotnet user-secrets set "Jwt:SigningKey" "" +``` + +### Kubernetes / Helm +Store secrets as Kubernetes Secrets: + +```bash +kubectl create secret generic stellaops-secrets \ + --from-literal=jwt-signing-key="" \ + --from-literal=smtp-password="" +``` + +Reference in Helm values: +```yaml +secrets: + provider: "kubernetes" + existingSecret: "stellaops-secrets" +``` + +Or use an external secrets operator (e.g., External Secrets Operator with Vault): +```yaml +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: stellaops-secrets +spec: + secretStoreRef: + name: vault-backend + kind: ClusterSecretStore + target: + name: stellaops-secrets + data: + - secretKey: jwt-signing-key + remoteRef: + key: secret/stellaops/jwt + property: signing_key +``` + +## Verification +``` +stella doctor run --check check.security.secrets +``` + +## Related Checks +- `check.security.jwt.config` — JWT signing key security +- `check.security.encryption` — encryption key management +- `check.security.apikey` — API key security practices diff --git a/docs/doctor/articles/security/tls-certificate.md b/docs/doctor/articles/security/tls-certificate.md new file mode 100644 index 000000000..78d031c14 --- /dev/null +++ b/docs/doctor/articles/security/tls-certificate.md @@ -0,0 +1,114 @@ +--- +checkId: check.security.tls.certificate +plugin: stellaops.doctor.security +severity: fail +tags: [security, tls, certificate] +--- +# TLS Certificate + +## What It Checks +Validates TLS certificate validity and expiration. The check only runs when a certificate path is configured (`Tls:CertificatePath` or `Kestrel:Certificates:Default:Path`). It loads the certificate file and performs the following validations: + +| Condition | Result | +|---|---| +| Certificate file not found | `fail` | +| Certificate cannot be loaded (corrupt, wrong password) | `fail` | +| Certificate not yet valid (`NotBefore` in the future) | `fail` | +| Certificate has expired (`NotAfter` in the past) | `fail` | +| Certificate expires in less than **30 days** | `warn` | +| Certificate valid for 30+ days | `pass` | + +The check supports both PEM certificates and PKCS#12 (.pfx/.p12) files with optional passwords (`Tls:CertificatePassword` or `Kestrel:Certificates:Default:Password`). + +Evidence collected includes: subject, issuer, NotBefore, NotAfter, days until expiry, and thumbprint. + +## Why It Matters +An expired or invalid TLS certificate causes all HTTPS connections to fail. Browsers display security warnings, API clients reject responses, and inter-service communication breaks. In a release control plane, TLS failures prevent: + +- Console access for operators. +- API calls from CI/CD pipelines. +- Inter-service communication via HTTPS. +- OIDC authentication flows with the Authority. + +Certificate expiration is the most common cause of production outages that is entirely preventable with monitoring. + +## Common Causes +- Certificate file path is incorrect or the file was deleted +- Certificate has exceeded its validity period (expired) +- Certificate validity period has not started yet (clock skew or pre-dated certificate) +- Certificate file is corrupted +- Certificate password is incorrect (for PKCS#12 files) +- Certificate format not supported + +## How to Fix + +### Docker Compose +Mount the certificate and configure the path: + +```yaml +services: + platform: + environment: + Tls__CertificatePath: "/app/certs/stellaops.pfx" + Tls__CertificatePassword: "${TLS_CERT_PASSWORD}" + volumes: + - ./certs/stellaops.pfx:/app/certs/stellaops.pfx:ro +``` + +Generate a new self-signed certificate for development: +```bash +openssl req -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem -days 365 -nodes \ + -subj "/CN=stella-ops.local" +openssl pkcs12 -export -out stellaops.pfx -inkey key.pem -in cert.pem +``` + +### Bare Metal / systemd +Renew the certificate (e.g., with Let's Encrypt): +```bash +sudo certbot renew +sudo systemctl restart stellaops-platform +``` + +Or update the configuration with a new certificate: +```bash +# Update appsettings.json +{ + "Tls": { + "CertificatePath": "/etc/ssl/stellaops/cert.pfx", + "CertificatePassword": "" + } +} +``` + +### Kubernetes / Helm +Use cert-manager for automatic certificate management: + +```yaml +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: stellaops-tls +spec: + secretName: stellaops-tls-secret + issuerRef: + name: letsencrypt-prod + kind: ClusterIssuer + dnsNames: + - stella-ops.yourdomain.com +``` + +Reference in Helm values: +```yaml +tls: + secretName: "stellaops-tls-secret" +``` + +## Verification +``` +stella doctor run --check check.security.tls.certificate +``` + +## Related Checks +- `check.security.headers` — HSTS requires a valid TLS certificate +- `check.security.encryption` — validates encryption at rest (TLS handles encryption in transit) +- `check.core.crypto.available` — RSA/ECDSA must be available for certificate operations diff --git a/docs/doctor/articles/storage/backup-directory.md b/docs/doctor/articles/storage/backup-directory.md new file mode 100644 index 000000000..940548398 --- /dev/null +++ b/docs/doctor/articles/storage/backup-directory.md @@ -0,0 +1,80 @@ +--- +checkId: check.storage.backup +plugin: stellaops.doctor.storage +severity: warn +tags: [storage, backup, disaster-recovery] +--- +# Backup Directory Accessibility + +## What It Checks +Verifies backup directory accessibility and recent backup presence. The check: + +- Reads the backup path from `Backup:Path` or `Storage:BackupPath` configuration. +- Verifies the directory exists. +- Tests write access by creating and deleting a temp file. +- Scans for backup files (`.bak`, `.backup`, `.tar`, `.tar.gz`, `.tgz`, `.zip`, `.sql`, `.dump`) in the top-level directory. +- Warns if no backup files are found or if the most recent backup is older than 7 days. +- Fails if the directory exists but is not writable. + +The check only runs when a backup path is configured. + +## Why It Matters +Backups are the last line of defense against data loss. An inaccessible backup directory, missing backups, or stale backups mean the system cannot recover from database corruption, hardware failure, or accidental deletion. The 7-day staleness threshold ensures backups are kept reasonably current. + +## Common Causes +- Backup directory not created yet +- Path misconfigured or remote mount not available +- Insufficient permissions (read-only mount, wrong ownership) +- Backup job never run or failing silently +- Backup schedule disabled + +## How to Fix + +### Docker Compose +```yaml +environment: + Backup__Path: "/var/backups/stellaops" +volumes: + - backup-data:/var/backups/stellaops +``` + +```bash +# Create backup directory +docker exec mkdir -p /var/backups/stellaops + +# Run initial backup +docker exec stella backup create --full +``` + +### Bare Metal / systemd +```bash +# Create backup directory +mkdir -p /var/backups/stellaops +chmod 750 /var/backups/stellaops + +# Run initial backup +stella backup create --full + +# Set up a schedule +stella backup schedule create --interval daily +``` + +### Kubernetes / Helm +```yaml +backup: + enabled: true + path: "/var/backups/stellaops" + schedule: "0 3 * * *" + persistence: + enabled: true + size: 100Gi +``` + +## Verification +``` +stella doctor run --check check.storage.backup +``` + +## Related Checks +- `check.storage.diskspace` — verifies sufficient disk space is available +- `check.storage.evidencelocker` — verifies evidence locker write access diff --git a/docs/doctor/articles/storage/disk-space.md b/docs/doctor/articles/storage/disk-space.md new file mode 100644 index 000000000..0026cbe86 --- /dev/null +++ b/docs/doctor/articles/storage/disk-space.md @@ -0,0 +1,84 @@ +--- +checkId: check.storage.diskspace +plugin: stellaops.doctor.storage +severity: fail +tags: [storage, disk, capacity, core] +--- +# Disk Space Availability + +## What It Checks +Verifies disk space availability on drives used by Stella Ops. The check: + +- Identifies paths to check from `Storage:DataPath`, `EvidenceLocker:Path`, `Backup:Path`, and `Logging:Path` configuration (falls back to platform defaults: `/var/lib/stellaops` on Linux, `%ProgramData%\StellaOps` on Windows). +- Gets the drive info for each path and calculates usage ratio. +- **Fails at 90%+ usage** (critical threshold) -- the system is at immediate risk of running out of space. +- **Warns at 80%+ usage** (warning threshold) -- approaching capacity. +- Reports the most critically used drive. + +## Why It Matters +Disk exhaustion causes cascading failures: database writes fail, evidence cannot be stored, log rotation breaks, and container operations halt. This is a severity-fail check because disk exhaustion can cause data loss and service outages that are difficult to recover from. + +## Common Causes +- Log files accumulating without rotation +- Evidence artifacts consuming space +- Backup files not rotated or pruned +- Large container images cached on disk +- Normal data growth approaching provisioned capacity + +## How to Fix + +### Docker Compose +```bash +# Check disk usage +docker exec df -h + +# Cleanup old logs +stella storage cleanup --logs --older-than 7d + +# Prune Docker resources +docker system prune -a +docker volume prune +``` + +### Bare Metal / systemd +```bash +# Find large files +du -sh /var/lib/stellaops/* | sort -rh | head -20 + +# Cleanup logs +stella storage cleanup --logs --older-than 7d + +# Cleanup temporary files +stella storage cleanup --temp + +# Review Docker disk usage +docker system df +``` + +### Kubernetes / Helm +```bash +# Check PV usage +kubectl get pv +kubectl exec -it -- df -h + +# Expand PVC if needed +kubectl edit pvc stellaops-data # increase storage request +``` + +Consider setting up automated cleanup policies: +```yaml +storage: + cleanup: + enabled: true + logRetentionDays: 30 + tempCleanupSchedule: "0 4 * * *" +``` + +## Verification +``` +stella doctor run --check check.storage.diskspace +``` + +## Related Checks +- `check.storage.backup` — verifies backup directory accessibility +- `check.storage.evidencelocker` — verifies evidence locker write access diff --git a/docs/doctor/articles/storage/evidence-locker-write.md b/docs/doctor/articles/storage/evidence-locker-write.md new file mode 100644 index 000000000..6ff54eb74 --- /dev/null +++ b/docs/doctor/articles/storage/evidence-locker-write.md @@ -0,0 +1,83 @@ +--- +checkId: check.storage.evidencelocker +plugin: stellaops.doctor.storage +severity: fail +tags: [storage, evidence, write, permissions] +--- +# Evidence Locker Write Access + +## What It Checks +Verifies evidence locker write permissions and performance. The check: + +- Reads the evidence locker path from `EvidenceLocker:Path` or `Storage:EvidencePath`. +- Creates the directory if it does not exist. +- Writes a test file, reads it back to verify content integrity, and measures latency. +- **Fails** if the directory cannot be created, writes are denied (`UnauthorizedAccessException`), or content read-back does not match (storage corruption). +- **Warns** if write latency exceeds 100ms (elevated I/O latency, e.g., slow NFS/CIFS backend). +- Cleans up the test file after the check. + +The check only runs when an evidence locker path is configured. + +## Why It Matters +The evidence locker stores cryptographically signed release evidence -- attestations, SBOM snapshots, policy evaluation results, and audit trails. If the locker is not writable, releases cannot produce verifiable evidence, blocking policy-gated promotions and breaking auditability guarantees. This is a severity-fail check because evidence integrity is a core platform invariant. + +## Common Causes +- Insufficient file system permissions +- Directory owned by a different user +- SELinux/AppArmor blocking writes +- Disk full +- Filesystem mounted read-only +- Slow network-attached storage (NFS/CIFS) causing high latency + +## How to Fix + +### Docker Compose +```yaml +environment: + EvidenceLocker__Path: "/var/lib/stellaops/evidence" +volumes: + - evidence-data:/var/lib/stellaops/evidence +``` + +```bash +# Check permissions inside container +docker exec ls -la /var/lib/stellaops/evidence + +# Fix permissions +docker exec chown -R stellaops:stellaops /var/lib/stellaops/evidence +``` + +### Bare Metal / systemd +```bash +# Create directory +mkdir -p /var/lib/stellaops/evidence + +# Set ownership and permissions +chown -R stellaops:stellaops /var/lib/stellaops/evidence +chmod 750 /var/lib/stellaops/evidence + +# Check disk space +df -h /var/lib/stellaops/evidence + +# Check mount status +mount | grep $(df --output=source /var/lib/stellaops/evidence | tail -1) +``` + +### Kubernetes / Helm +```yaml +evidenceLocker: + path: "/var/lib/stellaops/evidence" + persistence: + enabled: true + size: 50Gi + storageClass: "fast-ssd" # use fast storage to avoid latency warnings +``` + +## Verification +``` +stella doctor run --check check.storage.evidencelocker +``` + +## Related Checks +- `check.storage.diskspace` — verifies sufficient disk space is available +- `check.storage.backup` — verifies backup directory accessibility diff --git a/docs/doctor/articles/timestamping/crl-distribution.md b/docs/doctor/articles/timestamping/crl-distribution.md new file mode 100644 index 000000000..b5a7f14c3 --- /dev/null +++ b/docs/doctor/articles/timestamping/crl-distribution.md @@ -0,0 +1,53 @@ +--- +checkId: check.timestamp.crl.distribution +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, crl, distribution, revocation] +--- +# CRL Distribution Point Availability + +## What It Checks +Checks that configured CRL distribution points are accessible. The check: + +- Gets configured CDPs from the registry. +- Sends a HEAD request to each CDP URL with a 30-second timeout. +- Reports response status, latency, and CRL size (from Content-Length). +- Fails if all CDPs are unavailable. Warns if some are unavailable. +- Passes (healthy) if no CDPs are configured (optional feature). + +## Why It Matters +CRL distribution points provide certificate revocation lists needed to verify that TSA certificates have not been revoked. If CDPs are unavailable, the system cannot download updated CRLs, potentially accepting timestamps from revoked certificates. + +## Common Causes +- CRL distribution point server is down +- Network connectivity issues +- Firewall blocking HTTP/HTTPS to CDP URLs +- CDP URL changed by the CA + +## How to Fix + +### Docker Compose +```bash +docker exec curl -I http://crl.example.com/crl.pem +``` + +### Bare Metal / systemd +```bash +# Test CDP connectivity +curl -I http://crl.example.com/crl.pem + +# Check network and DNS +nslookup crl.example.com +``` + +### Kubernetes / Helm +Ensure egress NetworkPolicies allow traffic to CRL distribution point URLs. + +## Verification +``` +stella doctor run --check check.timestamp.crl.distribution +``` + +## Related Checks +- `check.timestamp.ocsp.responder` — checks OCSP responder availability +- `check.timestamp.revocation.cache-fresh` — checks revocation cache freshness diff --git a/docs/doctor/articles/timestamping/eu-trust-list-fresh.md b/docs/doctor/articles/timestamping/eu-trust-list-fresh.md new file mode 100644 index 000000000..2fb82480a --- /dev/null +++ b/docs/doctor/articles/timestamping/eu-trust-list-fresh.md @@ -0,0 +1,57 @@ +--- +checkId: check.timestamp.eidas.trustlist.fresh +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, eidas, trustlist, lotl, compliance] +--- +# EU Trust List Freshness + +## What It Checks +Checks that the EU Trust List (LOTL -- List of Trusted Lists) is up-to-date. The check: + +- Queries the trust list cache for the last refresh timestamp. +- Fails if the cache does not exist (trust list never fetched). +- Fails if the cache is older than the critical threshold (default 7 days). +- Warns if older than the warning threshold (default 3 days). +- Reports the number of TSPs (Trust Service Providers) and QTS (Qualified TSAs) in the cache. + +## Why It Matters +The EU Trust List is the authoritative source for eIDAS-qualified trust service providers. A stale trust list may not reflect recent provider additions, withdrawals, or status changes, leading to incorrect qualification decisions for TSA providers operating under eIDAS regulation. + +## Common Causes +- Trust list refresh job not running +- Network issues preventing download from the EU publication endpoint +- Air-gapped environment without scheduled trust list updates + +## How to Fix + +### Docker Compose +```bash +docker exec stella trust-list refresh +``` + +### Bare Metal / systemd +```bash +stella trust-list refresh + +# Schedule automatic refresh +stella trust-list schedule --interval 24h +``` + +### Kubernetes / Helm +```yaml +timestamping: + eidas: + trustListRefreshSchedule: "0 6 * * *" + warningAgeDays: 3 + criticalAgeDays: 7 +``` + +## Verification +``` +stella doctor run --check check.timestamp.eidas.trustlist.fresh +``` + +## Related Checks +- `check.timestamp.eidas.qts.qualified` — checks QTS provider qualification status +- `check.timestamp.eidas.qts.status-change` — alerts on qualification status changes diff --git a/docs/doctor/articles/timestamping/evidence-staleness.md b/docs/doctor/articles/timestamping/evidence-staleness.md new file mode 100644 index 000000000..2dacf2e70 --- /dev/null +++ b/docs/doctor/articles/timestamping/evidence-staleness.md @@ -0,0 +1,75 @@ +--- +checkId: check.timestamp.evidence.staleness +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, evidence, staleness, retimestamp] +--- +# Evidence Staleness + +## What It Checks +Aggregated check for timestamp evidence staleness across six dimensions: + +- **TST Expiry**: timestamps approaching signing certificate expiry (warn at 180 days, critical at 90 days). +- **Deprecated Algorithms**: timestamps using deprecated hash algorithms (e.g., SHA1). +- **Missing Stapling**: timestamps without stapled OCSP/CRL revocation data. +- **Retimestamp Queue**: artifacts pending re-timestamping. +- **OCSP Staleness**: OCSP responses approaching expiry (warn at 3 days). +- **CRL Staleness**: CRLs approaching expiry (warn at 7 days). + +Fails if any dimension is unhealthy (count exceeds `CriticalStaleCount`, default 10). Warns if any dimension is degraded. + +## Why It Matters +Stale evidence loses its verifiability over time. Expired timestamps, deprecated algorithms, and missing revocation data all weaken the chain of trust. Proactive detection enables scheduled re-timestamping before evidence becomes unverifiable. + +## Common Causes +- Re-timestamp jobs not running or failing +- TSA signing certificates approaching expiry +- OCSP/CRL cache not refreshed +- Legacy artifacts signed with SHA1 + +## How to Fix + +### Docker Compose +```bash +# Run evidence refresh +docker exec stella evidence refresh --all + +# Run retimestamp queue +docker exec stella retimestamp run +``` + +### Bare Metal / systemd +```bash +# Check evidence status +stella evidence audit --staleness + +# Refresh stale evidence +stella evidence refresh --all + +# Process retimestamp queue +stella retimestamp run + +# Schedule automatic refresh +stella retimestamp schedule create --interval daily +``` + +### Kubernetes / Helm +```yaml +timestamping: + evidenceStaleness: + tstWarnDays: 180 + tstCriticalDays: 90 + criticalStaleCount: 10 + retimestampSchedule: "0 1 * * *" +``` + +## Verification +``` +stella doctor run --check check.timestamp.evidence.staleness +``` + +## Related Checks +- `check.timestamp.evidence.tst.expiry` — focused check for expiring TSTs +- `check.timestamp.evidence.tst.deprecated-algo` — focused check for deprecated algorithms +- `check.timestamp.evidence.tst.missing-stapling` — focused check for missing stapling +- `check.timestamp.evidence.retimestamp.pending` — focused check for pending retimestamps diff --git a/docs/doctor/articles/timestamping/ocsp-responder.md b/docs/doctor/articles/timestamping/ocsp-responder.md new file mode 100644 index 000000000..78e87f7cd --- /dev/null +++ b/docs/doctor/articles/timestamping/ocsp-responder.md @@ -0,0 +1,53 @@ +--- +checkId: check.timestamp.ocsp.responder +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, ocsp, responder, revocation] +--- +# OCSP Responder Availability + +## What It Checks +Checks that configured OCSP responders are accessible. The check: + +- Gets configured OCSP responders from the registry. +- Sends an OPTIONS request to each responder with a 10-second timeout. +- Considers 2xx and 405 (Method Not Allowed) responses as healthy. +- Fails if all responders are unavailable. Warns if some are unavailable. +- Reports degraded if no responders are configured. + +## Why It Matters +OCSP responders provide real-time certificate revocation status. If OCSP responders are unavailable, the system cannot verify whether TSA certificates have been revoked, potentially accepting timestamps from compromised certificates. + +## Common Causes +- OCSP responder server is down +- Network connectivity issues +- Firewall blocking HTTP/HTTPS to OCSP URLs +- OCSP responder URL changed by the CA + +## How to Fix + +### Docker Compose +```bash +# Test OCSP responder connectivity +docker exec curl -v http://ocsp.digicert.com +``` + +### Bare Metal / systemd +```bash +# Test OCSP responder +openssl ocsp -issuer /path/to/issuer.pem -cert /path/to/cert.pem \ + -url http://ocsp.digicert.com -resp_text +``` + +### Kubernetes / Helm +Ensure egress NetworkPolicies allow traffic to OCSP responder URLs. + +## Verification +``` +stella doctor run --check check.timestamp.ocsp.responder +``` + +## Related Checks +- `check.timestamp.ocsp.stapling` — checks OCSP stapling configuration +- `check.timestamp.revocation.cache-fresh` — checks revocation cache freshness +- `check.timestamp.crl.distribution` — checks CRL distribution point availability diff --git a/docs/doctor/articles/timestamping/ocsp-stapling-enabled.md b/docs/doctor/articles/timestamping/ocsp-stapling-enabled.md new file mode 100644 index 000000000..89899944f --- /dev/null +++ b/docs/doctor/articles/timestamping/ocsp-stapling-enabled.md @@ -0,0 +1,53 @@ +--- +checkId: check.timestamp.ocsp.stapling +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, ocsp, stapling, revocation] +--- +# OCSP Stapling Enabled + +## What It Checks +Checks whether TSA OCSP stapling is configured and fresh. The check: + +- Queries the stapling status provider for all TSA providers. +- Reports which providers have OCSP stapling enabled or disabled. +- Fails if all providers have stapling disabled. Warns if some are disabled. +- Reports degraded if no stapling status data is available. + +## Why It Matters +OCSP stapling embeds the OCSP response directly in the TLS handshake or timestamp token, eliminating the need for clients to perform live OCSP lookups. This is critical for air-gapped deployments where live OCSP lookups are impossible, and improves performance for all deployments. + +## Common Causes +- OCSP stapling not configured for TSA providers +- Stapling status monitoring not set up +- TSA provider does not support stapling + +## How to Fix + +### Docker Compose +Enable OCSP stapling in TSA provider configuration: + +```yaml +environment: + Timestamping__OcspStapling__Enabled: "true" +``` + +### Bare Metal / systemd +Configure OCSP stapling in `appsettings.json` and ensure TSA providers support it. + +### Kubernetes / Helm +```yaml +timestamping: + ocspStapling: + enabled: true +``` + +## Verification +``` +stella doctor run --check check.timestamp.ocsp.stapling +``` + +## Related Checks +- `check.timestamp.ocsp.responder` — checks OCSP responder availability +- `check.timestamp.evidence.tst.missing-stapling` — detects timestamps without stapled data +- `check.timestamp.revocation.cache-fresh` — checks revocation cache freshness diff --git a/docs/doctor/articles/timestamping/qts-providers-qualified.md b/docs/doctor/articles/timestamping/qts-providers-qualified.md new file mode 100644 index 000000000..ce35fa13f --- /dev/null +++ b/docs/doctor/articles/timestamping/qts-providers-qualified.md @@ -0,0 +1,57 @@ +--- +checkId: check.timestamp.eidas.qts.qualified +plugin: stellaops.doctor.timestamping +severity: fail +tags: [timestamping, eidas, qts, qualification, compliance] +--- +# QTS Providers Qualification + +## What It Checks +Checks that configured qualified TSA providers are still listed on the EU Trust List. The check: + +- Gets qualified TSA providers from the registry. +- For each provider, queries the trust list cache for current qualification status. +- Fails if any provider is no longer qualified (withdrawn, suspended, or not found). +- Passes if no qualified providers are configured (optional feature) or all are still qualified. + +## Why It Matters +Under eIDAS regulation, only qualified TSA providers can produce timestamps with legal effect equivalent to handwritten signatures. If a provider loses qualification, timestamps from that provider no longer meet eIDAS compliance requirements, potentially invalidating evidence used for regulated releases. + +## Common Causes +- TSA provider's qualified status withdrawn by a supervisory body +- Provider suspended due to compliance issues +- Provider not yet (re-)listed on the current trust list version +- Trust list cache is stale (check `check.timestamp.eidas.trustlist.fresh`) + +## How to Fix + +### Docker Compose +```bash +# Refresh trust list first +docker exec stella trust-list refresh + +# Check provider status +docker exec stella tsa qualification status +``` + +### Bare Metal / systemd +```bash +stella trust-list refresh +stella tsa qualification status + +# Replace non-qualified provider +stella tsa remove --name "Withdrawn Provider" +stella tsa add --name "New QTS" --url "https://new-qualified-tsa.eu/tsr" --qualified +``` + +### Kubernetes / Helm +Update TSA provider configuration to use only qualified providers. + +## Verification +``` +stella doctor run --check check.timestamp.eidas.qts.qualified +``` + +## Related Checks +- `check.timestamp.eidas.trustlist.fresh` — checks EU Trust List freshness +- `check.timestamp.eidas.qts.status-change` — alerts on qualification status changes diff --git a/docs/doctor/articles/timestamping/qts-status-change.md b/docs/doctor/articles/timestamping/qts-status-change.md new file mode 100644 index 000000000..af8040efb --- /dev/null +++ b/docs/doctor/articles/timestamping/qts-status-change.md @@ -0,0 +1,52 @@ +--- +checkId: check.timestamp.eidas.qts.status-change +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, eidas, qts, status, monitoring] +--- +# QTS Status Changes + +## What It Checks +Alerts on TSA qualification status changes in the past 7 days. The check: + +- Queries the status change tracker for recent changes within a 7-day window. +- Reports each change (provider name, previous status, new status, change date). +- Warns if any withdrawals occurred (provider moved from Qualified to Withdrawn/Suspended/Deprecated). +- Passes if no changes occurred or all changes are positive (e.g., new qualification grants). + +## Why It Matters +Qualification status changes require operational response. A withdrawal means the provider's timestamps no longer satisfy eIDAS requirements, and traffic should be migrated to an alternative provider. Early detection of changes enables proactive migration before compliance deadlines. + +## Common Causes +- Supervisory body action against a TSA provider +- Provider voluntary withdrawal from qualification +- New provider achieving qualification (positive change) + +## How to Fix + +### Docker Compose +```bash +# Review recent changes +docker exec stella tsa qualification changes --days 7 + +# If a provider was withdrawn, add a replacement +docker exec stella tsa add --name "Replacement QTS" --url "https://new-tsa.eu/tsr" --qualified +``` + +### Bare Metal / systemd +```bash +stella tsa qualification changes --days 7 +stella tsa qualification status +``` + +### Kubernetes / Helm +Review changes and update provider configuration as needed. + +## Verification +``` +stella doctor run --check check.timestamp.eidas.qts.status-change +``` + +## Related Checks +- `check.timestamp.eidas.qts.qualified` — checks provider qualification status +- `check.timestamp.eidas.trustlist.fresh` — checks EU Trust List freshness diff --git a/docs/doctor/articles/timestamping/rekor-time-correlation.md b/docs/doctor/articles/timestamping/rekor-time-correlation.md new file mode 100644 index 000000000..0bd3bc5cc --- /dev/null +++ b/docs/doctor/articles/timestamping/rekor-time-correlation.md @@ -0,0 +1,63 @@ +--- +checkId: check.timestamp.timesync.rekor-correlation +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, timesync, rekor, correlation, transparency] +--- +# TST-Rekor Time Correlation + +## What It Checks +Checks that TST genTime and Rekor integratedTime are properly correlated. The check: + +- Retrieves recent attestations from the lookback window (default 24 hours). +- For each attestation with both TST and Rekor timestamps, computes the gap. +- **Fails** (unhealthy) if any attestation has TST *after* Rekor (potential backdating -- TST should always precede Rekor integration). +- **Warns** (degraded) if the gap between TST and Rekor exceeds the maximum threshold (default 5 minutes). + +## Why It Matters +Proper time correlation between TST (Timestamp Token) and Rekor (transparency log) entries is a key integrity signal. If a TST is dated after its Rekor entry, it may indicate clock manipulation or backdating. Excessive gaps suggest pipeline delays that could indicate operational issues. + +## Common Causes +- System clock drift causing ordering violations +- Pipeline delays between timestamping and Rekor submission +- Rekor transparency log ingestion delays +- Network issues causing timestamp reordering + +## How to Fix + +### Docker Compose +```bash +# Check system time sync +timedatectl status + +# Verify pipeline timing +docker exec stella evidence audit --recent --show-timing +``` + +### Bare Metal / systemd +```bash +# Investigate time synchronization +chronyc tracking + +# Review attestation timing +stella evidence audit --recent --show-timing +``` + +### Kubernetes / Helm +```yaml +timestamping: + rekorCorrelation: + lookbackWindow: "24h" + maximumGap: "5m" +``` + +Investigate ordering violations immediately as they may indicate tampering. + +## Verification +``` +stella doctor run --check check.timestamp.timesync.rekor-correlation +``` + +## Related Checks +- `check.timestamp.timesync.system` — checks system clock synchronization +- `check.timestamp.timesync.tsa-skew` — checks TSA time skew diff --git a/docs/doctor/articles/timestamping/retimestamp-pending.md b/docs/doctor/articles/timestamping/retimestamp-pending.md new file mode 100644 index 000000000..8f42dbc16 --- /dev/null +++ b/docs/doctor/articles/timestamping/retimestamp-pending.md @@ -0,0 +1,40 @@ +--- +checkId: check.timestamp.evidence.retimestamp.pending +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, evidence, retimestamp, queue] +--- +# Retimestamp Pending + +## What It Checks +Detects artifacts pending re-timestamping. Fails if the pending count exceeds the critical threshold (default 10), otherwise warns. + +## Why It Matters +Pending retimestamp work means artifacts are at risk of losing their temporal proof. If retimestamping does not complete before the original timestamp expires, the evidence becomes unverifiable and may need to be regenerated from scratch. + +## Common Causes +- Retimestamp queue processor not running +- TSA endpoints unavailable during retimestamp attempts +- Queue backlog from a large batch of expiring timestamps +- Retimestamp job scheduling not configured + +## How to Fix +Process the retimestamp queue: + +```bash +stella retimestamp run + +# Schedule automatic processing +stella retimestamp schedule create --interval daily +``` + +If TSA endpoints are down, resolve connectivity first (see `check.timestamp.tsa.reachable`). + +## Verification +``` +stella doctor run --check check.timestamp.evidence.retimestamp.pending +``` + +## Related Checks +- `check.timestamp.evidence.staleness` — aggregated evidence staleness check +- `check.timestamp.tsa.reachable` — verifies TSA endpoints are reachable diff --git a/docs/doctor/articles/timestamping/revocation-cache-fresh.md b/docs/doctor/articles/timestamping/revocation-cache-fresh.md new file mode 100644 index 000000000..cae5a0c15 --- /dev/null +++ b/docs/doctor/articles/timestamping/revocation-cache-fresh.md @@ -0,0 +1,57 @@ +--- +checkId: check.timestamp.revocation.cache-fresh +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, revocation, cache, ocsp, crl] +--- +# Revocation Cache Freshness + +## What It Checks +Checks that cached OCSP responses and CRLs are not stale. The check: + +- Queries the revocation cache for OCSP response and CRL snapshot timestamps. +- Compares each cached item's age against thresholds: OCSP max age (default 12 hours), CRL max age (default 7 days). +- Fails if all cached items are stale. Warns if some are stale. +- Passes if no cached data exists (optional) or all items are fresh. + +## Why It Matters +Stale revocation data means the system may accept certificates that have been revoked since the cache was last updated. For air-gapped environments that cannot perform live lookups, a fresh cache is the only source of revocation information. + +## Common Causes +- Revocation cache refresh job not running +- Network issues preventing OCSP/CRL fetches +- Cache storage issues + +## How to Fix + +### Docker Compose +```bash +# Refresh revocation cache +docker exec stella revocation cache refresh +``` + +### Bare Metal / systemd +```bash +stella revocation cache refresh + +# Schedule automatic refresh +stella revocation cache schedule --ocsp-interval 6h --crl-interval 24h +``` + +### Kubernetes / Helm +```yaml +timestamping: + revocationCache: + ocspMaxAgeHours: 12 + crlMaxAgeDays: 7 + refreshSchedule: "0 */6 * * *" +``` + +## Verification +``` +stella doctor run --check check.timestamp.revocation.cache-fresh +``` + +## Related Checks +- `check.timestamp.ocsp.responder` — checks OCSP responder availability +- `check.timestamp.crl.distribution` — checks CRL distribution point availability diff --git a/docs/doctor/articles/timestamping/system-time-sync.md b/docs/doctor/articles/timestamping/system-time-sync.md new file mode 100644 index 000000000..442729d2f --- /dev/null +++ b/docs/doctor/articles/timestamping/system-time-sync.md @@ -0,0 +1,75 @@ +--- +checkId: check.timestamp.timesync.system +plugin: stellaops.doctor.timestamping +severity: fail +tags: [timestamping, timesync, ntp, system] +--- +# System Time Synchronization + +## What It Checks +Checks that the system clock is synchronized with NTP servers. The check: + +- Queries configured NTP servers (defaults to `time.nist.gov` and `pool.ntp.org`) using the NTP protocol (UDP port 123). +- Computes the time skew between the local system clock and each NTP server. +- **Fails** (unhealthy) if skew exceeds the critical threshold (default 5 seconds). +- **Warns** (degraded) if skew exceeds the warning threshold (default 1 second). +- Reports degraded if no NTP servers can be reached. + +## Why It Matters +Accurate system time is fundamental to timestamping. Clock skew causes timestamp tokens to have incorrect genTime values, which can invalidate evidence during verification. Large skew can also cause TLS certificate validation failures, authentication token rejection, and incorrect audit log ordering. + +## Common Causes +- NTP service not running (chrony, ntpd, systemd-timesyncd) +- NTP servers unreachable (firewall blocking UDP 123) +- Virtual machine time drift (especially paused/resumed VMs) +- Hardware clock issues + +## How to Fix + +### Docker Compose +Docker containers inherit the host's clock. Fix time sync on the Docker host: + +```bash +# Check host time sync +timedatectl status + +# Enable NTP sync +sudo timedatectl set-ntp true + +# Or configure chrony +sudo systemctl restart chronyd +``` + +### Bare Metal / systemd +```bash +# Check time sync status +timedatectl status +chronyc tracking + +# Force sync +sudo chronyc makestep + +# Enable NTP +sudo timedatectl set-ntp true +sudo systemctl enable chronyd +``` + +### Kubernetes / Helm +Kubernetes nodes must have NTP configured. Verify on each node: + +```bash +# On each node +timedatectl status +chronyc tracking +``` + +Ensure NTP is part of your node provisioning configuration. + +## Verification +``` +stella doctor run --check check.timestamp.timesync.system +``` + +## Related Checks +- `check.timestamp.timesync.tsa-skew` — checks skew between system clock and TSA genTime +- `check.timestamp.timesync.rekor-correlation` — checks TST-Rekor time correlation diff --git a/docs/doctor/articles/timestamping/tsa-availability.md b/docs/doctor/articles/timestamping/tsa-availability.md new file mode 100644 index 000000000..3e23b1237 --- /dev/null +++ b/docs/doctor/articles/timestamping/tsa-availability.md @@ -0,0 +1,72 @@ +--- +checkId: check.timestamp.tsa.reachable +plugin: stellaops.doctor.timestamping +severity: fail +tags: [timestamping, tsa, availability, connectivity] +--- +# TSA Availability + +## What It Checks +Verifies that configured TSA (Time Stamp Authority) endpoints are reachable and responding. The check: + +- Probes each endpoint from the `TsaEndpoints` configuration via HTTP HEAD requests. +- Considers HTTP 2xx and 405 (Method Not Allowed) as healthy responses (405 means the TSA is alive but only accepts POST). +- Reports the count of healthy vs. unhealthy endpoints. +- Degrades if no endpoints are configured. Fails if no endpoints are reachable. Warns if some are down. + +## Why It Matters +TSA endpoints provide RFC-3161 timestamps that anchor release evidence in time. If no TSA is reachable, new evidence cannot be timestamped, blocking policy-gated releases that require verifiable timestamps. This is a critical-severity check. + +## Common Causes +- TSA endpoint server is down or unreachable +- Network connectivity issues or firewall blocking HTTPS +- DNS resolution failure +- TSA provider maintenance or outage + +## How to Fix + +### Docker Compose +```yaml +environment: + Timestamping__TsaEndpoints__0__Name: "FreeTSA" + Timestamping__TsaEndpoints__0__Url: "https://freetsa.org/tsr" + Timestamping__TsaEndpoints__1__Name: "DigiCert" + Timestamping__TsaEndpoints__1__Url: "http://timestamp.digicert.com" +``` + +### Bare Metal / systemd +```json +{ + "Timestamping": { + "TsaEndpoints": [ + { "Name": "FreeTSA", "Url": "https://freetsa.org/tsr" }, + { "Name": "DigiCert", "Url": "http://timestamp.digicert.com" } + ] + } +} +``` + +Test connectivity: +```bash +curl -I https://freetsa.org/tsr +``` + +### Kubernetes / Helm +```yaml +timestamping: + tsaEndpoints: + - name: "FreeTSA" + url: "https://freetsa.org/tsr" + - name: "DigiCert" + url: "http://timestamp.digicert.com" +``` + +## Verification +``` +stella doctor run --check check.timestamp.tsa.reachable +``` + +## Related Checks +- `check.timestamp.tsa.response-time` — measures TSA response latency +- `check.timestamp.tsa.valid-response` — verifies TSA returns valid RFC-3161 responses +- `check.timestamp.tsa.failover-ready` — confirms backup TSA endpoints for failover diff --git a/docs/doctor/articles/timestamping/tsa-cert-expiry.md b/docs/doctor/articles/timestamping/tsa-cert-expiry.md new file mode 100644 index 000000000..885bbad25 --- /dev/null +++ b/docs/doctor/articles/timestamping/tsa-cert-expiry.md @@ -0,0 +1,66 @@ +--- +checkId: check.timestamp.tsa.cert-expiry +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, tsa, certificate, expiry] +--- +# TSA Certificate Expiry + +## What It Checks +Checks if TSA signing certificates are approaching expiry. The check: + +- Evaluates each certificate in the `TsaCertificates` configuration list. +- Calculates days remaining until expiry. +- **Fails** (unhealthy) if any certificate is expired or within the critical threshold (default 90 days). +- **Warns** (degraded) if within the warning threshold (default 180 days). +- Passes if all certificates have sufficient validity remaining. + +## Why It Matters +An expired TSA signing certificate means new timestamps cannot be validated by relying parties. Evidence signed with an expired certificate may be rejected during compliance audits. Early warning gives operators time to coordinate certificate renewal with the TSA provider before any disruption. + +## Common Causes +- TSA provider certificate approaching natural end-of-life +- Certificate renewal not tracked or scheduled +- Using a short-lived certificate without automated renewal + +## How to Fix + +### Docker Compose +Update the certificate configuration when renewed certificates are obtained from the TSA provider: + +```yaml +environment: + Timestamping__TsaCertificates__0__Name: "DigiCert TSA" + Timestamping__TsaCertificates__0__Subject: "CN=DigiCert TSA" + Timestamping__TsaCertificates__0__ExpiresAt: "2027-01-15T00:00:00Z" +``` + +### Bare Metal / systemd +Contact the TSA provider to obtain renewed certificates and update the trust configuration: + +```bash +stella tsa cert update --name "DigiCert TSA" --cert /path/to/new-cert.pem +``` + +### Kubernetes / Helm +```yaml +timestamping: + certificates: + warnDays: 180 + criticalDays: 90 +``` + +Update Kubernetes secrets when new certificates are obtained: +```bash +kubectl create secret generic tsa-certs --from-file=cert.pem=/path/to/new-cert.pem --dry-run=client -o yaml | kubectl apply -f - +``` + +## Verification +``` +stella doctor run --check check.timestamp.tsa.cert-expiry +``` + +## Related Checks +- `check.timestamp.tsa.root-expiry` — checks TSA root/trust anchor certificate expiry +- `check.timestamp.tsa.chain-valid` — validates TSA certificate chain integrity +- `check.timestamp.tsa.valid-response` — verifies TSA returns valid timestamp tokens diff --git a/docs/doctor/articles/timestamping/tsa-chain-valid.md b/docs/doctor/articles/timestamping/tsa-chain-valid.md new file mode 100644 index 000000000..ed201cbef --- /dev/null +++ b/docs/doctor/articles/timestamping/tsa-chain-valid.md @@ -0,0 +1,65 @@ +--- +checkId: check.timestamp.tsa.chain-valid +plugin: stellaops.doctor.timestamping +severity: fail +tags: [timestamping, tsa, certificate, chain, validation] +--- +# TSA Certificate Chain Validity + +## What It Checks +Ensures TSA certificate chains are valid and complete. The check: + +- Queries the certificate chain status provider for all configured TSA chains. +- Validates that each chain is complete (leaf to root) and has no errors. +- Fails if all chains are invalid. Warns if some chains are invalid. +- Reports degraded if no chain data is available (provider not configured). + +## Why It Matters +An incomplete or broken certificate chain means TSA timestamps cannot be verified end-to-end. Relying parties will reject evidence with unverifiable chains, causing compliance audit failures and blocking release promotions. This is a critical-severity check. + +## Common Causes +- Missing intermediate certificates +- Intermediate certificate expired +- Trust store not updated after CA changes +- Misconfigured certificate chain ordering + +## How to Fix + +### Docker Compose +```bash +# Verify chain manually +openssl verify -CAfile /certs/root.pem -untrusted /certs/intermediate.pem /certs/tsa-leaf.pem + +# Update chain configuration +docker exec stella tsa chain update --name "Provider" \ + --cert /certs/tsa-leaf.pem --intermediate /certs/intermediate.pem +``` + +### Bare Metal / systemd +```bash +stella tsa chain validate --all +stella tsa chain update --name "Provider" \ + --cert /path/to/leaf.pem --intermediate /path/to/intermediate.pem +``` + +### Kubernetes / Helm +```yaml +timestamping: + chainValidation: + enabled: true +``` + +Update certificate chain secrets: +```bash +kubectl create secret generic tsa-chain \ + --from-file=leaf.pem --from-file=intermediate.pem --from-file=root.pem +``` + +## Verification +``` +stella doctor run --check check.timestamp.tsa.chain-valid +``` + +## Related Checks +- `check.timestamp.tsa.cert-expiry` — checks TSA signing certificate expiry +- `check.timestamp.tsa.root-expiry` — checks TSA root certificate expiry diff --git a/docs/doctor/articles/timestamping/tsa-failover-ready.md b/docs/doctor/articles/timestamping/tsa-failover-ready.md new file mode 100644 index 000000000..32b68a928 --- /dev/null +++ b/docs/doctor/articles/timestamping/tsa-failover-ready.md @@ -0,0 +1,71 @@ +--- +checkId: check.timestamp.tsa.failover-ready +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, tsa, failover, redundancy] +--- +# TSA Failover Readiness + +## What It Checks +Confirms that backup TSA endpoints are reachable for failover. The check: + +- Fails if no TSA endpoints are configured at all. +- Warns (degraded) if only one endpoint is configured -- failover is not possible with a single endpoint. +- Probes all configured endpoints and counts reachable ones. +- Compares reachable count against `MinHealthyTsas` (default 2). +- Fails or degrades if fewer than the minimum are reachable. + +## Why It Matters +TSA providers can experience outages. Without backup endpoints, a single TSA failure blocks all timestamping operations, halting the evidence pipeline and release process. Failover readiness ensures the platform can automatically switch to an alternative TSA without manual intervention. + +## Common Causes +- Only one TSA endpoint configured (no backup) +- Backup TSA endpoint down or unreachable +- Network issues to secondary TSA providers + +## How to Fix + +### Docker Compose +Configure at least two TSA endpoints: + +```yaml +environment: + Timestamping__TsaEndpoints__0__Name: "Primary" + Timestamping__TsaEndpoints__0__Url: "https://freetsa.org/tsr" + Timestamping__TsaEndpoints__1__Name: "Backup" + Timestamping__TsaEndpoints__1__Url: "http://timestamp.digicert.com" + Timestamping__MinHealthyTsas: "2" +``` + +### Bare Metal / systemd +```json +{ + "Timestamping": { + "TsaEndpoints": [ + { "Name": "Primary", "Url": "https://freetsa.org/tsr" }, + { "Name": "Backup", "Url": "http://timestamp.digicert.com" } + ], + "MinHealthyTsas": 2 + } +} +``` + +### Kubernetes / Helm +```yaml +timestamping: + minHealthyTsas: 2 + tsaEndpoints: + - name: "Primary" + url: "https://freetsa.org/tsr" + - name: "Backup" + url: "http://timestamp.digicert.com" +``` + +## Verification +``` +stella doctor run --check check.timestamp.tsa.failover-ready +``` + +## Related Checks +- `check.timestamp.tsa.reachable` — verifies TSA endpoint reachability +- `check.timestamp.tsa.response-time` — measures TSA response latency diff --git a/docs/doctor/articles/timestamping/tsa-response-time.md b/docs/doctor/articles/timestamping/tsa-response-time.md new file mode 100644 index 000000000..4d0e816e9 --- /dev/null +++ b/docs/doctor/articles/timestamping/tsa-response-time.md @@ -0,0 +1,65 @@ +--- +checkId: check.timestamp.tsa.response-time +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, tsa, latency, performance] +--- +# TSA Response Time + +## What It Checks +Measures TSA endpoint response times against configurable thresholds. The check: + +- Probes each configured TSA endpoint and measures round-trip latency. +- Compares latency against warning threshold (default 5000ms) and critical threshold (default 30000ms). +- Fails if any endpoint exceeds the critical latency threshold. +- Warns if any endpoint exceeds the warning threshold. +- Passes if all endpoints respond within acceptable latency. +- Reports degraded if no endpoints are configured. + +## Why It Matters +High TSA latency slows down the evidence generation pipeline. Every release artifact that needs a timestamp will be delayed by slow TSA responses. In high-throughput environments, TSA latency can become a bottleneck that blocks the entire release pipeline. + +## Common Causes +- TSA server under heavy load +- Network latency to remote TSA endpoints +- Firewall or proxy adding latency +- TSA provider experiencing service degradation + +## How to Fix + +### Docker Compose +Consider adding a geographically closer TSA endpoint or a local TSA: + +```yaml +environment: + Timestamping__WarnLatencyMs: "5000" + Timestamping__CriticalLatencyMs: "30000" +``` + +### Bare Metal / systemd +```bash +# Test TSA latency manually +time curl -s -o /dev/null https://freetsa.org/tsr + +# Add a faster TSA endpoint +stella tsa add --name "LocalTSA" --url "https://tsa.internal.example.com/tsr" +``` + +### Kubernetes / Helm +```yaml +timestamping: + warnLatencyMs: 5000 + criticalLatencyMs: 30000 +``` + +Consider deploying a local TSA proxy or cache to reduce latency. + +## Verification +``` +stella doctor run --check check.timestamp.tsa.response-time +``` + +## Related Checks +- `check.timestamp.tsa.reachable` — verifies TSA endpoints are reachable +- `check.timestamp.tsa.valid-response` — verifies valid RFC-3161 responses +- `check.timestamp.tsa.failover-ready` — confirms failover readiness diff --git a/docs/doctor/articles/timestamping/tsa-root-expiry.md b/docs/doctor/articles/timestamping/tsa-root-expiry.md new file mode 100644 index 000000000..95a92a617 --- /dev/null +++ b/docs/doctor/articles/timestamping/tsa-root-expiry.md @@ -0,0 +1,61 @@ +--- +checkId: check.timestamp.tsa.root-expiry +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, tsa, root, certificate, expiry] +--- +# TSA Root Certificate Expiry + +## What It Checks +Checks if TSA trust anchor (root) certificates are approaching expiry. The check: + +- Evaluates each root certificate in the `RootCertificates` configuration list. +- Calculates days remaining until expiry. +- **Fails** (unhealthy) if any root certificate is expired or within the critical threshold (default 180 days). +- **Warns** (degraded) if within the warning threshold (default 365 days). +- Uses longer thresholds than leaf certificates because root renewal requires more coordination. + +## Why It Matters +Root certificates anchor the entire TSA trust chain. When a root expires, all timestamps signed by TSAs chained to that root become unverifiable. Root certificate renewal requires updating trust stores across the entire deployment, which takes significant lead time. + +## Common Causes +- Root certificate approaching end-of-life (typically 10-20 year lifetime) +- Using a custom root CA with a shorter validity period +- Trust store not updated after provider rotated roots + +## How to Fix + +### Docker Compose +Update root certificate trust store: + +```bash +# Update trust anchors +docker exec stella trust-anchor update --cert /certs/new-root.pem +``` + +### Bare Metal / systemd +```bash +# Update the trust anchor +stella trust-anchor update --cert /path/to/new-root.pem + +# Or update the system trust store +sudo cp /path/to/new-root.pem /usr/local/share/ca-certificates/ +sudo update-ca-certificates +``` + +### Kubernetes / Helm +```yaml +timestamping: + rootCertificates: + warnDays: 365 + criticalDays: 180 +``` + +## Verification +``` +stella doctor run --check check.timestamp.tsa.root-expiry +``` + +## Related Checks +- `check.timestamp.tsa.cert-expiry` — checks TSA signing certificate expiry +- `check.timestamp.tsa.chain-valid` — validates TSA certificate chain integrity diff --git a/docs/doctor/articles/timestamping/tsa-time-skew.md b/docs/doctor/articles/timestamping/tsa-time-skew.md new file mode 100644 index 000000000..d2e8d7f3e --- /dev/null +++ b/docs/doctor/articles/timestamping/tsa-time-skew.md @@ -0,0 +1,50 @@ +--- +checkId: check.timestamp.timesync.tsa-skew +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, timesync, tsa, skew] +--- +# TSA Time Skew + +## What It Checks +Checks time skew between the system clock and TSA genTime. The check: + +- For each active TSA provider, requests a timestamp token with a random hash. +- Compares the TSA genTime against the local system clock, accounting for network round-trip time. +- **Fails** if skew exceeds the critical threshold (default 30 seconds). +- **Warns** if skew is elevated but below critical. + +## Why It Matters +Time skew between the system and TSA indicates that either the local clock or the TSA clock is drifting. This can produce timestamps that appear out of order relative to other events, undermining the temporal integrity of release evidence and audit trails. + +## Common Causes +- Local system clock not synchronized with NTP +- TSA provider clock drifting +- High network latency distorting round-trip time estimation +- Proxy or load balancer adding variable latency + +## How to Fix + +### Docker Compose +Ensure the host clock is synchronized (see `check.timestamp.timesync.system`). If the TSA shows consistent skew, consider using a different provider. + +### Bare Metal / systemd +```bash +# Verify system time sync +chronyc tracking + +# Test TSA response time +curl -w "@curl-format.txt" -s -o /dev/null https://freetsa.org/tsr +``` + +### Kubernetes / Helm +Verify node-level NTP synchronization. If a specific TSA consistently shows skew, switch to an alternative provider. + +## Verification +``` +stella doctor run --check check.timestamp.timesync.tsa-skew +``` + +## Related Checks +- `check.timestamp.timesync.system` — checks system clock synchronization with NTP +- `check.timestamp.timesync.rekor-correlation` — checks TST-Rekor time correlation diff --git a/docs/doctor/articles/timestamping/tsa-valid-response.md b/docs/doctor/articles/timestamping/tsa-valid-response.md new file mode 100644 index 000000000..654cdf652 --- /dev/null +++ b/docs/doctor/articles/timestamping/tsa-valid-response.md @@ -0,0 +1,65 @@ +--- +checkId: check.timestamp.tsa.valid-response +plugin: stellaops.doctor.timestamping +severity: fail +tags: [timestamping, tsa, validation, rfc3161] +--- +# TSA Valid Response + +## What It Checks +Verifies that TSA endpoints return valid RFC-3161 timestamp responses. The check: + +- Gets active TSA providers from the registry. +- Sends a dummy SHA-256 hash to each provider and requests a timestamp token. +- Validates that each response is a valid RFC-3161 timestamp token. +- Fails if no providers return valid responses. Warns if some providers fail validation. +- Reports degraded if no providers are configured. + +## Why It Matters +A reachable TSA that returns invalid timestamps is worse than no TSA at all -- it produces evidence that appears valid but cannot be verified. Invalid timestamps break the chain of trust for release evidence and can cause compliance audit failures. This is a critical-severity check. + +## Common Causes +- TSA provider configuration changed (algorithm, certificate) +- TSA provider returned an error response instead of a valid token +- Network issues causing corrupted responses +- TSA provider using an unsupported algorithm or format + +## How to Fix + +### Docker Compose +Verify TSA configuration and switch to a known-good provider: + +```yaml +environment: + Timestamping__TsaEndpoints__0__Name: "DigiCert" + Timestamping__TsaEndpoints__0__Url: "http://timestamp.digicert.com" +``` + +### Bare Metal / systemd +```bash +# Test TSA response manually with openssl +openssl ts -query -data /dev/null -sha256 -cert -no_nonce -out /tmp/ts.req +curl -H "Content-Type: application/timestamp-query" --data-binary @/tmp/ts.req \ + http://timestamp.digicert.com -o /tmp/ts.resp +openssl ts -reply -in /tmp/ts.resp -text +``` + +### Kubernetes / Helm +```yaml +timestamping: + tsaEndpoints: + - name: "DigiCert" + url: "http://timestamp.digicert.com" +``` + +If a TSA consistently returns invalid responses, remove it and add an alternative qualified provider. + +## Verification +``` +stella doctor run --check check.timestamp.tsa.valid-response +``` + +## Related Checks +- `check.timestamp.tsa.reachable` — verifies TSA endpoint reachability +- `check.timestamp.tsa.cert-expiry` — checks TSA certificate expiry +- `check.timestamp.tsa.chain-valid` — validates TSA certificate chain diff --git a/docs/doctor/articles/timestamping/tst-approaching-expiry.md b/docs/doctor/articles/timestamping/tst-approaching-expiry.md new file mode 100644 index 000000000..5dfc2d6a2 --- /dev/null +++ b/docs/doctor/articles/timestamping/tst-approaching-expiry.md @@ -0,0 +1,35 @@ +--- +checkId: check.timestamp.evidence.tst.expiry +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, evidence, tst, expiry] +--- +# TST Approaching Expiry + +## What It Checks +Detects timestamp tokens approaching signing certificate expiry. Fails if timestamps are within the critical window (default 90 days), warns if within the warning window (default 180 days). + +## Why It Matters +Expired timestamp tokens cannot be validated by relying parties. Artifacts with expired timestamps lose their temporal proof, which may invalidate compliance evidence. + +## Common Causes +- TSA signing certificates approaching end-of-life +- Re-timestamp jobs not scheduled or failing + +## How to Fix +Run the retimestamp workflow to refresh expiring artifacts: + +```bash +stella retimestamp run --expiring-within 180d +``` + +Schedule automatic re-timestamping before expiry. + +## Verification +``` +stella doctor run --check check.timestamp.evidence.tst.expiry +``` + +## Related Checks +- `check.timestamp.evidence.staleness` — aggregated evidence staleness check +- `check.timestamp.tsa.cert-expiry` — checks TSA certificate expiry diff --git a/docs/doctor/articles/timestamping/tst-deprecated-algorithms.md b/docs/doctor/articles/timestamping/tst-deprecated-algorithms.md new file mode 100644 index 000000000..777353f69 --- /dev/null +++ b/docs/doctor/articles/timestamping/tst-deprecated-algorithms.md @@ -0,0 +1,36 @@ +--- +checkId: check.timestamp.evidence.tst.deprecated-algo +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, evidence, tst, algorithm, deprecated] +--- +# TST Deprecated Algorithms + +## What It Checks +Detects timestamps using deprecated hash algorithms (default: SHA1). Fails if the count exceeds the critical threshold (default 10), otherwise warns. + +## Why It Matters +Timestamps using deprecated algorithms like SHA1 are vulnerable to collision attacks. Compliance frameworks (eIDAS, FIPS) may reject evidence signed with deprecated algorithms, blocking release attestation verification. + +## Common Causes +- Legacy artifacts timestamped with older TSA configurations +- TSA provider still using SHA1 by default +- Migration to SHA-256 not yet completed + +## How to Fix +Re-timestamp affected artifacts using approved algorithms: + +```bash +stella retimestamp run --algorithm SHA256 --filter deprecated-algo +``` + +Ensure TSA providers are configured to use SHA-256 or stronger. + +## Verification +``` +stella doctor run --check check.timestamp.evidence.tst.deprecated-algo +``` + +## Related Checks +- `check.timestamp.evidence.staleness` — aggregated evidence staleness check +- `check.timestamp.tsa.valid-response` — verifies TSA returns valid responses diff --git a/docs/doctor/articles/timestamping/tst-missing-stapling.md b/docs/doctor/articles/timestamping/tst-missing-stapling.md new file mode 100644 index 000000000..20ced8f5f --- /dev/null +++ b/docs/doctor/articles/timestamping/tst-missing-stapling.md @@ -0,0 +1,36 @@ +--- +checkId: check.timestamp.evidence.tst.missing-stapling +plugin: stellaops.doctor.timestamping +severity: warn +tags: [timestamping, evidence, tst, stapling, ocsp] +--- +# TST Missing Stapling + +## What It Checks +Detects timestamps without stapled OCSP/CRL revocation data. Fails if the count exceeds the critical threshold (default 10), otherwise warns. + +## Why It Matters +Without stapled revocation data, verifiers must perform live OCSP/CRL lookups to confirm certificate validity. In air-gapped environments, these lookups are impossible, making the timestamp unverifiable. Stapling embeds proof-of-non-revocation directly in the timestamp token for offline verification. + +## Common Causes +- TSA provider not configured to include stapled responses +- OCSP stapling disabled in TSA configuration +- Legacy timestamps created before stapling was enabled + +## How to Fix +Enable OCSP stapling and re-timestamp affected artifacts: + +```bash +stella retimestamp run --with-stapling --filter missing-stapling +``` + +Ensure TSA providers are configured with stapling enabled. + +## Verification +``` +stella doctor run --check check.timestamp.evidence.tst.missing-stapling +``` + +## Related Checks +- `check.timestamp.evidence.staleness` — aggregated evidence staleness check +- `check.timestamp.ocsp.stapling` — checks OCSP stapling configuration diff --git a/docs/doctor/articles/vex/issuer-trust.md b/docs/doctor/articles/vex/issuer-trust.md new file mode 100644 index 000000000..50995080c --- /dev/null +++ b/docs/doctor/articles/vex/issuer-trust.md @@ -0,0 +1,110 @@ +--- +checkId: check.vex.issuer-trust +plugin: stellaops.doctor.vex +severity: warn +tags: [vex, trust, issuer, security] +--- +# VEX Issuer Trust Registry + +## What It Checks +Verifies that the VEX issuer trust registry is configured and that key material is available for signature verification. The check evaluates: + +1. **Registry configuration**: whether the issuer trust registry is set up and operational. +2. **Trusted issuer count**: the number of issuers currently in the trust registry. +3. **Key availability**: how many signing keys are available and how many are currently active. + +| Condition | Result | +|---|---| +| Registry not configured | Fail | +| Registry configured but no trusted issuers | Warn | +| Registry configured with trusted issuers and active keys | Pass | + +Evidence collected: `RegistryConfigured`, `TrustedIssuers`, `KeysAvailable`, `ActiveKeys`. + +This check always runs (no precondition). + +## Why It Matters +The issuer trust registry determines which VEX document sources are trusted. Without a configured registry, no VEX documents can have their signatures verified, which means all incoming vulnerability assessments are treated as unverified. Without any trusted issuers, even valid VEX documents from legitimate sources will be rejected or flagged. This undermines the VEX processing pipeline and means vulnerability status updates cannot be reliably applied to releases, potentially blocking compliant releases or allowing vulnerable ones. + +## Common Causes +- Issuer directory not configured during initial setup +- Trust anchors not imported after deployment +- Configuration file missing or incorrect path +- All issuers expired or revoked without replacement +- No issuers added to the trust registry after installation + +## How to Fix + +### Docker Compose +```bash +# Configure issuer directory +docker compose exec vex-hub stella issuer directory configure + +# Import default trust anchors +docker compose exec vex-hub stella trust-anchors import --defaults + +# List available issuer keys +docker compose exec vex-hub stella issuer keys list --available + +# Trust a known issuer +docker compose exec vex-hub stella issuer trust --url https://example.com/.well-known/vex-issuer + +# Check current trust registry status +docker compose exec vex-hub stella issuer status +``` + +### Bare Metal / systemd +```bash +# Configure issuer directory +stella issuer directory configure + +# Import default trust anchors +stella trust-anchors import --defaults + +# List available keys +stella issuer keys list --available + +# Trust a specific issuer +stella issuer trust --url https://example.com/.well-known/vex-issuer + +# Check trust registry status +stella issuer status + +sudo systemctl restart stellaops-vex-hub +``` + +### Kubernetes / Helm +```yaml +# values.yaml +vexHub: + issuerTrust: + importDefaults: true + trustedIssuers: + - name: "upstream-vendor" + url: "https://vendor.example.com/.well-known/vex-issuer" + - name: "internal-security" + url: "https://security.internal/.well-known/vex-issuer" +``` + +```bash +# Configure issuer directory +kubectl exec deploy/stellaops-vex-hub -- stella issuer directory configure + +# Import trust anchors +kubectl exec deploy/stellaops-vex-hub -- stella trust-anchors import --defaults + +# Check status +kubectl exec deploy/stellaops-vex-hub -- stella issuer status + +helm upgrade stellaops ./charts/stellaops -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.vex.issuer-trust +``` + +## Related Checks +- `check.vex.validation` — document validation depends on issuer trust for signature verification +- `check.vex.schema` — schema compliance is checked alongside issuer trust +- `check.compliance.attestation-signing` — attestation signing uses related trust infrastructure diff --git a/docs/doctor/articles/vex/schema.md b/docs/doctor/articles/vex/schema.md new file mode 100644 index 000000000..d3c4ae0ad --- /dev/null +++ b/docs/doctor/articles/vex/schema.md @@ -0,0 +1,105 @@ +--- +checkId: check.vex.schema +plugin: stellaops.doctor.vex +severity: warn +tags: [vex, schema, compliance] +--- +# VEX Schema Compliance + +## What It Checks +Verifies that VEX document schema definitions are available for all three supported formats: + +- **OpenVEX**: the open-source VEX document format (checks version availability). +- **CSAF** (Common Security Advisory Framework): the OASIS standard for security advisories with VEX profiles (checks version availability). +- **CycloneDX VEX**: VEX capabilities embedded in CycloneDX BOM format (checks version availability). + +| Condition | Result | +|---|---| +| Any of the three schemas missing or unavailable | Fail | +| All three schemas available | Pass | + +Evidence collected: `OpenVEX` (version or "MISSING"), `CSAF` (version or "MISSING"), `CycloneDX` (version or "MISSING"). + +This check always runs (no precondition). + +## Why It Matters +VEX documents arrive in multiple formats from different sources (upstream vendors, internal security teams, community advisories). If a schema definition is missing, documents in that format cannot be validated, which means they are either rejected (blocking legitimate vulnerability updates) or accepted without validation (creating a security risk). Supporting all three major formats ensures Stella Ops can process VEX documents from any source in the software supply chain. + +## Common Causes +- Schema files not installed during deployment +- Schema version mismatch after an upgrade +- Configuration error pointing to wrong schema directory +- Incomplete installation missing one or more schema packages +- Schema files corrupted or deleted during maintenance + +## How to Fix + +### Docker Compose +```bash +# Update all VEX schemas +docker compose exec vex-hub stella vex schemas update + +# List installed schemas and versions +docker compose exec vex-hub stella vex schemas list + +# Check schema directory +docker compose exec vex-hub ls -la /data/vex/schemas/ + +# Verify specific format support +docker compose exec vex-hub stella vex schemas verify --format openvex +docker compose exec vex-hub stella vex schemas verify --format csaf +docker compose exec vex-hub stella vex schemas verify --format cyclonedx + +# Restart after schema update +docker compose restart vex-hub +``` + +### Bare Metal / systemd +```bash +# Update VEX schemas +stella vex schemas update + +# List installed schemas +stella vex schemas list + +# Verify schema directory +ls -la /var/lib/stellaops/vex/schemas/ + +# Verify each format +stella vex schemas verify --format openvex +stella vex schemas verify --format csaf +stella vex schemas verify --format cyclonedx + +sudo systemctl restart stellaops-vex-hub +``` + +### Kubernetes / Helm +```yaml +# values.yaml +vexHub: + schemas: + autoUpdate: true + formats: + openVex: true + csaf: true + cycloneDx: true +``` + +```bash +# Update schemas in pod +kubectl exec deploy/stellaops-vex-hub -- stella vex schemas update + +# Verify all formats +kubectl exec deploy/stellaops-vex-hub -- stella vex schemas list + +helm upgrade stellaops ./charts/stellaops -f values.yaml +``` + +## Verification +``` +stella doctor run --check check.vex.schema +``` + +## Related Checks +- `check.vex.validation` — document validation uses these schemas to verify incoming VEX documents +- `check.vex.issuer-trust` — issuer trust works alongside schema validation in the processing pipeline diff --git a/docs/doctor/articles/vex/validation.md b/docs/doctor/articles/vex/validation.md new file mode 100644 index 000000000..160a903c6 --- /dev/null +++ b/docs/doctor/articles/vex/validation.md @@ -0,0 +1,118 @@ +--- +checkId: check.vex.validation +plugin: stellaops.doctor.vex +severity: fail +tags: [vex, security, validation] +--- +# VEX Document Validation + +## What It Checks +Verifies the VEX document validation pipeline health by testing three subsystems: + +1. **Schema validation**: confirms the schema validation service can parse and validate VEX documents against supported schemas. Reports valid and invalid document counts. +2. **Signature verification**: confirms that VEX document signatures can be verified using available issuer key material. +3. **Processing pipeline**: confirms the VEX processing queue is operational and measures queue depth and processing rate. + +Results are aggregated across all three subsystems: + +| Condition | Result | +|---|---| +| Any subsystem failed (schema, signature, or pipeline) | Fail | +| All subsystems pass but warnings exist (e.g., high queue depth > 100) | Warn | +| All subsystems pass with no warnings | Pass | + +Evidence collected: `SchemaValidation`, `SignatureVerification`, `ProcessingPipeline`, `ValidDocuments`, `InvalidDocuments`, `QueueDepth`, `QueueStatus`. + +This check always runs (no precondition). + +## Why It Matters +VEX (Vulnerability Exploitability eXchange) documents are the primary mechanism for communicating vulnerability status in the release pipeline. If schema validation fails, invalid VEX documents may be accepted, leading to incorrect vulnerability assessments. If signature verification fails, forged or tampered VEX documents could influence release decisions. If the processing pipeline is down or backed up, vulnerability status updates are delayed, potentially allowing releases with unresolved vulnerabilities to proceed. + +## Common Causes +- VEX schema validation service unavailable or crashed +- Invalid VEX document format detected in the incoming queue +- Signature verification key material missing or expired +- VEX processing queue backed up due to high volume +- Issuer keys not imported into the trust registry +- VEX processing worker not running + +## How to Fix + +### Docker Compose +```bash +# Check VEX processing status +docker compose exec vex-hub stella vex status + +# Verify VEX document schema compliance +docker compose exec vex-hub stella vex verify --schema + +# Check issuer key availability +docker compose exec vex-hub stella issuer keys list + +# Check processing queue status +docker compose exec vex-hub stella vex queue status + +# Review documents with validation warnings +docker compose exec vex-hub stella vex list --status warning + +# Restart VEX processing worker if stuck +docker compose restart vex-hub +``` + +### Bare Metal / systemd +```bash +# Check VEX processing status +stella vex status + +# Verify schema compliance +stella vex verify --schema + +# Check issuer keys +stella issuer keys list + +# Check queue status +stella vex queue status + +# List warning documents +stella vex list --status warning + +sudo systemctl restart stellaops-vex-hub +``` + +### Kubernetes / Helm +```bash +# Check VEX pod status +kubectl get pods -l app=stellaops-vex-hub + +# Check processing status +kubectl exec deploy/stellaops-vex-hub -- stella vex status + +# Verify schemas +kubectl exec deploy/stellaops-vex-hub -- stella vex verify --schema + +# Check queue depth +kubectl exec deploy/stellaops-vex-hub -- stella vex queue status + +# Check for OOM or resource issues +kubectl top pod -l app=stellaops-vex-hub +``` + +```yaml +# values.yaml +vexHub: + processing: + workers: 2 + maxQueueDepth: 500 + schema: + validation: strict +``` + +## Verification +``` +stella doctor run --check check.vex.validation +``` + +## Related Checks +- `check.vex.issuer-trust` — issuer trust registry provides keys for signature verification +- `check.vex.schema` — schema compliance for supported VEX formats +- `check.compliance.evidence-integrity` — VEX documents are evidence artifacts subject to integrity checks diff --git a/docs/implplan/SPRINT_20260326_001_DOCS_doctor_checks_documentation.md b/docs/implplan/SPRINT_20260326_001_DOCS_doctor_checks_documentation.md new file mode 100644 index 000000000..319efbacb --- /dev/null +++ b/docs/implplan/SPRINT_20260326_001_DOCS_doctor_checks_documentation.md @@ -0,0 +1,181 @@ +# Sprint 20260326_001 — Doctor Health Checks Documentation + +## Topic & Scope +- Document every Doctor health check (99 checks across 16 plugins) with precise, actionable remediation. +- Each check must have: what it tests, why it matters, exact fix steps, Docker compose specifics, and verification. +- Fix false-positive checks that fail on default Docker compose installations. +- Working directory: `docs/modules/doctor/`, `src/Doctor/__Plugins/` +- Expected evidence: docs, improved check messages, tests. + +## Dependencies & Concurrency +- No upstream dependencies. Can be parallelized by plugin. +- Depends on the 4 check code fixes already applied (RequiredSettings, EnvironmentVariables, SecretsConfiguration, DockerSocket). + +## Documentation Prerequisites +- `docs/modules/doctor/architecture.md` — existing Doctor architecture overview +- `docs/modules/doctor/registry-checks.md` — existing check registry reference +- `devops/compose/docker-compose.stella-ops.yml` — the reference deployment + +## Delivery Tracker + +### DOC-001 - Create check reference index +Status: TODO +Dependency: none +Owners: Documentation author +Task description: +- Create `docs/modules/doctor/checks/README.md` with a master table of all 99 checks +- Columns: Check ID, Plugin, Category, Severity, Summary, Docker Compose Status (Pass/Warn/Fail/N/A) +- Group by plugin (Core, Security, Docker, Agent, Attestor, Auth, etc.) +- Include quick-reference severity legend + +Completion criteria: +- [ ] All 99 checks listed with correct metadata +- [ ] Docker Compose Status column filled from actual test run + +### DOC-002 - Core Plugin checks documentation (9 checks) +Status: TODO +Dependency: DOC-001 +Owners: Documentation author +Task description: +- Create `docs/modules/doctor/checks/core.md` +- Document each check: + - **check.core.config.required**: What settings are checked, key variants (colon vs `__`), compose env var names, how to add missing settings + - **check.core.env.variables**: Which env vars are checked, why `ASPNETCORE_ENVIRONMENT` may not be set in compose, when this is OK + - **check.core.health.endpoint**: Health endpoint configuration + - **check.core.memory**: Memory threshold configuration + - **check.core.startup.time**: Expected startup time ranges + - Each remaining core check +- For each check: Symptom → Root Cause → Fix → Verify + +Completion criteria: +- [ ] Each check has: description, what it tests, severity, fix steps, Docker compose notes, verification command + +### DOC-003 - Security Plugin checks documentation +Status: TODO +Dependency: DOC-001 +Owners: Documentation author +Task description: +- Create `docs/modules/doctor/checks/security.md` +- Document: check.security.secrets, check.security.tls, check.security.cors, check.security.headers +- Include: which keys are considered "secrets" vs DSNs, vault provider configuration, development vs production guidance + +Completion criteria: +- [ ] Each check documented with fix steps and Docker compose notes + +### DOC-004 - Docker Plugin checks documentation +Status: TODO +Dependency: DOC-001 +Owners: Documentation author +Task description: +- Create `docs/modules/doctor/checks/docker.md` +- Document: check.docker.socket, check.docker.daemon, check.docker.images +- Include: container-vs-host detection, socket mount instructions, Windows named pipe notes + +Completion criteria: +- [ ] Each check documented with container-aware behavior explained + +### DOC-005 - Agent Plugin checks documentation (11 checks) +Status: TODO +Dependency: DOC-001 +Owners: Documentation author +Task description: +- Create `docs/modules/doctor/checks/agent.md` +- Document all 11 agent checks: capacity, certificates, cluster health/quorum, heartbeat, resources, versions, stale detection, task failure rate, task backlog + +Completion criteria: +- [ ] Each check documented with thresholds, configuration options, fix steps + +### DOC-006 - Attestor Plugin checks documentation (6 checks) +Status: TODO +Dependency: DOC-001 +Owners: Documentation author +Task description: +- Create `docs/modules/doctor/checks/attestor.md` +- Document: cosign key material, clock skew, Rekor connectivity/verification, signing key expiration, transparency log consistency + +Completion criteria: +- [ ] Each check documented including air-gap/offline scenarios + +### DOC-007 - Auth Plugin checks documentation (4 checks) +Status: TODO +Dependency: DOC-001 +Owners: Documentation author +Task description: +- Create `docs/modules/doctor/checks/auth.md` +- Document: auth configuration, OIDC provider connectivity, signing key health, token service health + +Completion criteria: +- [ ] Each check documented with OIDC troubleshooting steps + +### DOC-008 - Remaining plugins documentation +Status: TODO +Dependency: DOC-001 +Owners: Documentation author +Task description: +- Create one doc per remaining plugin: + - `docs/modules/doctor/checks/binary-analysis.md` (6 checks) + - `docs/modules/doctor/checks/compliance.md` (7 checks) + - `docs/modules/doctor/checks/crypto.md` (6 checks) + - `docs/modules/doctor/checks/environment.md` (6 checks) + - `docs/modules/doctor/checks/evidence-locker.md` (4 checks) + - `docs/modules/doctor/checks/observability.md` (4 checks) + - `docs/modules/doctor/checks/notify.md` (9 checks) + - `docs/modules/doctor/checks/operations.md` (3 checks) + - `docs/modules/doctor/checks/policy.md` (1 check) + - `docs/modules/doctor/checks/postgres.md` (3 checks) + - `docs/modules/doctor/checks/release.md` (6 checks) + - `docs/modules/doctor/checks/scanner.md` (7 checks) + - `docs/modules/doctor/checks/storage.md` (3 checks) + - `docs/modules/doctor/checks/timestamping.md` (9 checks) + - `docs/modules/doctor/checks/vex.md` (3 checks) + +Completion criteria: +- [ ] Every check across all 16 plugins documented + +### DOC-009 - Improve check remediation messages in code +Status: TODO +Dependency: DOC-002 through DOC-008 +Owners: Developer +Task description: +- For each check, update the `WithRemediation()` steps to include: + - Exact commands (not vague "configure X") + - Docker compose env var names (using `__` separator) + - File paths relative to the compose directory + - Link to the documentation page (e.g., "See docs/modules/doctor/checks/core.md") +- Update `WithCauses()` to be specific, not generic + +Completion criteria: +- [ ] All 99 checks have precise, copy-pasteable remediation steps +- [ ] No check reports a generic "configure X" without specifying how +- [ ] Docker compose installations pass all checks that should pass + +### DOC-010 - Docker compose default pass baseline +Status: TODO +Dependency: DOC-009 +Owners: QA / Test Automation +Task description: +- Run all 99 Doctor checks against a fresh `docker compose up` installation +- Document which checks MUST pass, which are expected warnings, which are N/A +- Create `docs/modules/doctor/compose-baseline.md` with the expected results +- Add any remaining code fixes for false positives + +Completion criteria: +- [ ] Baseline document created +- [ ] Zero false-positive FAILs on fresh Docker compose install +- [ ] All WARN checks documented as expected or fixed + +## Execution Log +| Date (UTC) | Update | Owner | +| --- | --- | --- | +| 2026-03-26 | Sprint created. 4 code fixes applied (RequiredSettings, EnvironmentVariables, SecretsConfiguration, DockerSocket). | Planning | + +## Decisions & Risks +- Risk: 99 checks is a large documentation surface. Parallelize by plugin. +- Decision: Each plugin gets its own doc file for maintainability. +- Decision: Remediation messages in code should link to docs, not duplicate full instructions. + +## Next Checkpoints +- DOC-001 (index): 1 day +- DOC-002 through DOC-008 (all plugin docs): 3-5 days +- DOC-009 (code remediation improvements): 2 days +- DOC-010 (baseline): 1 day diff --git a/src/AdvisoryAI/StellaOps.AdvisoryAI/KnowledgeSearch/doctor-search-seed.json b/src/AdvisoryAI/StellaOps.AdvisoryAI/KnowledgeSearch/doctor-search-seed.json index 61166b11e..2ea2feaac 100644 --- a/src/AdvisoryAI/StellaOps.AdvisoryAI/KnowledgeSearch/doctor-search-seed.json +++ b/src/AdvisoryAI/StellaOps.AdvisoryAI/KnowledgeSearch/doctor-search-seed.json @@ -1,170 +1,3604 @@ [ { - "checkCode": "check.core.disk.space", - "title": "Disk space availability", - "severity": "high", - "description": "Low disk space can block ingestion pipelines and worker execution.", - "remediation": "Free disk space and verify retention settings.", - "runCommand": "stella doctor run --check check.core.disk.space", + "checkCode": "check.agent.capacity", + "title": "Agent Capacity", + "severity": "medium", + "description": "Verifies that agents have sufficient capacity to handle incoming tasks.", + "remediation": "Review and resolve the issue described in the Agent Capacity doctor check article.", + "runCommand": "stella doctor run --check check.agent.capacity", "symptoms": [ - "no space left on device", - "disk full", - "write failure" + "all agents are offline (host", + "no agents have been registered", + "agents exist but are in", + "agent bootstrap was started but" ], "tags": [ "doctor", - "storage", - "core" + "agent", + "capacity", + "performance" ], "references": [ - "docs/operations/devops/runbooks/deployment-upgrade.md" + "docs/doctor/articles/agent/capacity.md" ] }, { - "checkCode": "check.core.db.connectivity", - "title": "PostgreSQL connectivity", + "checkCode": "check.agent.certificate.expiry", + "title": "Agent Certificate Expiry", "severity": "high", - "description": "Doctor failed to connect to PostgreSQL or connection health checks timed out.", - "remediation": "Validate credentials, network reachability, and TLS settings.", - "runCommand": "stella doctor run --check check.core.db.connectivity", + "description": "Inspects the CertificateExpiresAt field on every non-revoked, non-inactive agent and classifies each into one of four buckets.", + "remediation": "Review and resolve the issue described in the Agent Certificate Expiry doctor check article.", + "runCommand": "stella doctor run --check check.agent.certificate.expiry", "symptoms": [ - "database unavailable", - "connection refused", - "timeout expired" + "certificate auto-renewal is disabled on", + "agent was offline when renewal", + "certificate authority is unreachable from", + "agent bootstrap was incomplete (certificate" ], "tags": [ "doctor", - "database", + "agent", + "certificate", + "security", + "quick" + ], + "references": [ + "docs/doctor/articles/agent/certificate-expiry.md" + ] + }, + { + "checkCode": "check.agent.certificate.validity", + "title": "Agent Certificate Validity", + "severity": "high", + "description": "Validates the full certificate chain of trust for agent mTLS certificates.", + "remediation": "Review and resolve the issue described in the Agent Certificate Validity doctor check article.", + "runCommand": "stella doctor run --check check.agent.certificate.validity", + "symptoms": [ + "ca certificate rotated but agent", + "intermediate certificate missing from agent's", + "certificate revoked via crl but", + "agent identity mismatch after hostname" + ], + "tags": [ + "doctor", + "agent", + "certificate", + "security" + ], + "references": [ + "docs/doctor/articles/agent/certificate-validity.md" + ] + }, + { + "checkCode": "check.agent.cluster.health", + "title": "Agent Cluster Health", + "severity": "high", + "description": "Monitors the health of the agent cluster when clustering is enabled.", + "remediation": "Set clustering configuration in your .env or compose override.", + "runCommand": "stella doctor run --check check.agent.cluster.health", + "symptoms": [ + "network partition between cluster members", + "leader node crashed without triggering", + "state sync backlog due to", + "clock skew between cluster members" + ], + "tags": [ + "doctor", + "agent", + "cluster", + "ha", + "resilience" + ], + "references": [ + "docs/doctor/articles/agent/cluster-health.md" + ] + }, + { + "checkCode": "check.agent.cluster.quorum", + "title": "Agent Cluster Quorum", + "severity": "high", + "description": "Verifies that the agent cluster has sufficient members online to maintain quorum for leader election and consensus operations.", + "remediation": "Ensure cluster member list is correct in .env.", + "runCommand": "stella doctor run --check check.agent.cluster.quorum", + "symptoms": [ + "too many cluster members went", + "network partition isolating a minority", + "cluster scaled down below quorum", + "new deployment removed members without" + ], + "tags": [ + "doctor", + "agent", + "cluster", + "quorum", + "ha" + ], + "references": [ + "docs/doctor/articles/agent/cluster-quorum.md" + ] + }, + { + "checkCode": "check.agent.heartbeat.freshness", + "title": "Agent Heartbeat Freshness", + "severity": "high", + "description": "Queries all non-revoked, non-inactive agents for the current tenant and classifies each by the age of its last heartbeat.", + "remediation": "Review and resolve the issue described in the Agent Heartbeat Freshness doctor check article.", + "runCommand": "stella doctor run --check check.agent.heartbeat.freshness", + "symptoms": [ + "agent process has crashed or", + "network connectivity issue between agent", + "firewall blocking agent heartbeat traffic", + "agent host is unreachable or" + ], + "tags": [ + "doctor", + "agent", + "heartbeat", + "connectivity", + "quick" + ], + "references": [ + "docs/doctor/articles/agent/heartbeat-freshness.md" + ] + }, + { + "checkCode": "check.agent.resource.utilization", + "title": "Agent Resource Utilization", + "severity": "medium", + "description": "Monitors CPU, memory, and disk utilization across the agent fleet.", + "remediation": "Review and resolve the issue described in the Agent Resource Utilization doctor check article.", + "runCommand": "stella doctor run --check check.agent.resource.utilization", + "symptoms": [ + "agent running too many concurrent", + "disk filled by accumulated scan", + "memory leak in long-running agent", + "noisy neighbor on shared infrastructure" + ], + "tags": [ + "doctor", + "agent", + "resource", + "performance", + "capacity" + ], + "references": [ + "docs/doctor/articles/agent/resource-utilization.md" + ] + }, + { + "checkCode": "check.agent.stale", + "title": "Stale Agent Detection", + "severity": "medium", + "description": "Identifies agents that have been offline (no heartbeat) for extended periods and may need investigation or decommissioning.", + "remediation": "Review and resolve the issue described in the Stale Agent Detection doctor check article.", + "runCommand": "stella doctor run --check check.agent.stale", + "symptoms": [ + "agent host has been permanently", + "agent was replaced by a", + "infrastructure change (network re-architecture, datacenter", + "agent host is undergoing extended" + ], + "tags": [ + "doctor", + "agent", + "maintenance", + "cleanup" + ], + "references": [ + "docs/doctor/articles/agent/stale.md" + ] + }, + { + "checkCode": "check.agent.task.backlog", + "title": "Task Queue Backlog", + "severity": "medium", + "description": "Monitors the pending task queue depth across the agent fleet to detect capacity issues.", + "remediation": "Review and resolve the issue described in the Task Queue Backlog doctor check article.", + "runCommand": "stella doctor run --check check.agent.task.backlog", + "symptoms": [ + "insufficient agent count for current", + "one or more agents offline", + "task burst from bulk operations", + "slow tasks monopolizing agent slots" + ], + "tags": [ + "doctor", + "agent", + "task", + "queue", + "capacity" + ], + "references": [ + "docs/doctor/articles/agent/task-backlog.md" + ] + }, + { + "checkCode": "check.agent.task.failure.rate", + "title": "Task Failure Rate", + "severity": "medium", + "description": "Monitors the task failure rate across the agent fleet to detect systemic issues.", + "remediation": "Review and resolve the issue described in the Task Failure Rate doctor check article.", + "runCommand": "stella doctor run --check check.agent.task.failure.rate", + "symptoms": [ + "registry or artifact store unreachable", + "expired credentials used by tasks", + "agent software bug introduced by", + "target environment misconfigured (wrong endpoints" + ], + "tags": [ + "doctor", + "agent", + "task", + "failure", + "reliability" + ], + "references": [ + "docs/doctor/articles/agent/task-failure-rate.md" + ] + }, + { + "checkCode": "check.agent.version.consistency", + "title": "Agent Version Consistency", + "severity": "medium", + "description": "Groups all non-revoked, non-inactive agents by their reported Version field and evaluates version skew.", + "remediation": "Review and resolve the issue described in the Agent Version Consistency doctor check article.", + "runCommand": "stella doctor run --check check.agent.version.consistency", + "symptoms": [ + "auto-update is disabled on some", + "some agents failed to update", + "phased rollout in progress (expected", + "agents on isolated networks that" + ], + "tags": [ + "doctor", + "agent", + "version", + "maintenance" + ], + "references": [ + "docs/doctor/articles/agent/version-consistency.md" + ] + }, + { + "checkCode": "check.attestation.clock.skew", + "title": "Clock Skew", + "severity": "high", + "description": "Verifies that the system clock is synchronized accurately enough for attestation validity by comparing local time against the Rekor transparency log server's Date response header.", + "remediation": "Docker containers inherit the host clock.", + "runCommand": "stella doctor run --check check.attestation.clock.skew", + "symptoms": [ + "ntp service not running (stopped", + "ntp server unreachable (firewall, dns", + "system clock manually set incorrectly", + "virtual machine clock drift (common" + ], + "tags": [ + "doctor", + "attestation", + "time", + "ntp", + "quick", + "setup" + ], + "references": [ + "docs/doctor/articles/attestor/clock-skew.md" + ] + }, + { + "checkCode": "check.attestation.cosign.keymaterial", + "title": "Cosign Key Material", + "severity": "high", + "description": "Verifies that signing key material is available for container image attestation.", + "remediation": "For keyless mode.", + "runCommand": "stella doctor run --check check.attestation.cosign.keymaterial", + "symptoms": [ + "keypath not set in configuration", + "configuration file not loaded (missing", + "key file was moved or", + "wrong path configured (typo, path" + ], + "tags": [ + "doctor", + "attestation", + "cosign", + "signing", + "setup" + ], + "references": [ + "docs/doctor/articles/attestor/cosign-keymaterial.md" + ] + }, + { + "checkCode": "check.attestation.keymaterial", + "title": "Signing Key Expiration", + "severity": "medium", + "description": "Monitors the expiration timeline of attestation signing keys.", + "remediation": "Review and resolve the issue described in the Signing Key Expiration doctor check article.", + "runCommand": "stella doctor run --check check.attestation.keymaterial", + "symptoms": [ + "keys were not rotated before", + "scheduled rotation job failed (permissions", + "key expiration not monitored (no", + "normal lifecycle -- keys approaching" + ], + "tags": [ + "doctor", + "attestation", + "signing", + "security", + "expiration" + ], + "references": [ + "docs/doctor/articles/attestor/keymaterial.md" + ] + }, + { + "checkCode": "check.attestation.rekor.connectivity", + "title": "Rekor Connectivity", + "severity": "high", + "description": "Tests connectivity to the Rekor transparency log by sending an HTTP GET request to the log info endpoint ({rekorUrl}/api/v1/log).", + "remediation": "Review and resolve the issue described in the Rekor Connectivity doctor check article.", + "runCommand": "stella doctor run --check check.attestation.rekor.connectivity", + "symptoms": [ + "rekor service is down or", + "network connectivity issue (proxy not", + "firewall blocking outbound https (port", + "dns resolution failure for rekor.sigstore.dev" + ], + "tags": [ + "doctor", + "attestation", + "rekor", + "transparency", + "quick", + "setup" + ], + "references": [ + "docs/doctor/articles/attestor/rekor-connectivity.md" + ] + }, + { + "checkCode": "check.attestation.rekor.verification.job", + "title": "Rekor Verification Job", + "severity": "medium", + "description": "Monitors the health of the periodic background job that re-verifies attestation entries stored in Rekor.", + "remediation": "If critical alerts indicate possible log tampering, this may be a security incident.", + "runCommand": "stella doctor run --check check.attestation.rekor.verification.job", + "symptoms": [ + "job was just deployed and", + "job is disabled in configuration", + "background service failed to start", + "transparency log tampering detected (critical" + ], + "tags": [ + "doctor", + "attestation", + "rekor", + "verification", + "background" + ], + "references": [ + "docs/doctor/articles/attestor/rekor-verification-job.md" + ] + }, + { + "checkCode": "check.attestation.transparency.consistency", + "title": "Transparency Log Consistency", + "severity": "high", + "description": "Verifies that locally stored transparency log checkpoints are consistent with the remote Rekor log.", + "remediation": "For corrupted checkpoint.", + "runCommand": "stella doctor run --check check.attestation.transparency.consistency", + "symptoms": [ + "transparency log was actually rolled", + "stored checkpoint is from a", + "man-in-the-middle attack on log queries", + "configuration changed to point at" + ], + "tags": [ + "doctor", + "attestation", + "transparency", + "security" + ], + "references": [ + "docs/doctor/articles/attestor/transparency-consistency.md" + ] + }, + { + "checkCode": "check.auth.config", + "title": "Auth Configuration", + "severity": "high", + "description": "Validates the overall authentication configuration by inspecting three layers in sequence.", + "remediation": "Review and resolve the issue described in the Auth Configuration doctor check article.", + "runCommand": "stella doctor run --check check.auth.config", + "symptoms": [ + "authority service not configured (fresh", + "missing issuer url configuration in", + "signing keys not yet generated", + "key material corrupted (disk failure" + ], + "tags": [ + "doctor", + "auth", + "security", + "core", + "config" + ], + "references": [ + "docs/doctor/articles/auth/config.md" + ] + }, + { + "checkCode": "check.auth.oidc", + "title": "OIDC Provider Connectivity", + "severity": "medium", + "description": "Tests connectivity to an external OIDC provider by performing real HTTP requests.", + "remediation": "Review and resolve the issue described in the OIDC Provider Connectivity doctor check article.", + "runCommand": "stella doctor run --check check.auth.oidc", + "symptoms": [ + "oidc provider is down or", + "network connectivity issue (proxy misconfiguration", + "dns resolution failure for the", + "firewall blocking outbound https to" + ], + "tags": [ + "doctor", + "auth", + "oidc", "connectivity" ], "references": [ - "docs/INSTALL_GUIDE.md" + "docs/doctor/articles/auth/oidc.md" ] }, { - "checkCode": "check.security.oidc.readiness", - "title": "OIDC readiness", - "severity": "warn", - "description": "OIDC prerequisites are missing or identity issuer metadata is not reachable.", - "remediation": "Verify issuer URL, JWKS availability, and Authority client configuration.", - "runCommand": "stella doctor run --check check.security.oidc.readiness", + "checkCode": "check.auth.signing-key", + "title": "Signing Key Health", + "severity": "high", + "description": "Verifies the health of the active signing key used for token issuance.", + "remediation": "Review and resolve the issue described in the Signing Key Health doctor check article.", + "runCommand": "stella doctor run --check check.auth.signing-key", "symptoms": [ - "oidc setup", - "invalid issuer", - "jwks fetch failed" + "signing keys not generated (incomplete", + "all keys expired without rotation", + "key store corrupted (file system", + "key rotation not scheduled (manual" + ], + "tags": [ + "doctor", + "auth", + "security", + "keys" + ], + "references": [ + "docs/doctor/articles/auth/signing-key.md" + ] + }, + { + "checkCode": "check.auth.token-service", + "title": "Token Service Health", + "severity": "high", + "description": "Verifies the availability and performance of the token service endpoint (/connect/token).", + "remediation": "Review and resolve the issue described in the Token Service Health doctor check article.", + "runCommand": "stella doctor run --check check.auth.token-service", + "symptoms": [ + "authority service not running (container", + "token endpoint misconfigured (wrong path", + "database connectivity issue (authority cannot", + "database performance issues (slow queries" + ], + "tags": [ + "doctor", + "auth", + "service", + "health" + ], + "references": [ + "docs/doctor/articles/auth/token-service.md" + ] + }, + { + "checkCode": "check.binaryanalysis.buildinfo.cache", + "title": "Debian Buildinfo Cache", + "severity": "medium", + "description": "Verifies Debian buildinfo service accessibility and local cache directory configuration.", + "remediation": "Test connectivity.", + "runCommand": "stella doctor run --check check.binaryanalysis.buildinfo.cache", + "symptoms": [ + "firewall blocking https access to", + "network connectivity issues or dns", + "proxy configuration required but not", + "cache directory not created" + ], + "tags": [ + "doctor", + "binaryanalysis", + "buildinfo", + "debian", + "cache", + "security" + ], + "references": [ + "docs/doctor/articles/binary-analysis/buildinfo-cache.md" + ] + }, + { + "checkCode": "check.binaryanalysis.corpus.kpi.baseline", + "title": "KPI Baseline Configuration", + "severity": "medium", + "description": "Verifies that a KPI baseline file exists for regression detection in CI gates.", + "remediation": "Run a one-time job to establish the baseline.", + "runCommand": "stella doctor run --check check.binaryanalysis.corpus.kpi.baseline", + "symptoms": [ + "kpi baseline has never been", + "baseline directory path misconfigured", + "baseline file was deleted or", + "baseline created with an older" + ], + "tags": [ + "doctor", + "binaryanalysis", + "corpus", + "kpi", + "baseline", + "regression", + "ci", + "groundtruth", + "security" + ], + "references": [ + "docs/doctor/articles/binary-analysis/kpi-baseline-exists.md" + ] + }, + { + "checkCode": "check.binaryanalysis.corpus.mirror.freshness", + "title": "Corpus Mirror Freshness", + "severity": "medium", + "description": "Verifies that local corpus mirrors are not stale.", + "remediation": "For air-gapped environments, transfer pre-populated mirrors from an online system.", + "runCommand": "stella doctor run --check check.binaryanalysis.corpus.mirror.freshness", + "symptoms": [ + "corpus mirrors have not been", + "mirror sync job has not", + "network connectivity issues preventing sync", + "air-gapped setup incomplete (mirrors not" + ], + "tags": [ + "doctor", + "binaryanalysis", + "corpus", + "mirrors", + "freshness", + "security", + "groundtruth" + ], + "references": [ + "docs/doctor/articles/binary-analysis/corpus-mirror-freshness.md" + ] + }, + { + "checkCode": "check.binaryanalysis.ddeb.enabled", + "title": "Ubuntu Ddeb Repository", + "severity": "medium", + "description": "Verifies Ubuntu debug symbol repository (ddebs.ubuntu.com) is configured and accessible.", + "remediation": "Add ddeb repository inside the binary analysis container.", + "runCommand": "stella doctor run --check check.binaryanalysis.ddeb.enabled", + "symptoms": [ + "ddeb repository not added to", + "network connectivity issues preventing access", + "firewall blocking http access", + "running on a non-ubuntu linux" + ], + "tags": [ + "doctor", + "binaryanalysis", + "ddeb", + "ubuntu", + "symbols", + "security" + ], + "references": [ + "docs/doctor/articles/binary-analysis/ddeb-repo-enabled.md" + ] + }, + { + "checkCode": "check.binaryanalysis.debuginfod.available", + "title": "Debuginfod Availability", + "severity": "medium", + "description": "Verifies DEBUGINFOD_URLS environment variable and debuginfod service connectivity.", + "remediation": "Test connectivity.", + "runCommand": "stella doctor run --check check.binaryanalysis.debuginfod.available", + "symptoms": [ + "debuginfod_urls environment variable is not", + "configured debuginfod servers may be", + "firewall blocking https access to", + "proxy configuration required but not" + ], + "tags": [ + "doctor", + "binaryanalysis", + "debuginfod", + "symbols", + "security" + ], + "references": [ + "docs/doctor/articles/binary-analysis/debuginfod-availability.md" + ] + }, + { + "checkCode": "check.binaryanalysis.symbol.recovery.fallback", + "title": "Symbol Recovery Fallback", + "severity": "medium", + "description": "Meta-check that ensures at least one symbol recovery path is available.", + "remediation": "Configure at least one symbol source.", + "runCommand": "stella doctor run --check check.binaryanalysis.symbol.recovery.fallback", + "symptoms": [ + "all symbol recovery endpoints unreachable", + "network connectivity issues affecting all", + "firewall blocking access to symbol", + "air-gapped environment without offline symbol" + ], + "tags": [ + "doctor", + "binaryanalysis", + "symbols", + "fallback", + "security", + "meta" + ], + "references": [ + "docs/doctor/articles/binary-analysis/symbol-recovery-fallback.md" + ] + }, + { + "checkCode": "check.compliance.attestation-signing", + "title": "Attestation Signing Health", + "severity": "high", + "description": "Monitors attestation signing capability by querying the Attestor service at /api/v1/signing/status.", + "remediation": "Verify the Attestor service is running and the URL is correct.", + "runCommand": "stella doctor run --check check.compliance.attestation-signing", + "symptoms": [ + "hsm/kms connectivity issue preventing key", + "key rotation in progress (brief", + "key expired or revoked without", + "permission denied on the key" + ], + "tags": [ + "doctor", + "compliance", + "attestation", + "signing", + "crypto" + ], + "references": [ + "docs/doctor/articles/compliance/attestation-signing.md" + ] + }, + { + "checkCode": "check.compliance.audit-readiness", + "title": "Audit Readiness", + "severity": "medium", + "description": "Verifies the system is ready for compliance audits by querying the Evidence Locker at /api/v1/evidence/audit-readiness.", + "remediation": "Review and resolve the issue described in the Audit Readiness doctor check article.", + "runCommand": "stella doctor run --check check.compliance.audit-readiness", + "symptoms": [ + "no retention policy configured (default", + "audit logging disabled in configuration", + "backup verification job not running", + "evidence retention shorter than the" + ], + "tags": [ + "doctor", + "compliance", + "audit", + "evidence" + ], + "references": [ + "docs/doctor/articles/compliance/audit-readiness.md" + ] + }, + { + "checkCode": "check.compliance.evidence-integrity", + "title": "Evidence Integrity", + "severity": "high", + "description": "Detects evidence tampering or integrity issues by querying the Evidence Locker at /api/v1/evidence/integrity-check.", + "remediation": "Review and resolve the issue described in the Evidence Integrity doctor check article.", + "runCommand": "stella doctor run --check check.compliance.evidence-integrity", + "symptoms": [ + "evidence modification after signing (accidental", + "storage corruption (disk errors, incomplete", + "malicious tampering by an attacker", + "key or certificate mismatch after" + ], + "tags": [ + "doctor", + "compliance", + "security", + "integrity", + "signatures" + ], + "references": [ + "docs/doctor/articles/compliance/evidence-integrity.md" + ] + }, + { + "checkCode": "check.compliance.evidence-rate", + "title": "Evidence Generation Rate", + "severity": "high", + "description": "Monitors evidence generation success rate by querying the Evidence Locker at /api/v1/evidence/metrics.", + "remediation": "Review and resolve the issue described in the Evidence Generation Rate doctor check article.", + "runCommand": "stella doctor run --check check.compliance.evidence-rate", + "symptoms": [ + "evidence generation service failures (internal", + "database connectivity issues preventing evidence", + "signing key unavailable, blocking signed", + "storage quota exceeded on the" + ], + "tags": [ + "doctor", + "compliance", + "evidence", + "attestation" + ], + "references": [ + "docs/doctor/articles/compliance/evidence-rate.md" + ] + }, + { + "checkCode": "check.compliance.export-readiness", + "title": "Evidence Export Readiness", + "severity": "medium", + "description": "Verifies that evidence can be exported in auditor-ready formats by querying the Evidence Locker at /api/v1/evidence/export/capabilities.", + "remediation": "Review and resolve the issue described in the Evidence Export Readiness doctor check article.", + "runCommand": "stella doctor run --check check.compliance.export-readiness", + "symptoms": [ + "export dependencies not installed (e.g", + "signing keys not configured for", + "template files missing for pdf", + "evidence locker deployed without export" + ], + "tags": [ + "doctor", + "compliance", + "export", + "audit" + ], + "references": [ + "docs/doctor/articles/compliance/export-readiness.md" + ] + }, + { + "checkCode": "check.compliance.framework", + "title": "Compliance Framework", + "severity": "medium", + "description": "Verifies that configured compliance framework requirements are met by querying the Policy service at /api/v1/compliance/status.", + "remediation": "Review and resolve the issue described in the Compliance Framework doctor check article.", + "runCommand": "stella doctor run --check check.compliance.framework", + "symptoms": [ + "control requirements not implemented in", + "evidence gaps where expected artifacts", + "policy violations detected by the", + "configuration drift from the established" + ], + "tags": [ + "doctor", + "compliance", + "framework", + "soc2", + "fedramp" + ], + "references": [ + "docs/doctor/articles/compliance/framework.md" + ] + }, + { + "checkCode": "check.compliance.provenance-completeness", + "title": "Provenance Completeness", + "severity": "high", + "description": "Verifies that provenance records exist for all releases by querying the Provenance service at /api/v1/provenance/completeness.", + "remediation": "Review and resolve the issue described in the Provenance Completeness doctor check article.", + "runCommand": "stella doctor run --check check.compliance.provenance-completeness", + "symptoms": [ + "build pipeline not configured to", + "provenance upload failures due to", + "legacy releases created before provenance", + "manual deployments that bypass the" + ], + "tags": [ + "doctor", + "compliance", + "provenance", + "slsa" + ], + "references": [ + "docs/doctor/articles/compliance/provenance-completeness.md" + ] + }, + { + "checkCode": "check.core.auth.config", + "title": "Authentication Configuration", + "severity": "medium", + "description": "Verifies that authentication and authorization configuration is valid.", + "remediation": "Set the appropriate environment variables in your service definition inside docker-compose.yml or the .env file.", + "runCommand": "stella doctor run --check check.core.auth.config", + "symptoms": [ + "jwt issuer not configured", + "jwt audience not configured", + "jwt secretkey is shorter than", + "jwt secretkey contains common weak" ], "tags": [ "doctor", "security", - "oidc" + "authentication", + "configuration" ], "references": [ - "docs/modules/authority/architecture.md" + "docs/doctor/articles/core/auth-config.md" ] }, { - "checkCode": "check.router.gateway.routes", - "title": "Router route registration", - "severity": "warn", - "description": "Expected gateway routes were not registered or health probes failed.", - "remediation": "Inspect route tables and refresh router registration.", - "runCommand": "stella doctor run --check check.router.gateway.routes", + "checkCode": "check.core.config.loaded", + "title": "Configuration Loaded", + "severity": "high", + "description": "Verifies that the application configuration system is properly loaded and accessible.", + "remediation": "Verify the configuration file exists inside the container.", + "runCommand": "stella doctor run --check check.core.config.loaded", "symptoms": [ - "route missing", - "404 on expected endpoint", - "gateway routing" + "configuration file (appsettings.json) is missing", + "configuration provider not registered in", + "environment variables not set in", + "config file not included in" ], "tags": [ "doctor", - "router", - "gateway" + "quick", + "configuration", + "startup" ], "references": [ - "docs/modules/router/README.md" + "docs/doctor/articles/core/config-loaded.md" ] }, { - "checkCode": "check.integrations.secrets.binding", - "title": "Integration secret binding", + "checkCode": "check.core.config.required", + "title": "Required Settings", + "severity": "high", + "description": "Verifies that required configuration settings are present and have non-empty values.", + "remediation": "Add the connection string to your .env file or directly in docker-compose.yml.", + "runCommand": "stella doctor run --check check.core.config.required", + "symptoms": [ + "database connection string not configured", + "environment variables not set (check", + "typo in the environment variable", + "config file present but missing" + ], + "tags": [ + "doctor", + "quick", + "configuration", + "startup" + ], + "references": [ + "docs/doctor/articles/core/config-required.md" + ] + }, + { + "checkCode": "check.core.crypto.available", + "title": "Cryptography Providers", + "severity": "high", + "description": "Verifies that required cryptographic algorithms are available on the host system.", + "remediation": "If using Alpine-based images, ensure OpenSSL is installed.", + "runCommand": "stella doctor run --check check.core.crypto.available", + "symptoms": [ + "operating system does not support", + "fips mode restrictions preventing non-fips", + "missing cryptographic libraries (e.g., openssl", + "running on a platform with" + ], + "tags": [ + "doctor", + "quick", + "security", + "crypto" + ], + "references": [ + "docs/doctor/articles/core/crypto-available.md" + ] + }, + { + "checkCode": "check.core.env.diskspace", + "title": "Disk Space", + "severity": "high", + "description": "Verifies sufficient disk space is available on the drive where the application is running.", + "remediation": "Check disk usage and clean up.", + "runCommand": "stella doctor run --check check.core.env.diskspace", + "symptoms": [ + "log files consuming disk space", + "temporary files not cleaned up", + "application data growth (evidence locker", + "docker images and volumes consuming" + ], + "tags": [ + "doctor", + "quick", + "environment", + "resources" + ], + "references": [ + "docs/doctor/articles/core/env-diskspace.md" + ] + }, + { + "checkCode": "check.core.env.memory", + "title": "Memory Usage", "severity": "medium", - "description": "Integration connectors cannot resolve configured secrets.", - "remediation": "Validate secret provider configuration and rotate invalid credentials.", - "runCommand": "stella doctor run --check check.integrations.secrets.binding", + "description": "Verifies that the application process memory usage is within acceptable limits.", + "remediation": "Set memory limits for the service in docker-compose.yml.", + "runCommand": "stella doctor run --check check.core.env.memory", "symptoms": [ - "secret missing", - "invalid credential", - "auth failed" + "memory leak in application code", + "large data sets loaded entirely", + "insufficient memory limits configured for", + "normal operation with high load" ], "tags": [ "doctor", - "integrations", - "secrets" + "quick", + "environment", + "resources" ], "references": [ - "docs/modules/platform/architecture-overview.md" + "docs/doctor/articles/core/env-memory.md" ] }, { - "checkCode": "check.release.policy.gate", - "title": "Policy gate prerequisites", - "severity": "warn", - "description": "Release policy gate prerequisites are incomplete for the target environment.", - "remediation": "Review required approvals, policy bundle versions, and attestations.", - "runCommand": "stella doctor run --check check.release.policy.gate", + "checkCode": "check.core.env.variables", + "title": "Environment Variables", + "severity": "medium", + "description": "Verifies that expected environment variables are configured for the runtime environment.", + "remediation": "Add the environment variable to your service in docker-compose.yml.", + "runCommand": "stella doctor run --check check.core.env.variables", "symptoms": [ - "policy gate failed", - "missing attestation", - "promotion blocked" + "no stellaops, asp.net, or .net", + "the service is not running", + "docker compose .env file missing", + "environment variables defined in the" + ], + "tags": [ + "doctor", + "quick", + "environment", + "configuration" + ], + "references": [ + "docs/doctor/articles/core/env-variables.md" + ] + }, + { + "checkCode": "check.core.services.dependencies", + "title": "Required Services", + "severity": "high", + "description": "Verifies that required infrastructure services are registered in the .NET dependency injection (DI) container.", + "remediation": "This is a code-level issue, not a deployment configuration problem.", + "runCommand": "stella doctor run --check check.core.services.dependencies", + "symptoms": [ + "services not registered in the", + "missing builder.services.addxxx() call in program.cs", + "incorrect service registration order causing", + "custom host builder that skips" + ], + "tags": [ + "doctor", + "quick", + "services", + "di" + ], + "references": [ + "docs/doctor/articles/core/services-dependencies.md" + ] + }, + { + "checkCode": "check.core.services.health", + "title": "Service Health", + "severity": "high", + "description": "Aggregates health status from all registered ASP.NET Core IHealthCheck services.", + "remediation": "Check the health endpoint directly.", + "runCommand": "stella doctor run --check check.core.services.health", + "symptoms": [ + "dependent service unavailable (database, valkey", + "database connection failed or timed", + "external api unreachable (network partition", + "health check timeout exceeded (default" + ], + "tags": [ + "doctor", + "health", + "services" + ], + "references": [ + "docs/doctor/articles/core/services-health.md" + ] + }, + { + "checkCode": "check.crypto.certchain", + "title": "Certificate Chain Validation", + "severity": "medium", + "description": "Verifies certificate chain completeness, trust anchor validity, and expiration for the configured TLS certificate.", + "remediation": "Review and resolve the issue described in the Certificate Chain Validation doctor check article.", + "runCommand": "stella doctor run --check check.crypto.certchain", + "symptoms": [ + "certificate file was moved or", + "incorrect certificate path in configuration", + "missing intermediate certificates in the", + "incomplete certificate bundle (only leaf" + ], + "tags": [ + "doctor", + "crypto", + "certificate", + "tls", + "security" + ], + "references": [ + "docs/doctor/articles/crypto/certchain.md" + ] + }, + { + "checkCode": "check.crypto.eidas", + "title": "eIDAS Compliance", + "severity": "high", + "description": "Verifies that eIDAS-compliant signature algorithms are available for EU deployments.", + "remediation": "Review and resolve the issue described in the eIDAS Compliance doctor check article.", + "runCommand": "stella doctor run --check check.crypto.eidas", + "symptoms": [ + "openssl version too old to", + "crypto libraries compiled without required", + "configuration restricting the set of", + "legacy rsa key size configuration" + ], + "tags": [ + "doctor", + "crypto", + "eidas", + "eu", + "compliance", + "signature" + ], + "references": [ + "docs/doctor/articles/crypto/eidas.md" + ] + }, + { + "checkCode": "check.crypto.fips", + "title": "FIPS 140-2 Compliance", + "severity": "high", + "description": "Verifies that FIPS 140-2 mode is enabled and that FIPS-compliant algorithms are functional.", + "remediation": "Linux (RHEL/CentOS/Fedora).", + "runCommand": "stella doctor run --check check.crypto.fips", + "symptoms": [ + "fips mode not enabled in", + "openssl fips provider not loaded", + ".net runtime not configured for", + "fips module version incompatible with" + ], + "tags": [ + "doctor", + "crypto", + "fips", + "compliance", + "security" + ], + "references": [ + "docs/doctor/articles/crypto/fips.md" + ] + }, + { + "checkCode": "check.crypto.gost", + "title": "GOST Algorithm Availability", + "severity": "high", + "description": "Verifies that GOST cryptographic algorithms are available for Russian deployments.", + "remediation": "Review and resolve the issue described in the GOST Algorithm Availability doctor check article.", + "runCommand": "stella doctor run --check check.crypto.gost", + "symptoms": [ + "openssl gost engine not installed", + "gost engine not configured in", + "missing gost-engine package", + "gost engine version too old" + ], + "tags": [ + "doctor", + "crypto", + "gost", + "russia", + "compliance" + ], + "references": [ + "docs/doctor/articles/crypto/gost.md" + ] + }, + { + "checkCode": "check.crypto.hsm", + "title": "HSM/PKCS#11 Availability", + "severity": "medium", + "description": "Verifies HSM (Hardware Security Module) availability via PKCS#11 interface.", + "remediation": "Review and resolve the issue described in the HSM/PKCS#11 Availability doctor check article.", + "runCommand": "stella doctor run --check check.crypto.hsm", + "symptoms": [ + "pkcs#11 module path not configured", + "module file was moved or", + "hsm software not installed (e.g", + "pkcs#11 module initialization failure (driver" + ], + "tags": [ + "doctor", + "crypto", + "hsm", + "pkcs11", + "security" + ], + "references": [ + "docs/doctor/articles/crypto/hsm.md" + ] + }, + { + "checkCode": "check.crypto.sm", + "title": "SM2/SM3/SM4 Availability", + "severity": "high", + "description": "Verifies that Chinese national cryptographic algorithms (GM/T standards) are available for CN deployments.", + "remediation": "Review and resolve the issue described in the SM2/SM3/SM4 Availability doctor check article.", + "runCommand": "stella doctor run --check check.crypto.sm", + "symptoms": [ + "openssl version too old (pre-1.1.1)", + "using libressl instead of openssl", + "system openssl not updated to", + "openssl compiled without sm algorithm" + ], + "tags": [ + "doctor", + "crypto", + "sm2", + "sm3", + "sm4", + "china", + "compliance" + ], + "references": [ + "docs/doctor/articles/crypto/sm.md" + ] + }, + { + "checkCode": "check.docker.apiversion", + "title": "Docker API Version", + "severity": "medium", + "description": "Validates that the Docker API version meets minimum requirements for Stella Ops.", + "remediation": "Update Docker Engine to the latest stable version.", + "runCommand": "stella doctor run --check check.docker.apiversion", + "symptoms": [ + "docker engine is outdated (version", + "docker engine is functional but", + "using a docker-compatible runtime (podman", + "docker not updated after os" + ], + "tags": [ + "doctor", + "docker", + "api", + "compatibility" + ], + "references": [ + "docs/doctor/articles/docker/apiversion.md" + ] + }, + { + "checkCode": "check.docker.daemon", + "title": "Docker Daemon", + "severity": "high", + "description": "Validates that the Docker daemon is running and responsive.", + "remediation": "Check and restart the Docker daemon.", + "runCommand": "stella doctor run --check check.docker.daemon", + "symptoms": [ + "docker daemon is not running", + "docker is not installed on", + "docker service crashed or was", + "docker daemon returned an error" + ], + "tags": [ + "doctor", + "docker", + "daemon", + "container" + ], + "references": [ + "docs/doctor/articles/docker/daemon.md" + ] + }, + { + "checkCode": "check.docker.network", + "title": "Docker Network", + "severity": "medium", + "description": "Validates Docker network configuration and connectivity.", + "remediation": "Docker Compose normally creates networks automatically.", + "runCommand": "stella doctor run --check check.docker.network", + "symptoms": [ + "required network not found (not", + "no bridge network driver available", + "docker compose network not created", + "network name mismatch between configuration" + ], + "tags": [ + "doctor", + "docker", + "network", + "connectivity" + ], + "references": [ + "docs/doctor/articles/docker/network.md" + ] + }, + { + "checkCode": "check.docker.socket", + "title": "Docker Socket", + "severity": "high", + "description": "Validates that the Docker socket exists and is accessible with correct permissions.", + "remediation": "Mount the Docker socket for services that need container management.", + "runCommand": "stella doctor run --check check.docker.socket", + "symptoms": [ + "docker socket not found at", + "docker not installed or daemon", + "insufficient permissions on the socket", + "docker socket not mounted into" + ], + "tags": [ + "doctor", + "docker", + "socket", + "permissions" + ], + "references": [ + "docs/doctor/articles/docker/socket.md" + ] + }, + { + "checkCode": "check.docker.storage", + "title": "Docker Storage", + "severity": "medium", + "description": "Validates Docker storage driver and disk space usage.", + "remediation": "Check and clean Docker storage.", + "runCommand": "stella doctor run --check check.docker.storage", + "symptoms": [ + "storage driver is not overlay2", + "low disk space on the", + "disk usage exceeds 85% threshold", + "unused images, containers, and volumes" + ], + "tags": [ + "doctor", + "docker", + "storage", + "disk" + ], + "references": [ + "docs/doctor/articles/docker/storage.md" + ] + }, + { + "checkCode": "check.environment.capacity", + "title": "Environment Capacity", + "severity": "medium", + "description": "Queries the Release Orchestrator API (/api/v1/environments/capacity) and evaluates CPU, memory, storage, and deployment slot usage for every configured environment.", + "remediation": "Review and resolve the issue described in the Environment Capacity doctor check article.", + "runCommand": "stella doctor run --check check.environment.capacity", + "symptoms": [ + "gradual organic growth without corresponding", + "runaway or leaked processes consuming", + "accumulated old deployments that were", + "resource limits set too tightly" + ], + "tags": [ + "doctor", + "environment", + "capacity", + "resources", + "cpu", + "memory", + "storage" + ], + "references": [ + "docs/doctor/articles/environment/environment-capacity.md" + ] + }, + { + "checkCode": "check.environment.connectivity", + "title": "Environment Connectivity", + "severity": "medium", + "description": "Retrieves the list of environments from the Release Orchestrator (/api/v1/environments), then probes each environment agent's /health endpoint.", + "remediation": "Review and resolve the issue described in the Environment Connectivity doctor check article.", + "runCommand": "stella doctor run --check check.environment.connectivity", + "symptoms": [ + "environment agent service is stopped", + "firewall rule change blocking the", + "network partition between stella ops", + "tls certificate not renewed before" + ], + "tags": [ + "doctor", + "environment", + "connectivity", + "agent", + "network" + ], + "references": [ + "docs/doctor/articles/environment/environment-connectivity.md" + ] + }, + { + "checkCode": "check.environment.deployments", + "title": "Environment Deployment Health", + "severity": "medium", + "description": "Queries the Release Orchestrator (/api/v1/environments/deployments) for all deployed services across all environments.", + "remediation": "Review and resolve the issue described in the Environment Deployment Health doctor check article.", + "runCommand": "stella doctor run --check check.environment.deployments", + "symptoms": [ + "service crashed due to unhandled", + "deployment rolled out a bad", + "dependency (database, cache, message broker)", + "resource exhaustion preventing replicas from" + ], + "tags": [ + "doctor", + "environment", + "deployment", + "services", + "health" + ], + "references": [ + "docs/doctor/articles/environment/environment-deployment-health.md" + ] + }, + { + "checkCode": "check.environment.drift", + "title": "Environment Drift Detection", + "severity": "medium", + "description": "Queries the Release Orchestrator drift report API (/api/v1/environments/drift) and compares configuration snapshots across environments.", + "remediation": "Review and resolve the issue described in the Environment Drift Detection doctor check article.", + "runCommand": "stella doctor run --check check.environment.drift", + "symptoms": [ + "manual configuration changes applied directly", + "failed deployment that left partial", + "configuration sync job that did", + "environment restored from an outdated" + ], + "tags": [ + "doctor", + "environment", + "drift", + "configuration", + "consistency" + ], + "references": [ + "docs/doctor/articles/environment/environment-drift.md" + ] + }, + { + "checkCode": "check.environment.network.policy", + "title": "Environment Network Policy", + "severity": "medium", + "description": "Retrieves network policies from the Release Orchestrator (/api/v1/environments/network-policies) and evaluates isolation posture for each environment.", + "remediation": "Review and resolve the issue described in the Environment Network Policy doctor check article.", + "runCommand": "stella doctor run --check check.environment.network.policy", + "symptoms": [ + "network policies not yet defined", + "legacy policy left in place", + "production policy copied from dev", + "manual firewall rule change not" + ], + "tags": [ + "doctor", + "environment", + "network", + "policy", + "security", + "isolation" + ], + "references": [ + "docs/doctor/articles/environment/environment-network-policy.md" + ] + }, + { + "checkCode": "check.environment.secrets", + "title": "Environment Secret Health", + "severity": "medium", + "description": "Queries the Release Orchestrator secrets status API (/api/v1/environments/secrets/status) for metadata about all configured secrets (no actual secret values are retrieved).", + "remediation": "Review and resolve the issue described in the Environment Secret Health doctor check article.", + "runCommand": "stella doctor run --check check.environment.secrets", + "symptoms": [ + "secret expired without automated rotation", + "rotation job failed silently (scheduler", + "secret provider (vault, key vault)", + "manual secret set with fixed" + ], + "tags": [ + "doctor", + "environment", + "secrets", + "security", + "rotation", + "expiry" + ], + "references": [ + "docs/doctor/articles/environment/environment-secret-health.md" + ] + }, + { + "checkCode": "check.evidencelocker.index", + "title": "Evidence Index Consistency", + "severity": "medium", + "description": "Verifies that the evidence index is consistent with the artifacts stored on disk.", + "remediation": "Review and resolve the issue described in the Evidence Index Consistency doctor check article.", + "runCommand": "stella doctor run --check check.evidencelocker.index", + "symptoms": [ + "index never created (evidence locker", + "index file was deleted or", + "artifacts deleted without updating the", + "disk corruption causing artifact loss" + ], + "tags": [ + "doctor", + "evidence", + "index", + "consistency" + ], + "references": [ + "docs/doctor/articles/evidence-locker/index.md" + ] + }, + { + "checkCode": "check.evidencelocker.merkle", + "title": "Merkle Anchor Verification", + "severity": "high", + "description": "Verifies Merkle root anchoring integrity when anchoring is enabled.", + "remediation": "Review and resolve the issue described in the Merkle Anchor Verification doctor check article.", + "runCommand": "stella doctor run --check check.evidencelocker.merkle", + "symptoms": [ + "anchoring job not run yet", + "anchoring job scheduler not running", + "anchor record corrupted on disk", + "merkle root hash mismatch due" + ], + "tags": [ + "doctor", + "evidence", + "merkle", + "anchoring", + "integrity" + ], + "references": [ + "docs/doctor/articles/evidence-locker/merkle.md" + ] + }, + { + "checkCode": "check.evidencelocker.provenance", + "title": "Provenance Chain Integrity", + "severity": "high", + "description": "Validates provenance chain integrity using random sample verification.", + "remediation": "Review and resolve the issue described in the Provenance Chain Integrity doctor check article.", + "runCommand": "stella doctor run --check check.evidencelocker.provenance", + "symptoms": [ + "provenance record corrupted on disk", + "hash verification failure after accidental", + "chain link broken due to", + "data tampered or modified by" + ], + "tags": [ + "doctor", + "evidence", + "provenance", + "integrity", + "chain" + ], + "references": [ + "docs/doctor/articles/evidence-locker/provenance.md" + ] + }, + { + "checkCode": "check.evidencelocker.retrieval", + "title": "Attestation Retrieval", + "severity": "high", + "description": "Verifies that attestation artifacts can be retrieved from the evidence locker.", + "remediation": "Review and resolve the issue described in the Attestation Retrieval doctor check article.", + "runCommand": "stella doctor run --check check.evidencelocker.retrieval", + "symptoms": [ + "evidence locker service unavailable or", + "authentication failure when accessing the", + "artifact not found (empty or", + "evidence locker under heavy load" + ], + "tags": [ + "doctor", + "evidence", + "attestation", + "retrieval", + "core" + ], + "references": [ + "docs/doctor/articles/evidence-locker/retrieval.md" + ] + }, + { + "checkCode": "check.integration.ci.system", + "title": "CI System Connectivity", + "severity": "medium", + "description": "Iterates over all CI/CD systems defined under CI:Systems (or the legacy CI:Url single-system key).", + "remediation": "Review and resolve the issue described in the CI System Connectivity doctor check article.", + "runCommand": "stella doctor run --check check.integration.ci.system", + "symptoms": [ + "ci system is down or", + "network connectivity issue between stella", + "api credentials (token or password)", + "firewall or security group blocking" + ], + "tags": [ + "doctor", + "integration", + "ci", + "cd", + "jenkins", + "gitlab", + "github" + ], + "references": [ + "docs/doctor/articles/integration/ci-system-connectivity.md" + ] + }, + { + "checkCode": "check.integration.git", + "title": "Git Provider API", + "severity": "medium", + "description": "Resolves the configured Git provider URL from Git:Url, Scm:Url, GitHub:Url, GitLab:Url, or Gitea:Url.", + "remediation": "Review and resolve the issue described in the Git Provider API doctor check article.", + "runCommand": "stella doctor run --check check.integration.git", + "symptoms": [ + "git provider url is incorrect", + "network connectivity issues or dns", + "git provider service is down", + "provider uses a non-standard api" + ], + "tags": [ + "doctor", + "connectivity", + "git", + "scm" + ], + "references": [ + "docs/doctor/articles/integration/git-provider-api.md" + ] + }, + { + "checkCode": "check.integration.ldap", + "title": "LDAP/AD Connectivity", + "severity": "medium", + "description": "Reads the LDAP host from Ldap:Host, ActiveDirectory:Host, or Authority:Ldap:Host and the port from the corresponding :Port key (defaulting to 389, or 636 when UseSsl is true).", + "remediation": "Review and resolve the issue described in the LDAP/AD Connectivity doctor check article.", + "runCommand": "stella doctor run --check check.integration.ldap", + "symptoms": [ + "ldap/ad server is not running", + "firewall blocking ldap port (389)", + "dns resolution failure for the", + "network unreachable between stella ops" + ], + "tags": [ + "doctor", + "connectivity", + "ldap", + "directory", + "auth" + ], + "references": [ + "docs/doctor/articles/integration/ldap-connectivity.md" + ] + }, + { + "checkCode": "check.integration.oci.capabilities", + "title": "OCI Registry Capability Matrix", + "severity": "low", + "description": "Probes the configured OCI registry for five capabilities using a test repository (OCI:TestRepository, default library/alpine).", + "remediation": "Review and resolve the issue described in the OCI Registry Capability Matrix doctor check article.", + "runCommand": "stella doctor run --check check.integration.oci.capabilities", + "symptoms": [ + "registry does not implement oci", + "registry has delete operations disabled", + "chunked upload is disabled in", + "cross-repo mount is not supported" + ], + "tags": [ + "doctor", + "registry", + "oci", + "capabilities", + "compatibility" + ], + "references": [ + "docs/doctor/articles/integration/registry-capability-probe.md" + ] + }, + { + "checkCode": "check.integration.oci.credentials", + "title": "OCI Registry Credentials", + "severity": "high", + "description": "Determines the authentication method from configuration: bearer token (OCI:Token / Registry:Token), basic auth (OCI:Username + OCI:Password / Registry:Username + Registry:Password), or anonymous.", + "remediation": "Review and resolve the issue described in the OCI Registry Credentials doctor check article.", + "runCommand": "stella doctor run --check check.integration.oci.credentials", + "symptoms": [ + "credentials are invalid or have", + "token has been revoked by", + "username provided without a corresponding", + "service account token expired" + ], + "tags": [ + "doctor", + "registry", + "oci", + "credentials", + "secrets", + "auth" + ], + "references": [ + "docs/doctor/articles/integration/registry-credentials.md" + ] + }, + { + "checkCode": "check.integration.oci.pull", + "title": "OCI Registry Pull Authorization", + "severity": "high", + "description": "Sends an authenticated HTTP HEAD request to /v2//manifests/ with OCI and Docker manifest accept headers.", + "remediation": "Review and resolve the issue described in the OCI Registry Pull Authorization doctor check article.", + "runCommand": "stella doctor run --check check.integration.oci.pull", + "symptoms": [ + "credentials are invalid or expired", + "token has been revoked", + "anonymous pull is not allowed", + "service account has been removed" + ], + "tags": [ + "doctor", + "registry", + "oci", + "pull", + "authorization", + "credentials" + ], + "references": [ + "docs/doctor/articles/integration/registry-pull-authorization.md" + ] + }, + { + "checkCode": "check.integration.oci.push", + "title": "OCI Registry Push Authorization", + "severity": "high", + "description": "Sends an authenticated HTTP POST to /v2//blobs/uploads/ to initiate a blob upload session.", + "remediation": "Review and resolve the issue described in the OCI Registry Push Authorization doctor check article.", + "runCommand": "stella doctor run --check check.integration.oci.push", + "symptoms": [ + "credentials are valid but lack", + "repository does not exist and", + "service account has read-only access", + "organization or team policy restricts" + ], + "tags": [ + "doctor", + "registry", + "oci", + "push", + "authorization", + "credentials" + ], + "references": [ + "docs/doctor/articles/integration/registry-push-authorization.md" + ] + }, + { + "checkCode": "check.integration.oci.referrers", + "title": "OCI Registry Referrers API Support", + "severity": "medium", + "description": "First resolves the manifest digest for the test image (OCI:TestRepository:OCI:TestTag, defaults to library/alpine:latest) by sending a HEAD request to the manifests endpoint and reading the Docker-Content-Digest header.", + "remediation": "Review and resolve the issue described in the OCI Registry Referrers API Support doctor check article.", + "runCommand": "stella doctor run --check check.integration.oci.referrers", + "symptoms": [ + "registry does not implement oci", + "registry version is too old", + "referrers api disabled in registry", + "test image does not exist" + ], + "tags": [ + "doctor", + "registry", + "oci", + "referrers", + "compatibility", + "oci-1.1" + ], + "references": [ + "docs/doctor/articles/integration/registry-referrers-api.md" + ] + }, + { + "checkCode": "check.integration.oci.registry", + "title": "OCI Registry Connectivity", + "severity": "medium", + "description": "Reads the registry URL from OCI:RegistryUrl or Registry:Url.", + "remediation": "Review and resolve the issue described in the OCI Registry Connectivity doctor check article.", + "runCommand": "stella doctor run --check check.integration.oci.registry", + "symptoms": [ + "registry url is incorrect (typo", + "network connectivity issues between stella", + "registry service is down or", + "registry does not support the" + ], + "tags": [ + "doctor", + "connectivity", + "oci", + "registry" + ], + "references": [ + "docs/doctor/articles/integration/oci-registry-connectivity.md" + ] + }, + { + "checkCode": "check.integration.oidc", + "title": "OIDC Provider", + "severity": "medium", + "description": "Reads the OIDC issuer URL from Oidc:Issuer, Authentication:Oidc:Issuer, or Authority:Oidc:Issuer.", + "remediation": "Review and resolve the issue described in the OIDC Provider doctor check article.", + "runCommand": "stella doctor run --check check.integration.oidc", + "symptoms": [ + "oidc issuer url is incorrect", + "oidc provider (authority, keycloak, azure", + "network connectivity issues between stella", + "provider does not support openid" + ], + "tags": [ + "doctor", + "connectivity", + "oidc", + "auth", + "identity" + ], + "references": [ + "docs/doctor/articles/integration/oidc-provider.md" + ] + }, + { + "checkCode": "check.integration.s3.storage", + "title": "Object Storage Connectivity", + "severity": "medium", + "description": "Reads the S3 endpoint from S3:Endpoint, Storage:S3:Endpoint, or AWS:S3:ServiceURL.", + "remediation": "Review and resolve the issue described in the Object Storage Connectivity doctor check article.", + "runCommand": "stella doctor run --check check.integration.s3.storage", + "symptoms": [ + "s3 endpoint (minio, aws s3", + "network connectivity issues or dns", + "firewall blocking the storage port", + "invalid endpoint url format in" + ], + "tags": [ + "doctor", + "connectivity", + "s3", + "storage" + ], + "references": [ + "docs/doctor/articles/integration/object-storage.md" + ] + }, + { + "checkCode": "check.integration.secrets.manager", + "title": "Secrets Manager Connectivity", + "severity": "high", + "description": "Iterates over all secrets managers defined under Secrets:Managers (or the legacy Secrets:Vault:Url / Vault:Url single-manager key).", + "remediation": "Review and resolve the issue described in the Secrets Manager Connectivity doctor check article.", + "runCommand": "stella doctor run --check check.integration.secrets.manager", + "symptoms": [ + "secrets manager service is down", + "network connectivity issue between stella", + "authentication token has expired or", + "tls certificate issue (expired, untrusted" + ], + "tags": [ + "doctor", + "integration", + "secrets", + "vault", + "security", + "keyvault" + ], + "references": [ + "docs/doctor/articles/integration/secrets-manager-connectivity.md" + ] + }, + { + "checkCode": "check.integration.slack", + "title": "Slack Webhook", + "severity": "low", + "description": "Reads the Slack webhook URL from Slack:WebhookUrl or Notify:Slack:WebhookUrl.", + "remediation": "Review and resolve the issue described in the Slack Webhook doctor check article.", + "runCommand": "stella doctor run --check check.integration.slack", + "symptoms": [ + "network connectivity issues between stella", + "firewall blocking outbound https to", + "proxy misconfiguration preventing external https", + "webhook url is malformed or" + ], + "tags": [ + "doctor", + "notification", + "slack", + "webhook" + ], + "references": [ + "docs/doctor/articles/integration/slack-webhook.md" + ] + }, + { + "checkCode": "check.integration.smtp", + "title": "SMTP Email Connectivity", + "severity": "medium", + "description": "Reads the SMTP host from Smtp:Host, Email:Smtp:Host, or Notify:Email:Host and the port from the corresponding :Port key (defaulting to 587).", + "remediation": "Review and resolve the issue described in the SMTP Email Connectivity doctor check article.", + "runCommand": "stella doctor run --check check.integration.smtp", + "symptoms": [ + "smtp server is not running", + "firewall blocking smtp port (25", + "dns resolution failure for the", + "network unreachable between stella ops" + ], + "tags": [ + "doctor", + "connectivity", + "email", + "smtp" + ], + "references": [ + "docs/doctor/articles/integration/smtp-connectivity.md" + ] + }, + { + "checkCode": "check.integration.webhooks", + "title": "Integration Webhook Health", + "severity": "medium", + "description": "Iterates over all webhook endpoints defined under Webhooks:Endpoints.", + "remediation": "Review and resolve the issue described in the Integration Webhook Health doctor check article.", + "runCommand": "stella doctor run --check check.integration.webhooks", + "symptoms": [ + "webhook endpoint is down or", + "network connectivity issue or dns", + "tls certificate expired or untrusted", + "payload format changed causing receiver" + ], + "tags": [ + "doctor", + "integration", + "webhooks", + "notifications", + "events" + ], + "references": [ + "docs/doctor/articles/integration/webhook-health.md" + ] + }, + { + "checkCode": "check.logs.directory.writable", + "title": "Log Directory Writable", + "severity": "high", + "description": "Verifies that the log directory exists and is writable.", + "remediation": "Or use an emptyDir volume for ephemeral log storage with a sidecar shipping logs to an external system.", + "runCommand": "stella doctor run --check check.logs.directory.writable", + "symptoms": [ + "log directory not created during", + "directory was deleted", + "configuration points to wrong path", + "insufficient permissions or directory owned" + ], + "tags": [ + "doctor", + "observability", + "logs", + "quick" + ], + "references": [ + "docs/doctor/articles/observability/log-directory-writable.md" + ] + }, + { + "checkCode": "check.logs.rotation.configured", + "title": "Log Rotation", + "severity": "medium", + "description": "Verifies that log rotation is configured to prevent disk exhaustion.", + "remediation": "Set application-level log rotation.", + "runCommand": "stella doctor run --check check.logs.rotation.configured", + "symptoms": [ + "log rotation not configured in", + "logrotate not installed or stellaops", + "application-level rotation disabled", + "rotation threshold set too high" + ], + "tags": [ + "doctor", + "observability", + "logs" + ], + "references": [ + "docs/doctor/articles/observability/log-rotation.md" + ] + }, + { + "checkCode": "check.metrics.prometheus.scrape", + "title": "Prometheus Scrape", + "severity": "medium", + "description": "Verifies that the application metrics endpoint is accessible for Prometheus scraping.", + "remediation": "Edit appsettings.json.", + "runCommand": "stella doctor run --check check.metrics.prometheus.scrape", + "symptoms": [ + "metrics endpoint not enabled in", + "wrong port configured", + "service not running on the", + "authentication required but not configured" + ], + "tags": [ + "doctor", + "observability", + "metrics", + "prometheus" + ], + "references": [ + "docs/doctor/articles/observability/prometheus-scrape.md" + ] + }, + { + "checkCode": "check.notify.email.configured", + "title": "Email Configuration", + "severity": "medium", + "description": "Verifies that the email (SMTP) notification channel is properly configured.", + "remediation": "Add environment variables to your service definition.", + "runCommand": "stella doctor run --check check.notify.email.configured", + "symptoms": [ + "smtp host not set in", + "missing notify:channels:email:smtphost setting", + "smtp port not specified or", + "from address not configured" + ], + "tags": [ + "doctor", + "notify", + "email", + "smtp", + "quick", + "configuration" + ], + "references": [ + "docs/doctor/articles/notify/email-configured.md" + ] + }, + { + "checkCode": "check.notify.email.connectivity", + "title": "Email Connectivity", + "severity": "medium", + "description": "Verifies that the configured SMTP server is reachable by opening a TCP connection to the SMTP host and port.", + "remediation": "Verify network connectivity from the container.", + "runCommand": "stella doctor run --check check.notify.email.connectivity", + "symptoms": [ + "smtp server not running", + "wrong host or port in", + "firewall blocking outbound smtp connections", + "dns resolution failure for the" + ], + "tags": [ + "doctor", + "notify", + "email", + "smtp", + "connectivity", + "network" + ], + "references": [ + "docs/doctor/articles/notify/email-connectivity.md" + ] + }, + { + "checkCode": "check.notify.queue.health", + "title": "Notification Queue Health", + "severity": "high", + "description": "Verifies that the notification event and delivery queues are healthy.", + "remediation": "For Redis/Valkey transport.", + "runCommand": "stella doctor run --check check.notify.queue.health", + "symptoms": [ + "queue server (redis/valkey/nats) not running", + "network connectivity issues between the", + "authentication failure (wrong password or", + "incorrect connection string in configuration" + ], + "tags": [ + "doctor", + "notify", + "queue", + "redis", + "nats", + "infrastructure" + ], + "references": [ + "docs/doctor/articles/notify/queue-health.md" + ] + }, + { + "checkCode": "check.notify.slack.configured", + "title": "Slack Configuration", + "severity": "medium", + "description": "Verifies that the Slack notification channel is properly configured.", + "remediation": "> Security note: Slack webhook URLs are secrets.", + "runCommand": "stella doctor run --check check.notify.slack.configured", + "symptoms": [ + "slack webhook url not set", + "missing notify:channels:slack:webhookurl setting", + "environment variable not bound to", + "slack notifications explicitly disabled" + ], + "tags": [ + "doctor", + "notify", + "slack", + "quick", + "configuration" + ], + "references": [ + "docs/doctor/articles/notify/slack-configured.md" + ] + }, + { + "checkCode": "check.notify.slack.connectivity", + "title": "Slack Connectivity", + "severity": "medium", + "description": "Verifies that the configured Slack webhook endpoint is reachable.", + "remediation": "Test connectivity from the container.", + "runCommand": "stella doctor run --check check.notify.slack.connectivity", + "symptoms": [ + "invalid or expired webhook url", + "slack workspace configuration changed", + "webhook url revoked or regenerated", + "rate limiting by slack" + ], + "tags": [ + "doctor", + "notify", + "slack", + "connectivity", + "network" + ], + "references": [ + "docs/doctor/articles/notify/slack-connectivity.md" + ] + }, + { + "checkCode": "check.notify.teams.configured", + "title": "Teams Configuration", + "severity": "medium", + "description": "Verifies that the Microsoft Teams notification channel is properly configured.", + "remediation": "> Security note: Teams webhook URLs are secrets.", + "runCommand": "stella doctor run --check check.notify.teams.configured", + "symptoms": [ + "teams webhook url not set", + "webhook url is not from", + "teams notifications explicitly disabled", + "environment variable not bound to" + ], + "tags": [ + "doctor", + "notify", + "teams", + "quick", + "configuration" + ], + "references": [ + "docs/doctor/articles/notify/teams-configured.md" + ] + }, + { + "checkCode": "check.notify.teams.connectivity", + "title": "Teams Connectivity", + "severity": "medium", + "description": "Verifies that the configured Microsoft Teams webhook endpoint is reachable.", + "remediation": "Check Microsoft 365 service status at https://status.office.com.", + "runCommand": "stella doctor run --check check.notify.teams.connectivity", + "symptoms": [ + "invalid or expired webhook url", + "teams connector disabled or deleted", + "microsoft 365 tenant configuration changed", + "firewall blocking outbound https to" + ], + "tags": [ + "doctor", + "notify", + "teams", + "connectivity", + "network" + ], + "references": [ + "docs/doctor/articles/notify/teams-connectivity.md" + ] + }, + { + "checkCode": "check.notify.webhook.configured", + "title": "Webhook Configuration", + "severity": "medium", + "description": "Verifies that the generic webhook notification channel is properly configured.", + "remediation": "Review and resolve the issue described in the Webhook Configuration doctor check article.", + "runCommand": "stella doctor run --check check.notify.webhook.configured", + "symptoms": [ + "webhook url not set in", + "malformed url (missing protocol http://", + "invalid characters in url", + "webhook channel explicitly disabled" + ], + "tags": [ + "doctor", + "notify", + "webhook", + "quick", + "configuration" + ], + "references": [ + "docs/doctor/articles/notify/webhook-configured.md" + ] + }, + { + "checkCode": "check.notify.webhook.connectivity", + "title": "Webhook Connectivity", + "severity": "medium", + "description": "Verifies that the configured generic webhook endpoint is reachable.", + "remediation": "Check that egress NetworkPolicies allow traffic to the webhook destination.", + "runCommand": "stella doctor run --check check.notify.webhook.connectivity", + "symptoms": [ + "endpoint server not responding", + "network connectivity issue or firewall", + "dns resolution failure", + "tls/ssl certificate problem on the" + ], + "tags": [ + "doctor", + "notify", + "webhook", + "connectivity", + "network" + ], + "references": [ + "docs/doctor/articles/notify/webhook-connectivity.md" + ] + }, + { + "checkCode": "check.operations.dead-letter", + "title": "Dead Letter Queue", + "severity": "medium", + "description": "Examines the dead letter queue for failed jobs that have exhausted their retry attempts and require manual review.", + "remediation": "Review and resolve the issue described in the Dead Letter Queue doctor check article.", + "runCommand": "stella doctor run --check check.operations.dead-letter", + "symptoms": [ + "persistent downstream service failures (registry", + "configuration errors causing jobs to", + "resource exhaustion (out of memory", + "integration service outage (scm, ci" + ], + "tags": [ + "doctor", + "operations", + "queue", + "dead-letter" + ], + "references": [ + "docs/doctor/articles/operations/dead-letter.md" + ] + }, + { + "checkCode": "check.operations.job-queue", + "title": "Job Queue Health", + "severity": "high", + "description": "Evaluates the platform job queue health across three dimensions.", + "remediation": "Set in Helm values.yaml.", + "runCommand": "stella doctor run --check check.operations.job-queue", + "symptoms": [ + "worker service not running (crashed", + "all workers crashed or became", + "job processing slower than submission", + "workers overloaded or misconfigured (too" + ], + "tags": [ + "doctor", + "operations", + "queue", + "jobs", + "core" + ], + "references": [ + "docs/doctor/articles/operations/job-queue.md" + ] + }, + { + "checkCode": "check.operations.scheduler", + "title": "Scheduler Health", + "severity": "medium", + "description": "Evaluates the scheduler service status, scheduled jobs, and execution history.", + "remediation": "Set in Helm values.yaml.", + "runCommand": "stella doctor run --check check.operations.scheduler", + "symptoms": [ + "scheduler service crashed or was", + "service configuration error preventing startup", + "system was down during a", + "scheduler overloaded with too many" + ], + "tags": [ + "doctor", + "operations", + "scheduler", + "core" + ], + "references": [ + "docs/doctor/articles/operations/scheduler.md" + ] + }, + { + "checkCode": "check.policy.engine", + "title": "Policy Engine Health", + "severity": "high", + "description": "Performs a three-part health check against the policy engine (OPA).", + "remediation": "Set in Helm values.yaml.", + "runCommand": "stella doctor run --check check.policy.engine", + "symptoms": [ + "policy engine service (opa) not", + "policy storage backend unavailable (bundled", + "opa/rego compilation error in a", + "policy cache corrupted after abnormal" + ], + "tags": [ + "doctor", + "policy", + "core", + "health" + ], + "references": [ + "docs/doctor/articles/policy/engine.md" + ] + }, + { + "checkCode": "check.postgres.connectivity", + "title": "PostgreSQL Connectivity", + "severity": "high", + "description": "Opens a connection to PostgreSQL and executes SELECT version(), current_timestamp to verify the database is accessible and responsive.", + "remediation": "Verify connection string in environment.", + "runCommand": "stella doctor run --check check.postgres.connectivity", + "symptoms": [ + "database server not running or", + "network connectivity issues between the", + "firewall blocking the database port", + "dns resolution failure for the" + ], + "tags": [ + "doctor", + "database", + "postgres", + "connectivity", + "core" + ], + "references": [ + "docs/doctor/articles/postgres/connectivity.md" + ] + }, + { + "checkCode": "check.postgres.migrations", + "title": "PostgreSQL Migration Status", + "severity": "medium", + "description": "Connects to PostgreSQL and examines the EF Core migration history to identify pending migrations.", + "remediation": "Ensure auto-migration is enabled.", + "runCommand": "stella doctor run --check check.postgres.migrations", + "symptoms": [ + "new deployment with schema changes", + "migration was not run after", + "previous migration attempt failed partway", + "database initialized without ef core" + ], + "tags": [ + "doctor", + "database", + "postgres", + "migrations", + "schema" + ], + "references": [ + "docs/doctor/articles/postgres/migrations.md" + ] + }, + { + "checkCode": "check.postgres.pool", + "title": "PostgreSQL Connection Pool", + "severity": "medium", + "description": "Connects to PostgreSQL and queries pg_stat_activity and pg_settings to evaluate connection pool health.", + "remediation": "Increase Npgsql pool size via connection string.", + "runCommand": "stella doctor run --check check.postgres.pool", + "symptoms": [ + "connection leak in application code", + "long-running queries holding connections open", + "pool size too small for", + "sudden spike in database requests" + ], + "tags": [ + "doctor", + "database", + "postgres", + "pool", + "connections" + ], + "references": [ + "docs/doctor/articles/postgres/pool.md" + ] + }, + { + "checkCode": "check.release.active", + "title": "Active Release Health", + "severity": "medium", + "description": "Queries the Release Orchestrator at /api/v1/releases?state=active and evaluates the health of all currently active releases.", + "remediation": "Review and resolve the issue described in the Active Release Health doctor check article.", + "runCommand": "stella doctor run --check check.release.active", + "symptoms": [ + "release workflow step failed (script", + "approval bottleneck -- approvers not", + "target environment became unreachable during", + "resource contention between concurrent releases" ], "tags": [ "doctor", "release", - "policy" + "pipeline", + "active", + "monitoring" ], "references": [ - "docs/operations/upgrade-runbook.md" + "docs/doctor/articles/release/active.md" ] }, { - "checkCode": "check.airgap.bundle.integrity", - "title": "Air-gap bundle integrity", - "severity": "high", - "description": "Offline bundle integrity validation failed.", - "remediation": "Rebuild the bundle and verify signatures and checksums before import.", - "runCommand": "stella doctor run --check check.airgap.bundle.integrity", - "symptoms": [ - "checksum mismatch", - "signature invalid", - "offline import failed" - ], - "tags": [ - "doctor", - "airgap", - "integrity" - ], - "references": [ - "docs/operations/devops/runbooks/deployment-upgrade.md" - ] - }, - { - "checkCode": "check.telemetry.pipeline.delivery", - "title": "Telemetry delivery pipeline", + "checkCode": "check.release.configuration", + "title": "Release Configuration", "severity": "medium", - "description": "Telemetry queue backlog is growing or delivery workers are stalled.", - "remediation": "Scale workers, inspect queue depth, and validate downstream availability.", - "runCommand": "stella doctor run --check check.telemetry.pipeline.delivery", + "description": "Queries the Release Orchestrator at /api/v1/workflows and validates all release workflow definitions.", + "remediation": "Edit the workflow configuration file directly if needed.", + "runCommand": "stella doctor run --check check.release.configuration", "symptoms": [ - "telemetry lag", - "queue backlog", - "delivery timeout" + "workflow configuration incomplete (created but", + "stage transition misconfigured after adding", + "environment deleted from the system", + "copy-paste errors when duplicating workflows" ], "tags": [ "doctor", + "release", + "configuration", + "workflow", + "validation" + ], + "references": [ + "docs/doctor/articles/release/configuration.md" + ] + }, + { + "checkCode": "check.release.environment.readiness", + "title": "Environment Readiness", + "severity": "medium", + "description": "Queries the Release Orchestrator at /api/v1/environments and evaluates the health and readiness of all configured target environments.", + "remediation": "Review and resolve the issue described in the Environment Readiness doctor check article.", + "runCommand": "stella doctor run --check check.release.environment.readiness", + "symptoms": [ + "environment agent not responding (crashed", + "network connectivity issue between the", + "container runtime issue in the", + "resource exhaustion (disk full, memory" + ], + "tags": [ + "doctor", + "release", + "environment", + "readiness", + "deployment" + ], + "references": [ + "docs/doctor/articles/release/environment-readiness.md" + ] + }, + { + "checkCode": "check.release.promotion.gates", + "title": "Promotion Gate Health", + "severity": "medium", + "description": "Queries the Release Orchestrator at /api/v1/promotion-gates and validates each promotion gate's dependencies.", + "remediation": "Set in Helm values.yaml.", + "runCommand": "stella doctor run --check check.release.promotion.gates", + "symptoms": [ + "required policies not loaded or", + "attestor service unavailable (crashed, misconfigured", + "approval workflow misconfigured (approvers removed", + "environment was deleted but its" + ], + "tags": [ + "doctor", + "release", + "promotion", + "gates", + "policy", + "attestation" + ], + "references": [ + "docs/doctor/articles/release/promotion-gates.md" + ] + }, + { + "checkCode": "check.release.rollback.readiness", + "title": "Rollback Readiness", + "severity": "medium", + "description": "Queries the Release Orchestrator at /api/v1/environments/rollback-status (with fallback to /api/v1/environments) and evaluates rollback capability for production environments.", + "remediation": "Configure health probes.", + "runCommand": "stella doctor run --check check.release.rollback.readiness", + "symptoms": [ + "previous deployment artifacts not retained", + "database migration not reversible (destructive", + "breaking api change deployed that", + "rollback manually disabled for the" + ], + "tags": [ + "doctor", + "release", + "rollback", + "disaster-recovery", + "production" + ], + "references": [ + "docs/doctor/articles/release/rollback-readiness.md" + ] + }, + { + "checkCode": "check.release.schedule", + "title": "Release Schedule Health", + "severity": "low", + "description": "Queries the Release Orchestrator at /api/v1/releases/scheduled and evaluates the health of scheduled releases.", + "remediation": "Review and resolve the issue described in the Release Schedule Health doctor check article.", + "runCommand": "stella doctor run --check check.release.schedule", + "symptoms": [ + "release scheduler service not running", + "prerequisite conditions (policy gates, approvals)", + "target environment was unavailable when", + "multiple teams scheduling releases to" + ], + "tags": [ + "doctor", + "release", + "schedule", + "upcoming", + "planning" + ], + "references": [ + "docs/doctor/articles/release/schedule.md" + ] + }, + { + "checkCode": "check.scanner.queue", + "title": "Scanner Queue Health", + "severity": "medium", + "description": "Queries the Scanner service at /api/v1/queue/stats and evaluates job queue health across four dimensions.", + "remediation": "Check scanner worker status and restart if needed.", + "runCommand": "stella doctor run --check check.scanner.queue", + "symptoms": [ + "scanner worker process crashed or", + "job dependency (registry, database) became", + "resource exhaustion (cpu, memory, disk)", + "database connection lost during job" + ], + "tags": [ + "doctor", + "scanner", + "queue", + "jobs", + "processing" + ], + "references": [ + "docs/doctor/articles/scanner/queue.md" + ] + }, + { + "checkCode": "check.scanner.reachability", + "title": "Reachability Computation Health", + "severity": "medium", + "description": "Queries the Scanner service at /api/v1/reachability/stats and evaluates reachability analysis performance and accuracy.", + "remediation": "Edit /etc/stellaops/scanner/appsettings.json.", + "runCommand": "stella doctor run --check check.scanner.reachability", + "symptoms": [ + "invalid or incomplete call graph", + "missing slice cache entries forcing", + "timeout on large codebases with", + "memory exhaustion during graph traversal" + ], + "tags": [ + "doctor", + "scanner", + "reachability", + "analysis", + "performance" + ], + "references": [ + "docs/doctor/articles/scanner/reachability.md" + ] + }, + { + "checkCode": "check.scanner.resources", + "title": "Scanner Resource Utilization", + "severity": "medium", + "description": "Queries the Scanner service at /api/v1/resources/stats and evaluates CPU, memory, and worker pool health.", + "remediation": "Edit /etc/stellaops/scanner/appsettings.json.", + "runCommand": "stella doctor run --check check.scanner.resources", + "symptoms": [ + "high scan volume during bulk", + "memory leak from accumulated scan", + "large container images (multi-gb layers)", + "insufficient cpu/memory allocation relative to" + ], + "tags": [ + "doctor", + "scanner", + "resources", + "cpu", + "memory", + "workers" + ], + "references": [ + "docs/doctor/articles/scanner/resources.md" + ] + }, + { + "checkCode": "check.scanner.sbom", + "title": "SBOM Generation Health", + "severity": "medium", + "description": "Queries the Scanner service at /api/v1/sbom/stats and evaluates SBOM generation health.", + "remediation": "Edit /etc/stellaops/scanner/appsettings.json.", + "runCommand": "stella doctor run --check check.scanner.sbom", + "symptoms": [ + "invalid or corrupted source artifacts", + "parser errors for specific ecosystems", + "memory exhaustion on large monorepo", + "sbom schema validation failures due" + ], + "tags": [ + "doctor", + "scanner", + "sbom", + "cyclonedx", + "spdx", + "compliance" + ], + "references": [ + "docs/doctor/articles/scanner/sbom.md" + ] + }, + { + "checkCode": "check.scanner.slice.cache", + "title": "Slice Cache Health", + "severity": "medium", + "description": "Queries the Scanner service at /api/v1/cache/stats and evaluates slice cache effectiveness.", + "remediation": "Increase cache size in docker-compose.stella-ops.yml.", + "runCommand": "stella doctor run --check check.scanner.slice.cache", + "symptoms": [ + "cache size limit too small", + "ttl configured too long, preventing", + "eviction policy not working (configuration", + "unexpected growth in the number" + ], + "tags": [ + "doctor", + "scanner", + "cache", + "slice", + "performance" + ], + "references": [ + "docs/doctor/articles/scanner/slice-cache.md" + ] + }, + { + "checkCode": "check.scanner.vuln", + "title": "Vulnerability Scan Health", + "severity": "medium", + "description": "Queries the Scanner service at /api/v1/vuln/stats and evaluates vulnerability scanning health, focusing on database freshness.", + "remediation": "Configure sync schedule in docker-compose.stella-ops.yml.", + "runCommand": "stella doctor run --check check.scanner.vuln", + "symptoms": [ + "vulnerability database sync job failed", + "feed source (nvd, osv, vendor", + "network connectivity issue preventing feed", + "scheduled sync delayed due to" + ], + "tags": [ + "doctor", + "scanner", + "vulnerability", + "cve", + "database" + ], + "references": [ + "docs/doctor/articles/scanner/vuln.md" + ] + }, + { + "checkCode": "check.scanner.witness.graph", + "title": "Witness Graph Health", + "severity": "medium", + "description": "Queries the Scanner service at /api/v1/witness/stats and evaluates witness graph construction health.", + "remediation": "Edit /etc/stellaops/scanner/appsettings.json.", + "runCommand": "stella doctor run --check check.scanner.witness.graph", + "symptoms": [ + "missing sbom input (sbom generation", + "parser error on specific artifact", + "cyclical dependency detected causing infinite", + "resource exhaustion during graph construction" + ], + "tags": [ + "doctor", + "scanner", + "witness", + "graph", + "reachability", + "evidence" + ], + "references": [ + "docs/doctor/articles/scanner/witness-graph.md" + ] + }, + { + "checkCode": "check.security.apikey", + "title": "API Key Security", + "severity": "medium", + "description": "Validates API key configuration and security practices.", + "remediation": "Set API key security configuration in environment variables.", + "runCommand": "stella doctor run --check check.security.apikey", + "symptoms": [ + "minimum api key length configured", + "api keys allowed in query", + "using the authorization header for", + "per-key rate limiting not enabled" + ], + "tags": [ + "doctor", + "security", + "apikey", + "authentication" + ], + "references": [ + "docs/doctor/articles/security/apikey.md" + ] + }, + { + "checkCode": "check.security.audit.logging", + "title": "Audit Logging", + "severity": "medium", + "description": "Validates that audit logging is enabled and properly configured for security events.", + "remediation": "Add audit configuration to environment variables.", + "runCommand": "stella doctor run --check check.security.audit.logging", + "symptoms": [ + "audit logging disabled in configuration", + "audit logging configuration not found", + "authentication event logging turned off", + "administrative event logging turned off" + ], + "tags": [ + "doctor", + "security", + "audit", + "logging" + ], + "references": [ + "docs/doctor/articles/security/audit-logging.md" + ] + }, + { + "checkCode": "check.security.cors", + "title": "CORS Configuration", + "severity": "medium", + "description": "Validates Cross-Origin Resource Sharing (CORS) security settings.", + "remediation": "Set explicit CORS origins in environment variables.", + "runCommand": "stella doctor run --check check.security.cors", + "symptoms": [ + "cors allows any origin (allowanyorigin", + "cors wildcard origin * configured", + "cors allows any origin with", + "allowed origins include non-https urls" + ], + "tags": [ + "doctor", + "security", + "cors", + "web" + ], + "references": [ + "docs/doctor/articles/security/cors.md" + ] + }, + { + "checkCode": "check.security.encryption", + "title": "Encryption Keys", + "severity": "medium", + "description": "Validates encryption key configuration and algorithms.", + "remediation": "Set encryption configuration.", + "runCommand": "stella doctor run --check check.security.encryption", + "symptoms": [ + "weak encryption algorithm configured (des", + "encryption key size too small", + "key rotation period greater than", + "data protection keys directory does" + ], + "tags": [ + "doctor", + "security", + "encryption", + "cryptography" + ], + "references": [ + "docs/doctor/articles/security/encryption.md" + ] + }, + { + "checkCode": "check.security.evidence.integrity", + "title": "Evidence Integrity", + "severity": "high", + "description": "Validates DSSE signatures, Rekor inclusion proofs, and evidence hash consistency for files in the evidence locker.", + "remediation": "Verify the evidence locker path is configured and accessible.", + "runCommand": "stella doctor run --check check.security.evidence.integrity", + "symptoms": [ + "evidence files may have been", + "dsse signatures are invalid (payload", + "evidence digests do not match", + "rekor inclusion proofs are invalid" + ], + "tags": [ + "doctor", + "security", + "evidence", + "integrity", + "dsse", + "rekor", + "offline" + ], + "references": [ + "docs/doctor/articles/security/evidence-integrity.md" + ] + }, + { + "checkCode": "check.security.headers", + "title": "Security Headers", + "severity": "medium", + "description": "Validates that HTTP security headers are properly configured.", + "remediation": "Set security headers via environment variables.", + "runCommand": "stella doctor run --check check.security.headers", + "symptoms": [ + "hsts not enabled (common in", + "x-frame-options header not configured or", + "content-security-policy header not defined", + "x-content-type-options: nosniff not enabled" + ], + "tags": [ + "doctor", + "security", + "headers", + "web" + ], + "references": [ + "docs/doctor/articles/security/headers.md" + ] + }, + { + "checkCode": "check.security.jwt.config", + "title": "JWT Configuration", + "severity": "high", + "description": "Validates JWT token signing and validation configuration.", + "remediation": "Set JWT configuration as environment variables.", + "runCommand": "stella doctor run --check check.security.jwt.config", + "symptoms": [ + "jwt signing key is not", + "jwt signing key is too", + "jwt issuer or audience not", + "jwt expiration time set too" + ], + "tags": [ + "doctor", + "security", + "jwt", + "authentication" + ], + "references": [ + "docs/doctor/articles/security/jwt-config.md" + ] + }, + { + "checkCode": "check.security.password.policy", + "title": "Password Policy", + "severity": "medium", + "description": "Validates password requirements meet security standards.", + "remediation": "Set password policy via environment variables.", + "runCommand": "stella doctor run --check check.security.password.policy", + "symptoms": [ + "minimum password length set too", + "password complexity requirements disabled (no", + "maximum failed login attempts too", + "account lockout duration too short" + ], + "tags": [ + "doctor", + "security", + "password", + "authentication" + ], + "references": [ + "docs/doctor/articles/security/password-policy.md" + ] + }, + { + "checkCode": "check.security.ratelimit", + "title": "Rate Limiting", + "severity": "medium", + "description": "Validates that rate limiting is configured to prevent API abuse.", + "remediation": "Set rate limiting configuration.", + "runCommand": "stella doctor run --check check.security.ratelimit", + "symptoms": [ + "rate limiting explicitly disabled in", + "rate limiting configuration section not", + "permit limit set too high", + "rate limit window too short" + ], + "tags": [ + "doctor", + "security", + "ratelimit", + "api" + ], + "references": [ + "docs/doctor/articles/security/ratelimit.md" + ] + }, + { + "checkCode": "check.security.secrets", + "title": "Secrets Configuration", + "severity": "high", + "description": "Validates that secrets are properly managed and not exposed as plain text in configuration.", + "remediation": "Use Docker secrets or reference an external secrets manager.", + "runCommand": "stella doctor run --check check.security.secrets", + "symptoms": [ + "secrets stored directly in appsettings.json", + "environment variables containing secrets not", + "development secrets left in production", + "no secrets management provider configured" + ], + "tags": [ + "doctor", + "security", + "secrets", + "configuration" + ], + "references": [ + "docs/doctor/articles/security/secrets.md" + ] + }, + { + "checkCode": "check.security.tls.certificate", + "title": "TLS Certificate", + "severity": "high", + "description": "Validates TLS certificate validity and expiration.", + "remediation": "Mount the certificate and configure the path.", + "runCommand": "stella doctor run --check check.security.tls.certificate", + "symptoms": [ + "certificate file path is incorrect", + "certificate has exceeded its validity", + "certificate validity period has not", + "certificate file is corrupted" + ], + "tags": [ + "doctor", + "security", + "tls", + "certificate" + ], + "references": [ + "docs/doctor/articles/security/tls-certificate.md" + ] + }, + { + "checkCode": "check.storage.backup", + "title": "Backup Directory Accessibility", + "severity": "medium", + "description": "Verifies backup directory accessibility and recent backup presence.", + "remediation": "Review and resolve the issue described in the Backup Directory Accessibility doctor check article.", + "runCommand": "stella doctor run --check check.storage.backup", + "symptoms": [ + "backup directory not created yet", + "path misconfigured or remote mount", + "insufficient permissions (read-only mount, wrong", + "backup job never run or" + ], + "tags": [ + "doctor", + "storage", + "backup", + "disaster-recovery" + ], + "references": [ + "docs/doctor/articles/storage/backup-directory.md" + ] + }, + { + "checkCode": "check.storage.diskspace", + "title": "Disk Space Availability", + "severity": "high", + "description": "Verifies disk space availability on drives used by Stella Ops.", + "remediation": "Consider setting up automated cleanup policies.", + "runCommand": "stella doctor run --check check.storage.diskspace", + "symptoms": [ + "log files accumulating without rotation", + "evidence artifacts consuming space", + "backup files not rotated or", + "large container images cached on" + ], + "tags": [ + "doctor", + "storage", + "disk", + "capacity", + "core" + ], + "references": [ + "docs/doctor/articles/storage/disk-space.md" + ] + }, + { + "checkCode": "check.storage.evidencelocker", + "title": "Evidence Locker Write Access", + "severity": "high", + "description": "Verifies evidence locker write permissions and performance.", + "remediation": "Review and resolve the issue described in the Evidence Locker Write Access doctor check article.", + "runCommand": "stella doctor run --check check.storage.evidencelocker", + "symptoms": [ + "insufficient file system permissions", + "directory owned by a different", + "selinux/apparmor blocking writes", + "disk full" + ], + "tags": [ + "doctor", + "storage", + "evidence", + "write", + "permissions" + ], + "references": [ + "docs/doctor/articles/storage/evidence-locker-write.md" + ] + }, + { + "checkCode": "check.telemetry.otlp.endpoint", + "title": "OTLP Endpoint", + "severity": "medium", + "description": "Verifies that the OTLP (OpenTelemetry Protocol) collector endpoint is reachable.", + "remediation": "Edit appsettings.json.", + "runCommand": "stella doctor run --check check.telemetry.otlp.endpoint", + "symptoms": [ + "otlp collector not running", + "wrong endpoint configured", + "network connectivity issue or firewall", + "collector health endpoint not available" + ], + "tags": [ + "doctor", + "observability", "telemetry", + "otlp" + ], + "references": [ + "docs/doctor/articles/observability/otlp-endpoint.md" + ] + }, + { + "checkCode": "check.timestamp.crl.distribution", + "title": "CRL Distribution Point Availability", + "severity": "medium", + "description": "Checks that configured CRL distribution points are accessible.", + "remediation": "Ensure egress NetworkPolicies allow traffic to CRL distribution point URLs.", + "runCommand": "stella doctor run --check check.timestamp.crl.distribution", + "symptoms": [ + "crl distribution point server is", + "network connectivity issues", + "firewall blocking http/https to cdp", + "cdp url changed by the" + ], + "tags": [ + "doctor", + "timestamping", + "crl", + "distribution", + "revocation" + ], + "references": [ + "docs/doctor/articles/timestamping/crl-distribution.md" + ] + }, + { + "checkCode": "check.timestamp.eidas.qts.qualified", + "title": "QTS Providers Qualification", + "severity": "high", + "description": "Checks that configured qualified TSA providers are still listed on the EU Trust List.", + "remediation": "Update TSA provider configuration to use only qualified providers.", + "runCommand": "stella doctor run --check check.timestamp.eidas.qts.qualified", + "symptoms": [ + "tsa provider's qualified status withdrawn", + "provider suspended due to compliance", + "provider not yet (re-)listed on", + "trust list cache is stale" + ], + "tags": [ + "doctor", + "timestamping", + "eidas", + "qts", + "qualification", + "compliance" + ], + "references": [ + "docs/doctor/articles/timestamping/qts-providers-qualified.md" + ] + }, + { + "checkCode": "check.timestamp.eidas.qts.status-change", + "title": "QTS Status Changes", + "severity": "medium", + "description": "Alerts on TSA qualification status changes in the past 7 days.", + "remediation": "Review changes and update provider configuration as needed.", + "runCommand": "stella doctor run --check check.timestamp.eidas.qts.status-change", + "symptoms": [ + "supervisory body action against a", + "provider voluntary withdrawal from qualification", + "new provider achieving qualification (positive" + ], + "tags": [ + "doctor", + "timestamping", + "eidas", + "qts", + "status", + "monitoring" + ], + "references": [ + "docs/doctor/articles/timestamping/qts-status-change.md" + ] + }, + { + "checkCode": "check.timestamp.eidas.trustlist.fresh", + "title": "EU Trust List Freshness", + "severity": "medium", + "description": "Checks that the EU Trust List (LOTL -- List of Trusted Lists) is up-to-date.", + "remediation": "Review and resolve the issue described in the EU Trust List Freshness doctor check article.", + "runCommand": "stella doctor run --check check.timestamp.eidas.trustlist.fresh", + "symptoms": [ + "trust list refresh job not", + "network issues preventing download from", + "air-gapped environment without scheduled trust" + ], + "tags": [ + "doctor", + "timestamping", + "eidas", + "trustlist", + "lotl", + "compliance" + ], + "references": [ + "docs/doctor/articles/timestamping/eu-trust-list-fresh.md" + ] + }, + { + "checkCode": "check.timestamp.evidence.retimestamp.pending", + "title": "Retimestamp Pending", + "severity": "medium", + "description": "Detects artifacts pending re-timestamping.", + "remediation": "Process the retimestamp queue.", + "runCommand": "stella doctor run --check check.timestamp.evidence.retimestamp.pending", + "symptoms": [ + "retimestamp queue processor not running", + "tsa endpoints unavailable during retimestamp", + "queue backlog from a large", + "retimestamp job scheduling not configured" + ], + "tags": [ + "doctor", + "timestamping", + "evidence", + "retimestamp", "queue" ], "references": [ - "docs/modules/platform/architecture-overview.md" + "docs/doctor/articles/timestamping/retimestamp-pending.md" + ] + }, + { + "checkCode": "check.timestamp.evidence.staleness", + "title": "Evidence Staleness", + "severity": "medium", + "description": "Aggregated check for timestamp evidence staleness across six dimensions.", + "remediation": "Review and resolve the issue described in the Evidence Staleness doctor check article.", + "runCommand": "stella doctor run --check check.timestamp.evidence.staleness", + "symptoms": [ + "re-timestamp jobs not running or", + "tsa signing certificates approaching expiry", + "ocsp/crl cache not refreshed", + "legacy artifacts signed with sha1" + ], + "tags": [ + "doctor", + "timestamping", + "evidence", + "staleness", + "retimestamp" + ], + "references": [ + "docs/doctor/articles/timestamping/evidence-staleness.md" + ] + }, + { + "checkCode": "check.timestamp.evidence.tst.deprecated-algo", + "title": "TST Deprecated Algorithms", + "severity": "medium", + "description": "Detects timestamps using deprecated hash algorithms (default: SHA1).", + "remediation": "Re-timestamp affected artifacts using approved algorithms.", + "runCommand": "stella doctor run --check check.timestamp.evidence.tst.deprecated-algo", + "symptoms": [ + "legacy artifacts timestamped with older", + "tsa provider still using sha1", + "migration to sha-256 not yet" + ], + "tags": [ + "doctor", + "timestamping", + "evidence", + "tst", + "algorithm", + "deprecated" + ], + "references": [ + "docs/doctor/articles/timestamping/tst-deprecated-algorithms.md" + ] + }, + { + "checkCode": "check.timestamp.evidence.tst.expiry", + "title": "TST Approaching Expiry", + "severity": "medium", + "description": "Detects timestamp tokens approaching signing certificate expiry.", + "remediation": "Run the retimestamp workflow to refresh expiring artifacts.", + "runCommand": "stella doctor run --check check.timestamp.evidence.tst.expiry", + "symptoms": [ + "tsa signing certificates approaching end-of-life", + "re-timestamp jobs not scheduled or" + ], + "tags": [ + "doctor", + "timestamping", + "evidence", + "tst", + "expiry" + ], + "references": [ + "docs/doctor/articles/timestamping/tst-approaching-expiry.md" + ] + }, + { + "checkCode": "check.timestamp.evidence.tst.missing-stapling", + "title": "TST Missing Stapling", + "severity": "medium", + "description": "Detects timestamps without stapled OCSP/CRL revocation data.", + "remediation": "Enable OCSP stapling and re-timestamp affected artifacts.", + "runCommand": "stella doctor run --check check.timestamp.evidence.tst.missing-stapling", + "symptoms": [ + "tsa provider not configured to", + "ocsp stapling disabled in tsa", + "legacy timestamps created before stapling" + ], + "tags": [ + "doctor", + "timestamping", + "evidence", + "tst", + "stapling", + "ocsp" + ], + "references": [ + "docs/doctor/articles/timestamping/tst-missing-stapling.md" + ] + }, + { + "checkCode": "check.timestamp.ocsp.responder", + "title": "OCSP Responder Availability", + "severity": "medium", + "description": "Checks that configured OCSP responders are accessible.", + "remediation": "Ensure egress NetworkPolicies allow traffic to OCSP responder URLs.", + "runCommand": "stella doctor run --check check.timestamp.ocsp.responder", + "symptoms": [ + "ocsp responder server is down", + "network connectivity issues", + "firewall blocking http/https to ocsp", + "ocsp responder url changed by" + ], + "tags": [ + "doctor", + "timestamping", + "ocsp", + "responder", + "revocation" + ], + "references": [ + "docs/doctor/articles/timestamping/ocsp-responder.md" + ] + }, + { + "checkCode": "check.timestamp.ocsp.stapling", + "title": "OCSP Stapling Enabled", + "severity": "medium", + "description": "Checks whether TSA OCSP stapling is configured and fresh.", + "remediation": "Enable OCSP stapling in TSA provider configuration.", + "runCommand": "stella doctor run --check check.timestamp.ocsp.stapling", + "symptoms": [ + "ocsp stapling not configured for", + "stapling status monitoring not set", + "tsa provider does not support" + ], + "tags": [ + "doctor", + "timestamping", + "ocsp", + "stapling", + "revocation" + ], + "references": [ + "docs/doctor/articles/timestamping/ocsp-stapling-enabled.md" + ] + }, + { + "checkCode": "check.timestamp.revocation.cache-fresh", + "title": "Revocation Cache Freshness", + "severity": "medium", + "description": "Checks that cached OCSP responses and CRLs are not stale.", + "remediation": "Review and resolve the issue described in the Revocation Cache Freshness doctor check article.", + "runCommand": "stella doctor run --check check.timestamp.revocation.cache-fresh", + "symptoms": [ + "revocation cache refresh job not", + "network issues preventing ocsp/crl fetches", + "cache storage issues" + ], + "tags": [ + "doctor", + "timestamping", + "revocation", + "cache", + "ocsp", + "crl" + ], + "references": [ + "docs/doctor/articles/timestamping/revocation-cache-fresh.md" + ] + }, + { + "checkCode": "check.timestamp.timesync.rekor-correlation", + "title": "TST-Rekor Time Correlation", + "severity": "medium", + "description": "Checks that TST genTime and Rekor integratedTime are properly correlated.", + "remediation": "Investigate ordering violations immediately as they may indicate tampering.", + "runCommand": "stella doctor run --check check.timestamp.timesync.rekor-correlation", + "symptoms": [ + "system clock drift causing ordering", + "pipeline delays between timestamping and", + "rekor transparency log ingestion delays", + "network issues causing timestamp reordering" + ], + "tags": [ + "doctor", + "timestamping", + "timesync", + "rekor", + "correlation", + "transparency" + ], + "references": [ + "docs/doctor/articles/timestamping/rekor-time-correlation.md" + ] + }, + { + "checkCode": "check.timestamp.timesync.system", + "title": "System Time Synchronization", + "severity": "high", + "description": "Checks that the system clock is synchronized with NTP servers.", + "remediation": "Docker containers inherit the host's clock.", + "runCommand": "stella doctor run --check check.timestamp.timesync.system", + "symptoms": [ + "ntp service not running (chrony", + "ntp servers unreachable (firewall blocking", + "virtual machine time drift (especially", + "hardware clock issues" + ], + "tags": [ + "doctor", + "timestamping", + "timesync", + "ntp", + "system" + ], + "references": [ + "docs/doctor/articles/timestamping/system-time-sync.md" + ] + }, + { + "checkCode": "check.timestamp.timesync.tsa-skew", + "title": "TSA Time Skew", + "severity": "medium", + "description": "Checks time skew between the system clock and TSA genTime.", + "remediation": "Ensure the host clock is synchronized (see check.timestamp.timesync.system).", + "runCommand": "stella doctor run --check check.timestamp.timesync.tsa-skew", + "symptoms": [ + "local system clock not synchronized", + "tsa provider clock drifting", + "high network latency distorting round-trip", + "proxy or load balancer adding" + ], + "tags": [ + "doctor", + "timestamping", + "timesync", + "tsa", + "skew" + ], + "references": [ + "docs/doctor/articles/timestamping/tsa-time-skew.md" + ] + }, + { + "checkCode": "check.timestamp.tsa.cert-expiry", + "title": "TSA Certificate Expiry", + "severity": "medium", + "description": "Checks if TSA signing certificates are approaching expiry.", + "remediation": "Update the certificate configuration when renewed certificates are obtained from the TSA provider.", + "runCommand": "stella doctor run --check check.timestamp.tsa.cert-expiry", + "symptoms": [ + "tsa provider certificate approaching natural", + "certificate renewal not tracked or", + "using a short-lived certificate without" + ], + "tags": [ + "doctor", + "timestamping", + "tsa", + "certificate", + "expiry" + ], + "references": [ + "docs/doctor/articles/timestamping/tsa-cert-expiry.md" + ] + }, + { + "checkCode": "check.timestamp.tsa.chain-valid", + "title": "TSA Certificate Chain Validity", + "severity": "high", + "description": "Ensures TSA certificate chains are valid and complete.", + "remediation": "Update certificate chain secrets.", + "runCommand": "stella doctor run --check check.timestamp.tsa.chain-valid", + "symptoms": [ + "missing intermediate certificates", + "intermediate certificate expired", + "trust store not updated after", + "misconfigured certificate chain ordering" + ], + "tags": [ + "doctor", + "timestamping", + "tsa", + "certificate", + "chain", + "validation" + ], + "references": [ + "docs/doctor/articles/timestamping/tsa-chain-valid.md" + ] + }, + { + "checkCode": "check.timestamp.tsa.failover-ready", + "title": "TSA Failover Readiness", + "severity": "medium", + "description": "Confirms that backup TSA endpoints are reachable for failover.", + "remediation": "Configure at least two TSA endpoints.", + "runCommand": "stella doctor run --check check.timestamp.tsa.failover-ready", + "symptoms": [ + "only one tsa endpoint configured", + "backup tsa endpoint down or", + "network issues to secondary tsa" + ], + "tags": [ + "doctor", + "timestamping", + "tsa", + "failover", + "redundancy" + ], + "references": [ + "docs/doctor/articles/timestamping/tsa-failover-ready.md" + ] + }, + { + "checkCode": "check.timestamp.tsa.reachable", + "title": "TSA Availability", + "severity": "high", + "description": "Verifies that configured TSA (Time Stamp Authority) endpoints are reachable and responding.", + "remediation": "Test connectivity.", + "runCommand": "stella doctor run --check check.timestamp.tsa.reachable", + "symptoms": [ + "tsa endpoint server is down", + "network connectivity issues or firewall", + "dns resolution failure", + "tsa provider maintenance or outage" + ], + "tags": [ + "doctor", + "timestamping", + "tsa", + "availability", + "connectivity" + ], + "references": [ + "docs/doctor/articles/timestamping/tsa-availability.md" + ] + }, + { + "checkCode": "check.timestamp.tsa.response-time", + "title": "TSA Response Time", + "severity": "medium", + "description": "Measures TSA endpoint response times against configurable thresholds.", + "remediation": "Consider adding a geographically closer TSA endpoint or a local TSA.", + "runCommand": "stella doctor run --check check.timestamp.tsa.response-time", + "symptoms": [ + "tsa server under heavy load", + "network latency to remote tsa", + "firewall or proxy adding latency", + "tsa provider experiencing service degradation" + ], + "tags": [ + "doctor", + "timestamping", + "tsa", + "latency", + "performance" + ], + "references": [ + "docs/doctor/articles/timestamping/tsa-response-time.md" + ] + }, + { + "checkCode": "check.timestamp.tsa.root-expiry", + "title": "TSA Root Certificate Expiry", + "severity": "medium", + "description": "Checks if TSA trust anchor (root) certificates are approaching expiry.", + "remediation": "Update root certificate trust store.", + "runCommand": "stella doctor run --check check.timestamp.tsa.root-expiry", + "symptoms": [ + "root certificate approaching end-of-life (typically", + "using a custom root ca", + "trust store not updated after" + ], + "tags": [ + "doctor", + "timestamping", + "tsa", + "root", + "certificate", + "expiry" + ], + "references": [ + "docs/doctor/articles/timestamping/tsa-root-expiry.md" + ] + }, + { + "checkCode": "check.timestamp.tsa.valid-response", + "title": "TSA Valid Response", + "severity": "high", + "description": "Verifies that TSA endpoints return valid RFC-3161 timestamp responses.", + "remediation": "Verify TSA configuration and switch to a known-good provider.", + "runCommand": "stella doctor run --check check.timestamp.tsa.valid-response", + "symptoms": [ + "tsa provider configuration changed (algorithm", + "tsa provider returned an error", + "network issues causing corrupted responses", + "tsa provider using an unsupported" + ], + "tags": [ + "doctor", + "timestamping", + "tsa", + "validation", + "rfc3161" + ], + "references": [ + "docs/doctor/articles/timestamping/tsa-valid-response.md" + ] + }, + { + "checkCode": "check.vex.issuer-trust", + "title": "VEX Issuer Trust Registry", + "severity": "medium", + "description": "Verifies that the VEX issuer trust registry is configured and that key material is available for signature verification.", + "remediation": "Review and resolve the issue described in the VEX Issuer Trust Registry doctor check article.", + "runCommand": "stella doctor run --check check.vex.issuer-trust", + "symptoms": [ + "issuer directory not configured during", + "trust anchors not imported after", + "configuration file missing or incorrect", + "all issuers expired or revoked" + ], + "tags": [ + "doctor", + "vex", + "trust", + "issuer", + "security" + ], + "references": [ + "docs/doctor/articles/vex/issuer-trust.md" + ] + }, + { + "checkCode": "check.vex.schema", + "title": "VEX Schema Compliance", + "severity": "medium", + "description": "Verifies that VEX document schema definitions are available for all three supported formats.", + "remediation": "Review and resolve the issue described in the VEX Schema Compliance doctor check article.", + "runCommand": "stella doctor run --check check.vex.schema", + "symptoms": [ + "schema files not installed during", + "schema version mismatch after an", + "configuration error pointing to wrong", + "incomplete installation missing one or" + ], + "tags": [ + "doctor", + "vex", + "schema", + "compliance" + ], + "references": [ + "docs/doctor/articles/vex/schema.md" + ] + }, + { + "checkCode": "check.vex.validation", + "title": "VEX Document Validation", + "severity": "high", + "description": "Verifies the VEX document validation pipeline health by testing three subsystems.", + "remediation": "Review and resolve the issue described in the VEX Document Validation doctor check article.", + "runCommand": "stella doctor run --check check.vex.validation", + "symptoms": [ + "vex schema validation service unavailable", + "invalid vex document format detected", + "signature verification key material missing", + "vex processing queue backed up" + ], + "tags": [ + "doctor", + "vex", + "security", + "validation" + ], + "references": [ + "docs/doctor/articles/vex/validation.md" ] } ] diff --git a/src/AdvisoryAI/StellaOps.AdvisoryAI/KnowledgeSearch/knowledge-docs-allowlist.json b/src/AdvisoryAI/StellaOps.AdvisoryAI/KnowledgeSearch/knowledge-docs-allowlist.json index e0c26439b..0f17a5d73 100644 --- a/src/AdvisoryAI/StellaOps.AdvisoryAI/KnowledgeSearch/knowledge-docs-allowlist.json +++ b/src/AdvisoryAI/StellaOps.AdvisoryAI/KnowledgeSearch/knowledge-docs-allowlist.json @@ -11,6 +11,7 @@ "docs/modules/router", "docs/modules/scanner", "docs/operations", - "docs/operations/devops/runbooks" + "docs/operations/devops/runbooks", + "docs/doctor/articles" ] } diff --git a/src/Doctor/AGENTS.md b/src/Doctor/AGENTS.md index c8639281d..aabea241f 100644 --- a/src/Doctor/AGENTS.md +++ b/src/Doctor/AGENTS.md @@ -34,6 +34,22 @@ - Evidence logs are JSONL with deterministic ordering and include `doctor_command`. - DSSE summaries assume operator execution and include the same command note. +## Article Requirement (MANDATORY) + +Every `IDoctorCheck` implementation MUST have a corresponding documentation article. + +**When creating or modifying a Doctor check:** + +1. **Article**: Create/update `docs/doctor/articles//.md` using the template at `docs/doctor/articles/_TEMPLATE.md`. The article must include deployment-specific fix steps for Docker Compose, bare metal, and Kubernetes. + +2. **RunbookUrl**: Call `.WithRunbookUrl("docs/doctor/articles//.md")` in the check's `WithRemediation()` builder so the Doctor UI links to the article. + +3. **Search seed**: Add/update an entry in `src/AdvisoryAI/StellaOps.AdvisoryAI/KnowledgeSearch/doctor-search-seed.json` with `checkCode`, `title`, `severity`, `description`, `remediation`, `runCommand`, `symptoms`, `tags`, and `references` pointing to the article. + +**File naming**: `check.agent.heartbeat.freshness` → `docs/doctor/articles/agent/heartbeat-freshness.md` + +**PR reviewers MUST reject checks that are missing any of these three artifacts.** + ## Testing - Doctor engine tests: `src/__Libraries/__Tests/StellaOps.Doctor.Tests` - Plugin tests: `src/__Libraries/__Tests/StellaOps.Doctor.Plugins.*.Tests` diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentCapacityCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentCapacityCheck.cs index 7ee408553..5f6f4aa0f 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentCapacityCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentCapacityCheck.cs @@ -83,7 +83,8 @@ public sealed class AgentCapacityCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Bootstrap new agents if needed", "stella agent bootstrap --name --env ", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/agent/capacity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentCertificateExpiryCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentCertificateExpiryCheck.cs index 65bf2e188..9df5e2585 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentCertificateExpiryCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentCertificateExpiryCheck.cs @@ -158,7 +158,8 @@ public sealed class AgentCertificateExpiryCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Check agent logs for renewal failures", "stella agent logs --agent-id --level warn", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/agent/certificate-expiry.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -184,7 +185,8 @@ public sealed class AgentCertificateExpiryCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Optionally force early renewal", "stella agent renew-cert --agent-id ", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/agent/certificate-expiry.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentHeartbeatFreshnessCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentHeartbeatFreshnessCheck.cs index 0c2a40136..b9e0bc8f0 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentHeartbeatFreshnessCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentHeartbeatFreshnessCheck.cs @@ -83,7 +83,8 @@ public sealed class AgentHeartbeatFreshnessCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Check agent registration status", "stella agent list --all", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/agent/heartbeat-freshness.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -174,7 +175,8 @@ public sealed class AgentHeartbeatFreshnessCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Monitor heartbeat trend", "stella agent logs --agent-id --tail 50", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/agent/heartbeat-freshness.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentVersionConsistencyCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentVersionConsistencyCheck.cs index 29d2736b2..dced255ff 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentVersionConsistencyCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentVersionConsistencyCheck.cs @@ -117,7 +117,8 @@ public sealed class AgentVersionConsistencyCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Enable auto-update if appropriate", "stella agent config --agent-id --set auto_update.enabled=true", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/agent/version-consistency.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/StaleAgentCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/StaleAgentCheck.cs index 1208cfa23..0ddbc9f10 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/StaleAgentCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/StaleAgentCheck.cs @@ -110,7 +110,8 @@ public sealed class StaleAgentCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "If agent should be active, investigate host", "ssh 'systemctl status stella-agent'", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/agent/stale.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -136,7 +137,8 @@ public sealed class StaleAgentCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Restart agent service", "ssh 'systemctl restart stella-agent'", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/agent/stale.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/CosignKeyMaterialCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/CosignKeyMaterialCheck.cs index e27755d7a..dfd83f797 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/CosignKeyMaterialCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/CosignKeyMaterialCheck.cs @@ -79,7 +79,8 @@ public sealed class CosignKeyMaterialCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Configure signing mode", "stella attestor signing configure --mode keyless", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/cosign-keymaterial.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -128,7 +129,8 @@ public sealed class CosignKeyMaterialCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Configure the key path", "stella attestor signing configure --mode file --key-path /etc/stellaops/cosign.key", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/cosign-keymaterial.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -154,7 +156,8 @@ public sealed class CosignKeyMaterialCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Update configuration with correct path", "stella attestor signing configure --key-path ", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/cosign-keymaterial.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -207,7 +210,8 @@ public sealed class CosignKeyMaterialCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Or for GCP KMS", "stella attestor signing configure --mode kms --kms-key-ref 'gcpkms://projects/.../cryptoKeys/...'", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/cosign-keymaterial.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/RekorClockSkewCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/RekorClockSkewCheck.cs index 32362227d..bd288ccca 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/RekorClockSkewCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/RekorClockSkewCheck.cs @@ -95,7 +95,8 @@ public sealed class RekorClockSkewCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check network connectivity", $"curl -s {rekorUrl}/api/v1/log", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/clock-skew.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -167,7 +168,8 @@ public sealed class RekorClockSkewCheck : IDoctorCheck "NTP server unreachable", "System clock manually set incorrectly", "Virtual machine clock drift") - .WithRemediation(rb => BuildPlatformSpecificRemediation(rb, ntpStatus, vmStatus)) + .WithRemediation(rb => BuildPlatformSpecificRemediation(rb, ntpStatus, vmStatus) + .WithRunbookUrl("docs/doctor/articles/attestor/clock-skew.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/RekorConnectivityCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/RekorConnectivityCheck.cs index 9e0168de6..0f7c5016d 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/RekorConnectivityCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/RekorConnectivityCheck.cs @@ -112,7 +112,8 @@ public sealed class RekorConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(4, "If air-gapped, configure offline bundle", "stella attestor offline-bundle download --output /var/lib/stellaops/rekor-offline", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/rekor-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -137,7 +138,8 @@ public sealed class RekorConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "For air-gapped environments, configure offline mode", "stella attestor config set --key offline.enabled --value true", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/rekor-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -158,7 +160,8 @@ public sealed class RekorConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Check SSL certificates", "openssl s_client -connect rekor.sigstore.dev:443 -brief", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/rekor-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/RekorVerificationJobCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/RekorVerificationJobCheck.cs index c15c9a2f0..8f3264b48 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/RekorVerificationJobCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/RekorVerificationJobCheck.cs @@ -89,7 +89,8 @@ public sealed class RekorVerificationJobCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Check application logs for errors", "journalctl -u stellaops-attestor --since '1 hour ago' | grep -i 'verification\\|rekor'", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/rekor-verification-job.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -118,7 +119,8 @@ public sealed class RekorVerificationJobCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Contact security team if tampering suspected", "# This may indicate a security incident. Review evidence carefully.", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/attestor/rekor-verification-job.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -145,7 +147,8 @@ public sealed class RekorVerificationJobCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "If mismatch persists, escalate to security team", "# Root hash mismatch may indicate log tampering", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/attestor/rekor-verification-job.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -173,7 +176,8 @@ public sealed class RekorVerificationJobCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Review recent logs", "journalctl -u stellaops-attestor --since '48 hours ago' | grep -i error", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/rekor-verification-job.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -202,7 +206,8 @@ public sealed class RekorVerificationJobCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Re-sync from Rekor if needed", "stella attestor verification resync --failed-only", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/rekor-verification-job.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/SigningKeyExpirationCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/SigningKeyExpirationCheck.cs index 5087f26ff..53210b8e7 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/SigningKeyExpirationCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/SigningKeyExpirationCheck.cs @@ -114,7 +114,8 @@ public sealed class SigningKeyExpirationCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Set up key expiration monitoring", "stella notify channels add --type email --event key.expiring --threshold-days 30", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/keymaterial.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -135,7 +136,8 @@ public sealed class SigningKeyExpirationCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Review all critical keys", "stella keys status", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/keymaterial.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -157,7 +159,8 @@ public sealed class SigningKeyExpirationCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Schedule rotation with overlap period", $"stella keys rotate {warningKeys[0].KeyId} --overlap-days 14", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/keymaterial.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/TransparencyLogConsistencyCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/TransparencyLogConsistencyCheck.cs index e32c93718..a2a2945c4 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/TransparencyLogConsistencyCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Attestor/Checks/TransparencyLogConsistencyCheck.cs @@ -106,7 +106,8 @@ public sealed class TransparencyLogConsistencyCheck : IDoctorCheck $"cat {checkpointPath}") .AddStep(2, "Trigger re-sync", "stella attestor transparency sync", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/transparency-consistency.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -184,7 +185,8 @@ public sealed class TransparencyLogConsistencyCheck : IDoctorCheck CommandType.Shell) .AddDestructiveStep(4, "If using wrong log, reset checkpoint (DESTRUCTIVE)", $"rm {checkpointPath} && stella attestor transparency sync", - $"ls -la {checkpointPath}")) + $"ls -la {checkpointPath}") + .WithRunbookUrl("docs/doctor/articles/attestor/transparency-consistency.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -211,7 +213,8 @@ public sealed class TransparencyLogConsistencyCheck : IDoctorCheck CommandType.Manual) .AddStep(2, "Compare with independent source", "curl -s https://rekor.sigstore.dev/api/v1/log | jq .", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/attestor/transparency-consistency.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/AuthConfigurationCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/AuthConfigurationCheck.cs index 8cb0995db..7e1b87550 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/AuthConfigurationCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/AuthConfigurationCheck.cs @@ -71,7 +71,8 @@ public sealed class AuthConfigurationCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Generate signing keys", "stella keys generate --type rsa", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/config.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -96,7 +97,8 @@ public sealed class AuthConfigurationCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Check key store health", "stella doctor --check check.crypto.keystore", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/config.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -122,7 +124,8 @@ public sealed class AuthConfigurationCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Schedule key rotation", "stella keys rotate --schedule 30d", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/config.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/OidcProviderConnectivityCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/OidcProviderConnectivityCheck.cs index e94ee90b1..ace20ef92 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/OidcProviderConnectivityCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/OidcProviderConnectivityCheck.cs @@ -103,7 +103,8 @@ public sealed class OidcProviderConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Check network configuration", "stella doctor --check check.network.dns", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/oidc.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -132,7 +133,8 @@ public sealed class OidcProviderConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Check OIDC provider configuration", "stella auth oidc validate", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/oidc.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -159,7 +161,8 @@ public sealed class OidcProviderConnectivityCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check JWKS endpoint", $"curl -s {oidcStatus.JwksUri} | jq .", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/oidc.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/SigningKeyHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/SigningKeyHealthCheck.cs index 051872458..8b9ee0cd0 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/SigningKeyHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/SigningKeyHealthCheck.cs @@ -69,7 +69,8 @@ public sealed class SigningKeyHealthCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Activate the key", "stella keys activate", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/signing-key.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -94,7 +95,8 @@ public sealed class SigningKeyHealthCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Schedule automatic rotation", "stella keys rotate --schedule 30d", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/signing-key.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/TokenServiceHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/TokenServiceHealthCheck.cs index 734547190..d56d169db 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/TokenServiceHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Auth/Checks/TokenServiceHealthCheck.cs @@ -74,7 +74,8 @@ public sealed class TokenServiceHealthCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Check database connectivity", "stella doctor --check check.storage.postgres", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/token-service.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -99,7 +100,8 @@ public sealed class TokenServiceHealthCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Review database performance", "stella doctor --check check.storage.performance", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/token-service.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -121,7 +123,8 @@ public sealed class TokenServiceHealthCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Monitor service metrics", "stella auth metrics --watch", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/token-service.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/BuildinfoCacheCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/BuildinfoCacheCheck.cs index 20b73ef49..0c84470b9 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/BuildinfoCacheCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/BuildinfoCacheCheck.cs @@ -116,7 +116,8 @@ public sealed class BuildinfoCacheCheck : IDoctorCheck .AddShellStep(3, "Check proxy settings if behind a corporate firewall", "export HTTPS_PROXY=http://proxy.example.com:8080") .AddManualStep(4, "For air-gapped environments", - "Pre-populate the buildinfo cache with required files or disable this check")) + "Pre-populate the buildinfo cache with required files or disable this check") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/buildinfo-cache.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -140,7 +141,8 @@ public sealed class BuildinfoCacheCheck : IDoctorCheck .AddShellStep(1, "Test connectivity", $"curl -I {BuildinfosUrl}") .AddManualStep(2, "If air-gapped intentionally", - "Ensure buildinfo cache is pre-populated with required files")) + "Ensure buildinfo cache is pre-populated with required files") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/buildinfo-cache.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -161,7 +163,8 @@ public sealed class BuildinfoCacheCheck : IDoctorCheck .WithCauses("Cache directory not created") .WithRemediation(rb => rb .AddShellStep(1, "Create cache directory", - $"sudo mkdir -p {cacheDir} && sudo chmod 755 {cacheDir}")) + $"sudo mkdir -p {cacheDir} && sudo chmod 755 {cacheDir}") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/buildinfo-cache.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -181,7 +184,8 @@ public sealed class BuildinfoCacheCheck : IDoctorCheck .WithCauses("Insufficient permissions on cache directory") .WithRemediation(rb => rb .AddShellStep(1, "Fix cache directory permissions", - $"sudo chown $(whoami) {cacheDir} && chmod 755 {cacheDir}")) + $"sudo chown $(whoami) {cacheDir} && chmod 755 {cacheDir}") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/buildinfo-cache.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/CorpusMirrorFreshnessCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/CorpusMirrorFreshnessCheck.cs index 48861371d..466df9f13 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/CorpusMirrorFreshnessCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/CorpusMirrorFreshnessCheck.cs @@ -92,7 +92,8 @@ public sealed class CorpusMirrorFreshnessCheck : IDoctorCheck .AddStellaStep(2, "Initialize corpus mirrors", "groundtruth mirror sync --all") .AddManualStep(3, "For air-gapped environments", - "Copy pre-populated mirrors from an online system to the mirrors directory")) + "Copy pre-populated mirrors from an online system to the mirrors directory") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/corpus-mirror-freshness.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -156,7 +157,8 @@ public sealed class CorpusMirrorFreshnessCheck : IDoctorCheck .AddShellStep(2, "Or sync specific mirrors", "stella groundtruth mirror sync --source debian") .AddManualStep(3, "For air-gapped environments", - "Transfer pre-populated mirrors from an online system")) + "Transfer pre-populated mirrors from an online system") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/corpus-mirror-freshness.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -178,7 +180,8 @@ public sealed class CorpusMirrorFreshnessCheck : IDoctorCheck .AddShellStep(2, "Check mirror sync job status", "systemctl status stella-mirror-sync.timer") .AddManualStep(3, "Set up automatic mirror sync", - $"Configure a cron job or systemd timer to run 'stella groundtruth mirror sync' at least every {staleThresholdDays} days")) + $"Configure a cron job or systemd timer to run 'stella groundtruth mirror sync' at least every {staleThresholdDays} days") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/corpus-mirror-freshness.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -198,7 +201,8 @@ public sealed class CorpusMirrorFreshnessCheck : IDoctorCheck .AddStellaStep(1, "Sync stale mirrors", $"groundtruth mirror sync --sources {string.Join(",", staleMirrors.Select(m => m.Name.Split('/')[0]))}") .AddShellStep(2, "Check sync logs for errors", - "journalctl -u stella-mirror-sync --since '7 days ago' | grep -i error")) + "journalctl -u stella-mirror-sync --since '7 days ago' | grep -i error") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/corpus-mirror-freshness.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -212,7 +216,8 @@ public sealed class CorpusMirrorFreshnessCheck : IDoctorCheck .WithEvidence("Mirror Status", AddMirrorEvidence) .WithRemediation(rb => rb .AddManualStep(1, "Optionally add missing mirrors", - $"stella groundtruth mirror sync --sources {string.Join(",", missingMirrors.Select(m => m.Name.Split('/')[0]))}")) + $"stella groundtruth mirror sync --sources {string.Join(",", missingMirrors.Select(m => m.Name.Split('/')[0]))}") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/corpus-mirror-freshness.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/DdebRepoEnabledCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/DdebRepoEnabledCheck.cs index e22ba14ff..cfbe11b1e 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/DdebRepoEnabledCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/DdebRepoEnabledCheck.cs @@ -115,7 +115,8 @@ public sealed partial class DdebRepoEnabledCheck : IDoctorCheck .AddShellStep(2, "Import repository signing key", "sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F2EDC64DC5AEE1F6B9C621F0C8CAB6595FDFF622") .AddShellStep(3, "Update package lists", - "sudo apt update")) + "sudo apt update") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/ddeb-enabled.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -138,7 +139,8 @@ public sealed partial class DdebRepoEnabledCheck : IDoctorCheck .AddShellStep(2, "Import repository signing key", "sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F2EDC64DC5AEE1F6B9C621F0C8CAB6595FDFF622") .AddShellStep(3, "Update package lists", - "sudo apt update")) + "sudo apt update") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/ddeb-enabled.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -167,7 +169,8 @@ public sealed partial class DdebRepoEnabledCheck : IDoctorCheck .AddShellStep(2, "Check proxy settings if behind a corporate firewall", "export HTTP_PROXY=http://proxy.example.com:8080") .AddManualStep(3, "For air-gapped environments", - "Set up a local ddeb mirror or use offline symbol packages")) + "Set up a local ddeb mirror or use offline symbol packages") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/ddeb-enabled.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/DebuginfodAvailabilityCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/DebuginfodAvailabilityCheck.cs index 082b32578..c80b26d75 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/DebuginfodAvailabilityCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/DebuginfodAvailabilityCheck.cs @@ -113,7 +113,8 @@ public sealed class DebuginfodAvailabilityCheck : IDoctorCheck .AddShellStep(2, "Verify network connectivity", $"curl -I {DefaultFedoraUrl}") .AddManualStep(3, "For air-gapped environments", - "Set up a local debuginfod mirror or pre-populate the symbol cache. See docs/modules/binary-index/ground-truth-corpus.md for offline setup")) + "Set up a local debuginfod mirror or pre-populate the symbol cache. See docs/modules/binary-index/ground-truth-corpus.md for offline setup") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/debuginfod-available.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -131,7 +132,8 @@ public sealed class DebuginfodAvailabilityCheck : IDoctorCheck }) .WithRemediation(rb => rb .AddShellStep(1, "Optionally set DEBUGINFOD_URLS for explicit configuration (recommended for production)", - $"export {DebuginfodUrlsEnvVar}=\"{DefaultFedoraUrl}\"")) + $"export {DebuginfodUrlsEnvVar}=\"{DefaultFedoraUrl}\"") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/debuginfod-available.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -158,7 +160,8 @@ public sealed class DebuginfodAvailabilityCheck : IDoctorCheck .AddShellStep(2, "Check proxy settings if behind a corporate firewall", "export HTTPS_PROXY=http://proxy.example.com:8080") .AddManualStep(3, "For air-gapped environments", - "Deploy a local debuginfod instance or use offline symbol bundles. See docs/modules/binary-index/ground-truth-corpus.md for offline setup")) + "Deploy a local debuginfod instance or use offline symbol bundles. See docs/modules/binary-index/ground-truth-corpus.md for offline setup") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/debuginfod-available.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -182,7 +185,8 @@ public sealed class DebuginfodAvailabilityCheck : IDoctorCheck .AddShellStep(1, "Verify unreachable servers", $"curl -I {unreachableUrls[0]}") .AddManualStep(2, "Update DEBUGINFOD_URLS to remove unavailable servers", - $"Edit DEBUGINFOD_URLS to remove: {string.Join(", ", unreachableUrls)}")) + $"Edit DEBUGINFOD_URLS to remove: {string.Join(", ", unreachableUrls)}") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/debuginfod-available.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/KpiBaselineExistsCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/KpiBaselineExistsCheck.cs index 4aeba85b4..805b76a09 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/KpiBaselineExistsCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/KpiBaselineExistsCheck.cs @@ -95,7 +95,8 @@ public sealed class KpiBaselineExistsCheck : IDoctorCheck .AddStellaStep(2, "Run corpus validation to establish baseline", "groundtruth validate run --corpus datasets/golden-corpus/seed/ --output-baseline") .AddStellaStep(3, "Or manually set the current results as baseline", - $"groundtruth baseline update --from-latest --output {baselinePath}")) + $"groundtruth baseline update --from-latest --output {baselinePath}") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/corpus-kpi-baseline.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -130,7 +131,8 @@ public sealed class KpiBaselineExistsCheck : IDoctorCheck .AddShellStep(1, "Copy latest baseline to default location", $"cp {Path.Combine(baselineDir, latest.Filename)} {baselinePath}") .AddManualStep(2, "Or update configuration to use existing baseline", - $"Set BinaryAnalysis:Corpus:BaselineFilename to '{latest.Filename}'")) + $"Set BinaryAnalysis:Corpus:BaselineFilename to '{latest.Filename}'") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/corpus-kpi-baseline.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -154,7 +156,8 @@ public sealed class KpiBaselineExistsCheck : IDoctorCheck .AddStellaStep(1, "Run corpus validation to establish baseline", $"groundtruth validate run --corpus datasets/golden-corpus/seed/ --output {baselinePath}") .AddStellaStep(2, "Or update baseline from existing validation results", - $"groundtruth baseline update --from-latest --output {baselinePath}")) + $"groundtruth baseline update --from-latest --output {baselinePath}") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/corpus-kpi-baseline.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -189,7 +192,8 @@ public sealed class KpiBaselineExistsCheck : IDoctorCheck .AddStellaStep(2, "Regenerate baseline from latest validation", $"groundtruth baseline update --from-latest --output {baselinePath}") .AddShellStep(3, "Or validate JSON manually", - $"cat {baselinePath} | jq .")) + $"cat {baselinePath} | jq .") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/corpus-kpi-baseline.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -219,7 +223,8 @@ public sealed class KpiBaselineExistsCheck : IDoctorCheck "Partial baseline update") .WithRemediation(rb => rb .AddStellaStep(1, "Regenerate complete baseline", - $"groundtruth baseline update --from-latest --output {baselinePath}")) + $"groundtruth baseline update --from-latest --output {baselinePath}") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/corpus-kpi-baseline.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/SymbolRecoveryFallbackCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/SymbolRecoveryFallbackCheck.cs index 918f42e6b..67766ee4b 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/SymbolRecoveryFallbackCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.BinaryAnalysis/Checks/SymbolRecoveryFallbackCheck.cs @@ -151,7 +151,8 @@ public sealed class SymbolRecoveryFallbackCheck : IDoctorCheck .AddManualStep(3, "For air-gapped environments", "Set up an offline symbol bundle. See docs/modules/binary-index/ground-truth-corpus.md for instructions on creating and importing offline symbol packs") .AddManualStep(4, "Consider setting up a local debuginfod mirror", - "Run a local debuginfod server and point DEBUGINFOD_URLS to it")) + "Run a local debuginfod server and point DEBUGINFOD_URLS to it") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/symbol-recovery-fallback.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -168,7 +169,8 @@ public sealed class SymbolRecoveryFallbackCheck : IDoctorCheck .WithEvidence("Symbol Recovery Status", AddChildEvidence) .WithRemediation(rb => rb .AddManualStep(1, "Optionally configure additional sources for redundancy", - $"The following sources are unavailable: {string.Join(", ", unavailableNames)}")) + $"The following sources are unavailable: {string.Join(", ", unavailableNames)}") + .WithRunbookUrl("docs/doctor/articles/binaryanalysis/symbol-recovery-fallback.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/AttestationSigningHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/AttestationSigningHealthCheck.cs index 277ed33f0..0f2c4569d 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/AttestationSigningHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/AttestationSigningHealthCheck.cs @@ -109,6 +109,7 @@ public sealed class AttestationSigningHealthCheck : IDoctorCheck rb.AddStep(2, "Verify HSM/KMS connectivity", "stella attestor hsm test", CommandType.Shell); + rb.WithRunbookUrl("docs/doctor/articles/compliance/attestation-signing.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -130,7 +131,8 @@ public sealed class AttestationSigningHealthCheck : IDoctorCheck }) .WithCauses("Key not rotated before expiry") .WithRemediation(rb => rb - .AddStep(1, "Rotate signing key", "stella attestor key rotate", CommandType.Shell)) + .AddStep(1, "Rotate signing key", "stella attestor key rotate", CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/compliance/attestation-signing.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -147,7 +149,8 @@ public sealed class AttestationSigningHealthCheck : IDoctorCheck }) .WithCauses("Key approaching end of validity") .WithRemediation(rb => rb - .AddStep(1, "Schedule key rotation", "stella attestor key rotate --schedule", CommandType.Shell)) + .AddStep(1, "Schedule key rotation", "stella attestor key rotate --schedule", CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/compliance/attestation-signing.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/AuditReadinessCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/AuditReadinessCheck.cs index ea72f02af..2b916d31d 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/AuditReadinessCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/AuditReadinessCheck.cs @@ -117,6 +117,7 @@ public sealed class AuditReadinessCheck : IDoctorCheck rb.AddStep(2, "Enable audit logging", "stella audit enable", CommandType.Shell); + rb.WithRunbookUrl("docs/doctor/articles/compliance/audit-readiness.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/ComplianceFrameworkCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/ComplianceFrameworkCheck.cs index bc619f702..0247d8997 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/ComplianceFrameworkCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/ComplianceFrameworkCheck.cs @@ -111,6 +111,7 @@ public sealed class ComplianceFrameworkCheck : IDoctorCheck rb.AddStep(2, "Review remediation guidance", "stella compliance remediate --plan", CommandType.Shell); + rb.WithRunbookUrl("docs/doctor/articles/compliance/framework.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/EvidenceExportReadinessCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/EvidenceExportReadinessCheck.cs index 35dd99ee7..e44a71ff7 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/EvidenceExportReadinessCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/EvidenceExportReadinessCheck.cs @@ -115,6 +115,7 @@ public sealed class EvidenceExportReadinessCheck : IDoctorCheck rb.AddStep(1, "Check export configuration", "stella evidence export --check", CommandType.Shell); + rb.WithRunbookUrl("docs/doctor/articles/compliance/export-readiness.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/EvidenceGenerationRateCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/EvidenceGenerationRateCheck.cs index d25e33605..903893462 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/EvidenceGenerationRateCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/EvidenceGenerationRateCheck.cs @@ -112,6 +112,7 @@ public sealed class EvidenceGenerationRateCheck : IDoctorCheck rb.AddStep(2, "Verify signing keys", "stella evidence keys status", CommandType.Shell); + rb.WithRunbookUrl("docs/doctor/articles/compliance/evidence-rate.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/EvidenceTamperCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/EvidenceTamperCheck.cs index 3b0d972b1..748fc775c 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/EvidenceTamperCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/EvidenceTamperCheck.cs @@ -108,6 +108,7 @@ public sealed class EvidenceTamperCheck : IDoctorCheck .WithSafetyNote("DO NOT delete tampered evidence - preserve for investigation"); rb.AddStep(2, "Investigate security incident", "Contact security team", CommandType.Manual) .RequiresBackup(); + rb.WithRunbookUrl("docs/doctor/articles/compliance/evidence-integrity.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/ProvenanceCompletenessCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/ProvenanceCompletenessCheck.cs index fe01a3226..17d5f9f7f 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/ProvenanceCompletenessCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Compliance/Checks/ProvenanceCompletenessCheck.cs @@ -109,6 +109,7 @@ public sealed class ProvenanceCompletenessCheck : IDoctorCheck rb.AddStep(2, "Generate backfill provenance", "stella provenance backfill --dry-run", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/compliance/provenance-completeness.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/CertChainValidationCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/CertChainValidationCheck.cs index fd9962ca1..2ee1082f1 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/CertChainValidationCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/CertChainValidationCheck.cs @@ -79,7 +79,8 @@ public sealed class CertChainValidationCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Update certificate path", "stella crypto config set --tls-cert ", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/certchain.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -109,7 +110,8 @@ public sealed class CertChainValidationCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Update configuration", "stella crypto config set --tls-cert fullchain.pem", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/certchain.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -133,7 +135,8 @@ public sealed class CertChainValidationCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Or configure explicit trust anchor", "stella crypto trust-anchors add --type ca --cert root-ca.crt", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/certchain.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -160,7 +163,8 @@ public sealed class CertChainValidationCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Deploy renewed certificate", "stella crypto config set --tls-cert ", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/certchain.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -179,7 +183,8 @@ public sealed class CertChainValidationCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Renew certificate urgently", "stella crypto cert renew --cert ", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/certchain.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -203,7 +208,8 @@ public sealed class CertChainValidationCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Set up automated renewal", "stella notify channels add --type email --event cert.expiring --threshold-days 14", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/certchain.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/EidasComplianceCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/EidasComplianceCheck.cs index 4f784303d..e9a24cf8e 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/EidasComplianceCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/EidasComplianceCheck.cs @@ -115,7 +115,8 @@ public sealed class EidasComplianceCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Configure eIDAS crypto profile", "stella crypto profile set --profile eu", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/eidas.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -138,7 +139,8 @@ public sealed class EidasComplianceCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Update minimum RSA key size", "stella crypto config set --min-rsa-key-size 3072", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/eidas.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/FipsComplianceCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/FipsComplianceCheck.cs index 3704f180e..f5f751fa3 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/FipsComplianceCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/FipsComplianceCheck.cs @@ -114,6 +114,7 @@ public sealed class FipsComplianceCheck : IDoctorCheck "Consult your OS documentation for FIPS enablement", CommandType.Manual); } + rb.WithRunbookUrl("docs/doctor/articles/crypto/fips.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build()); @@ -149,7 +150,8 @@ public sealed class FipsComplianceCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Verify crypto algorithms", "openssl list -digest-algorithms", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/fips.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/GostAvailabilityCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/GostAvailabilityCheck.cs index b8edf495e..2de03eb46 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/GostAvailabilityCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/GostAvailabilityCheck.cs @@ -94,7 +94,8 @@ public sealed class GostAvailabilityCheck : IDoctorCheck CommandType.Shell) .AddStep(4, "Configure StellaOps GOST profile", "stella crypto profile set --profile ru", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/gost.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -135,7 +136,8 @@ public sealed class GostAvailabilityCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Verify available algorithms", "openssl engine gost -c", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/gost.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/HsmPkcs11AvailabilityCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/HsmPkcs11AvailabilityCheck.cs index 2359953f2..2c0d8e5df 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/HsmPkcs11AvailabilityCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/HsmPkcs11AvailabilityCheck.cs @@ -68,7 +68,8 @@ public sealed class HsmPkcs11AvailabilityCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Or for Windows", "stella crypto config set --hsm-module C:\\SoftHSM2\\lib\\softhsm2.dll", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/hsm.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -91,7 +92,8 @@ public sealed class HsmPkcs11AvailabilityCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Update module path configuration", "stella crypto config set --hsm-module ", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/hsm.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -118,7 +120,8 @@ public sealed class HsmPkcs11AvailabilityCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Initialize slot if needed", "softhsm2-util --init-token --slot 0 --label \"stellaops\"", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/hsm.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -145,7 +148,8 @@ public sealed class HsmPkcs11AvailabilityCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Configure token PIN", "stella crypto config set --hsm-pin ", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/hsm.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/SmCryptoAvailabilityCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/SmCryptoAvailabilityCheck.cs index d753246b0..2f52c1f38 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/SmCryptoAvailabilityCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Crypto/Checks/SmCryptoAvailabilityCheck.cs @@ -106,7 +106,8 @@ public sealed class SmCryptoAvailabilityCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Or use StellaOps bundled crypto", "stella crypto config set --provider bundled-sm", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/sm.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -135,7 +136,8 @@ public sealed class SmCryptoAvailabilityCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Use external SM provider if needed", "stella crypto config set --sm-provider gmssl", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/sm.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -159,7 +161,8 @@ public sealed class SmCryptoAvailabilityCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Verify SM2 curve", "openssl ecparam -list_curves | grep -i sm2", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/crypto/sm.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentCapacityCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentCapacityCheck.cs index ed737bef9..914337f00 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentCapacityCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentCapacityCheck.cs @@ -146,6 +146,7 @@ public sealed class EnvironmentCapacityCheck : IDoctorCheck rb.AddStep(3, "Or remove unused deployments", $"stella env cleanup {criticalEnvs[0].Name}", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/environment/capacity.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -171,6 +172,7 @@ public sealed class EnvironmentCapacityCheck : IDoctorCheck rb.AddStep(1, "Monitor capacity trend", "stella env capacity --trend", CommandType.Shell); + rb.WithRunbookUrl("docs/doctor/articles/environment/capacity.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentConnectivityCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentConnectivityCheck.cs index 4a77fdb2b..2b160a9ac 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentConnectivityCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentConnectivityCheck.cs @@ -164,6 +164,7 @@ public sealed class EnvironmentConnectivityCheck : IDoctorCheck rb.AddStep(3, "Test network connectivity", "# Check firewall rules and network routes to environment agent", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/environment/connectivity.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -204,6 +205,7 @@ public sealed class EnvironmentConnectivityCheck : IDoctorCheck $"stella env diagnose {highLatency[0].Name} --network", CommandType.Shell); } + rb.WithRunbookUrl("docs/doctor/articles/environment/connectivity.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentDeploymentHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentDeploymentHealthCheck.cs index 1a611e974..d06c7d179 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentDeploymentHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentDeploymentHealthCheck.cs @@ -159,6 +159,7 @@ public sealed class EnvironmentDeploymentHealthCheck : IDoctorCheck rb.AddStep(3, "Rollback if needed", $"stella release rollback --env {prodFailures[0].Env}", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/environment/deployments.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -182,6 +183,7 @@ public sealed class EnvironmentDeploymentHealthCheck : IDoctorCheck rb.AddStep(1, "View service logs", $"stella env logs {failedServices[0].Env} --service {failedServices[0].Service}", CommandType.Shell); + rb.WithRunbookUrl("docs/doctor/articles/environment/deployments.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -208,6 +210,7 @@ public sealed class EnvironmentDeploymentHealthCheck : IDoctorCheck rb.AddStep(1, "View service health", $"stella env health {degradedServices[0].Env} --service {degradedServices[0].Service}", CommandType.Shell); + rb.WithRunbookUrl("docs/doctor/articles/environment/deployments.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentDriftCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentDriftCheck.cs index f9eec36c9..7d24331fe 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentDriftCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentDriftCheck.cs @@ -150,6 +150,7 @@ public sealed class EnvironmentDriftCheck : IDoctorCheck rb.AddStep(3, "Or accept drift as intentional", $"stella env drift accept {criticalDrifts[0].ConfigKey}", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/environment/drift.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -175,6 +176,7 @@ public sealed class EnvironmentDriftCheck : IDoctorCheck rb.AddStep(1, "Review drift report", "stella env drift show", CommandType.Shell); + rb.WithRunbookUrl("docs/doctor/articles/environment/drift.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentNetworkPolicyCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentNetworkPolicyCheck.cs index 91378e344..4079c3573 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentNetworkPolicyCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentNetworkPolicyCheck.cs @@ -99,7 +99,8 @@ public sealed class EnvironmentNetworkPolicyCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Configure network isolation", "stella env network-policy create --default-deny", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/environment/network-policy.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -192,6 +193,7 @@ public sealed class EnvironmentNetworkPolicyCheck : IDoctorCheck rb.AddStep(2, "Fix production isolation", $"stella env network-policy update {criticalViolations[0].Environment} --default-deny --allow-from staging", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/environment/network-policy.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -213,7 +215,8 @@ public sealed class EnvironmentNetworkPolicyCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Review policy recommendations", "stella env network-policy audit", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/environment/network-policy.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentSecretHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentSecretHealthCheck.cs index 4786bedaf..e2fd641be 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentSecretHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Environment/Checks/EnvironmentSecretHealthCheck.cs @@ -164,6 +164,7 @@ public sealed class EnvironmentSecretHealthCheck : IDoctorCheck rb.AddStep(2, "Check secret provider status", "stella secrets provider status", CommandType.Shell); + rb.WithRunbookUrl("docs/doctor/articles/environment/secrets.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -196,6 +197,7 @@ public sealed class EnvironmentSecretHealthCheck : IDoctorCheck "stella env secrets rotate-scheduled --days 7", CommandType.Manual); } + rb.WithRunbookUrl("docs/doctor/articles/environment/secrets.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -223,7 +225,8 @@ public sealed class EnvironmentSecretHealthCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "View secrets status", "stella env secrets list --expiring", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/environment/secrets.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/AttestationRetrievalCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/AttestationRetrievalCheck.cs index 977edbc66..48187fd0f 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/AttestationRetrievalCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/AttestationRetrievalCheck.cs @@ -104,7 +104,8 @@ public sealed class AttestationRetrievalCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Verify authentication", "stella evidence auth-test", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/retrieval.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -127,7 +128,8 @@ public sealed class AttestationRetrievalCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check evidence locker metrics", "stella evidence metrics", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/retrieval.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -163,7 +165,8 @@ public sealed class AttestationRetrievalCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check evidence locker status", "stella evidence status", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/retrieval.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -183,7 +186,8 @@ public sealed class AttestationRetrievalCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check service connectivity", "stella evidence ping", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/retrieval.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -219,7 +223,8 @@ public sealed class AttestationRetrievalCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Initialize evidence locker", "stella evidence init", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/retrieval.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -270,7 +275,8 @@ public sealed class AttestationRetrievalCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check file permissions", $"ls -la {attestationDir}", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/retrieval.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/EvidenceIndexCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/EvidenceIndexCheck.cs index f4f798448..b2b741850 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/EvidenceIndexCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/EvidenceIndexCheck.cs @@ -77,7 +77,8 @@ public sealed class EvidenceIndexCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Rebuild evidence index", "stella evidence index rebuild", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/index.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -155,7 +156,8 @@ public sealed class EvidenceIndexCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Verify evidence integrity", "stella evidence verify --all", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/index.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -182,7 +184,8 @@ public sealed class EvidenceIndexCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Refresh evidence index", "stella evidence index refresh", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/index.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -213,7 +216,8 @@ public sealed class EvidenceIndexCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Rebuild evidence index", "stella evidence index rebuild", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/index.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/MerkleAnchorCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/MerkleAnchorCheck.cs index 5c32eb4ec..7149dd9ff 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/MerkleAnchorCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/MerkleAnchorCheck.cs @@ -84,7 +84,8 @@ public sealed class MerkleAnchorCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Trigger anchor creation", "stella evidence anchor create", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/merkle.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -111,7 +112,8 @@ public sealed class MerkleAnchorCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Create initial anchor", "stella evidence anchor create", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/merkle.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -160,7 +162,8 @@ public sealed class MerkleAnchorCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Investigate specific anchors", $"stella evidence anchor verify {invalidAnchors.First()}", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/merkle.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -194,7 +197,8 @@ public sealed class MerkleAnchorCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Create new anchor", "stella evidence anchor create", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/merkle.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -222,7 +226,8 @@ public sealed class MerkleAnchorCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check evidence locker status", "stella evidence status", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/merkle.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/ProvenanceChainCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/ProvenanceChainCheck.cs index a8e918e58..9e196c012 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/ProvenanceChainCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.EvidenceLocker/Checks/ProvenanceChainCheck.cs @@ -139,7 +139,8 @@ public sealed class ProvenanceChainCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Review evidence locker integrity", "stella evidence integrity-check", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/provenance.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -167,7 +168,8 @@ public sealed class ProvenanceChainCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check evidence locker integrity", "stella evidence integrity-check", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/evidence-locker/provenance.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/EmailConfiguredCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/EmailConfiguredCheck.cs index 9bdf77f03..49a89a74e 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/EmailConfiguredCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/EmailConfiguredCheck.cs @@ -80,7 +80,8 @@ public sealed class EmailConfiguredCheck : IDoctorCheck "export Notify__Channels__Email__SmtpHost=\"smtp.example.com\"\n" + "export Notify__Channels__Email__SmtpPort=\"587\"\n" + "export Notify__Channels__Email__FromAddress=\"noreply@example.com\"", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/notify/email-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -101,7 +102,8 @@ public sealed class EmailConfiguredCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Set SMTP port", "# Common SMTP ports:\n# 25 - Standard SMTP (often blocked)\n# 465 - SMTP over SSL\n# 587 - SMTP with STARTTLS (recommended)", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/notify/email-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -121,7 +123,8 @@ public sealed class EmailConfiguredCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Set from address", "# Add Notify:Channels:Email:FromAddress to configuration", - CommandType.FileEdit)) + CommandType.FileEdit) + .WithRunbookUrl("docs/doctor/articles/notify/email-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -142,7 +145,8 @@ public sealed class EmailConfiguredCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Enable email notifications", "# Set Notify:Channels:Email:Enabled to true", - CommandType.FileEdit)) + CommandType.FileEdit) + .WithRunbookUrl("docs/doctor/articles/notify/email-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/EmailConnectivityCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/EmailConnectivityCheck.cs index 5ec5d3957..d1d7e44aa 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/EmailConnectivityCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/EmailConnectivityCheck.cs @@ -122,7 +122,8 @@ public sealed class EmailConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Test with telnet", $"telnet {smtpHost} {smtpPort}", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/notify/email-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -152,7 +153,8 @@ public sealed class EmailConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Check firewall rules", "# Ensure outbound connections to SMTP ports are allowed", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/notify/email-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -179,7 +181,8 @@ public sealed class EmailConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Verify SMTP host and port settings", "# Common SMTP ports: 25, 465 (SSL), 587 (STARTTLS)", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/notify/email-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/NotifyQueueHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/NotifyQueueHealthCheck.cs index a7e3756a6..6f49379d9 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/NotifyQueueHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/NotifyQueueHealthCheck.cs @@ -198,6 +198,7 @@ public sealed class NotifyQueueHealthCheck : IDoctorCheck "# Check Notify:Queue:Transport setting", CommandType.Manual); } + rb.WithRunbookUrl("docs/doctor/articles/notify/queue-health.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -225,7 +226,8 @@ public sealed class NotifyQueueHealthCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check queue server health", "# Review queue server metrics and logs", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/notify/queue-health.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/SlackConfiguredCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/SlackConfiguredCheck.cs index 53c283356..4614aa52e 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/SlackConfiguredCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/SlackConfiguredCheck.cs @@ -76,7 +76,8 @@ public sealed class SlackConfiguredCheck : IDoctorCheck .AddStep(2, "Or set via environment variable", "export Notify__Channels__Slack__WebhookUrl=\"https://hooks.slack.com/services/YOUR/WEBHOOK/URL\"", CommandType.Shell) - .WithSafetyNote("Slack webhook URLs are secrets - store in a secrets manager")) + .WithSafetyNote("Slack webhook URLs are secrets - store in a secrets manager") + .WithRunbookUrl("docs/doctor/articles/notify/slack-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -94,7 +95,8 @@ public sealed class SlackConfiguredCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Enable Slack notifications", "# Set Notify:Channels:Slack:Enabled to true in configuration", - CommandType.FileEdit)) + CommandType.FileEdit) + .WithRunbookUrl("docs/doctor/articles/notify/slack-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/SlackConnectivityCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/SlackConnectivityCheck.cs index 3a267fa32..853a21539 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/SlackConnectivityCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/SlackConnectivityCheck.cs @@ -99,7 +99,8 @@ public sealed class SlackConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Regenerate webhook if needed", "# Create a new webhook URL in Slack and update configuration", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/notify/slack-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -124,7 +125,8 @@ public sealed class SlackConnectivityCheck : IDoctorCheck CommandType.Manual) .AddStep(3, "Verify proxy settings if applicable", "echo $HTTP_PROXY $HTTPS_PROXY", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/notify/slack-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -146,7 +148,8 @@ public sealed class SlackConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Test HTTPS connectivity", "curl -v https://hooks.slack.com/", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/notify/slack-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/TeamsConfiguredCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/TeamsConfiguredCheck.cs index 3822f1661..be8fddc2c 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/TeamsConfiguredCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/TeamsConfiguredCheck.cs @@ -73,7 +73,8 @@ public sealed class TeamsConfiguredCheck : IDoctorCheck .AddStep(3, "Or set via environment variable", "export Notify__Channels__Teams__WebhookUrl=\"https://YOUR_WEBHOOK_URL\"", CommandType.Shell) - .WithSafetyNote("Teams webhook URLs are secrets - store securely")) + .WithSafetyNote("Teams webhook URLs are secrets - store securely") + .WithRunbookUrl("docs/doctor/articles/notify/teams-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -93,7 +94,8 @@ public sealed class TeamsConfiguredCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Verify webhook URL", "# Teams webhook URLs typically look like:\n# https://YOUR_TENANT.webhook.office.com/webhookb2/...", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/notify/teams-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -110,7 +112,8 @@ public sealed class TeamsConfiguredCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Enable Teams notifications", "# Set Notify:Channels:Teams:Enabled to true in configuration", - CommandType.FileEdit)) + CommandType.FileEdit) + .WithRunbookUrl("docs/doctor/articles/notify/teams-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/TeamsConnectivityCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/TeamsConnectivityCheck.cs index bad89c026..636ec8927 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/TeamsConnectivityCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/TeamsConnectivityCheck.cs @@ -115,7 +115,8 @@ public sealed class TeamsConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Recreate webhook if needed", "# Delete and recreate the Incoming Webhook connector in Teams", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/notify/teams-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -140,7 +141,8 @@ public sealed class TeamsConnectivityCheck : IDoctorCheck CommandType.Manual) .AddStep(3, "Verify proxy settings if applicable", "echo $HTTP_PROXY $HTTPS_PROXY", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/notify/teams-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -162,7 +164,8 @@ public sealed class TeamsConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Test HTTPS connectivity", "curl -v https://webhook.office.com/", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/notify/teams-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/WebhookConfiguredCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/WebhookConfiguredCheck.cs index 2b5774cac..00df7886c 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/WebhookConfiguredCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/WebhookConfiguredCheck.cs @@ -72,7 +72,8 @@ public sealed class WebhookConfiguredCheck : IDoctorCheck CommandType.FileEdit) .AddStep(2, "Or set via environment variable", "export Notify__Channels__Webhook__Url=\"https://your-endpoint/webhook\"", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/notify/webhook-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -92,7 +93,8 @@ public sealed class WebhookConfiguredCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Fix URL format", "# Ensure URL starts with http:// or https:// and is properly encoded", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/notify/webhook-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -111,7 +113,8 @@ public sealed class WebhookConfiguredCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Enable webhook notifications", "# Set Notify:Channels:Webhook:Enabled to true in configuration", - CommandType.FileEdit)) + CommandType.FileEdit) + .WithRunbookUrl("docs/doctor/articles/notify/webhook-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/WebhookConnectivityCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/WebhookConnectivityCheck.cs index fd1a3ec8e..ce83cd044 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/WebhookConnectivityCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Notify/Checks/WebhookConnectivityCheck.cs @@ -112,7 +112,8 @@ public sealed class WebhookConnectivityCheck : IDoctorCheck CommandType.Manual) .AddStep(3, "Check endpoint logs", "# Review logs on the webhook endpoint server", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/notify/webhook-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -137,7 +138,8 @@ public sealed class WebhookConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Test port connectivity", $"nc -zv {new Uri(url).Host} {(new Uri(url).Port > 0 ? new Uri(url).Port : (new Uri(url).Scheme == "https" ? 443 : 80))}", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/notify/webhook-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -159,7 +161,8 @@ public sealed class WebhookConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Test connectivity", $"curl -v {DoctorPluginContext.Redact(url)}", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/notify/webhook-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/LogDirectoryCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/LogDirectoryCheck.cs index 9427ed78a..794fae393 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/LogDirectoryCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/LogDirectoryCheck.cs @@ -63,7 +63,8 @@ public sealed class LogDirectoryCheck : IDoctorCheck RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? $"icacls \"{logPath}\" /grant Users:F" : $"sudo chown -R stellaops:stellaops {logPath} && sudo chmod 755 {logPath}", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/logs/directory-writable.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -100,7 +101,8 @@ public sealed class LogDirectoryCheck : IDoctorCheck RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? $"icacls \"{logPath}\" /grant Users:F" : $"sudo chown -R stellaops:stellaops {logPath} && sudo chmod 755 {logPath}", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/logs/directory-writable.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/LogRotationCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/LogRotationCheck.cs index 6a5eee535..80fce8aab 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/LogRotationCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/LogRotationCheck.cs @@ -85,7 +85,8 @@ public sealed class LogRotationCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Adjust rotation threshold", "Edit Logging:RollingPolicy in configuration", - CommandType.FileEdit)) + CommandType.FileEdit) + .WithRunbookUrl("docs/doctor/articles/logs/rotation-configured.md")) .Build()); } @@ -121,7 +122,8 @@ public sealed class LogRotationCheck : IDoctorCheck RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Use Windows Event Log or configure log cleanup task" : "sudo cp /usr/share/stellaops/logrotate.conf /etc/logrotate.d/stellaops", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/logs/rotation-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/OtlpEndpointCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/OtlpEndpointCheck.cs index f48d123b6..8fa298b94 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/OtlpEndpointCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/OtlpEndpointCheck.cs @@ -82,7 +82,8 @@ public sealed class OtlpEndpointCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Verify configuration", "cat /etc/stellaops/telemetry.yaml | grep otlp", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/telemetry/otlp-endpoint.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -103,7 +104,8 @@ public sealed class OtlpEndpointCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Check network connectivity", $"nc -zv {new Uri(endpoint).Host} {new Uri(endpoint).Port}", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/telemetry/otlp-endpoint.md")) .Build(); } catch (HttpRequestException ex) diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/PrometheusScrapeCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/PrometheusScrapeCheck.cs index 3613bf5b3..16cc8d45c 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/PrometheusScrapeCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Observability/Checks/PrometheusScrapeCheck.cs @@ -87,7 +87,8 @@ public sealed class PrometheusScrapeCheck : IDoctorCheck CommandType.FileEdit) .AddStep(2, "Verify metrics configuration", "stella config get Metrics", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/metrics/prometheus-scrape.md")) .WithVerification($"curl -s {metricsUrl} | head -5") .Build(); } @@ -108,7 +109,8 @@ public sealed class PrometheusScrapeCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Check port binding", $"netstat -an | grep {metricsPort}", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/metrics/prometheus-scrape.md")) .Build(); } catch (HttpRequestException ex) diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Operations/Checks/DeadLetterQueueCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Operations/Checks/DeadLetterQueueCheck.cs index 81b88a962..2b88423f4 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Operations/Checks/DeadLetterQueueCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Operations/Checks/DeadLetterQueueCheck.cs @@ -75,7 +75,8 @@ public sealed class DeadLetterQueueCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Investigate common failures", "stella orchestrator deadletter analyze", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/operations/dead-letter.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -99,7 +100,8 @@ public sealed class DeadLetterQueueCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Retry failed jobs", "stella orchestrator deadletter retry --all", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/operations/dead-letter.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Operations/Checks/JobQueueHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Operations/Checks/JobQueueHealthCheck.cs index 0fede27e9..4d8fea34f 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Operations/Checks/JobQueueHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Operations/Checks/JobQueueHealthCheck.cs @@ -78,7 +78,8 @@ public sealed class JobQueueHealthCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Check orchestrator logs", "stella orchestrator logs --tail 100", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/operations/job-queue.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -110,7 +111,8 @@ public sealed class JobQueueHealthCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Review job processing metrics", "stella orchestrator metrics --period 1h", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/operations/job-queue.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -141,7 +143,8 @@ public sealed class JobQueueHealthCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Consider scaling workers", "stella orchestrator workers scale --count 6", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/operations/job-queue.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Operations/Checks/SchedulerHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Operations/Checks/SchedulerHealthCheck.cs index 15669ae60..0e9414a31 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Operations/Checks/SchedulerHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Operations/Checks/SchedulerHealthCheck.cs @@ -67,7 +67,8 @@ public sealed class SchedulerHealthCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Start scheduler", "stella scheduler start", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/operations/scheduler.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -93,7 +94,8 @@ public sealed class SchedulerHealthCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Trigger catch-up", "stella scheduler catchup --dry-run", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/operations/scheduler.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Policy/Checks/PolicyEngineHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Policy/Checks/PolicyEngineHealthCheck.cs index a29ba9da1..ba6ba5b4b 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Policy/Checks/PolicyEngineHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Policy/Checks/PolicyEngineHealthCheck.cs @@ -124,7 +124,8 @@ public sealed class PolicyEngineHealthCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Recompile policies", "stella policy compile --all", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/policy/engine.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -158,7 +159,8 @@ public sealed class PolicyEngineHealthCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Check for complex policies", "stella policy list --complexity high", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/policy/engine.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -201,7 +203,8 @@ public sealed class PolicyEngineHealthCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Verify network connectivity", $"curl -s {policyEngineUrl}/health", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/policy/engine.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -225,7 +228,8 @@ public sealed class PolicyEngineHealthCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Restart policy engine if needed", "stella policy restart", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/policy/engine.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Postgres/Checks/PostgresConnectionPoolCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Postgres/Checks/PostgresConnectionPoolCheck.cs index 4b41fad2e..50bae5488 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Postgres/Checks/PostgresConnectionPoolCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Postgres/Checks/PostgresConnectionPoolCheck.cs @@ -106,7 +106,8 @@ public sealed class PostgresConnectionPoolCheck : IDoctorCheck CommandType.Shell) .AddStep(4, "Terminate idle connections if necessary", "stella db pool reset --idle-only", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/postgres/pool.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -134,7 +135,8 @@ public sealed class PostgresConnectionPoolCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Review active queries", "stella db queries --active", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/postgres/pool.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -162,7 +164,8 @@ public sealed class PostgresConnectionPoolCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Consider increasing pool size", "stella db config set --max-pool-size 150", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/postgres/pool.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -195,7 +198,8 @@ public sealed class PostgresConnectionPoolCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check database connectivity", "stella doctor --check check.postgres.connectivity", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/postgres/pool.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Postgres/Checks/PostgresConnectivityCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Postgres/Checks/PostgresConnectivityCheck.cs index d154b6ecd..9656885c0 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Postgres/Checks/PostgresConnectivityCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Postgres/Checks/PostgresConnectivityCheck.cs @@ -117,7 +117,8 @@ public sealed class PostgresConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Check network connectivity", "stella db ping --trace", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/postgres/connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -142,7 +143,8 @@ public sealed class PostgresConnectivityCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Monitor database performance", "stella db status --watch", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/postgres/connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -187,7 +189,8 @@ public sealed class PostgresConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Verify firewall rules", "stella db connectivity-test", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/postgres/connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -215,7 +218,8 @@ public sealed class PostgresConnectivityCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Check credentials", "stella db verify-credentials", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/postgres/connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Postgres/Checks/PostgresMigrationStatusCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Postgres/Checks/PostgresMigrationStatusCheck.cs index 08971c0c6..f71fed6fa 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Postgres/Checks/PostgresMigrationStatusCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Postgres/Checks/PostgresMigrationStatusCheck.cs @@ -78,7 +78,8 @@ public sealed class PostgresMigrationStatusCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Initialize database with migrations", "stella db migrate --init", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/postgres/migrations.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -118,7 +119,8 @@ public sealed class PostgresMigrationStatusCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Verify migration status", "stella db migrations status", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/postgres/migrations.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -150,7 +152,8 @@ public sealed class PostgresMigrationStatusCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check database connectivity", "stella doctor --check check.postgres.connectivity", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/postgres/migrations.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/ActiveReleaseHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/ActiveReleaseHealthCheck.cs index 78e63290d..9a268224b 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/ActiveReleaseHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/ActiveReleaseHealthCheck.cs @@ -98,7 +98,8 @@ public sealed class ActiveReleaseHealthCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Check service status", "stella release status", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/release/active.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -189,6 +190,7 @@ public sealed class ActiveReleaseHealthCheck : IDoctorCheck "stella release approvals list", CommandType.Shell); } + rb.WithRunbookUrl("docs/doctor/articles/release/active.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -225,6 +227,7 @@ public sealed class ActiveReleaseHealthCheck : IDoctorCheck "stella release approvals list", CommandType.Shell); } + rb.WithRunbookUrl("docs/doctor/articles/release/active.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -265,7 +268,8 @@ public sealed class ActiveReleaseHealthCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check Release Orchestrator health", $"curl -s {orchestratorUrl}/health", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/release/active.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/EnvironmentReadinessCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/EnvironmentReadinessCheck.cs index 0047badd6..dd0826dd3 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/EnvironmentReadinessCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/EnvironmentReadinessCheck.cs @@ -83,7 +83,8 @@ public sealed class EnvironmentReadinessCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check Release Orchestrator health", $"curl -s {orchestratorUrl}/health", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/release/environment-readiness.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -174,6 +175,7 @@ public sealed class EnvironmentReadinessCheck : IDoctorCheck $"stella env health {unhealthy[0].Name}", CommandType.Shell); } + rb.WithRunbookUrl("docs/doctor/articles/release/environment-readiness.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -212,6 +214,7 @@ public sealed class EnvironmentReadinessCheck : IDoctorCheck $"stella env health {unhealthy[0].Name}", CommandType.Shell); } + rb.WithRunbookUrl("docs/doctor/articles/release/environment-readiness.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -233,7 +236,8 @@ public sealed class EnvironmentReadinessCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Trigger health check refresh", "stella env health --refresh-all", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/release/environment-readiness.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/PromotionGateHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/PromotionGateHealthCheck.cs index 7f625e42e..217da8443 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/PromotionGateHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/PromotionGateHealthCheck.cs @@ -83,7 +83,8 @@ public sealed class PromotionGateHealthCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check Release Orchestrator health", $"curl -s {orchestratorUrl}/health", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/release/promotion-gates.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -212,6 +213,7 @@ public sealed class PromotionGateHealthCheck : IDoctorCheck "stella release gates configure --approvers ", CommandType.Manual); } + rb.WithRunbookUrl("docs/doctor/articles/release/promotion-gates.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/ReleaseConfigurationCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/ReleaseConfigurationCheck.cs index 58275e5c3..325d4b156 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/ReleaseConfigurationCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/ReleaseConfigurationCheck.cs @@ -98,7 +98,8 @@ public sealed class ReleaseConfigurationCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Create a release workflow", "stella release workflow create --name --stages dev,staging,prod", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/release/configuration.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -219,6 +220,7 @@ public sealed class ReleaseConfigurationCheck : IDoctorCheck rb.AddStep(2, "Fix workflow configuration", $"stella release workflow edit {validationErrors[0].WorkflowId}", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/release/configuration.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/ReleaseScheduleHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/ReleaseScheduleHealthCheck.cs index 2a9838b13..1c42850d6 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/ReleaseScheduleHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/ReleaseScheduleHealthCheck.cs @@ -147,6 +147,7 @@ public sealed class ReleaseScheduleHealthCheck : IDoctorCheck rb.AddStep(2, "Reschedule or run immediately", $"stella release schedule run {missedSchedules[0].Id}", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/release/schedule.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -173,7 +174,8 @@ public sealed class ReleaseScheduleHealthCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Reschedule one of the conflicting releases", "stella release schedule update --time ", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/release/schedule.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/RollbackReadinessCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/RollbackReadinessCheck.cs index a49262448..a409a6110 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/RollbackReadinessCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Release/Checks/RollbackReadinessCheck.cs @@ -169,6 +169,7 @@ public sealed class RollbackReadinessCheck : IDoctorCheck rb.AddStep(3, "Configure artifact retention", "stella config set Release:ArtifactRetention:Count 5", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/release/rollback-readiness.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -193,7 +194,8 @@ public sealed class RollbackReadinessCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "This is expected for new environments", "# After the next successful deployment, rollback will be available", - CommandType.Manual)) + CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/release/rollback-readiness.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -220,6 +222,7 @@ public sealed class RollbackReadinessCheck : IDoctorCheck rb.AddStep(2, "Enable auto-rollback", $"stella env configure {noHealthProbe[0].Name} --auto-rollback-on-failure", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/release/rollback-readiness.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/ReachabilityComputationHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/ReachabilityComputationHealthCheck.cs index f264dcc32..7cb136010 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/ReachabilityComputationHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/ReachabilityComputationHealthCheck.cs @@ -119,6 +119,7 @@ public sealed class ReachabilityComputationHealthCheck : IDoctorCheck rb.AddStep(2, "Retry failed computations", "stella scanner reachability retry --failed", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/scanner/reachability.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -149,6 +150,7 @@ public sealed class ReachabilityComputationHealthCheck : IDoctorCheck rb.AddStep(2, "Scale workers", "stella scanner workers scale --replicas 4", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/scanner/reachability.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/SbomGenerationHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/SbomGenerationHealthCheck.cs index 746a58679..8ffdf89b4 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/SbomGenerationHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/SbomGenerationHealthCheck.cs @@ -119,6 +119,7 @@ public sealed class SbomGenerationHealthCheck : IDoctorCheck rb.AddStep(2, "Retry failed SBOMs", "stella scanner sbom retry --failed", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/scanner/sbom.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/ScannerQueueHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/ScannerQueueHealthCheck.cs index d63a27c7f..bbf8de73b 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/ScannerQueueHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/ScannerQueueHealthCheck.cs @@ -123,6 +123,7 @@ public sealed class ScannerQueueHealthCheck : IDoctorCheck rb.AddStep(3, "Check worker status", "stella scanner workers status", CommandType.Shell); + rb.WithRunbookUrl("docs/doctor/articles/scanner/queue.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -145,7 +146,8 @@ public sealed class ScannerQueueHealthCheck : IDoctorCheck }) .WithCauses("High volume", "Workers overwhelmed", "High error rate") .WithRemediation(rb => rb - .AddStep(1, "Scale workers", "stella scanner workers scale --replicas 4", CommandType.Manual)) + .AddStep(1, "Scale workers", "stella scanner workers scale --replicas 4", CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/scanner/queue.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/ScannerResourceUtilizationCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/ScannerResourceUtilizationCheck.cs index 75798a539..9fbb7288e 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/ScannerResourceUtilizationCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/ScannerResourceUtilizationCheck.cs @@ -120,6 +120,7 @@ public sealed class ScannerResourceUtilizationCheck : IDoctorCheck rb.AddStep(2, "Reduce concurrent jobs", "stella scanner config set MaxConcurrentJobs 2", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/scanner/resources.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -139,7 +140,8 @@ public sealed class ScannerResourceUtilizationCheck : IDoctorCheck }) .WithCauses("High demand", "Consider scaling") .WithRemediation(rb => rb - .AddStep(1, "Scale workers", "stella scanner workers scale --replicas 4", CommandType.Manual)) + .AddStep(1, "Scale workers", "stella scanner workers scale --replicas 4", CommandType.Manual) + .WithRunbookUrl("docs/doctor/articles/scanner/resources.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/SliceCacheHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/SliceCacheHealthCheck.cs index 4423cf95b..fc4e34fe3 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/SliceCacheHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/SliceCacheHealthCheck.cs @@ -115,6 +115,7 @@ public sealed class SliceCacheHealthCheck : IDoctorCheck rb.AddStep(2, "Increase cache size", "# Update Scanner:Cache:MaxSizeBytes in configuration", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/scanner/slice-cache.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -149,6 +150,7 @@ public sealed class SliceCacheHealthCheck : IDoctorCheck rb.AddStep(2, "Warm cache", "stella scanner cache warm", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/scanner/slice-cache.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/VulnerabilityScanHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/VulnerabilityScanHealthCheck.cs index ebf0c5b48..3293fd2c6 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/VulnerabilityScanHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/VulnerabilityScanHealthCheck.cs @@ -111,6 +111,7 @@ public sealed class VulnerabilityScanHealthCheck : IDoctorCheck rb.AddStep(2, "Check sync status", "stella scanner db status", CommandType.Shell); + rb.WithRunbookUrl("docs/doctor/articles/scanner/vuln.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -128,7 +129,8 @@ public sealed class VulnerabilityScanHealthCheck : IDoctorCheck }) .WithCauses("Scheduled sync delayed") .WithRemediation(rb => rb - .AddStep(1, "Check sync schedule", "stella scanner db schedule", CommandType.Shell)) + .AddStep(1, "Check sync schedule", "stella scanner db schedule", CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/scanner/vuln.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/WitnessGraphHealthCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/WitnessGraphHealthCheck.cs index 9476f8ece..cec33eb9e 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/WitnessGraphHealthCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Scanner/Checks/WitnessGraphHealthCheck.cs @@ -117,6 +117,7 @@ public sealed class WitnessGraphHealthCheck : IDoctorCheck rb.AddStep(2, "Rebuild failed graphs", "stella scanner witness rebuild --failed", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/scanner/witness-graph.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Storage/Checks/BackupDirectoryCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Storage/Checks/BackupDirectoryCheck.cs index aa5268c30..118281409 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Storage/Checks/BackupDirectoryCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Storage/Checks/BackupDirectoryCheck.cs @@ -81,7 +81,8 @@ public sealed class BackupDirectoryCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Verify backup configuration", "stella backup config show", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/storage/backup.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -114,7 +115,8 @@ public sealed class BackupDirectoryCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Check disk space", "stella doctor --check check.storage.diskspace", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/storage/backup.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -146,7 +148,8 @@ public sealed class BackupDirectoryCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Verify backup schedule", "stella backup schedule show", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/storage/backup.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -178,7 +181,8 @@ public sealed class BackupDirectoryCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Check backup logs", "stella backup logs --tail 50", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/storage/backup.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Storage/Checks/DiskSpaceCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Storage/Checks/DiskSpaceCheck.cs index 5ee6b568f..f2fb536c2 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Storage/Checks/DiskSpaceCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Storage/Checks/DiskSpaceCheck.cs @@ -127,6 +127,7 @@ public sealed class DiskSpaceCheck : IDoctorCheck "docker system df", CommandType.Shell); } + rb.WithRunbookUrl("docs/doctor/articles/storage/diskspace.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build()); @@ -155,7 +156,8 @@ public sealed class DiskSpaceCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Schedule cleanup if needed", "stella storage cleanup --dry-run", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/storage/diskspace.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Storage/Checks/EvidenceLockerWriteCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Storage/Checks/EvidenceLockerWriteCheck.cs index 13a6a8a76..cd3ea7da2 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Storage/Checks/EvidenceLockerWriteCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Storage/Checks/EvidenceLockerWriteCheck.cs @@ -89,7 +89,8 @@ public sealed class EvidenceLockerWriteCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Set permissions", $"chmod 750 {lockerPath}", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/storage/evidencelocker.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -141,7 +142,8 @@ public sealed class EvidenceLockerWriteCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check filesystem integrity", "stella storage verify --path evidence-locker", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/storage/evidencelocker.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -165,7 +167,8 @@ public sealed class EvidenceLockerWriteCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Check storage I/O metrics", "stella storage iostat", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/storage/evidencelocker.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -202,7 +205,8 @@ public sealed class EvidenceLockerWriteCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Fix permissions", $"chown -R stellaops:stellaops {lockerPath}", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/storage/evidencelocker.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -226,7 +230,8 @@ public sealed class EvidenceLockerWriteCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Check filesystem mount", $"mount | grep {Path.GetPathRoot(lockerPath)}", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/storage/evidencelocker.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Vex/Checks/VexDocumentValidationCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Vex/Checks/VexDocumentValidationCheck.cs index 77076d92d..19eb8b85f 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Vex/Checks/VexDocumentValidationCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Vex/Checks/VexDocumentValidationCheck.cs @@ -86,7 +86,8 @@ public sealed class VexDocumentValidationCheck : IDoctorCheck CommandType.Shell) .AddStep(3, "Check issuer key availability", "stella issuer keys list", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/vex/validation.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -116,7 +117,8 @@ public sealed class VexDocumentValidationCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Review validation warnings", "stella vex list --status warning", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/vex/validation.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Vex/Checks/VexIssuerTrustCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Vex/Checks/VexIssuerTrustCheck.cs index 94c45f50b..a4e272d1b 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Vex/Checks/VexIssuerTrustCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Vex/Checks/VexIssuerTrustCheck.cs @@ -67,7 +67,8 @@ public sealed class VexIssuerTrustCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Import trust anchors", "stella trust-anchors import --defaults", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/vex/issuer-trust.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -91,7 +92,8 @@ public sealed class VexIssuerTrustCheck : IDoctorCheck CommandType.Shell) .AddStep(2, "Trust a known issuer", "stella issuer trust --url https://example.com/.well-known/vex-issuer", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/vex/issuer-trust.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Vex/Checks/VexSchemaComplianceCheck.cs b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Vex/Checks/VexSchemaComplianceCheck.cs index 5761f102c..40c35b114 100644 --- a/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Vex/Checks/VexSchemaComplianceCheck.cs +++ b/src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Vex/Checks/VexSchemaComplianceCheck.cs @@ -65,7 +65,8 @@ public sealed class VexSchemaComplianceCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Update VEX schemas", "stella vex schemas update", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/vex/schema.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/ClaudeProviderCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/ClaudeProviderCheck.cs index 0812c6777..86db00637 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/ClaudeProviderCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/ClaudeProviderCheck.cs @@ -155,7 +155,8 @@ public sealed class ClaudeProviderCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Verify API key", "Check ANTHROPIC_API_KEY is valid") - .AddManualStep(2, "Check quotas", "Verify API usage limits on console.anthropic.com")) + .AddManualStep(2, "Check quotas", "Verify API usage limits on console.anthropic.com") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.ai.provider.claude") .Build(); } @@ -171,7 +172,8 @@ public sealed class ClaudeProviderCheck : IDoctorCheck .WithCauses("Network connectivity issue or invalid endpoint") .WithRemediation(r => r .AddManualStep(1, "Check network", "Verify network connectivity to api.anthropic.com") - .AddManualStep(2, "Check proxy", "Ensure proxy settings are configured if required")) + .AddManualStep(2, "Check proxy", "Ensure proxy settings are configured if required") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.ai.provider.claude") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/GeminiProviderCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/GeminiProviderCheck.cs index 05beb2618..765020151 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/GeminiProviderCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/GeminiProviderCheck.cs @@ -143,7 +143,8 @@ public sealed class GeminiProviderCheck : IDoctorCheck .WithRemediation(r => r .AddManualStep(1, "Verify API key", "Check GEMINI_API_KEY or GOOGLE_API_KEY is valid") .AddManualStep(2, "Enable API", "Ensure Generative Language API is enabled in Google Cloud Console") - .AddManualStep(3, "Check quotas", "Verify API usage limits in Google Cloud Console")) + .AddManualStep(3, "Check quotas", "Verify API usage limits in Google Cloud Console") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.ai.provider.gemini") .Build(); } @@ -159,7 +160,8 @@ public sealed class GeminiProviderCheck : IDoctorCheck .WithCauses("Network connectivity issue or invalid endpoint") .WithRemediation(r => r .AddManualStep(1, "Check network", "Verify network connectivity to generativelanguage.googleapis.com") - .AddManualStep(2, "Check proxy", "Ensure proxy settings are configured if required")) + .AddManualStep(2, "Check proxy", "Ensure proxy settings are configured if required") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.ai.provider.gemini") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/LlmProviderConfigurationCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/LlmProviderConfigurationCheck.cs index 15bbd837b..6206c08de 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/LlmProviderConfigurationCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/LlmProviderConfigurationCheck.cs @@ -143,7 +143,8 @@ public sealed class LlmProviderConfigurationCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Set API key", "Configure API key for the default provider") - .AddManualStep(2, "Verify provider", "Ensure default provider matches a configured one")) + .AddManualStep(2, "Verify provider", "Ensure default provider matches a configured one") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.ai.llm.config") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/LocalInferenceCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/LocalInferenceCheck.cs index 4428ed244..473b1e568 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/LocalInferenceCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/LocalInferenceCheck.cs @@ -142,7 +142,8 @@ public sealed class LocalInferenceCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Load model", "Ensure a model is loaded in the server") - .AddManualStep(2, "Check model path", "Verify the model file exists at configured path")) + .AddManualStep(2, "Check model path", "Verify the model file exists at configured path") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.ai.provider.local") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/OllamaProviderCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/OllamaProviderCheck.cs index 58ec11f70..1f99b5623 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/OllamaProviderCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/OllamaProviderCheck.cs @@ -84,7 +84,8 @@ public sealed class OllamaProviderCheck : IDoctorCheck .WithCauses("Ollama server is not running or endpoint is incorrect") .WithRemediation(r => r .AddManualStep(1, "Start Ollama", "Run: ollama serve") - .AddManualStep(2, "Check endpoint", $"Verify Ollama is running at {endpoint}")) + .AddManualStep(2, "Check endpoint", $"Verify Ollama is running at {endpoint}") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.ai.provider.ollama") .Build(); } @@ -160,7 +161,8 @@ public sealed class OllamaProviderCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Pull model", $"Run: ollama pull {model}") - .AddManualStep(2, "List models", "Run: ollama list")) + .AddManualStep(2, "List models", "Run: ollama list") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.ai.provider.ollama") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/OpenAiProviderCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/OpenAiProviderCheck.cs index 0fd0a3f65..bcef6143e 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/OpenAiProviderCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.AI/Checks/OpenAiProviderCheck.cs @@ -139,7 +139,8 @@ public sealed class OpenAiProviderCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Verify API key", "Check OPENAI_API_KEY is valid") - .AddManualStep(2, "Check quotas", "Verify API usage limits on platform.openai.com")) + .AddManualStep(2, "Check quotas", "Verify API usage limits on platform.openai.com") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.ai.provider.openai") .Build(); } @@ -155,7 +156,8 @@ public sealed class OpenAiProviderCheck : IDoctorCheck .WithCauses("Network connectivity issue or invalid endpoint") .WithRemediation(r => r .AddManualStep(1, "Check network", "Verify network connectivity to api.openai.com") - .AddManualStep(2, "Check proxy", "Ensure proxy settings are configured if required")) + .AddManualStep(2, "Check proxy", "Ensure proxy settings are configured if required") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.ai.provider.openai") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/ClockSkewCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/ClockSkewCheck.cs index 9c73d3499..c9fbe7cae 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/ClockSkewCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/ClockSkewCheck.cs @@ -96,7 +96,8 @@ public sealed class ClockSkewCheck : AttestationCheckBase .Add("Note", "Clock skew verification skipped - no network reference available")) .WithRemediation(r => r .AddShellStep(1, "Check system time", GetTimeCheckCommand()) - .AddManualStep(2, "Configure NTP", "Ensure NTP is configured for time synchronization")) + .AddManualStep(2, "Configure NTP", "Ensure NTP is configured for time synchronization") + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-clock-skew.md")) .Build(); } @@ -122,7 +123,8 @@ public sealed class ClockSkewCheck : AttestationCheckBase .WithRemediation(r => r .AddShellStep(1, "Check current time", GetTimeCheckCommand()) .AddShellStep(2, "Force NTP sync", GetNtpSyncCommand()) - .AddManualStep(3, "Configure NTP", "Ensure NTP is properly configured and the NTP service is running")) + .AddManualStep(3, "Configure NTP", "Ensure NTP is properly configured and the NTP service is running") + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-clock-skew.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -143,7 +145,8 @@ public sealed class ClockSkewCheck : AttestationCheckBase "Infrequent NTP sync interval") .WithRemediation(r => r .AddShellStep(1, "Check NTP status", GetNtpStatusCommand()) - .AddShellStep(2, "Force NTP sync", GetNtpSyncCommand())) + .AddShellStep(2, "Force NTP sync", GetNtpSyncCommand()) + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-clock-skew.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/CosignKeyMaterialCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/CosignKeyMaterialCheck.cs index 3c454430a..f92e96fe8 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/CosignKeyMaterialCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/CosignKeyMaterialCheck.cs @@ -93,7 +93,8 @@ public sealed class CosignKeyMaterialCheck : AttestationCheckBase .Add("Note", "Enable Sigstore to use attestation signing")) .WithRemediation(r => r .AddManualStep(1, "Enable Sigstore", "Set Sigstore:Enabled to true in configuration") - .AddManualStep(2, "Configure signing mode", "Set either Sigstore:KeyPath, Sigstore:Keyless:Enabled, or Sigstore:KMS:KeyRef")) + .AddManualStep(2, "Configure signing mode", "Set either Sigstore:KeyPath, Sigstore:Keyless:Enabled, or Sigstore:KMS:KeyRef") + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-cosign-keymaterial.md")) .Build()); } @@ -112,7 +113,8 @@ public sealed class CosignKeyMaterialCheck : AttestationCheckBase .AddShellStep(1, "Generate a signing key pair", "cosign generate-key-pair") .AddManualStep(2, "Configure key path", "Set Sigstore:KeyPath to the path of the private key") .AddManualStep(3, "Or enable keyless", "Set Sigstore:Keyless:Enabled to true for OIDC-based signing") - .AddManualStep(4, "Or use KMS", "Set Sigstore:KMS:KeyRef to your KMS key reference")) + .AddManualStep(4, "Or use KMS", "Set Sigstore:KMS:KeyRef to your KMS key reference") + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-cosign-keymaterial.md")) .WithVerification($"stella doctor --check check.attestation.cosign.keymaterial") .Build()); } @@ -135,7 +137,8 @@ public sealed class CosignKeyMaterialCheck : AttestationCheckBase .WithRemediation(r => r .AddShellStep(1, "Verify file exists", $"ls -la {keyPath}") .AddShellStep(2, "Generate new key pair if needed", "cosign generate-key-pair") - .AddManualStep(3, "Update configuration", "Ensure Sigstore:KeyPath points to the correct file")) + .AddManualStep(3, "Update configuration", "Ensure Sigstore:KeyPath points to the correct file") + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-cosign-keymaterial.md")) .WithVerification($"stella doctor --check check.attestation.cosign.keymaterial") .Build()); } @@ -172,7 +175,8 @@ public sealed class CosignKeyMaterialCheck : AttestationCheckBase .WithCauses("File permissions prevent reading the key file") .WithRemediation(r => r .AddShellStep(1, "Check file permissions", $"ls -la {keyPath}") - .AddShellStep(2, "Fix permissions if needed", $"chmod 600 {keyPath}")) + .AddShellStep(2, "Fix permissions if needed", $"chmod 600 {keyPath}") + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-cosign-keymaterial.md")) .WithVerification($"stella doctor --check check.attestation.cosign.keymaterial") .Build()); } @@ -213,7 +217,8 @@ public sealed class CosignKeyMaterialCheck : AttestationCheckBase "Fulcio URL is incorrect") .WithRemediation(r => r .AddShellStep(1, "Test Fulcio endpoint", $"curl -I {fulcioApiUrl}") - .AddManualStep(2, "Check service status", "Visit https://status.sigstore.dev")) + .AddManualStep(2, "Check service status", "Visit https://status.sigstore.dev") + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-cosign-keymaterial.md")) .WithVerification($"stella doctor --check check.attestation.cosign.keymaterial") .Build(); } @@ -242,7 +247,8 @@ public sealed class CosignKeyMaterialCheck : AttestationCheckBase "Firewall blocking HTTPS traffic") .WithRemediation(r => r .AddShellStep(1, "Test connectivity", $"curl -I {fulcioUrl}") - .AddManualStep(2, "Check network configuration", "Ensure HTTPS traffic to Fulcio is allowed")) + .AddManualStep(2, "Check network configuration", "Ensure HTTPS traffic to Fulcio is allowed") + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-cosign-keymaterial.md")) .WithVerification($"stella doctor --check check.attestation.cosign.keymaterial") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/OfflineBundleCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/OfflineBundleCheck.cs index 85da885cb..36d1c6cca 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/OfflineBundleCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/OfflineBundleCheck.cs @@ -69,7 +69,8 @@ public sealed class OfflineBundleCheck : AttestationCheckBase .WithRemediation(r => r .AddShellStep(1, "Export bundle from online system", "stella attestation bundle export --output /path/to/bundle.json") .AddManualStep(2, "Configure bundle path", "Set Doctor:Plugins:Attestation:OfflineBundlePath to the bundle location") - .AddManualStep(3, "Transfer bundle", "Copy the bundle to the target system")) + .AddManualStep(3, "Transfer bundle", "Copy the bundle to the target system") + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-offline-bundle.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -89,7 +90,8 @@ public sealed class OfflineBundleCheck : AttestationCheckBase .WithRemediation(r => r .AddShellStep(1, "Check file existence", $"ls -la {options.OfflineBundlePath}") .AddShellStep(2, "Export new bundle", "stella attestation bundle export --output " + options.OfflineBundlePath) - .AddManualStep(3, "Verify path", "Ensure the configured path is correct")) + .AddManualStep(3, "Verify path", "Ensure the configured path is correct") + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-offline-bundle.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -134,7 +136,8 @@ public sealed class OfflineBundleCheck : AttestationCheckBase .Add("ParseError", parseError)) .WithRemediation(r => r .AddShellStep(1, "Validate bundle", "stella attestation bundle validate " + options.OfflineBundlePath) - .AddShellStep(2, "Export fresh bundle", "stella attestation bundle export --output " + options.OfflineBundlePath)) + .AddShellStep(2, "Export fresh bundle", "stella attestation bundle export --output " + options.OfflineBundlePath) + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-offline-bundle.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -167,7 +170,8 @@ public sealed class OfflineBundleCheck : AttestationCheckBase .WithRemediation(r => r .AddShellStep(1, "Export fresh bundle from online system", "stella attestation bundle export --output /path/to/new-bundle.json") .AddManualStep(2, "Transfer to air-gap environment", "Copy the new bundle to the target system") - .AddManualStep(3, "Update bundle path if needed", "Point configuration to the new bundle file")) + .AddManualStep(3, "Update bundle path if needed", "Point configuration to the new bundle file") + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-offline-bundle.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -191,7 +195,8 @@ public sealed class OfflineBundleCheck : AttestationCheckBase }) .WithRemediation(r => r .AddShellStep(1, "Export fresh bundle", "stella attestation bundle export --output /path/to/new-bundle.json") - .AddManualStep(2, "Schedule regular updates", "Consider automating bundle refresh")) + .AddManualStep(2, "Schedule regular updates", "Consider automating bundle refresh") + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-offline-bundle.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/RekorConnectivityCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/RekorConnectivityCheck.cs index b4cf07bef..dbd84eecc 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/RekorConnectivityCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Attestation/Checks/RekorConnectivityCheck.cs @@ -60,7 +60,8 @@ public sealed class RekorConnectivityCheck : AttestationCheckBase .Add("ConfigKey", "Doctor:Plugins:Attestation:RekorUrl or Sigstore:RekorUrl")) .WithRemediation(r => r .AddManualStep(1, "Configure Rekor URL", "Set the Rekor URL in configuration: STELLA_REKOR_URL=https://rekor.sigstore.dev") - .AddManualStep(2, "Or use offline mode", "Set Doctor:Plugins:Attestation:Mode to 'offline' and configure OfflineBundlePath")) + .AddManualStep(2, "Or use offline mode", "Set Doctor:Plugins:Attestation:Mode to 'offline' and configure OfflineBundlePath") + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-rekor-connectivity.md")) .Build(); } @@ -86,7 +87,8 @@ public sealed class RekorConnectivityCheck : AttestationCheckBase .WithRemediation(r => r .AddShellStep(1, "Test endpoint manually", $"curl -I {logInfoUrl}") .AddManualStep(2, "Verify Rekor URL", "Ensure the URL is correct (default: https://rekor.sigstore.dev)") - .AddManualStep(3, "Check service status", "Visit https://status.sigstore.dev for public Rekor status")) + .AddManualStep(3, "Check service status", "Visit https://status.sigstore.dev for public Rekor status") + .WithRunbookUrl("docs/doctor/articles/attestor/attestation-rekor-connectivity.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/AuthorityPluginConfigurationCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/AuthorityPluginConfigurationCheck.cs index 041873ce5..7f1d47923 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/AuthorityPluginConfigurationCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/AuthorityPluginConfigurationCheck.cs @@ -122,7 +122,8 @@ public sealed class AuthorityPluginConfigurationCheck : IDoctorCheck CommandType.FileEdit) .AddStep(3, "Run setup wizard to configure", "stella setup --step authority", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/authority-plugin-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -139,7 +140,8 @@ public sealed class AuthorityPluginConfigurationCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Review configuration", "Check Authority:Plugins section for missing values") - .AddStep(2, "Run setup wizard", "stella setup --step authority", CommandType.Shell)) + .AddStep(2, "Run setup wizard", "stella setup --step authority", CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/authority-plugin-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/AuthorityPluginConnectivityCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/AuthorityPluginConnectivityCheck.cs index 0d4ce3d85..ac31c8cbe 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/AuthorityPluginConnectivityCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/AuthorityPluginConnectivityCheck.cs @@ -98,6 +98,7 @@ public sealed class AuthorityPluginConnectivityCheck : IDoctorCheck r.AddManualStep(3, "Check LDAP server", "Verify LDAP server is accessible from this network"); r.AddManualStep(4, "Verify LDAP credentials", "Check Authority:Plugins:Ldap:BindDn and BindPassword"); } + r.WithRunbookUrl("docs/doctor/articles/auth/authority-plugin-connectivity.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/BootstrapUserExistsCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/BootstrapUserExistsCheck.cs index 8c88a376e..a29939138 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/BootstrapUserExistsCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/BootstrapUserExistsCheck.cs @@ -81,7 +81,8 @@ public sealed class BootstrapUserExistsCheck : IDoctorCheck CommandType.FileEdit) .AddStep(2, "Or run setup wizard to create user", "stella setup --step users", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/authority-bootstrap-exists.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -123,7 +124,8 @@ public sealed class BootstrapUserExistsCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Complete configuration", "Set missing bootstrap user fields") - .AddStep(2, "Run setup wizard", "stella setup --step users", CommandType.Shell)) + .AddStep(2, "Run setup wizard", "stella setup --step users", CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/authority-bootstrap-exists.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/SuperUserExistsCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/SuperUserExistsCheck.cs index d4a789e23..580ecf00e 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/SuperUserExistsCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/SuperUserExistsCheck.cs @@ -106,7 +106,8 @@ public sealed class SuperUserExistsCheck : IDoctorCheck "\"Authority\": {\n" + " \"Bootstrap\": { \"Enabled\": true }\n" + "}", - CommandType.FileEdit)) + CommandType.FileEdit) + .WithRunbookUrl("docs/doctor/articles/auth/users-superuser-exists.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/UserPasswordPolicyCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/UserPasswordPolicyCheck.cs index 816332892..0e31e6766 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/UserPasswordPolicyCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Authority/Checks/UserPasswordPolicyCheck.cs @@ -115,7 +115,8 @@ public sealed class UserPasswordPolicyCheck : IDoctorCheck " \"RequireSpecialCharacter\": true\n" + " }\n" + "}", - CommandType.FileEdit)) + CommandType.FileEdit) + .WithRunbookUrl("docs/doctor/articles/auth/users-password-policy.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -134,7 +135,8 @@ public sealed class UserPasswordPolicyCheck : IDoctorCheck .WithCauses(recommendations.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Review recommendations", "Consider strengthening password policy") - .AddStep(2, "Run setup wizard", "stella setup --step authority", CommandType.Shell)) + .AddStep(2, "Run setup wizard", "stella setup --step authority", CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/auth/users-password-policy.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/AuthenticationConfigCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/AuthenticationConfigCheck.cs index 0fb230d59..49939410f 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/AuthenticationConfigCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/AuthenticationConfigCheck.cs @@ -125,7 +125,8 @@ public sealed class AuthenticationConfigCheck : IDoctorCheck .AddManualStep(1, "Review authentication settings", "Check appsettings.json Authentication section for proper configuration") .AddManualStep(2, "Use strong secrets", - "Ensure JWT secrets are at least 32 characters and not default values")) + "Ensure JWT secrets are at least 32 characters and not default values") + .WithRunbookUrl("docs/doctor/articles/core/core-auth-config.md")) .WithVerification("stella doctor --check check.core.auth.config") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/ConfigurationLoadedCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/ConfigurationLoadedCheck.cs index 98fa0fc74..7d422c6ac 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/ConfigurationLoadedCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/ConfigurationLoadedCheck.cs @@ -56,7 +56,8 @@ public sealed class ConfigurationLoadedCheck : IDoctorCheck "Environment variables not set") .WithRemediation(r => r .AddManualStep(1, "Check for configuration files", "Verify appsettings.json or environment-specific config files exist") - .AddShellStep(2, "List environment variables", "printenv | grep -i stella")) + .AddShellStep(2, "List environment variables", "printenv | grep -i stella") + .WithRunbookUrl("docs/doctor/articles/core/core-config-loaded.md")) .WithVerification("stella doctor --check check.core.config.loaded") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/CryptoProvidersCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/CryptoProvidersCheck.cs index 6bc36e4a4..b92c71b9e 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/CryptoProvidersCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/CryptoProvidersCheck.cs @@ -91,7 +91,8 @@ public sealed class CryptoProvidersCheck : IDoctorCheck .AddManualStep(1, "Verify OS crypto support", "Ensure operating system has required cryptographic providers installed") .AddManualStep(2, "Check FIPS compliance requirements", - "If FIPS mode is enabled, ensure only FIPS-compliant algorithms are used")) + "If FIPS mode is enabled, ensure only FIPS-compliant algorithms are used") + .WithRunbookUrl("docs/doctor/articles/core/core-crypto-available.md")) .WithVerification("stella doctor --check check.core.crypto.available") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/DependencyServicesCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/DependencyServicesCheck.cs index fda7c7649..54838934a 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/DependencyServicesCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/DependencyServicesCheck.cs @@ -84,7 +84,8 @@ public sealed class DependencyServicesCheck : IDoctorCheck "Incorrect service registration order") .WithRemediation(r => r .AddManualStep(1, "Register missing services", - $"Add registration for: {string.Join(", ", missing)} in Program.cs or Startup.cs")) + $"Add registration for: {string.Join(", ", missing)} in Program.cs or Startup.cs") + .WithRunbookUrl("docs/doctor/articles/core/core-services-dependencies.md")) .WithVerification("stella doctor --check check.core.services.dependencies") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/DiskSpaceCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/DiskSpaceCheck.cs index 05f97aa4c..90fd42f47 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/DiskSpaceCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/DiskSpaceCheck.cs @@ -75,7 +75,8 @@ public sealed class DiskSpaceCheck : IDoctorCheck .WithRemediation(r => r .AddShellStep(1, "Check large files", "du -sh /* | sort -hr | head -20") .AddShellStep(2, "Clean temp files", "rm -rf /tmp/* 2>/dev/null") - .AddShellStep(3, "Rotate logs", "logrotate -f /etc/logrotate.conf")) + .AddShellStep(3, "Rotate logs", "logrotate -f /etc/logrotate.conf") + .WithRunbookUrl("docs/doctor/articles/core/core-env-diskspace.md")) .WithVerification("stella doctor --check check.core.env.diskspace") .Build()); } @@ -91,7 +92,8 @@ public sealed class DiskSpaceCheck : IDoctorCheck .Add("UsedPercent", $"{usedPercent:F1}%")) .WithCauses("Disk usage approaching capacity") .WithRemediation(r => r - .AddManualStep(1, "Review disk usage", "Consider archiving or deleting old data")) + .AddManualStep(1, "Review disk usage", "Consider archiving or deleting old data") + .WithRunbookUrl("docs/doctor/articles/core/core-env-diskspace.md")) .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/EnvironmentVariablesCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/EnvironmentVariablesCheck.cs index 4239adfff..fea18dab7 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/EnvironmentVariablesCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/EnvironmentVariablesCheck.cs @@ -1,4 +1,3 @@ - using StellaOps.Doctor.Models; using StellaOps.Doctor.Plugins; using StellaOps.Doctor.Plugins.Builders; @@ -8,6 +7,9 @@ namespace StellaOps.Doctor.Plugins.Core.Checks; /// /// Verifies that expected environment variables are set. +/// In Docker compose and Kubernetes deployments, ASPNETCORE_ENVIRONMENT may +/// not be explicitly set (defaults to Production), but other STELLAOPS_* +/// variables confirm the environment is intentionally configured. /// public sealed class EnvironmentVariablesCheck : IDoctorCheck { @@ -50,39 +52,55 @@ public sealed class EnvironmentVariablesCheck : IDoctorCheck { var value = Environment.GetEnvironmentVariable(varName); if (string.IsNullOrEmpty(value)) - { missing.Add(varName); - } else - { found[varName] = value; - } } - // Count total stella-related env vars + // Count total stella-related env vars (broader than just the two recommended) var stellaVars = Environment.GetEnvironmentVariables() .Keys.Cast() .Where(k => k.StartsWith("STELLA", StringComparison.OrdinalIgnoreCase) || k.StartsWith("ASPNETCORE", StringComparison.OrdinalIgnoreCase) || - k.StartsWith("DOTNET", StringComparison.OrdinalIgnoreCase)) + k.StartsWith("DOTNET", StringComparison.OrdinalIgnoreCase) || + k.StartsWith("CONNECTIONSTRINGS", StringComparison.OrdinalIgnoreCase)) .ToList(); - if (missing.Count > 0 && missing.Count == RecommendedVariables.Length) + // If neither recommended var is set but other platform vars exist, + // the environment IS configured (e.g. Docker compose with ASPNETCORE_URLS, + // STELLAOPS_* vars, ConnectionStrings__*, etc.). Report as pass with note. + if (missing.Count == RecommendedVariables.Length) { + if (stellaVars.Count > 0) + { + // Platform is configured via other env vars — pass with advisory + return Task.FromResult(result + .Pass($"Environment configured ({stellaVars.Count} platform variables detected, using default environment: {context.EnvironmentName})") + .WithEvidence("Environment status", e => + { + e.Add("CurrentEnvironment", context.EnvironmentName); + e.Add("TotalPlatformVars", stellaVars.Count.ToString(CultureInfo.InvariantCulture)); + e.Add("Note", "ASPNETCORE_ENVIRONMENT not explicitly set — using default. Set it to suppress this note."); + }) + .Build()); + } + + // No platform vars at all — genuine warning return Task.FromResult(result .Warn("No environment configuration variables detected") .WithEvidence("Environment status", e => { e.Add("MissingRecommended", string.Join(", ", missing)); - e.Add("TotalStellaVars", stellaVars.Count.ToString(CultureInfo.InvariantCulture)); + e.Add("TotalPlatformVars", "0"); e.Add("CurrentEnvironment", context.EnvironmentName); }) .WithCauses( - "Environment variables not set for deployment", - "Using default environment (Production)") + "No StellaOps, ASP.NET, or .NET environment variables found", + "The service may not be running in a configured deployment") .WithRemediation(r => r .AddShellStep(1, "Set environment", "export ASPNETCORE_ENVIRONMENT=Development") - .AddManualStep(2, "Configure in deployment", "Set ASPNETCORE_ENVIRONMENT in your deployment configuration")) + .AddManualStep(2, "Configure in deployment", "Set ASPNETCORE_ENVIRONMENT in docker-compose.yml or Kubernetes manifest") + .WithRunbookUrl("docs/doctor/articles/core/core-env-variables.md")) .WithVerification("stella doctor --check check.core.env.variables") .Build()); } @@ -92,10 +110,8 @@ public sealed class EnvironmentVariablesCheck : IDoctorCheck .WithEvidence("Environment status", e => { foreach (var kv in found) - { e.Add(kv.Key, kv.Value); - } - e.Add("TotalStellaVars", stellaVars.Count.ToString(CultureInfo.InvariantCulture)); + e.Add("TotalPlatformVars", stellaVars.Count.ToString(CultureInfo.InvariantCulture)); }) .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/MemoryUsageCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/MemoryUsageCheck.cs index 68ff756ad..791afa06e 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/MemoryUsageCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/MemoryUsageCheck.cs @@ -74,7 +74,8 @@ public sealed class MemoryUsageCheck : IDoctorCheck .WithRemediation(r => r .AddManualStep(1, "Analyze memory usage", "Use dotnet-dump or dotnet-gcdump to analyze memory") .AddShellStep(2, "Force garbage collection", "GC.Collect() - only for diagnostics") - .AddManualStep(3, "Review memory allocation patterns", "Look for large object allocations or memory leaks")) + .AddManualStep(3, "Review memory allocation patterns", "Look for large object allocations or memory leaks") + .WithRunbookUrl("docs/doctor/articles/core/core-env-memory.md")) .WithVerification("stella doctor --check check.core.env.memory") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/RequiredSettingsCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/RequiredSettingsCheck.cs index 9acb8398b..0e6d0dd90 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/RequiredSettingsCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/RequiredSettingsCheck.cs @@ -7,13 +7,32 @@ namespace StellaOps.Doctor.Plugins.Core.Checks; /// /// Verifies that required configuration settings are present and valid. +/// Checks multiple key variants to support both appsettings.json and +/// environment-variable configuration (Docker compose, Kubernetes, etc.). /// public sealed class RequiredSettingsCheck : IDoctorCheck { - private static readonly string[] RequiredSettings = + /// + /// Required settings: at least one key variant must be present. + /// The first entry in each group is the canonical name shown in diagnostics. + /// + private static readonly string[][] RequiredSettingVariants = [ - "ConnectionStrings:DefaultConnection", - "Logging:LogLevel:Default" + // Connection string: multiple key conventions + [ + "ConnectionStrings:DefaultConnection", + "ConnectionStrings:Default", + "CONNECTIONSTRINGS__DEFAULTCONNECTION", + "CONNECTIONSTRINGS__DEFAULT", + ], + ]; + + /// + /// Recommended (but not required) settings — reported as warnings, not failures. + /// + private static readonly string[][] RecommendedSettingVariants = + [ + ["Logging:LogLevel:Default"], ]; /// @@ -41,28 +60,58 @@ public sealed class RequiredSettingsCheck : IDoctorCheck public Task RunAsync(DoctorPluginContext context, CancellationToken ct) { var result = context.CreateResult(CheckId, "stellaops.doctor.core", DoctorCategory.Core.ToString()); - var config = context.Configuration; + var missing = new List(); var present = new List(); + var warnings = new List(); // Check plugin-specific required settings var customRequired = context.PluginConfig.GetSection("RequiredSettings") .Get() ?? []; - var allRequired = RequiredSettings.Concat(customRequired).Distinct(); + // Check required settings (multiple key variants) + foreach (var variants in RequiredSettingVariants) + { + var canonicalName = variants[0]; + var found = false; + foreach (var variant in variants) + { + var value = config[variant]; + if (!string.IsNullOrEmpty(value)) + { + found = true; + break; + } + // Also check environment variables directly (Docker compose uses __ separator) + var envValue = Environment.GetEnvironmentVariable(variant.Replace(":", "__")); + if (!string.IsNullOrEmpty(envValue)) + { + found = true; + break; + } + } - foreach (var setting in allRequired) + if (found) present.Add(canonicalName); + else missing.Add(canonicalName); + } + + // Check custom required settings (single key) + foreach (var setting in customRequired) { var value = config[setting]; if (string.IsNullOrEmpty(value)) - { missing.Add(setting); - } else - { present.Add(setting); - } + } + + // Check recommended settings (warning only) + foreach (var variants in RecommendedSettingVariants) + { + var canonicalName = variants[0]; + var found = variants.Any(v => !string.IsNullOrEmpty(config[v])); + if (!found) warnings.Add(canonicalName); } if (missing.Count > 0) @@ -74,31 +123,42 @@ public sealed class RequiredSettingsCheck : IDoctorCheck e.Add("MissingCount", missing.Count.ToString()); e.Add("PresentCount", present.Count.ToString()); e.Add("MissingSettings", string.Join(", ", missing)); + if (warnings.Count > 0) + e.Add("Warnings", string.Join(", ", warnings)); }) .WithCauses( - "Configuration file missing required values", - "Environment variables not set", - "Secrets not configured") + "Database connection string not configured", + "Environment variables not set (check Docker compose .env or service environment)") .WithRemediation(r => { - r.AddManualStep(1, "Add missing settings to configuration", - $"Add the following settings to appsettings.json or environment: {string.Join(", ", missing)}"); - - if (missing.Any(m => m.StartsWith("ConnectionStrings:", StringComparison.Ordinal))) + if (missing.Any(m => m.Contains("ConnectionStrings", StringComparison.Ordinal))) { - r.AddManualStep(2, "Configure database connection", - "Set ConnectionStrings:DefaultConnection in appsettings.json or CONNECTIONSTRINGS__DEFAULTCONNECTION env var"); + r.AddManualStep(1, "Configure database connection", + "Set ConnectionStrings__Default or ConnectionStrings__DefaultConnection as an environment variable, " + + "or add ConnectionStrings:Default to appsettings.json. " + + "In Docker compose, add to the service environment section in docker-compose.yml."); } + else + { + r.AddManualStep(1, "Add missing settings", + $"Configure: {string.Join(", ", missing)}"); + } + r.WithRunbookUrl("docs/doctor/articles/core/core-config-required.md"); }) .WithVerification("stella doctor --check check.core.config.required") .Build()); } + var msg = $"All {present.Count} required settings are configured"; + if (warnings.Count > 0) + msg += $" ({warnings.Count} recommended setting(s) using defaults: {string.Join(", ", warnings)})"; + return Task.FromResult(result - .Pass($"All {present.Count} required settings are configured") + .Pass(msg) .WithEvidence("Settings status", e => e .Add("TotalRequired", present.Count.ToString()) - .Add("AllPresent", "true")) + .Add("AllPresent", "true") + .Add("RecommendedMissing", warnings.Count.ToString())) .Build()); } } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/ServiceHealthCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/ServiceHealthCheck.cs index 4a4ec06b5..2695704e2 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/ServiceHealthCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Core/Checks/ServiceHealthCheck.cs @@ -88,7 +88,8 @@ public sealed class ServiceHealthCheck : IDoctorCheck "External API unreachable") .WithRemediation(r => r .AddShellStep(1, "Check health endpoint", "curl -s http://localhost:5000/health | jq") - .AddManualStep(2, "Review failing services", $"Investigate: {string.Join(", ", failedChecks)}")) + .AddManualStep(2, "Review failing services", $"Investigate: {string.Join(", ", failedChecks)}") + .WithRunbookUrl("docs/doctor/articles/core/core-services-health.md")) .WithVerification("stella doctor --check check.core.services.health") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/CryptoLicenseCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/CryptoLicenseCheck.cs index 3c0f00e90..65e002e9a 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/CryptoLicenseCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/CryptoLicenseCheck.cs @@ -99,7 +99,8 @@ public sealed class CryptoLicenseCheck : IDoctorCheck .WithRemediation(r => r .AddManualStep(1, "Verify license", "Check license file exists and is valid") .AddManualStep(2, "Renew license", "Contact vendor to renew expired licenses") - .AddManualStep(3, "Configure license path", "Set Cryptography::LicensePath in configuration")) + .AddManualStep(3, "Configure license path", "Set Cryptography::LicensePath in configuration") + .WithRunbookUrl("docs/doctor/articles/crypto/crypto-license.md")) .WithVerification("stella doctor --check check.crypto.license") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/CryptoProCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/CryptoProCheck.cs index 7e5ac8297..734b9f004 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/CryptoProCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/CryptoProCheck.cs @@ -154,7 +154,8 @@ public sealed class CryptoProCheck : IDoctorCheck .WithRemediation(r => r .AddManualStep(1, "Install CryptoPro", "Download and install CryptoPro CSP from cryptopro.ru") .AddManualStep(2, "Set build flag", "Set STELLAOPS_CRYPTO_PRO=1 environment variable") - .AddManualStep(3, "Configure license", "Configure Cryptography:CryptoPro:LicensePath")) + .AddManualStep(3, "Configure license", "Configure Cryptography:CryptoPro:LicensePath") + .WithRunbookUrl("docs/doctor/articles/crypto/crypto-cryptopro.md")) .WithVerification("stella doctor --check check.crypto.cryptopro") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/CryptoProviderAvailabilityCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/CryptoProviderAvailabilityCheck.cs index bbeecdfa7..640f57561 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/CryptoProviderAvailabilityCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/CryptoProviderAvailabilityCheck.cs @@ -113,7 +113,8 @@ public sealed class CryptoProviderAvailabilityCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Check runtime", "Ensure .NET runtime supports required algorithms") - .AddManualStep(2, "Install providers", "Install additional crypto libraries if needed")) + .AddManualStep(2, "Install providers", "Install additional crypto libraries if needed") + .WithRunbookUrl("docs/doctor/articles/crypto/crypto-provider.md")) .WithVerification("stella doctor --check check.crypto.provider") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/EidasProviderCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/EidasProviderCheck.cs index d34cb1226..61a037ce1 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/EidasProviderCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/EidasProviderCheck.cs @@ -91,7 +91,8 @@ public sealed class EidasProviderCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Configure provider", "Configure PKCS#11 library or certificate store for eIDAS") - .AddManualStep(2, "Verify trust list", "Ensure EU Trust List is accessible")) + .AddManualStep(2, "Verify trust list", "Ensure EU Trust List is accessible") + .WithRunbookUrl("docs/doctor/articles/crypto/crypto-eidas.md")) .WithVerification("stella doctor --check check.crypto.eidas") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/FipsComplianceCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/FipsComplianceCheck.cs index c9cdfab60..058c5a230 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/FipsComplianceCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/FipsComplianceCheck.cs @@ -55,7 +55,8 @@ public sealed class FipsComplianceCheck : IDoctorCheck .WithCauses("System FIPS mode is not enabled but configuration requires it") .WithRemediation(r => r .AddManualStep(1, "Enable FIPS on Windows", "Set FIPS security policy in Windows Group Policy") - .AddManualStep(2, "Enable FIPS on Linux", "Configure system crypto policy with fips-mode-setup")) + .AddManualStep(2, "Enable FIPS on Linux", "Configure system crypto policy with fips-mode-setup") + .WithRunbookUrl("docs/doctor/articles/crypto/crypto-fips.md")) .WithVerification("stella doctor --check check.crypto.fips") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/GostProviderCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/GostProviderCheck.cs index 8bd523dc8..191fc7bda 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/GostProviderCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/GostProviderCheck.cs @@ -97,7 +97,8 @@ public sealed class GostProviderCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Install provider", "Install the configured GOST provider (OpenSSL GOST engine, CryptoPro CSP, or PKCS#11)") - .AddManualStep(2, "Configure endpoint", "For remote providers, configure the service endpoint")) + .AddManualStep(2, "Configure endpoint", "For remote providers, configure the service endpoint") + .WithRunbookUrl("docs/doctor/articles/crypto/crypto-gost.md")) .WithVerification("stella doctor --check check.crypto.gost") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/HsmConnectivityCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/HsmConnectivityCheck.cs index 0343e8242..374a64cef 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/HsmConnectivityCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/HsmConnectivityCheck.cs @@ -115,7 +115,8 @@ public sealed class HsmConnectivityCheck : IDoctorCheck .WithRemediation(r => r .AddManualStep(1, "Install PKCS#11 library", "Ensure the HSM PKCS#11 library is installed") .AddManualStep(2, "Check connectivity", "Verify network/USB connectivity to HSM") - .AddManualStep(3, "Verify credentials", "Ensure HSM credentials are configured")) + .AddManualStep(3, "Verify credentials", "Ensure HSM credentials are configured") + .WithRunbookUrl("docs/doctor/articles/crypto/crypto-hsm.md")) .WithVerification("stella doctor --check check.crypto.hsm") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/SmProviderCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/SmProviderCheck.cs index 2a3f7592f..67ecf2068 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/SmProviderCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Cryptography/Checks/SmProviderCheck.cs @@ -93,7 +93,8 @@ public sealed class SmProviderCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Set environment gate", "Set SM_SOFT_ALLOWED=1 to enable SM software providers") - .AddManualStep(2, "Configure SmRemote", "Configure SmRemote:Endpoint for remote SM crypto service")) + .AddManualStep(2, "Configure SmRemote", "Configure SmRemote:Endpoint for remote SM crypto service") + .WithRunbookUrl("docs/doctor/articles/crypto/crypto-sm.md")) .WithVerification("stella doctor --check check.crypto.sm") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/ConnectionPoolHealthCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/ConnectionPoolHealthCheck.cs index acc87b301..bae32052c 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/ConnectionPoolHealthCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/ConnectionPoolHealthCheck.cs @@ -86,7 +86,8 @@ public sealed class ConnectionPoolHealthCheck : DatabaseCheckBase "Deadlock or lock contention") .WithRemediation(r => r .AddShellStep(1, "Find idle transactions", "psql -c \"SELECT pid, query FROM pg_stat_activity WHERE state = 'idle in transaction'\"") - .AddManualStep(2, "Review application code", "Ensure transactions are properly committed or rolled back")) + .AddManualStep(2, "Review application code", "Ensure transactions are properly committed or rolled back") + .WithRunbookUrl("docs/doctor/articles/postgres/db-pool-health.md")) .WithVerification("stella doctor --check check.db.pool.health") .Build(); } @@ -107,7 +108,8 @@ public sealed class ConnectionPoolHealthCheck : DatabaseCheckBase "max_connections too low for workload") .WithRemediation(r => r .AddManualStep(1, "Review connection pool settings", "Check Npgsql connection string pool size") - .AddManualStep(2, "Consider increasing max_connections", "Edit postgresql.conf if appropriate")) + .AddManualStep(2, "Consider increasing max_connections", "Edit postgresql.conf if appropriate") + .WithRunbookUrl("docs/doctor/articles/postgres/db-pool-health.md")) .WithVerification("stella doctor --check check.db.pool.health") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/ConnectionPoolSizeCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/ConnectionPoolSizeCheck.cs index 632b966d8..2976155a4 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/ConnectionPoolSizeCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/ConnectionPoolSizeCheck.cs @@ -66,7 +66,8 @@ public sealed class ConnectionPoolSizeCheck : DatabaseCheckBase "Pooling=false in connection string", "Connection string misconfiguration") .WithRemediation(r => r - .AddManualStep(1, "Enable pooling", "Set Pooling=true in connection string")) + .AddManualStep(1, "Enable pooling", "Set Pooling=true in connection string") + .WithRunbookUrl("docs/doctor/articles/postgres/db-pool-size.md")) .WithVerification("stella doctor --check check.db.pool.size") .Build(); } @@ -87,7 +88,8 @@ public sealed class ConnectionPoolSizeCheck : DatabaseCheckBase "Multiple application instances sharing connection limit") .WithRemediation(r => r .AddManualStep(1, "Reduce pool size", $"Set Max Pool Size={availableConnections / 2} in connection string") - .AddManualStep(2, "Or increase server limit", "Increase max_connections in postgresql.conf")) + .AddManualStep(2, "Or increase server limit", "Increase max_connections in postgresql.conf") + .WithRunbookUrl("docs/doctor/articles/postgres/db-pool-size.md")) .WithVerification("stella doctor --check check.db.pool.size") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/DatabasePermissionsCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/DatabasePermissionsCheck.cs index 2c7d00995..687ab7602 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/DatabasePermissionsCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/DatabasePermissionsCheck.cs @@ -112,7 +112,8 @@ public sealed class DatabasePermissionsCheck : DatabaseCheckBase .WithRemediation(r => r .AddManualStep(1, "Create dedicated user", "CREATE USER stellaops WITH PASSWORD 'secure_password'") .AddManualStep(2, "Grant minimal permissions", "GRANT CONNECT ON DATABASE stellaops TO stellaops") - .AddManualStep(3, "Update connection string", "Change user in connection string to dedicated user")) + .AddManualStep(3, "Update connection string", "Change user in connection string to dedicated user") + .WithRunbookUrl("docs/doctor/articles/postgres/db-permissions.md")) .WithVerification("stella doctor --check check.db.permissions") .Build(); } @@ -134,7 +135,8 @@ public sealed class DatabasePermissionsCheck : DatabaseCheckBase "Restrictive default privileges") .WithRemediation(r => r .AddManualStep(1, "Grant schema access", $"GRANT USAGE ON SCHEMA public TO {currentUser}") - .AddManualStep(2, "Grant table access", $"GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO {currentUser}")) + .AddManualStep(2, "Grant table access", $"GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO {currentUser}") + .WithRunbookUrl("docs/doctor/articles/postgres/db-permissions.md")) .WithVerification("stella doctor --check check.db.permissions") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/FailedMigrationsCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/FailedMigrationsCheck.cs index af70680d8..ca92e0e19 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/FailedMigrationsCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/FailedMigrationsCheck.cs @@ -88,7 +88,8 @@ public sealed class FailedMigrationsCheck : DatabaseCheckBase .WithRemediation(r => r .AddManualStep(1, "Review migration logs", "Check application logs for migration error details") .AddManualStep(2, "Fix migration issues", "Resolve the underlying issue and retry migration") - .AddShellStep(3, "Retry migrations", "dotnet ef database update")) + .AddShellStep(3, "Retry migrations", "dotnet ef database update") + .WithRunbookUrl("docs/doctor/articles/postgres/db-migrations-failed.md")) .WithVerification("stella doctor --check check.db.migrations.failed") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/QueryLatencyCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/QueryLatencyCheck.cs index 1a27112fc..65da7ae23 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/QueryLatencyCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/QueryLatencyCheck.cs @@ -110,7 +110,8 @@ public sealed class QueryLatencyCheck : DatabaseCheckBase .WithRemediation(r => r .AddShellStep(1, "Check server load", "psql -c \"SELECT * FROM pg_stat_activity WHERE state = 'active'\"") .AddShellStep(2, "Check for locks", "psql -c \"SELECT * FROM pg_locks WHERE NOT granted\"") - .AddManualStep(3, "Review network path", "Check network latency between application and database")) + .AddManualStep(3, "Review network path", "Check network latency between application and database") + .WithRunbookUrl("docs/doctor/articles/postgres/db-latency.md")) .WithVerification("stella doctor --check check.db.latency") .Build(); } @@ -129,7 +130,8 @@ public sealed class QueryLatencyCheck : DatabaseCheckBase "Network latency to database server", "Database server moderately loaded") .WithRemediation(r => r - .AddManualStep(1, "Monitor trends", "Track latency over time to identify patterns")) + .AddManualStep(1, "Monitor trends", "Track latency over time to identify patterns") + .WithRunbookUrl("docs/doctor/articles/postgres/db-latency.md")) .WithVerification("stella doctor --check check.db.latency") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/SchemaVersionCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/SchemaVersionCheck.cs index 7f221bef5..371aeca3e 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/SchemaVersionCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Database/Checks/SchemaVersionCheck.cs @@ -94,7 +94,8 @@ public sealed class SchemaVersionCheck : DatabaseCheckBase "Manual DDL changes") .WithRemediation(r => r .AddShellStep(1, "List orphaned FKs", "psql -c \"SELECT conname FROM pg_constraint WHERE NOT convalidated\"") - .AddManualStep(2, "Review and clean up", "Drop or fix orphaned constraints")) + .AddManualStep(2, "Review and clean up", "Drop or fix orphaned constraints") + .WithRunbookUrl("docs/doctor/articles/postgres/db-schema-version.md")) .WithVerification("stella doctor --check check.db.schema.version") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerApiVersionCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerApiVersionCheck.cs index fb5eba85f..8ce94d025 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerApiVersionCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerApiVersionCheck.cs @@ -88,7 +88,8 @@ public sealed class DockerApiVersionCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Update Docker", "Install the latest Docker version for your OS") - .AddManualStep(2, "Verify version", "Run: docker version")) + .AddManualStep(2, "Verify version", "Run: docker version") + .WithRunbookUrl("docs/doctor/articles/docker/docker-apiversion.md")) .WithVerification("stella doctor --check check.docker.apiversion") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerDaemonCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerDaemonCheck.cs index 25aa2749b..d7ae1d995 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerDaemonCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerDaemonCheck.cs @@ -80,7 +80,8 @@ public sealed class DockerDaemonCheck : IDoctorCheck .WithCauses("Docker daemon returned an error response") .WithRemediation(r => r .AddManualStep(1, "Check daemon status", "Run: docker info") - .AddManualStep(2, "Restart daemon", "Run: sudo systemctl restart docker")) + .AddManualStep(2, "Restart daemon", "Run: sudo systemctl restart docker") + .WithRunbookUrl("docs/doctor/articles/docker/docker-daemon.md")) .WithVerification("stella doctor --check check.docker.daemon") .Build(); } @@ -98,7 +99,8 @@ public sealed class DockerDaemonCheck : IDoctorCheck .WithRemediation(r => r .AddManualStep(1, "Install Docker", "Follow Docker installation guide for your OS") .AddManualStep(2, "Start daemon", "Run: sudo systemctl start docker") - .AddManualStep(3, "Verify installation", "Run: docker version")) + .AddManualStep(3, "Verify installation", "Run: docker version") + .WithRunbookUrl("docs/doctor/articles/docker/docker-daemon.md")) .WithVerification("stella doctor --check check.docker.daemon") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerNetworkCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerNetworkCheck.cs index 57672202d..8fcafc57e 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerNetworkCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerNetworkCheck.cs @@ -97,7 +97,8 @@ public sealed class DockerNetworkCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "List networks", "Run: docker network ls") - .AddManualStep(2, "Create network", "Run: docker network create ")) + .AddManualStep(2, "Create network", "Run: docker network create ") + .WithRunbookUrl("docs/doctor/articles/docker/docker-network.md")) .WithVerification("stella doctor --check check.docker.network") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerSocketCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerSocketCheck.cs index 953d4d0c2..4bcb205a5 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerSocketCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerSocketCheck.cs @@ -86,8 +86,28 @@ public sealed class DockerSocketCheck : IDoctorCheck } } + // Detect if we're running inside a container (no socket is expected) + var insideContainer = File.Exists("/.dockerenv") || File.Exists("/proc/1/cgroup"); + if (!socketExists) { + if (insideContainer) + { + // Inside a container without socket mount — this is normal for services + // that don't need direct Docker access (like Doctor, Platform, etc.) + return result + .Pass("Running inside container — Docker socket not required") + .WithEvidence("Docker socket", e => + { + e.Add("Path", socketPath); + e.Add("Exists", "false"); + e.Add("InsideContainer", "true"); + e.Add("Note", "Docker socket is not mounted. This is expected for most services. " + + "Only mount the socket if this service needs to manage containers."); + }) + .Build(); + } + issues.Add($"Docker socket not found at {socketPath}"); } else if (!socketReadable || !socketWritable) @@ -98,7 +118,7 @@ public sealed class DockerSocketCheck : IDoctorCheck if (issues.Count > 0) { return result - .Fail($"{issues.Count} Docker socket issue(s)") + .Warn($"{issues.Count} Docker socket issue(s)") .WithEvidence("Docker socket", e => { e.Add("Path", socketPath); @@ -109,8 +129,10 @@ public sealed class DockerSocketCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Check Docker installation", "Ensure Docker is installed and running") - .AddManualStep(2, "Add user to docker group", "Run: sudo usermod -aG docker $USER") - .AddManualStep(3, "Re-login", "Log out and back in for group changes to take effect")) + .AddManualStep(2, "Mount Docker socket", "Add -v /var/run/docker.sock:/var/run/docker.sock to docker run, " + + "or volumes: ['/var/run/docker.sock:/var/run/docker.sock'] in docker-compose.yml") + .AddManualStep(3, "Add user to docker group", "Run: sudo usermod -aG docker $USER && logout") + .WithRunbookUrl("docs/doctor/articles/docker/docker-socket.md")) .WithVerification("stella doctor --check check.docker.socket") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerStorageCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerStorageCheck.cs index 49060eb78..fe03a3bfc 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerStorageCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Docker/Checks/DockerStorageCheck.cs @@ -123,7 +123,8 @@ public sealed class DockerStorageCheck : IDoctorCheck .WithRemediation(r => r .AddManualStep(1, "Prune unused data", "Run: docker system prune -a") .AddManualStep(2, "Check disk usage", "Run: docker system df") - .AddManualStep(3, "Add storage", "Expand disk or add additional storage")) + .AddManualStep(3, "Add storage", "Expand disk or add additional storage") + .WithRunbookUrl("docs/doctor/articles/docker/docker-storage.md")) .WithVerification("stella doctor --check check.docker.storage") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/CiSystemConnectivityCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/CiSystemConnectivityCheck.cs index b261aa9e5..53ce1ffab 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/CiSystemConnectivityCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/CiSystemConnectivityCheck.cs @@ -120,6 +120,7 @@ public sealed class CiSystemConnectivityCheck : IDoctorCheck rb.AddStep(2, "Refresh credentials", $"stella ci auth refresh {unhealthy[0]}", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/integration/integration-ci-system.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -145,6 +146,7 @@ public sealed class CiSystemConnectivityCheck : IDoctorCheck rb.AddStep(1, "Check runner status", $"stella ci runners {noRunners[0]}", CommandType.Shell); + rb.WithRunbookUrl("docs/doctor/articles/integration/integration-ci-system.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/GitProviderCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/GitProviderCheck.cs index 8561325ab..ad0104852 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/GitProviderCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/GitProviderCheck.cs @@ -126,7 +126,8 @@ public sealed class GitProviderCheck : IDoctorCheck "Git provider service is down") .WithRemediation(r => r .AddManualStep(1, "Verify Git URL", "Check Git:Url configuration") - .AddManualStep(2, "Test connectivity", $"curl -v {gitUrl}")) + .AddManualStep(2, "Test connectivity", $"curl -v {gitUrl}") + .WithRunbookUrl("docs/doctor/articles/integration/integration-git.md")) .WithVerification("stella doctor --check check.integration.git") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/IntegrationWebhookHealthCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/IntegrationWebhookHealthCheck.cs index eeafaa7dd..3bbe06235 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/IntegrationWebhookHealthCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/IntegrationWebhookHealthCheck.cs @@ -174,7 +174,8 @@ public sealed class IntegrationWebhookHealthCheck : IDoctorCheck .WithRemediation(rb => rb .AddStep(1, "Monitor webhook metrics", "stella webhooks stats", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/integration/integration-webhooks.md")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/LdapConnectivityCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/LdapConnectivityCheck.cs index 7612c5349..f15a6e67f 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/LdapConnectivityCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/LdapConnectivityCheck.cs @@ -95,7 +95,8 @@ public sealed class LdapConnectivityCheck : IDoctorCheck "Network connectivity issues") .WithRemediation(r => r .AddManualStep(1, "Check LDAP server", "Verify LDAP server is running and accessible") - .AddManualStep(2, "Test connectivity", $"telnet {host} {port}")) + .AddManualStep(2, "Test connectivity", $"telnet {host} {port}") + .WithRunbookUrl("docs/doctor/articles/integration/integration-ldap.md")) .WithVerification("stella doctor --check check.integration.ldap") .Build(); } @@ -143,7 +144,8 @@ public sealed class LdapConnectivityCheck : IDoctorCheck "Network unreachable") .WithRemediation(r => r .AddManualStep(1, "Check LDAP configuration", "Verify Ldap:Host and Ldap:Port settings") - .AddManualStep(2, "Check DNS", $"nslookup {host}")) + .AddManualStep(2, "Check DNS", $"nslookup {host}") + .WithRunbookUrl("docs/doctor/articles/integration/integration-ldap.md")) .WithVerification("stella doctor --check check.integration.ldap") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/ObjectStorageCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/ObjectStorageCheck.cs index b50786d18..64c58d5c0 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/ObjectStorageCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/ObjectStorageCheck.cs @@ -89,7 +89,8 @@ public sealed class ObjectStorageCheck : IDoctorCheck "Firewall blocking connection") .WithRemediation(r => r .AddManualStep(1, "Check S3 endpoint", "Verify S3:Endpoint configuration") - .AddManualStep(2, "Test connectivity", $"curl -v {endpoint}")) + .AddManualStep(2, "Test connectivity", $"curl -v {endpoint}") + .WithRunbookUrl("docs/doctor/articles/integration/integration-s3-storage.md")) .WithVerification("stella doctor --check check.integration.s3.storage") .Build(); } @@ -148,7 +149,8 @@ public sealed class ObjectStorageCheck : IDoctorCheck "Network unreachable") .WithRemediation(r => r .AddManualStep(1, "Check S3 service", "Verify MinIO or S3 service is running") - .AddManualStep(2, "Check DNS", $"nslookup {new Uri(endpoint).Host}")) + .AddManualStep(2, "Check DNS", $"nslookup {new Uri(endpoint).Host}") + .WithRunbookUrl("docs/doctor/articles/integration/integration-s3-storage.md")) .WithVerification("stella doctor --check check.integration.s3.storage") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/OciRegistryCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/OciRegistryCheck.cs index cf0442c32..1890d6caa 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/OciRegistryCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/OciRegistryCheck.cs @@ -114,7 +114,8 @@ public sealed class OciRegistryCheck : IDoctorCheck "Registry service is down") .WithRemediation(r => r .AddManualStep(1, "Verify registry URL", "Check OCI:RegistryUrl configuration") - .AddManualStep(2, "Test connectivity", $"curl -v {registryUrl}/v2/")) + .AddManualStep(2, "Test connectivity", $"curl -v {registryUrl}/v2/") + .WithRunbookUrl("docs/doctor/articles/integration/integration-oci-registry.md")) .WithVerification("stella doctor --check check.integration.oci.registry") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/OidcProviderCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/OidcProviderCheck.cs index 6466dcec7..e6055bb74 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/OidcProviderCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/OidcProviderCheck.cs @@ -129,7 +129,8 @@ public sealed class OidcProviderCheck : IDoctorCheck "OIDC provider does not support discovery") .WithRemediation(r => r .AddManualStep(1, "Verify issuer URL", "Check Oidc:Issuer configuration") - .AddManualStep(2, "Test discovery", $"curl -v {discoveryUrl}")) + .AddManualStep(2, "Test discovery", $"curl -v {discoveryUrl}") + .WithRunbookUrl("docs/doctor/articles/integration/integration-oidc.md")) .WithVerification("stella doctor --check check.integration.oidc") .Build(); } @@ -149,7 +150,8 @@ public sealed class OidcProviderCheck : IDoctorCheck "OIDC provider is down") .WithRemediation(r => r .AddManualStep(1, "Verify issuer URL", "Check Oidc:Issuer configuration") - .AddManualStep(2, "Test connectivity", $"curl -v {issuer}/.well-known/openid-configuration")) + .AddManualStep(2, "Test connectivity", $"curl -v {issuer}/.well-known/openid-configuration") + .WithRunbookUrl("docs/doctor/articles/integration/integration-oidc.md")) .WithVerification("stella doctor --check check.integration.oidc") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/SecretsManagerConnectivityCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/SecretsManagerConnectivityCheck.cs index 85253d34e..b193716be 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/SecretsManagerConnectivityCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/SecretsManagerConnectivityCheck.cs @@ -121,6 +121,7 @@ public sealed class SecretsManagerConnectivityCheck : IDoctorCheck rb.AddStep(2, "Refresh authentication", $"stella secrets auth refresh {unhealthy[0]}", CommandType.Manual); + rb.WithRunbookUrl("docs/doctor/articles/integration/integration-secrets-manager.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); @@ -149,6 +150,7 @@ public sealed class SecretsManagerConnectivityCheck : IDoctorCheck rb.AddStep(2, "Check seal status", $"stella secrets status {sealed_[0]}", CommandType.Shell); + rb.WithRunbookUrl("docs/doctor/articles/integration/integration-secrets-manager.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/SmtpCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/SmtpCheck.cs index ee6b71929..0741e9b22 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/SmtpCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Integration/Checks/SmtpCheck.cs @@ -90,7 +90,8 @@ public sealed class SmtpCheck : IDoctorCheck "Network connectivity issues") .WithRemediation(r => r .AddManualStep(1, "Check SMTP server", "Verify SMTP server is running") - .AddManualStep(2, "Test connectivity", $"telnet {host} {port}")) + .AddManualStep(2, "Test connectivity", $"telnet {host} {port}") + .WithRunbookUrl("docs/doctor/articles/integration/integration-smtp.md")) .WithVerification("stella doctor --check check.integration.smtp") .Build(); } @@ -138,7 +139,8 @@ public sealed class SmtpCheck : IDoctorCheck "Network unreachable") .WithRemediation(r => r .AddManualStep(1, "Check SMTP configuration", "Verify Smtp:Host and Smtp:Port settings") - .AddManualStep(2, "Check DNS", $"nslookup {host}")) + .AddManualStep(2, "Check DNS", $"nslookup {host}") + .WithRunbookUrl("docs/doctor/articles/integration/integration-smtp.md")) .WithVerification("stella doctor --check check.integration.smtp") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Notify/Checks/NotifyChannelConfigurationCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Notify/Checks/NotifyChannelConfigurationCheck.cs index 79d3e064d..38cc6e144 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Notify/Checks/NotifyChannelConfigurationCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Notify/Checks/NotifyChannelConfigurationCheck.cs @@ -128,7 +128,8 @@ public sealed class NotifyChannelConfigurationCheck : IDoctorCheck CommandType.FileEdit) .AddStep(2, "Or run setup wizard", "stella setup --step notify", - CommandType.Shell)) + CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/notify/notify-channel-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -145,7 +146,8 @@ public sealed class NotifyChannelConfigurationCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Review configuration", "Check Notify:Channels section for missing values") - .AddStep(2, "Run setup wizard", "stella setup --step notify", CommandType.Shell)) + .AddStep(2, "Run setup wizard", "stella setup --step notify", CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/notify/notify-channel-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -161,7 +163,8 @@ public sealed class NotifyChannelConfigurationCheck : IDoctorCheck }) .WithCauses(issues.ToArray()) .WithRemediation(r => r - .AddManualStep(1, "Review configuration", "Check Notify:Channels section for missing values")) + .AddManualStep(1, "Review configuration", "Check Notify:Channels section for missing values") + .WithRunbookUrl("docs/doctor/articles/notify/notify-channel-configured.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Notify/Checks/NotifyChannelConnectivityCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Notify/Checks/NotifyChannelConnectivityCheck.cs index 59fc0eb46..a9dcae7ae 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Notify/Checks/NotifyChannelConnectivityCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Notify/Checks/NotifyChannelConnectivityCheck.cs @@ -123,6 +123,7 @@ public sealed class NotifyChannelConnectivityCheck : IDoctorCheck { r.AddManualStep(4, "Check webhook endpoint", "Verify webhook endpoint is accessible from this network"); } + r.WithRunbookUrl("docs/doctor/articles/notify/notify-channel-connectivity.md"); }) .WithVerification($"stella doctor --check {CheckId}") .Build(); diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Notify/Checks/NotifyDeliveryTestCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Notify/Checks/NotifyDeliveryTestCheck.cs index 346396947..dd766e252 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Notify/Checks/NotifyDeliveryTestCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Notify/Checks/NotifyDeliveryTestCheck.cs @@ -111,7 +111,8 @@ public sealed class NotifyDeliveryTestCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Review delivery settings", "Check Notify:Delivery section for invalid values") - .AddStep(2, "Run setup wizard", "stella setup --step notify", CommandType.Shell)) + .AddStep(2, "Run setup wizard", "stella setup --step notify", CommandType.Shell) + .WithRunbookUrl("docs/doctor/articles/notify/notify-delivery-test.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } @@ -139,7 +140,8 @@ public sealed class NotifyDeliveryTestCheck : IDoctorCheck " \"Redis\": { \"ConnectionString\": \"localhost:6379\" }\n" + " }\n" + "}", - CommandType.FileEdit)) + CommandType.FileEdit) + .WithRunbookUrl("docs/doctor/articles/notify/notify-delivery-test.md")) .WithVerification($"stella doctor --check {CheckId}") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/HealthCheckEndpointsCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/HealthCheckEndpointsCheck.cs index 5a1bb44d0..59bd2a9e0 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/HealthCheckEndpointsCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/HealthCheckEndpointsCheck.cs @@ -115,7 +115,8 @@ public sealed class HealthCheckEndpointsCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Configure endpoints", "Set separate /health/ready and /health/live endpoints") - .AddManualStep(2, "Set timeout", "Configure reasonable timeout (5-30 seconds)")) + .AddManualStep(2, "Set timeout", "Configure reasonable timeout (5-30 seconds)") + .WithRunbookUrl("docs/doctor/articles/observability/observability-healthchecks.md")) .WithVerification("stella doctor --check check.observability.healthchecks") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/LoggingConfigurationCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/LoggingConfigurationCheck.cs index 60fa2776c..6d6363998 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/LoggingConfigurationCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/LoggingConfigurationCheck.cs @@ -89,7 +89,8 @@ public sealed class LoggingConfigurationCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Set appropriate level", "Use 'Information' or 'Warning' for production") - .AddManualStep(2, "Enable structured logging", "Configure Serilog or JSON console formatter")) + .AddManualStep(2, "Enable structured logging", "Configure Serilog or JSON console formatter") + .WithRunbookUrl("docs/doctor/articles/observability/observability-logging.md")) .WithVerification("stella doctor --check check.observability.logging") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/MetricsCollectionCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/MetricsCollectionCheck.cs index 5b37b9925..58aad2afe 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/MetricsCollectionCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/MetricsCollectionCheck.cs @@ -118,7 +118,8 @@ public sealed class MetricsCollectionCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Enable metrics", "Configure Metrics:Enabled or Prometheus:Enabled") - .AddManualStep(2, "Check endpoint", $"curl http://localhost:{metricsPort ?? 80}{metricsPath}")) + .AddManualStep(2, "Check endpoint", $"curl http://localhost:{metricsPort ?? 80}{metricsPath}") + .WithRunbookUrl("docs/doctor/articles/observability/observability-metrics.md")) .WithVerification("stella doctor --check check.observability.metrics") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/OpenTelemetryCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/OpenTelemetryCheck.cs index e6dd535d8..4437b4cdc 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/OpenTelemetryCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/OpenTelemetryCheck.cs @@ -125,7 +125,8 @@ public sealed class OpenTelemetryCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Set service name", "Configure OTEL_SERVICE_NAME environment variable") - .AddManualStep(2, "Verify endpoint", "Ensure OpenTelemetry collector is running")) + .AddManualStep(2, "Verify endpoint", "Ensure OpenTelemetry collector is running") + .WithRunbookUrl("docs/doctor/articles/observability/observability-otel.md")) .WithVerification("stella doctor --check check.observability.otel") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/TracingConfigurationCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/TracingConfigurationCheck.cs index 9097b2352..be8f8cd1a 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/TracingConfigurationCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Observability/Checks/TracingConfigurationCheck.cs @@ -116,7 +116,8 @@ public sealed class TracingConfigurationCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Set sampling ratio", "Configure Tracing:SamplingRatio between 0.01 and 1.0") - .AddManualStep(2, "Enable instrumentation", "Enable HTTP and database instrumentation")) + .AddManualStep(2, "Enable instrumentation", "Enable HTTP and database instrumentation") + .WithRunbookUrl("docs/doctor/articles/observability/observability-tracing.md")) .WithVerification("stella doctor --check check.observability.tracing") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/ApiKeySecurityCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/ApiKeySecurityCheck.cs index 1ceb1cff6..fb66fe7f7 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/ApiKeySecurityCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/ApiKeySecurityCheck.cs @@ -127,7 +127,8 @@ public sealed class ApiKeySecurityCheck : IDoctorCheck .WithRemediation(r => r .AddManualStep(1, "Set minimum length", "Configure ApiKey:MinLength to at least 32") .AddManualStep(2, "Disable query string", "Set ApiKey:AllowInQueryString to false") - .AddManualStep(3, "Enable rate limiting", "Set ApiKey:RateLimitPerKey to true")) + .AddManualStep(3, "Enable rate limiting", "Set ApiKey:RateLimitPerKey to true") + .WithRunbookUrl("docs/doctor/articles/security/security-apikey.md")) .WithVerification("stella doctor --check check.security.apikey") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/AuditLoggingCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/AuditLoggingCheck.cs index b9da8d9db..6ea58c367 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/AuditLoggingCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/AuditLoggingCheck.cs @@ -67,7 +67,8 @@ public sealed class AuditLoggingCheck : IDoctorCheck }) .WithCauses("Audit logging disabled in configuration") .WithRemediation(r => r - .AddManualStep(1, "Enable audit logging", "Set Audit:Enabled to true")) + .AddManualStep(1, "Enable audit logging", "Set Audit:Enabled to true") + .WithRunbookUrl("docs/doctor/articles/security/security-audit-logging.md")) .WithVerification("stella doctor --check check.security.audit.logging") .Build()); } @@ -108,7 +109,8 @@ public sealed class AuditLoggingCheck : IDoctorCheck .WithRemediation(r => r .AddManualStep(1, "Enable audit logging", "Set Audit:Enabled to true") .AddManualStep(2, "Configure events", "Enable logging for auth, access, and admin events") - .AddManualStep(3, "Set destination", "Configure audit log destination")) + .AddManualStep(3, "Set destination", "Configure audit log destination") + .WithRunbookUrl("docs/doctor/articles/security/security-audit-logging.md")) .WithVerification("stella doctor --check check.security.audit.logging") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/CorsConfigurationCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/CorsConfigurationCheck.cs index 22f44ce1c..3d6d22cf8 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/CorsConfigurationCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/CorsConfigurationCheck.cs @@ -101,7 +101,8 @@ public sealed class CorsConfigurationCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Specify origins", "Configure explicit allowed origins in Cors:AllowedOrigins") - .AddManualStep(2, "Use HTTPS", "Ensure all allowed origins use HTTPS")) + .AddManualStep(2, "Use HTTPS", "Ensure all allowed origins use HTTPS") + .WithRunbookUrl("docs/doctor/articles/security/security-cors.md")) .WithVerification("stella doctor --check check.security.cors") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/EncryptionKeyCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/EncryptionKeyCheck.cs index 914a86d31..3fea21326 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/EncryptionKeyCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/EncryptionKeyCheck.cs @@ -97,7 +97,8 @@ public sealed class EncryptionKeyCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Use strong algorithm", "Configure AES-256 or stronger") - .AddManualStep(2, "Set key rotation", "Configure Encryption:KeyRotationDays")) + .AddManualStep(2, "Set key rotation", "Configure Encryption:KeyRotationDays") + .WithRunbookUrl("docs/doctor/articles/security/security-encryption.md")) .WithVerification("stella doctor --check check.security.encryption") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/EvidenceIntegrityCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/EvidenceIntegrityCheck.cs index aba0c80bf..cd377dba2 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/EvidenceIntegrityCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/EvidenceIntegrityCheck.cs @@ -77,7 +77,8 @@ public sealed class EvidenceIntegrityCheck : IDoctorCheck .WithCauses("Evidence locker has not been initialized", "Path is incorrect") .WithRemediation(r => r .AddManualStep(1, "Create directory", $"mkdir -p {evidenceLockerPath}") - .AddManualStep(2, "Check configuration", "Verify EvidenceLocker:LocalPath setting")) + .AddManualStep(2, "Check configuration", "Verify EvidenceLocker:LocalPath setting") + .WithRunbookUrl("docs/doctor/articles/security/security-evidence-integrity.md")) .WithVerification("stella doctor --check check.security.evidence.integrity") .Build(); } @@ -162,7 +163,8 @@ public sealed class EvidenceIntegrityCheck : IDoctorCheck .WithRemediation(r => r .AddManualStep(1, "Review issues", "Examine the invalid files listed above") .AddManualStep(2, "Re-generate evidence", "Re-scan and re-sign affected evidence bundles") - .AddManualStep(3, "Check Rekor", "Verify transparency log entries are valid")) + .AddManualStep(3, "Check Rekor", "Verify transparency log entries are valid") + .WithRunbookUrl("docs/doctor/articles/security/security-evidence-integrity.md")) .WithVerification("stella doctor --check check.security.evidence.integrity") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/JwtConfigurationCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/JwtConfigurationCheck.cs index 9206978da..d5e77a4d8 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/JwtConfigurationCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/JwtConfigurationCheck.cs @@ -115,7 +115,8 @@ public sealed class JwtConfigurationCheck : IDoctorCheck .WithRemediation(r => r .AddManualStep(1, "Configure JWT settings", "Set Jwt:SigningKey, Jwt:Issuer, and Jwt:Audience") .AddManualStep(2, "Use strong key", "Ensure signing key is at least 32 characters") - .AddManualStep(3, "Consider RS256", "Use asymmetric algorithms for production")) + .AddManualStep(3, "Consider RS256", "Use asymmetric algorithms for production") + .WithRunbookUrl("docs/doctor/articles/security/security-jwt-config.md")) .WithVerification("stella doctor --check check.security.jwt.config") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/PasswordPolicyCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/PasswordPolicyCheck.cs index c7b68c7a5..f788974d1 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/PasswordPolicyCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/PasswordPolicyCheck.cs @@ -132,7 +132,8 @@ public sealed class PasswordPolicyCheck : IDoctorCheck .WithCauses(issues.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Increase minimum length", "Set Identity:Password:RequiredLength to at least 12") - .AddManualStep(2, "Enable complexity", "Require digits, uppercase, lowercase, and special characters")) + .AddManualStep(2, "Enable complexity", "Require digits, uppercase, lowercase, and special characters") + .WithRunbookUrl("docs/doctor/articles/security/security-password-policy.md")) .WithVerification("stella doctor --check check.security.password.policy") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/RateLimitingCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/RateLimitingCheck.cs index 85b02ea69..2e9d6d8b5 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/RateLimitingCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/RateLimitingCheck.cs @@ -63,7 +63,8 @@ public sealed class RateLimitingCheck : IDoctorCheck }) .WithCauses("Rate limiting explicitly disabled in configuration") .WithRemediation(r => r - .AddManualStep(1, "Enable rate limiting", "Set RateLimiting:Enabled to true")) + .AddManualStep(1, "Enable rate limiting", "Set RateLimiting:Enabled to true") + .WithRunbookUrl("docs/doctor/articles/security/security-ratelimit.md")) .WithVerification("stella doctor --check check.security.ratelimit") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/SecretsConfigurationCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/SecretsConfigurationCheck.cs index 577653c2c..fabfe8d24 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/SecretsConfigurationCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/SecretsConfigurationCheck.cs @@ -37,10 +37,11 @@ public sealed class SecretsConfigurationCheck : IDoctorCheck var issues = new List(); + // Only check actual secret values (keys, passwords, tokens). + // Connection strings are NOT secrets — they are DSNs that contain host/port/db + // and are expected in configuration for all deployment modes. var sensitiveKeys = new[] { - "ConnectionStrings:Default", - "Database:ConnectionString", "Jwt:SigningKey", "Jwt:Secret", "ApiKey", @@ -77,7 +78,8 @@ public sealed class SecretsConfigurationCheck : IDoctorCheck if (!useSecretManager && issues.Count > 0) { - issues.Add("No secrets management provider configured"); + // Only flag missing secrets manager if actual plaintext secrets were found + issues.Add("Consider configuring a secrets management provider (Vault, Azure Key Vault, or dotnet user-secrets)"); } if (issues.Count > 0) @@ -94,7 +96,8 @@ public sealed class SecretsConfigurationCheck : IDoctorCheck .WithRemediation(r => r .AddManualStep(1, "Use secrets manager", "Configure a secrets provider like HashiCorp Vault or Azure Key Vault") .AddManualStep(2, "Use environment variables", "Move secrets to environment variables") - .AddManualStep(3, "Use user secrets", "Use dotnet user-secrets for development")) + .AddManualStep(3, "Use user secrets", "Use dotnet user-secrets for development") + .WithRunbookUrl("docs/doctor/articles/security/security-secrets.md")) .WithVerification("stella doctor --check check.security.secrets") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/SecurityHeadersCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/SecurityHeadersCheck.cs index 5d05714da..005d11bb6 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/SecurityHeadersCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/SecurityHeadersCheck.cs @@ -97,7 +97,8 @@ public sealed class SecurityHeadersCheck : IDoctorCheck .WithRemediation(r => r .AddManualStep(1, "Enable HSTS", "Set Security:Headers:Hsts:Enabled to true") .AddManualStep(2, "Set X-Frame-Options", "Configure as DENY or SAMEORIGIN") - .AddManualStep(3, "Configure CSP", "Set a Content-Security-Policy appropriate for your app")) + .AddManualStep(3, "Configure CSP", "Set a Content-Security-Policy appropriate for your app") + .WithRunbookUrl("docs/doctor/articles/security/security-headers.md")) .WithVerification("stella doctor --check check.security.headers") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/TlsCertificateCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/TlsCertificateCheck.cs index 42928366b..2a1bb675f 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/TlsCertificateCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Security/Checks/TlsCertificateCheck.cs @@ -66,7 +66,8 @@ public sealed class TlsCertificateCheck : IDoctorCheck .WithCauses("Certificate file path is incorrect", "Certificate file was deleted") .WithRemediation(r => r .AddManualStep(1, "Verify path", "Check Tls:CertificatePath configuration") - .AddManualStep(2, "Generate certificate", "Generate or obtain a valid TLS certificate")) + .AddManualStep(2, "Generate certificate", "Generate or obtain a valid TLS certificate") + .WithRunbookUrl("docs/doctor/articles/security/security-tls-certificate.md")) .WithVerification("stella doctor --check check.security.tls.certificate") .Build()); } @@ -112,7 +113,8 @@ public sealed class TlsCertificateCheck : IDoctorCheck .WithCauses("Certificate has exceeded its validity period") .WithRemediation(r => r .AddManualStep(1, "Renew certificate", "Obtain a new TLS certificate") - .AddManualStep(2, "Update configuration", "Update Tls:CertificatePath with new certificate")) + .AddManualStep(2, "Update configuration", "Update Tls:CertificatePath with new certificate") + .WithRunbookUrl("docs/doctor/articles/security/security-tls-certificate.md")) .WithVerification("stella doctor --check check.security.tls.certificate") .Build()); } @@ -130,7 +132,8 @@ public sealed class TlsCertificateCheck : IDoctorCheck }) .WithCauses("Certificate is approaching expiration") .WithRemediation(r => r - .AddManualStep(1, "Plan renewal", "Schedule certificate renewal before expiration")) + .AddManualStep(1, "Plan renewal", "Schedule certificate renewal before expiration") + .WithRunbookUrl("docs/doctor/articles/security/security-tls-certificate.md")) .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/BackendConnectivityCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/BackendConnectivityCheck.cs index 1db18383d..b2dd9595d 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/BackendConnectivityCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/BackendConnectivityCheck.cs @@ -123,7 +123,8 @@ public sealed class BackendConnectivityCheck : IDoctorCheck "Authentication/authorization failure") .WithRemediation(r => r .AddManualStep(1, "Check backend logs", "kubectl logs -l app=stellaops-backend") - .AddManualStep(2, "Verify backend health", $"curl -v {healthUrl}")) + .AddManualStep(2, "Verify backend health", $"curl -v {healthUrl}") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.servicegraph.backend") .Build(); } @@ -149,7 +150,8 @@ public sealed class BackendConnectivityCheck : IDoctorCheck "Firewall blocking connection") .WithRemediation(r => r .AddManualStep(1, "Verify URL", "Check STELLAOPS_BACKEND_URL environment variable") - .AddManualStep(2, "Test connectivity", $"curl -v {backendUrl}/health")) + .AddManualStep(2, "Test connectivity", $"curl -v {backendUrl}/health") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.servicegraph.backend") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/CircuitBreakerStatusCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/CircuitBreakerStatusCheck.cs index f2c493808..859c1aefd 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/CircuitBreakerStatusCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/CircuitBreakerStatusCheck.cs @@ -74,7 +74,8 @@ public sealed class CircuitBreakerStatusCheck : IDoctorCheck .WithEvidence(evidenceBuilder.Build("Circuit breaker configuration")) .WithCauses("Break duration less than 5 seconds may cause excessive retries") .WithRemediation(r => r - .AddManualStep(1, "Increase break duration", "Set Resilience:CircuitBreaker:BreakDurationSeconds to 30")) + .AddManualStep(1, "Increase break duration", "Set Resilience:CircuitBreaker:BreakDurationSeconds to 30") + .WithRunbookUrl("")) .Build()); } @@ -85,7 +86,8 @@ public sealed class CircuitBreakerStatusCheck : IDoctorCheck .WithEvidence(evidenceBuilder.Build("Circuit breaker configuration")) .WithCauses("Threshold of 1 may cause circuit to open on transient failures") .WithRemediation(r => r - .AddManualStep(1, "Increase threshold", "Set Resilience:CircuitBreaker:FailureThreshold to 5")) + .AddManualStep(1, "Increase threshold", "Set Resilience:CircuitBreaker:FailureThreshold to 5") + .WithRunbookUrl("")) .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/MessageQueueCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/MessageQueueCheck.cs index 274fb46fd..411251e0a 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/MessageQueueCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/MessageQueueCheck.cs @@ -83,7 +83,8 @@ public sealed class MessageQueueCheck : IDoctorCheck .WithRemediation(r => r .AddManualStep(1, "Check RabbitMQ status", "docker ps | grep rabbitmq") .AddManualStep(2, "Check RabbitMQ logs", "docker logs rabbitmq") - .AddManualStep(3, "Start RabbitMQ", "docker-compose up -d rabbitmq")) + .AddManualStep(3, "Start RabbitMQ", "docker-compose up -d rabbitmq") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.servicegraph.mq") .Build(); } @@ -132,7 +133,8 @@ public sealed class MessageQueueCheck : IDoctorCheck "Network unreachable") .WithRemediation(r => r .AddManualStep(1, "Start RabbitMQ", "docker-compose up -d rabbitmq") - .AddManualStep(2, "Verify DNS", $"nslookup {rabbitHost}")) + .AddManualStep(2, "Verify DNS", $"nslookup {rabbitHost}") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.servicegraph.mq") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/ServiceEndpointsCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/ServiceEndpointsCheck.cs index 79845517d..c777bb989 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/ServiceEndpointsCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/ServiceEndpointsCheck.cs @@ -114,7 +114,8 @@ public sealed class ServiceEndpointsCheck : IDoctorCheck .WithCauses(failedServices.Select(s => $"{s} service is down or unreachable").ToArray()) .WithRemediation(r => r .AddManualStep(1, "Check service status", "kubectl get pods -l app=stellaops") - .AddManualStep(2, "Check service logs", "kubectl logs -l app=stellaops --tail=100")) + .AddManualStep(2, "Check service logs", "kubectl logs -l app=stellaops --tail=100") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.servicegraph.endpoints") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/ServiceTimeoutCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/ServiceTimeoutCheck.cs index f55a0d142..8ce2b65d0 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/ServiceTimeoutCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/ServiceTimeoutCheck.cs @@ -90,7 +90,8 @@ public sealed class ServiceTimeoutCheck : IDoctorCheck .WithEvidence(evidenceBuilder.Build("Timeout configuration")) .WithCauses(issues.ToArray()) .WithRemediation(r => r - .AddManualStep(1, "Review timeout values", "Check configuration and adjust timeouts based on expected service latencies")) + .AddManualStep(1, "Review timeout values", "Check configuration and adjust timeouts based on expected service latencies") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.servicegraph.timeouts") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/ValkeyConnectivityCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/ValkeyConnectivityCheck.cs index 85b7112fe..8e3c525c3 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/ValkeyConnectivityCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.ServiceGraph/Checks/ValkeyConnectivityCheck.cs @@ -68,7 +68,8 @@ public sealed class ValkeyConnectivityCheck : IDoctorCheck .WithEvidence("Configuration", e => e.Add("ConnectionString", RedactConnectionString(connectionString))) .WithCauses("Connection string format is invalid") .WithRemediation(r => r - .AddManualStep(1, "Fix connection string", "Use format: host:port or host:port,password=xxx")) + .AddManualStep(1, "Fix connection string", "Use format: host:port or host:port,password=xxx") + .WithRunbookUrl("")) .Build(); } @@ -96,7 +97,8 @@ public sealed class ValkeyConnectivityCheck : IDoctorCheck "Firewall blocking port " + port) .WithRemediation(r => r .AddManualStep(1, "Check Valkey status", "docker ps | grep valkey") - .AddManualStep(2, "Test port connectivity", $"nc -zv {host} {port}")) + .AddManualStep(2, "Test port connectivity", $"nc -zv {host} {port}") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.servicegraph.valkey") .Build(); } @@ -148,7 +150,8 @@ public sealed class ValkeyConnectivityCheck : IDoctorCheck "Network unreachable") .WithRemediation(r => r .AddManualStep(1, "Start Valkey", "docker-compose up -d valkey") - .AddManualStep(2, "Check DNS", $"nslookup {host}")) + .AddManualStep(2, "Check DNS", $"nslookup {host}") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.servicegraph.valkey") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/MirrorServerAuthCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/MirrorServerAuthCheck.cs index 5d1f6ebff..b3713ddce 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/MirrorServerAuthCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/MirrorServerAuthCheck.cs @@ -94,7 +94,8 @@ public sealed class MirrorServerAuthCheck : IDoctorCheck "Missing required OAuth issuer configuration") .WithRemediation(r => r .AddManualStep(1, "Configure OAuth settings", "Add 'sources:mirrorServer:oauth' section with issuer URL") - .AddShellStep(2, "Run setup wizard", "stella setup --step sources")) + .AddShellStep(2, "Run setup wizard", "stella setup --step sources") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.sources.mirror.auth") .Build()); } @@ -112,7 +113,8 @@ public sealed class MirrorServerAuthCheck : IDoctorCheck .WithCauses("OAuth Issuer URL not configured") .WithRemediation(r => r .AddManualStep(1, "Configure OAuth issuer", "Set 'sources:mirrorServer:oauth:issuer' to your OIDC provider URL") - .AddShellStep(2, "Verify issuer metadata", "curl -s {issuer}/.well-known/openid-configuration")) + .AddShellStep(2, "Verify issuer metadata", "curl -s {issuer}/.well-known/openid-configuration") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.sources.mirror.auth") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/MirrorServerRateLimitCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/MirrorServerRateLimitCheck.cs index 3c702db08..4a905b278 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/MirrorServerRateLimitCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/MirrorServerRateLimitCheck.cs @@ -133,7 +133,8 @@ public sealed class MirrorServerRateLimitCheck : IDoctorCheck .WithCauses(warnings.ToArray()) .WithRemediation(r => r .AddManualStep(1, "Review rate limit configuration", "Check sources:mirrorServer:rateLimits in configuration") - .AddManualStep(2, "Set appropriate limits", "Configure MaxRequests and PerSeconds for your expected traffic")) + .AddManualStep(2, "Set appropriate limits", "Configure MaxRequests and PerSeconds for your expected traffic") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.sources.mirror.ratelimit") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/SourceConnectivityCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/SourceConnectivityCheck.cs index 3849c50a4..fe7a4b1e0 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/SourceConnectivityCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/SourceConnectivityCheck.cs @@ -121,7 +121,8 @@ public sealed class SourceConnectivityCheck : IDoctorCheck } }) .WithCauses(checkResult.PossibleReasons.ToArray()) - .WithRemediation(r => BuildRemediation(r, checkResult)) + .WithRemediation(r => BuildRemediation(r, checkResult + .WithRunbookUrl("")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } @@ -148,7 +149,8 @@ public sealed class SourceConnectivityCheck : IDoctorCheck } }) .WithCauses(checkResult.PossibleReasons.ToArray()) - .WithRemediation(r => BuildRemediation(r, checkResult)) + .WithRemediation(r => BuildRemediation(r, checkResult + .WithRunbookUrl("")) .WithVerification($"stella doctor --check {CheckId}") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/SourceModeConfiguredCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/SourceModeConfiguredCheck.cs index 85888ed77..856f9b46e 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/SourceModeConfiguredCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Sources/Checks/SourceModeConfiguredCheck.cs @@ -61,7 +61,8 @@ public sealed class SourceModeConfiguredCheck : IDoctorCheck "Configuration not loaded properly") .WithRemediation(r => r .AddManualStep(1, "Add sources section to configuration", "Add 'sources:' section to appsettings.json or environment-specific config") - .AddShellStep(2, "Run setup wizard", "stella setup --step sources")) + .AddShellStep(2, "Run setup wizard", "stella setup --step sources") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.sources.mode.configured") .Build()); } @@ -87,7 +88,8 @@ public sealed class SourceModeConfiguredCheck : IDoctorCheck "Mirror server URL not specified") .WithRemediation(r => r .AddManualStep(1, "Configure mirror server", "Add 'sources:mirrorServer' section with URL and authentication settings") - .AddShellStep(2, "Run setup wizard", "stella setup --step sources")) + .AddShellStep(2, "Run setup wizard", "stella setup --step sources") + .WithRunbookUrl("")) .WithVerification("stella doctor --check check.sources.mode.configured") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/PolicyEngineCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/PolicyEngineCheck.cs index 16d45dd2d..1bf146823 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/PolicyEngineCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/PolicyEngineCheck.cs @@ -74,7 +74,8 @@ public sealed class PolicyEngineCheck : VerificationCheckBase .Add("BundlePath", bundlePath) .Add("FileExists", "false")) .WithRemediation(r => r - .AddShellStep(1, "Export bundle", "stella verification bundle export --include-policy --output " + bundlePath)) + .AddShellStep(1, "Export bundle", "stella verification bundle export --include-policy --output " + bundlePath) + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.policy.engine") .Build()); } @@ -101,7 +102,8 @@ public sealed class PolicyEngineCheck : VerificationCheckBase "Bundle was exported without policy results", "Policy evaluation not run before export") .WithRemediation(r => r - .AddShellStep(1, "Re-export with policy", "stella verification bundle export --include-policy --output " + bundlePath)) + .AddShellStep(1, "Re-export with policy", "stella verification bundle export --include-policy --output " + bundlePath) + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.policy.engine") .Build()); } @@ -158,7 +160,8 @@ public sealed class PolicyEngineCheck : VerificationCheckBase .WithCauses("Policy engine not configured or disabled") .WithRemediation(r => r .AddManualStep(1, "Enable policy engine", "Set Policy:Engine:Enabled to true") - .AddManualStep(2, "Configure default policy", "Set Policy:DefaultPolicyRef to a policy reference")) + .AddManualStep(2, "Configure default policy", "Set Policy:DefaultPolicyRef to a policy reference") + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.policy.engine") .Build()); } @@ -176,7 +179,8 @@ public sealed class PolicyEngineCheck : VerificationCheckBase .WithCauses("No test policy reference configured") .WithRemediation(r => r .AddManualStep(1, "Configure test policy", "Set Doctor:Plugins:Verification:PolicyTest:PolicyRef") - .AddManualStep(2, "Or set default", "Set Policy:DefaultPolicyRef for a default policy")) + .AddManualStep(2, "Or set default", "Set Policy:DefaultPolicyRef for a default policy") + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.policy.engine") .Build()); } @@ -198,7 +202,8 @@ public sealed class PolicyEngineCheck : VerificationCheckBase .WithCauses("Policy may not consider VEX statements when evaluating vulnerabilities") .WithRemediation(r => r .AddManualStep(1, "Enable VEX in policy", "Set Policy:VexAware to true") - .AddManualStep(2, "Update policy rules", "Ensure policy considers VEX justifications for vulnerabilities")) + .AddManualStep(2, "Update policy rules", "Ensure policy considers VEX justifications for vulnerabilities") + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.policy.engine") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/SbomValidationCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/SbomValidationCheck.cs index b681a2b37..3ccb15a2c 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/SbomValidationCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/SbomValidationCheck.cs @@ -76,7 +76,8 @@ public sealed class SbomValidationCheck : VerificationCheckBase .Add("BundlePath", bundlePath) .Add("FileExists", "false")) .WithRemediation(r => r - .AddShellStep(1, "Export bundle", "stella verification bundle export --include-sbom --output " + bundlePath)) + .AddShellStep(1, "Export bundle", "stella verification bundle export --include-sbom --output " + bundlePath) + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.sbom.validation") .Build()); } @@ -101,7 +102,8 @@ public sealed class SbomValidationCheck : VerificationCheckBase "Test artifact has no SBOM attached") .WithRemediation(r => r .AddShellStep(1, "Re-export with SBOM", "stella verification bundle export --include-sbom --output " + bundlePath) - .AddManualStep(2, "Generate SBOM", "Enable SBOM generation in your build pipeline")) + .AddManualStep(2, "Generate SBOM", "Enable SBOM generation in your build pipeline") + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.sbom.validation") .Build()); } @@ -157,7 +159,8 @@ public sealed class SbomValidationCheck : VerificationCheckBase "SBOM attestation not configured") .WithRemediation(r => r .AddManualStep(1, "Enable SBOM generation", "Set Scanner:SbomGeneration:Enabled to true") - .AddManualStep(2, "Enable SBOM attestation", "Set Attestor:SbomAttestation:Enabled to true")) + .AddManualStep(2, "Enable SBOM attestation", "Set Attestor:SbomAttestation:Enabled to true") + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.sbom.validation") .Build()); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/SignatureVerificationCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/SignatureVerificationCheck.cs index 4c60c9998..5c361cdd7 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/SignatureVerificationCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/SignatureVerificationCheck.cs @@ -75,7 +75,8 @@ public sealed class SignatureVerificationCheck : VerificationCheckBase .Add("BundlePath", bundlePath) .Add("FileExists", "false")) .WithRemediation(r => r - .AddShellStep(1, "Export bundle", "stella verification bundle export --output " + bundlePath)) + .AddShellStep(1, "Export bundle", "stella verification bundle export --output " + bundlePath) + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.signature") .Build()); } @@ -102,7 +103,8 @@ public sealed class SignatureVerificationCheck : VerificationCheckBase .Add("SignatureDataFound", "false") .Add("Note", "Bundle should contain DSSE signatures for verification")) .WithRemediation(r => r - .AddShellStep(1, "Re-export with signatures", "stella verification bundle export --include-signatures --output " + bundlePath)) + .AddShellStep(1, "Re-export with signatures", "stella verification bundle export --include-signatures --output " + bundlePath) + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.signature") .Build()); } @@ -154,7 +156,8 @@ public sealed class SignatureVerificationCheck : VerificationCheckBase .Add("Note", "Enable Sigstore to verify artifact signatures")) .WithRemediation(r => r .AddManualStep(1, "Enable Sigstore", "Set Sigstore:Enabled to true") - .AddManualStep(2, "Configure signing", "Set up signing keys or keyless mode")) + .AddManualStep(2, "Configure signing", "Set up signing keys or keyless mode") + .WithRunbookUrl("")) .Build(); } @@ -180,7 +183,8 @@ public sealed class SignatureVerificationCheck : VerificationCheckBase "Network connectivity issue") .WithRemediation(r => r .AddShellStep(1, "Test Rekor", $"curl -I {rekorHealthUrl}") - .AddManualStep(2, "Or use offline mode", "Configure offline verification bundle")) + .AddManualStep(2, "Or use offline mode", "Configure offline verification bundle") + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.signature") .Build(); } @@ -208,7 +212,8 @@ public sealed class SignatureVerificationCheck : VerificationCheckBase .WithCauses("Network connectivity issue") .WithRemediation(r => r .AddManualStep(1, "Check network", "Verify connectivity to Rekor") - .AddManualStep(2, "Use offline mode", "Configure offline verification bundle")) + .AddManualStep(2, "Use offline mode", "Configure offline verification bundle") + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.signature") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/TestArtifactPullCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/TestArtifactPullCheck.cs index 8bd64a1dc..90b9370e3 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/TestArtifactPullCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/TestArtifactPullCheck.cs @@ -78,7 +78,8 @@ public sealed class TestArtifactPullCheck : VerificationCheckBase "Path is incorrect") .WithRemediation(r => r .AddShellStep(1, "Verify file exists", $"ls -la {bundlePath}") - .AddShellStep(2, "Export bundle from online system", "stella verification bundle export --output " + bundlePath)) + .AddShellStep(2, "Export bundle from online system", "stella verification bundle export --output " + bundlePath) + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.artifact.pull") .Build()); } @@ -113,7 +114,8 @@ public sealed class TestArtifactPullCheck : VerificationCheckBase .Add("Error", "Could not parse registry and repository")) .WithCauses("Reference format is incorrect") .WithRemediation(r => r - .AddManualStep(1, "Fix reference format", "Use format: oci://registry/repository@sha256:digest or registry/repository@sha256:digest")) + .AddManualStep(1, "Fix reference format", "Use format: oci://registry/repository@sha256:digest or registry/repository@sha256:digest") + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.artifact.pull") .Build(); } @@ -151,7 +153,8 @@ public sealed class TestArtifactPullCheck : VerificationCheckBase .WithRemediation(r => r .AddShellStep(1, "Test with crane", $"crane manifest {reference}") .AddManualStep(2, "Check registry credentials", "Ensure registry credentials are configured") - .AddManualStep(3, "Verify artifact exists", "Confirm the test artifact has been pushed to the registry")) + .AddManualStep(3, "Verify artifact exists", "Confirm the test artifact has been pushed to the registry") + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.artifact.pull") .Build(); } @@ -178,7 +181,8 @@ public sealed class TestArtifactPullCheck : VerificationCheckBase "Wrong artifact tag being pulled") .WithRemediation(r => r .AddManualStep(1, "Update expected digest", $"Set Doctor:Plugins:Verification:TestArtifact:ExpectedDigest to {responseDigest}") - .AddManualStep(2, "Or use digest in reference", "Use @sha256:... in the reference instead of :tag")) + .AddManualStep(2, "Or use digest in reference", "Use @sha256:... in the reference instead of :tag") + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.artifact.pull") .Build(); } @@ -208,7 +212,8 @@ public sealed class TestArtifactPullCheck : VerificationCheckBase "DNS resolution failure") .WithRemediation(r => r .AddShellStep(1, "Test registry connectivity", $"curl -I https://{registry}/v2/") - .AddManualStep(2, "Check network configuration", "Ensure HTTPS traffic to the registry is allowed")) + .AddManualStep(2, "Check network configuration", "Ensure HTTPS traffic to the registry is allowed") + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.artifact.pull") .Build(); } diff --git a/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/VexValidationCheck.cs b/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/VexValidationCheck.cs index 15c2ecf24..ef1d53d96 100644 --- a/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/VexValidationCheck.cs +++ b/src/__Libraries/StellaOps.Doctor.Plugins.Verification/Checks/VexValidationCheck.cs @@ -76,7 +76,8 @@ public sealed class VexValidationCheck : VerificationCheckBase .Add("BundlePath", bundlePath) .Add("FileExists", "false")) .WithRemediation(r => r - .AddShellStep(1, "Export bundle", "stella verification bundle export --include-vex --output " + bundlePath)) + .AddShellStep(1, "Export bundle", "stella verification bundle export --include-vex --output " + bundlePath) + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.vex.validation") .Build()); } @@ -103,7 +104,8 @@ public sealed class VexValidationCheck : VerificationCheckBase "Test artifact has no known vulnerabilities") .WithRemediation(r => r .AddShellStep(1, "Re-export with VEX", "stella verification bundle export --include-vex --output " + bundlePath) - .AddManualStep(2, "This may be expected", "VEX documents are only needed when vulnerabilities exist")) + .AddManualStep(2, "This may be expected", "VEX documents are only needed when vulnerabilities exist") + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.vex.validation") .Build()); } @@ -154,7 +156,8 @@ public sealed class VexValidationCheck : VerificationCheckBase .Add("Note", "VEX collection is optional but recommended for vulnerability context")) .WithRemediation(r => r .AddManualStep(1, "Enable VEX collection", "Set VexHub:Collection:Enabled to true") - .AddManualStep(2, "Configure VEX feeds", "Add vendor VEX feeds to VexHub:Feeds")) + .AddManualStep(2, "Configure VEX feeds", "Add vendor VEX feeds to VexHub:Feeds") + .WithRunbookUrl("")) .Build()); } @@ -170,7 +173,8 @@ public sealed class VexValidationCheck : VerificationCheckBase .Add("Note", "VEX feeds provide vendor vulnerability context")) .WithCauses("No VEX feed URLs configured") .WithRemediation(r => r - .AddManualStep(1, "Configure VEX feeds", "Add vendor VEX feeds to VexHub:Feeds array")) + .AddManualStep(1, "Configure VEX feeds", "Add vendor VEX feeds to VexHub:Feeds array") + .WithRunbookUrl("")) .WithVerification($"stella doctor --check check.verification.vex.validation") .Build()); }