add release orchestrator docs and sprints gaps fills
This commit is contained in:
246
docs/modules/release-orchestrator/operations/alerting.md
Normal file
246
docs/modules/release-orchestrator/operations/alerting.md
Normal file
@@ -0,0 +1,246 @@
|
||||
# Alerting Rules
|
||||
|
||||
> Prometheus alerting rules for the Release Orchestrator.
|
||||
|
||||
**Status:** Planned (not yet implemented)
|
||||
**Source:** [Architecture Advisory Section 13.4](../../../product/advisories/09-Jan-2026%20-%20Stella%20Ops%20Orchestrator%20Architecture.md)
|
||||
**Related Modules:** [Metrics](metrics.md), [Observability Overview](overview.md)
|
||||
|
||||
## Overview
|
||||
|
||||
The Release Orchestrator provides Prometheus alerting rules for monitoring promotions, deployments, agents, and integrations.
|
||||
|
||||
---
|
||||
|
||||
## High Priority Alerts
|
||||
|
||||
### Security Gate Block Rate
|
||||
|
||||
```yaml
|
||||
- alert: PromotionGateBlockRate
|
||||
expr: |
|
||||
rate(stella_security_gate_results_total{result="blocked"}[1h]) /
|
||||
rate(stella_security_gate_results_total[1h]) > 0.5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High rate of security gate blocks"
|
||||
description: "More than 50% of promotions are being blocked by security gates"
|
||||
```
|
||||
|
||||
### Deployment Failure Rate
|
||||
|
||||
```yaml
|
||||
- alert: DeploymentFailureRate
|
||||
expr: |
|
||||
rate(stella_deployments_total{status="failed"}[1h]) /
|
||||
rate(stella_deployments_total[1h]) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "High deployment failure rate"
|
||||
description: "More than 10% of deployments are failing"
|
||||
```
|
||||
|
||||
### Agent Offline
|
||||
|
||||
```yaml
|
||||
- alert: AgentOffline
|
||||
expr: |
|
||||
stella_agents_status{status="offline"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Agent offline"
|
||||
description: "Agent {{ $labels.agent_id }} has been offline for 5 minutes"
|
||||
```
|
||||
|
||||
### Promotion Stuck
|
||||
|
||||
```yaml
|
||||
- alert: PromotionStuck
|
||||
expr: |
|
||||
time() - stella_promotion_start_time{status="deploying"} > 1800
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Promotion stuck in deploying state"
|
||||
description: "Promotion {{ $labels.promotion_id }} has been deploying for more than 30 minutes"
|
||||
```
|
||||
|
||||
### Integration Unhealthy
|
||||
|
||||
```yaml
|
||||
- alert: IntegrationUnhealthy
|
||||
expr: |
|
||||
stella_integration_health{status="unhealthy"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Integration unhealthy"
|
||||
description: "Integration {{ $labels.integration_name }} has been unhealthy for 10 minutes"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Medium Priority Alerts
|
||||
|
||||
### Workflow Step Timeout
|
||||
|
||||
```yaml
|
||||
- alert: WorkflowStepTimeout
|
||||
expr: |
|
||||
stella_workflow_step_duration_seconds > 600
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Workflow step taking too long"
|
||||
description: "Step {{ $labels.step_type }} in workflow {{ $labels.workflow_run_id }} has been running for more than 10 minutes"
|
||||
```
|
||||
|
||||
### Evidence Generation Failure
|
||||
|
||||
```yaml
|
||||
- alert: EvidenceGenerationFailure
|
||||
expr: |
|
||||
rate(stella_evidence_generation_failures_total[1h]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Evidence generation failures"
|
||||
description: "Evidence generation is failing, affecting audit compliance"
|
||||
```
|
||||
|
||||
### Target Health Degraded
|
||||
|
||||
```yaml
|
||||
- alert: TargetHealthDegraded
|
||||
expr: |
|
||||
stella_target_health{status!="healthy"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Target health degraded"
|
||||
description: "Target {{ $labels.target_name }} is reporting {{ $labels.status }}"
|
||||
```
|
||||
|
||||
### Approval Timeout
|
||||
|
||||
```yaml
|
||||
- alert: ApprovalTimeout
|
||||
expr: |
|
||||
time() - stella_promotion_approval_requested_time > 86400
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Promotion awaiting approval for too long"
|
||||
description: "Promotion {{ $labels.promotion_id }} has been waiting for approval for more than 24 hours"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Low Priority Alerts
|
||||
|
||||
### Database Connection Pool
|
||||
|
||||
```yaml
|
||||
- alert: DatabaseConnectionPoolExhausted
|
||||
expr: |
|
||||
stella_db_connection_pool_available < 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Database connection pool running low"
|
||||
description: "Only {{ $value }} database connections available"
|
||||
```
|
||||
|
||||
### Plugin Error Rate
|
||||
|
||||
```yaml
|
||||
- alert: PluginErrorRate
|
||||
expr: |
|
||||
rate(stella_plugin_errors_total[5m]) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Plugin errors detected"
|
||||
description: "Plugin {{ $labels.plugin_id }} is experiencing errors"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Alert Routing
|
||||
|
||||
### Example AlertManager Configuration
|
||||
|
||||
```yaml
|
||||
# alertmanager.yaml
|
||||
route:
|
||||
receiver: default
|
||||
group_by: [alertname, severity]
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
|
||||
routes:
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: pagerduty
|
||||
continue: true
|
||||
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: slack
|
||||
|
||||
receivers:
|
||||
- name: default
|
||||
webhook_configs:
|
||||
- url: http://webhook.example.com/alerts
|
||||
|
||||
- name: pagerduty
|
||||
pagerduty_configs:
|
||||
- service_key: ${PAGERDUTY_KEY}
|
||||
severity: critical
|
||||
|
||||
- name: slack
|
||||
slack_configs:
|
||||
- channel: '#alerts'
|
||||
api_url: ${SLACK_WEBHOOK_URL}
|
||||
title: '{{ .CommonAnnotations.summary }}'
|
||||
text: '{{ .CommonAnnotations.description }}'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Dashboard Integration
|
||||
|
||||
### Grafana Alert Panels
|
||||
|
||||
Recommended dashboard panels for alerts:
|
||||
|
||||
| Panel | Query |
|
||||
|-------|-------|
|
||||
| Active Alerts | `count(ALERTS{alertstate="firing"})` |
|
||||
| Alert History | `count_over_time(ALERTS{alertstate="firing"}[24h])` |
|
||||
| By Severity | `count(ALERTS{alertstate="firing"}) by (severity)` |
|
||||
| By Component | `count(ALERTS{alertstate="firing"}) by (alertname)` |
|
||||
|
||||
---
|
||||
|
||||
## See Also
|
||||
|
||||
- [Metrics](metrics.md)
|
||||
- [Observability Overview](overview.md)
|
||||
- [Logging](logging.md)
|
||||
- [Tracing](tracing.md)
|
||||
220
docs/modules/release-orchestrator/operations/logging.md
Normal file
220
docs/modules/release-orchestrator/operations/logging.md
Normal file
@@ -0,0 +1,220 @@
|
||||
# Logging Specification
|
||||
|
||||
> Structured logging format and categories for the Release Orchestrator.
|
||||
|
||||
**Status:** Planned (not yet implemented)
|
||||
**Source:** [Architecture Advisory Section 13.2](../../../product/advisories/09-Jan-2026%20-%20Stella%20Ops%20Orchestrator%20Architecture.md)
|
||||
**Related Modules:** [Observability Overview](overview.md), [Tracing](tracing.md)
|
||||
|
||||
## Overview
|
||||
|
||||
The Release Orchestrator uses structured JSON logging with consistent format, correlation IDs, and context propagation for all components.
|
||||
|
||||
---
|
||||
|
||||
## Structured Log Format
|
||||
|
||||
### JSON Schema
|
||||
|
||||
```json
|
||||
{
|
||||
"timestamp": "2026-01-09T14:32:15.123Z",
|
||||
"level": "info",
|
||||
"module": "promotion-manager",
|
||||
"message": "Promotion approved",
|
||||
"context": {
|
||||
"tenant_id": "uuid",
|
||||
"promotion_id": "uuid",
|
||||
"release_id": "uuid",
|
||||
"environment": "prod",
|
||||
"user_id": "uuid"
|
||||
},
|
||||
"details": {
|
||||
"approvals_count": 2,
|
||||
"gates_passed": ["security", "approval", "freeze"],
|
||||
"decision": "allow"
|
||||
},
|
||||
"trace_id": "abc123",
|
||||
"span_id": "def456",
|
||||
"duration_ms": 45
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Log Levels
|
||||
|
||||
| Level | Usage |
|
||||
|-------|-------|
|
||||
| `error` | Errors requiring attention; failures that impact functionality |
|
||||
| `warn` | Potential issues; degraded functionality; approaching limits |
|
||||
| `info` | Significant events; state changes; audit-relevant actions |
|
||||
| `debug` | Detailed debugging info; request/response bodies |
|
||||
| `trace` | Very detailed tracing; internal state; performance profiling |
|
||||
|
||||
---
|
||||
|
||||
## Log Categories
|
||||
|
||||
| Category | Examples |
|
||||
|----------|----------|
|
||||
| `api` | Request received, response sent, validation errors |
|
||||
| `promotion` | Promotion requested, approved, rejected, completed |
|
||||
| `deployment` | Deployment started, task assigned, completed, failed |
|
||||
| `security` | Gate evaluation, vulnerability found, policy violation |
|
||||
| `agent` | Agent registered, heartbeat, task execution |
|
||||
| `workflow` | Workflow started, step executed, completed |
|
||||
| `integration` | Integration tested, resource discovered, webhook received |
|
||||
|
||||
---
|
||||
|
||||
## Logging Examples
|
||||
|
||||
### API Request
|
||||
|
||||
```json
|
||||
{
|
||||
"timestamp": "2026-01-09T14:32:15.123Z",
|
||||
"level": "info",
|
||||
"module": "api",
|
||||
"message": "Request received",
|
||||
"context": {
|
||||
"tenant_id": "uuid",
|
||||
"user_id": "uuid"
|
||||
},
|
||||
"details": {
|
||||
"method": "POST",
|
||||
"path": "/api/v1/promotions",
|
||||
"status": 201,
|
||||
"duration_ms": 125
|
||||
},
|
||||
"trace_id": "abc123",
|
||||
"span_id": "def456"
|
||||
}
|
||||
```
|
||||
|
||||
### Promotion Event
|
||||
|
||||
```json
|
||||
{
|
||||
"timestamp": "2026-01-09T14:32:15.123Z",
|
||||
"level": "info",
|
||||
"module": "promotion-manager",
|
||||
"message": "Promotion approved",
|
||||
"context": {
|
||||
"tenant_id": "uuid",
|
||||
"promotion_id": "uuid",
|
||||
"release_id": "uuid",
|
||||
"environment": "prod",
|
||||
"user_id": "uuid"
|
||||
},
|
||||
"details": {
|
||||
"approvals_count": 2,
|
||||
"gates_passed": ["security", "approval", "freeze"],
|
||||
"decision": "allow"
|
||||
},
|
||||
"trace_id": "abc123",
|
||||
"span_id": "def456",
|
||||
"duration_ms": 45
|
||||
}
|
||||
```
|
||||
|
||||
### Security Gate Failure
|
||||
|
||||
```json
|
||||
{
|
||||
"timestamp": "2026-01-09T14:32:15.123Z",
|
||||
"level": "warn",
|
||||
"module": "security",
|
||||
"message": "Security gate blocked promotion",
|
||||
"context": {
|
||||
"tenant_id": "uuid",
|
||||
"promotion_id": "uuid",
|
||||
"release_id": "uuid",
|
||||
"environment": "prod"
|
||||
},
|
||||
"details": {
|
||||
"gate_name": "security-gate",
|
||||
"reason": "Critical vulnerability found",
|
||||
"vulnerabilities": {
|
||||
"critical": 1,
|
||||
"high": 3
|
||||
}
|
||||
},
|
||||
"trace_id": "abc123",
|
||||
"span_id": "def456"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Sensitive Data Masking
|
||||
|
||||
The following fields are automatically masked in logs:
|
||||
|
||||
| Field Type | Masking Strategy |
|
||||
|------------|------------------|
|
||||
| Passwords | Not logged |
|
||||
| API Keys | First 4 and last 4 chars only |
|
||||
| Tokens | Hash only |
|
||||
| PII | Redacted |
|
||||
| Credentials | Not logged |
|
||||
|
||||
### Example
|
||||
|
||||
```json
|
||||
{
|
||||
"message": "Authentication succeeded",
|
||||
"details": {
|
||||
"api_key": "sk_l...abcd",
|
||||
"token_hash": "sha256:abc123..."
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Correlation IDs
|
||||
|
||||
All logs include correlation IDs for request tracing:
|
||||
|
||||
| Field | Description |
|
||||
|-------|-------------|
|
||||
| `trace_id` | W3C Trace Context trace ID |
|
||||
| `span_id` | Current operation span ID |
|
||||
| `correlation_id` | Business-level correlation (optional) |
|
||||
|
||||
---
|
||||
|
||||
## Log Aggregation
|
||||
|
||||
Recommended log aggregation setup:
|
||||
|
||||
```yaml
|
||||
# Fluent Bit configuration
|
||||
[INPUT]
|
||||
Name tail
|
||||
Path /var/log/stella/*.log
|
||||
Parser json
|
||||
|
||||
[FILTER]
|
||||
Name nest
|
||||
Match *
|
||||
Operation lift
|
||||
Nested_under context
|
||||
|
||||
[OUTPUT]
|
||||
Name opensearch
|
||||
Match *
|
||||
Host opensearch.example.com
|
||||
Index stella-logs
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## See Also
|
||||
|
||||
- [Observability Overview](overview.md)
|
||||
- [Tracing](tracing.md)
|
||||
- [Alerting](alerting.md)
|
||||
- [Security Overview](../security/overview.md)
|
||||
222
docs/modules/release-orchestrator/operations/tracing.md
Normal file
222
docs/modules/release-orchestrator/operations/tracing.md
Normal file
@@ -0,0 +1,222 @@
|
||||
# Distributed Tracing Specification
|
||||
|
||||
> OpenTelemetry-based distributed tracing for the Release Orchestrator.
|
||||
|
||||
**Status:** Planned (not yet implemented)
|
||||
**Source:** [Architecture Advisory Section 13.3](../../../product/advisories/09-Jan-2026%20-%20Stella%20Ops%20Orchestrator%20Architecture.md)
|
||||
**Related Modules:** [Observability Overview](overview.md), [Logging](logging.md)
|
||||
|
||||
## Overview
|
||||
|
||||
The Release Orchestrator uses OpenTelemetry for distributed tracing, enabling end-to-end visibility of promotion workflows, deployments, and agent tasks.
|
||||
|
||||
---
|
||||
|
||||
## Trace Context Propagation
|
||||
|
||||
### W3C Trace Context
|
||||
|
||||
```typescript
|
||||
// Trace context structure
|
||||
interface TraceContext {
|
||||
traceId: string; // 32-char hex
|
||||
spanId: string; // 16-char hex
|
||||
parentSpanId?: string;
|
||||
sampled: boolean;
|
||||
baggage: Record<string, string>;
|
||||
}
|
||||
|
||||
// Propagation headers
|
||||
const TRACE_HEADERS = {
|
||||
W3C_TRACEPARENT: "traceparent",
|
||||
W3C_TRACESTATE: "tracestate",
|
||||
BAGGAGE: "baggage",
|
||||
};
|
||||
|
||||
// Example traceparent: 00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01
|
||||
```
|
||||
|
||||
### Header Format
|
||||
|
||||
```
|
||||
traceparent: 00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01
|
||||
^ ^ ^ ^
|
||||
| | | |
|
||||
| trace-id (32 hex) span-id (16 hex) flags
|
||||
version
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Key Traces
|
||||
|
||||
| Operation | Span Name | Attributes |
|
||||
|-----------|-----------|------------|
|
||||
| Promotion request | `promotion.request` | promotion_id, release_id, environment |
|
||||
| Gate evaluation | `promotion.evaluate_gates` | gate_names, result |
|
||||
| Workflow execution | `workflow.execute` | workflow_run_id, template_name |
|
||||
| Step execution | `workflow.step.{type}` | step_run_id, node_id, inputs |
|
||||
| Deployment job | `deployment.execute` | job_id, environment, strategy |
|
||||
| Agent task | `agent.task.{type}` | task_id, agent_id, target_id |
|
||||
| Plugin call | `plugin.{method}` | plugin_id, method, duration |
|
||||
|
||||
---
|
||||
|
||||
## Trace Hierarchy
|
||||
|
||||
### Promotion Flow
|
||||
|
||||
```
|
||||
promotion.request (root)
|
||||
+-- promotion.evaluate_gates
|
||||
| +-- gate.security
|
||||
| +-- gate.approval
|
||||
| +-- gate.freeze_window
|
||||
|
|
||||
+-- workflow.execute
|
||||
| +-- workflow.step.security-check
|
||||
| +-- workflow.step.approval
|
||||
| +-- workflow.step.deploy
|
||||
| +-- deployment.execute
|
||||
| +-- deployment.assign_tasks
|
||||
| +-- agent.task.pull
|
||||
| +-- agent.task.deploy
|
||||
| +-- agent.task.health_check
|
||||
|
|
||||
+-- evidence.generate
|
||||
+-- evidence.sign
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Span Attributes
|
||||
|
||||
### Common Attributes
|
||||
|
||||
| Attribute | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `tenant.id` | string | Tenant UUID |
|
||||
| `user.id` | string | User UUID (if authenticated) |
|
||||
| `release.id` | string | Release UUID |
|
||||
| `environment.name` | string | Environment name |
|
||||
| `error` | boolean | Whether error occurred |
|
||||
| `error.type` | string | Error type/class |
|
||||
|
||||
### Promotion Attributes
|
||||
|
||||
| Attribute | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `promotion.id` | string | Promotion UUID |
|
||||
| `promotion.status` | string | Current status |
|
||||
| `promotion.gates` | string[] | Gates evaluated |
|
||||
| `promotion.decision` | string | allow/deny |
|
||||
|
||||
### Deployment Attributes
|
||||
|
||||
| Attribute | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `deployment.job_id` | string | Deployment job UUID |
|
||||
| `deployment.strategy` | string | Deployment strategy |
|
||||
| `deployment.target_count` | int | Number of targets |
|
||||
| `deployment.batch_size` | int | Batch size |
|
||||
|
||||
### Agent Task Attributes
|
||||
|
||||
| Attribute | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `task.id` | string | Task UUID |
|
||||
| `task.type` | string | Task type |
|
||||
| `agent.id` | string | Agent UUID |
|
||||
| `target.id` | string | Target UUID |
|
||||
|
||||
---
|
||||
|
||||
## OpenTelemetry Configuration
|
||||
|
||||
### SDK Configuration
|
||||
|
||||
```yaml
|
||||
# otel-config.yaml
|
||||
service:
|
||||
name: stella-release-orchestrator
|
||||
version: ${VERSION}
|
||||
|
||||
exporters:
|
||||
otlp:
|
||||
endpoint: otel-collector:4317
|
||||
protocol: grpc
|
||||
|
||||
processors:
|
||||
batch:
|
||||
timeout: 10s
|
||||
send_batch_size: 1024
|
||||
|
||||
resource:
|
||||
attributes:
|
||||
- key: service.namespace
|
||||
value: stella-ops
|
||||
- key: deployment.environment
|
||||
value: ${ENVIRONMENT}
|
||||
```
|
||||
|
||||
### Environment Variables
|
||||
|
||||
```bash
|
||||
OTEL_SERVICE_NAME=stella-release-orchestrator
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
|
||||
OTEL_EXPORTER_OTLP_PROTOCOL=grpc
|
||||
OTEL_TRACES_SAMPLER=parentbased_traceidratio
|
||||
OTEL_TRACES_SAMPLER_ARG=0.1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Sampling Strategy
|
||||
|
||||
| Environment | Sampling Rate | Reason |
|
||||
|-------------|---------------|--------|
|
||||
| Development | 100% | Full visibility |
|
||||
| Staging | 100% | Full visibility |
|
||||
| Production | 10% | Cost/performance |
|
||||
| Production (errors) | 100% | Always sample errors |
|
||||
|
||||
---
|
||||
|
||||
## Example Trace
|
||||
|
||||
```json
|
||||
{
|
||||
"traceId": "4bf92f3577b34da6a3ce929d0e0e4736",
|
||||
"spans": [
|
||||
{
|
||||
"spanId": "00f067aa0ba902b7",
|
||||
"name": "promotion.request",
|
||||
"duration_ms": 5234,
|
||||
"attributes": {
|
||||
"promotion.id": "promo-123",
|
||||
"release.id": "rel-456",
|
||||
"environment.name": "production"
|
||||
}
|
||||
},
|
||||
{
|
||||
"spanId": "00f067aa0ba902b8",
|
||||
"parentSpanId": "00f067aa0ba902b7",
|
||||
"name": "gate.security",
|
||||
"duration_ms": 234,
|
||||
"attributes": {
|
||||
"gate.result": "passed",
|
||||
"vulnerabilities.critical": 0
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## See Also
|
||||
|
||||
- [Observability Overview](overview.md)
|
||||
- [Logging](logging.md)
|
||||
- [Metrics](metrics.md)
|
||||
- [Alerting](alerting.md)
|
||||
Reference in New Issue
Block a user