add release orchestrator docs and sprints gaps fills
This commit is contained in:
270
docs/modules/release-orchestrator/progressive-delivery/canary.md
Normal file
270
docs/modules/release-orchestrator/progressive-delivery/canary.md
Normal file
@@ -0,0 +1,270 @@
|
||||
# Canary Controller
|
||||
|
||||
> Automated canary deployment controller with health-based stage advancement and automatic rollback.
|
||||
|
||||
**Status:** Planned (not yet implemented)
|
||||
**Source:** [Architecture Advisory Section 11.3](../../../product/advisories/09-Jan-2026%20-%20Stella%20Ops%20Orchestrator%20Architecture.md)
|
||||
**Related Modules:** [Progressive Delivery Module](../modules/progressive-delivery.md), [Deployment Strategies](../deployment/strategies.md)
|
||||
**Sprint:** [110_003 Canary Controller](../../../../implplan/SPRINT_20260110_110_003_PROGDL_canary_controller.md)
|
||||
|
||||
## Overview
|
||||
|
||||
The Canary Controller automates progressive rollout of new versions by gradually shifting traffic, monitoring health metrics, and automatically rolling back if issues are detected.
|
||||
|
||||
---
|
||||
|
||||
## Canary State Machine
|
||||
|
||||
### States
|
||||
|
||||
```
|
||||
CREATED -> DEPLOYING -> EVALUATING -> PROMOTING/ROLLING_BACK -> COMPLETED
|
||||
```
|
||||
|
||||
| State | Description |
|
||||
|-------|-------------|
|
||||
| `CREATED` | Canary release defined, not started |
|
||||
| `DEPLOYING` | Deploying variation B to targets |
|
||||
| `EVALUATING` | Monitoring health metrics at current stage |
|
||||
| `PROMOTING` | Advancing to next stage |
|
||||
| `ROLLING_BACK` | Reverting to variation A |
|
||||
| `COMPLETED` | Final state (promoted or rolled back) |
|
||||
|
||||
---
|
||||
|
||||
## Implementation
|
||||
|
||||
### Canary Controller Class
|
||||
|
||||
```typescript
|
||||
class CanaryController {
|
||||
async executeRollout(abRelease: ABRelease): Promise<void> {
|
||||
const strategy = abRelease.rolloutStrategy;
|
||||
|
||||
for (let i = 0; i < strategy.stages.length; i++) {
|
||||
const stage = strategy.stages[i];
|
||||
const stageRecord = await this.startStage(abRelease, stage, i);
|
||||
|
||||
try {
|
||||
// 1. Apply traffic configuration for this stage
|
||||
await this.applyStageTraffic(abRelease, stage);
|
||||
this.emit("canary.stage_started", { abRelease, stage, stageNumber: i });
|
||||
|
||||
// 2. Wait for stage completion based on criteria
|
||||
const result = await this.waitForStageCompletion(abRelease, stage);
|
||||
|
||||
if (!result.success) {
|
||||
// Health check failed - rollback
|
||||
this.log(`Stage ${stage.name} failed health check: ${result.reason}`);
|
||||
await this.rollback(abRelease, result.reason);
|
||||
return;
|
||||
}
|
||||
|
||||
// 3. Check if approval required
|
||||
if (stage.requireApproval) {
|
||||
this.log(`Stage ${stage.name} requires approval`);
|
||||
await this.pauseForApproval(abRelease, stage);
|
||||
|
||||
// Wait for approval
|
||||
const approval = await this.waitForApproval(abRelease, stage);
|
||||
if (!approval.approved) {
|
||||
await this.rollback(abRelease, "Approval denied");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
await this.completeStage(stageRecord, "succeeded");
|
||||
this.emit("canary.stage_completed", { abRelease, stage, stageNumber: i });
|
||||
|
||||
} catch (error) {
|
||||
await this.completeStage(stageRecord, "failed", error.message);
|
||||
await this.rollback(abRelease, error.message);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Rollout complete
|
||||
await this.completeRollout(abRelease);
|
||||
this.emit("canary.promoted", { abRelease });
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Stage Completion Logic
|
||||
|
||||
```typescript
|
||||
private async waitForStageCompletion(
|
||||
abRelease: ABRelease,
|
||||
stage: RolloutStage
|
||||
): Promise<StageCompletionResult> {
|
||||
|
||||
const startTime = Date.now();
|
||||
const checkInterval = 30000; // 30 seconds
|
||||
|
||||
while (true) {
|
||||
// Check health metrics
|
||||
const health = await this.checkHealth(abRelease, stage);
|
||||
|
||||
if (!health.healthy) {
|
||||
return {
|
||||
success: false,
|
||||
reason: `Health check failed: ${health.reason}`
|
||||
};
|
||||
}
|
||||
|
||||
// Check error rate (if threshold configured)
|
||||
if (stage.errorRateThreshold !== undefined) {
|
||||
const errorRate = await this.getErrorRate(abRelease);
|
||||
if (errorRate > stage.errorRateThreshold) {
|
||||
return {
|
||||
success: false,
|
||||
reason: `Error rate ${errorRate}% exceeds threshold ${stage.errorRateThreshold}%`
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Check latency (if threshold configured)
|
||||
if (stage.latencyThreshold !== undefined) {
|
||||
const latency = await this.getP99Latency(abRelease);
|
||||
if (latency > stage.latencyThreshold) {
|
||||
return {
|
||||
success: false,
|
||||
reason: `P99 latency ${latency}ms exceeds threshold ${stage.latencyThreshold}ms`
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Check duration (auto-advance)
|
||||
if (stage.duration !== undefined) {
|
||||
const elapsed = (Date.now() - startTime) / 1000;
|
||||
if (elapsed >= stage.duration) {
|
||||
return { success: true };
|
||||
}
|
||||
}
|
||||
|
||||
// Wait before next check
|
||||
await sleep(checkInterval);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Traffic Application
|
||||
|
||||
```typescript
|
||||
private async applyStageTraffic(abRelease: ABRelease, stage: RolloutStage): Promise<void> {
|
||||
if (abRelease.config.type === "router-based") {
|
||||
const router = await this.getRouterConnector(abRelease.config.routerIntegrationId);
|
||||
|
||||
await router.shiftTraffic(
|
||||
abRelease.config.variationA.serviceName,
|
||||
abRelease.config.variationB.serviceName,
|
||||
stage.trafficPercentageB
|
||||
);
|
||||
|
||||
} else if (abRelease.config.type === "target-group") {
|
||||
// Scale target groups
|
||||
await this.scaleTargetGroup(
|
||||
abRelease.config.groupA,
|
||||
stage.groupAPercentage
|
||||
);
|
||||
await this.scaleTargetGroup(
|
||||
abRelease.config.groupB,
|
||||
stage.groupBPercentage
|
||||
);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Rollback
|
||||
|
||||
```typescript
|
||||
async rollback(abRelease: ABRelease, reason: string): Promise<void> {
|
||||
this.log(`Rolling back A/B release: ${reason}`);
|
||||
this.emit("canary.rollback_started", { abRelease, reason });
|
||||
|
||||
if (abRelease.config.type === "router-based") {
|
||||
// Shift all traffic back to A
|
||||
const router = await this.getRouterConnector(abRelease.config.routerIntegrationId);
|
||||
await router.shiftTraffic(
|
||||
abRelease.config.variationB.serviceName,
|
||||
abRelease.config.variationA.serviceName,
|
||||
100
|
||||
);
|
||||
|
||||
} else if (abRelease.config.type === "target-group") {
|
||||
// Scale B to 0, A to 100
|
||||
await this.scaleTargetGroup(abRelease.config.groupA, 100);
|
||||
await this.scaleTargetGroup(abRelease.config.groupB, 0);
|
||||
}
|
||||
|
||||
abRelease.status = "rolled_back";
|
||||
await this.save(abRelease);
|
||||
|
||||
this.emit("canary.rolled_back", { abRelease, reason });
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
### Canary Stages
|
||||
|
||||
```yaml
|
||||
rolloutStrategy:
|
||||
type: health-based
|
||||
stages:
|
||||
- name: canary-5
|
||||
trafficPercentageB: 5
|
||||
duration: 300 # 5 minutes
|
||||
healthThreshold: 99
|
||||
errorRateThreshold: 0.5
|
||||
|
||||
- name: canary-25
|
||||
trafficPercentageB: 25
|
||||
duration: 600 # 10 minutes
|
||||
healthThreshold: 99
|
||||
errorRateThreshold: 1.0
|
||||
|
||||
- name: canary-50
|
||||
trafficPercentageB: 50
|
||||
duration: 900 # 15 minutes
|
||||
healthThreshold: 99
|
||||
errorRateThreshold: 1.0
|
||||
|
||||
- name: promote
|
||||
trafficPercentageB: 100
|
||||
requireApproval: true
|
||||
```
|
||||
|
||||
### Health Metrics
|
||||
|
||||
| Metric | Description | Typical Threshold |
|
||||
|--------|-------------|-------------------|
|
||||
| Success Rate | % of successful requests | > 99% |
|
||||
| Error Rate | % of failed requests | < 1% |
|
||||
| P99 Latency | 99th percentile response time | < 500ms |
|
||||
| Health Check | Container/service health | Healthy |
|
||||
|
||||
---
|
||||
|
||||
## Events
|
||||
|
||||
The canary controller emits events for observability:
|
||||
|
||||
| Event | Description |
|
||||
|-------|-------------|
|
||||
| `canary.stage_started` | Stage execution began |
|
||||
| `canary.stage_completed` | Stage completed successfully |
|
||||
| `canary.rollback_started` | Rollback initiated |
|
||||
| `canary.rolled_back` | Rollback completed |
|
||||
| `canary.promoted` | Full promotion completed |
|
||||
|
||||
---
|
||||
|
||||
## See Also
|
||||
|
||||
- [Progressive Delivery Module](../modules/progressive-delivery.md)
|
||||
- [A/B Release Models](ab-releases.md)
|
||||
- [Router Plugins](routers.md)
|
||||
- [Metrics](../operations/metrics.md)
|
||||
Reference in New Issue
Block a user