Files
git.stella-ops.org/docs/modules/release-orchestrator/progressive-delivery/canary.md

7.5 KiB

Canary Controller

Automated canary deployment controller with health-based stage advancement and automatic rollback.

Status: Planned (not yet implemented) Source: Architecture Advisory Section 11.3 Related Modules: Progressive Delivery Module, Deployment Strategies Sprint: 110_003 Canary Controller

Overview

The Canary Controller automates progressive rollout of new versions by gradually shifting traffic, monitoring health metrics, and automatically rolling back if issues are detected.


Canary State Machine

States

CREATED -> DEPLOYING -> EVALUATING -> PROMOTING/ROLLING_BACK -> COMPLETED
State Description
CREATED Canary release defined, not started
DEPLOYING Deploying variation B to targets
EVALUATING Monitoring health metrics at current stage
PROMOTING Advancing to next stage
ROLLING_BACK Reverting to variation A
COMPLETED Final state (promoted or rolled back)

Implementation

Canary Controller Class

class CanaryController {
  async executeRollout(abRelease: ABRelease): Promise<void> {
    const strategy = abRelease.rolloutStrategy;

    for (let i = 0; i < strategy.stages.length; i++) {
      const stage = strategy.stages[i];
      const stageRecord = await this.startStage(abRelease, stage, i);

      try {
        // 1. Apply traffic configuration for this stage
        await this.applyStageTraffic(abRelease, stage);
        this.emit("canary.stage_started", { abRelease, stage, stageNumber: i });

        // 2. Wait for stage completion based on criteria
        const result = await this.waitForStageCompletion(abRelease, stage);

        if (!result.success) {
          // Health check failed - rollback
          this.log(`Stage ${stage.name} failed health check: ${result.reason}`);
          await this.rollback(abRelease, result.reason);
          return;
        }

        // 3. Check if approval required
        if (stage.requireApproval) {
          this.log(`Stage ${stage.name} requires approval`);
          await this.pauseForApproval(abRelease, stage);

          // Wait for approval
          const approval = await this.waitForApproval(abRelease, stage);
          if (!approval.approved) {
            await this.rollback(abRelease, "Approval denied");
            return;
          }
        }

        await this.completeStage(stageRecord, "succeeded");
        this.emit("canary.stage_completed", { abRelease, stage, stageNumber: i });

      } catch (error) {
        await this.completeStage(stageRecord, "failed", error.message);
        await this.rollback(abRelease, error.message);
        return;
      }
    }

    // Rollout complete
    await this.completeRollout(abRelease);
    this.emit("canary.promoted", { abRelease });
  }
}

Stage Completion Logic

private async waitForStageCompletion(
  abRelease: ABRelease,
  stage: RolloutStage
): Promise<StageCompletionResult> {

  const startTime = Date.now();
  const checkInterval = 30000; // 30 seconds

  while (true) {
    // Check health metrics
    const health = await this.checkHealth(abRelease, stage);

    if (!health.healthy) {
      return {
        success: false,
        reason: `Health check failed: ${health.reason}`
      };
    }

    // Check error rate (if threshold configured)
    if (stage.errorRateThreshold !== undefined) {
      const errorRate = await this.getErrorRate(abRelease);
      if (errorRate > stage.errorRateThreshold) {
        return {
          success: false,
          reason: `Error rate ${errorRate}% exceeds threshold ${stage.errorRateThreshold}%`
        };
      }
    }

    // Check latency (if threshold configured)
    if (stage.latencyThreshold !== undefined) {
      const latency = await this.getP99Latency(abRelease);
      if (latency > stage.latencyThreshold) {
        return {
          success: false,
          reason: `P99 latency ${latency}ms exceeds threshold ${stage.latencyThreshold}ms`
        };
      }
    }

    // Check duration (auto-advance)
    if (stage.duration !== undefined) {
      const elapsed = (Date.now() - startTime) / 1000;
      if (elapsed >= stage.duration) {
        return { success: true };
      }
    }

    // Wait before next check
    await sleep(checkInterval);
  }
}

Traffic Application

private async applyStageTraffic(abRelease: ABRelease, stage: RolloutStage): Promise<void> {
  if (abRelease.config.type === "router-based") {
    const router = await this.getRouterConnector(abRelease.config.routerIntegrationId);

    await router.shiftTraffic(
      abRelease.config.variationA.serviceName,
      abRelease.config.variationB.serviceName,
      stage.trafficPercentageB
    );

  } else if (abRelease.config.type === "target-group") {
    // Scale target groups
    await this.scaleTargetGroup(
      abRelease.config.groupA,
      stage.groupAPercentage
    );
    await this.scaleTargetGroup(
      abRelease.config.groupB,
      stage.groupBPercentage
    );
  }
}

Rollback

async rollback(abRelease: ABRelease, reason: string): Promise<void> {
  this.log(`Rolling back A/B release: ${reason}`);
  this.emit("canary.rollback_started", { abRelease, reason });

  if (abRelease.config.type === "router-based") {
    // Shift all traffic back to A
    const router = await this.getRouterConnector(abRelease.config.routerIntegrationId);
    await router.shiftTraffic(
      abRelease.config.variationB.serviceName,
      abRelease.config.variationA.serviceName,
      100
    );

  } else if (abRelease.config.type === "target-group") {
    // Scale B to 0, A to 100
    await this.scaleTargetGroup(abRelease.config.groupA, 100);
    await this.scaleTargetGroup(abRelease.config.groupB, 0);
  }

  abRelease.status = "rolled_back";
  await this.save(abRelease);

  this.emit("canary.rolled_back", { abRelease, reason });
}

Configuration

Canary Stages

rolloutStrategy:
  type: health-based
  stages:
    - name: canary-5
      trafficPercentageB: 5
      duration: 300        # 5 minutes
      healthThreshold: 99
      errorRateThreshold: 0.5

    - name: canary-25
      trafficPercentageB: 25
      duration: 600        # 10 minutes
      healthThreshold: 99
      errorRateThreshold: 1.0

    - name: canary-50
      trafficPercentageB: 50
      duration: 900        # 15 minutes
      healthThreshold: 99
      errorRateThreshold: 1.0

    - name: promote
      trafficPercentageB: 100
      requireApproval: true

Health Metrics

Metric Description Typical Threshold
Success Rate % of successful requests > 99%
Error Rate % of failed requests < 1%
P99 Latency 99th percentile response time < 500ms
Health Check Container/service health Healthy

Events

The canary controller emits events for observability:

Event Description
canary.stage_started Stage execution began
canary.stage_completed Stage completed successfully
canary.rollback_started Rollback initiated
canary.rolled_back Rollback completed
canary.promoted Full promotion completed

See Also