Files
git.stella-ops.org/docs/modules/release-orchestrator/modules/progressive-delivery.md

18 KiB

PROGDL: Progressive Delivery

Purpose: A/B releases, canary deployments, and traffic management.

Architecture Overview

┌─────────────────────────────────────────────────────────────────────────────┐
│                    PROGRESSIVE DELIVERY ARCHITECTURE                        │
│                                                                             │
│  ┌─────────────────────────────────────────────────────────────────────┐   │
│  │                    A/B RELEASE MANAGER                               │   │
│  │                                                                      │   │
│  │  - Create A/B release with variations                               │   │
│  │  - Manage traffic split configuration                               │   │
│  │  - Coordinate rollout stages                                        │   │
│  │  - Handle promotion/rollback                                        │   │
│  └──────────────────────────────┬──────────────────────────────────────┘   │
│                                 │                                           │
│              ┌──────────────────┴──────────────────┐                       │
│              │                                     │                       │
│              ▼                                     ▼                       │
│  ┌───────────────────────┐           ┌───────────────────────┐            │
│  │  TARGET-GROUP A/B     │           │   ROUTER-BASED A/B    │            │
│  │                       │           │                       │            │
│  │  Deploy to groups     │           │  Configure traffic    │            │
│  │  by labels/membership │           │  via load balancer    │            │
│  │                       │           │                       │            │
│  │  Good for:            │           │  Good for:            │            │
│  │  - Background workers │           │  - Web/API traffic    │            │
│  │  - Batch processors   │           │  - Customer-facing    │            │
│  │  - Internal services  │           │  - L7 routing         │            │
│  └───────────────────────┘           └───────────────────────┘            │
│                                                                             │
│  ┌─────────────────────────────────────────────────────────────────────┐   │
│  │                    CANARY CONTROLLER                                 │   │
│  │                                                                      │   │
│  │  - Execute rollout stages                                           │   │
│  │  - Monitor health metrics                                           │   │
│  │  - Auto-advance or pause                                            │   │
│  │  - Trigger rollback on failure                                      │   │
│  └─────────────────────────────────────────────────────────────────────┘   │
│                                                                             │
│  ┌─────────────────────────────────────────────────────────────────────┐   │
│  │                    TRAFFIC ROUTER INTEGRATION                        │   │
│  │                                                                      │   │
│  │  Plugin-based integration with:                                     │   │
│  │  - Nginx (config generation + reload)                               │   │
│  │  - HAProxy (config generation + reload)                             │   │
│  │  - Traefik (dynamic config API)                                     │   │
│  │  - AWS ALB (target group weights)                                   │   │
│  │  - Custom (webhook)                                                 │   │
│  └─────────────────────────────────────────────────────────────────────┘   │
│                                                                             │
└─────────────────────────────────────────────────────────────────────────────┘

Modules

Module: ab-manager

Aspect Specification
Responsibility A/B release lifecycle; variation management
Dependencies release-manager, environment-manager, deploy-orchestrator
Data Entities ABRelease, Variation, TrafficSplit
Events Produced ab.created, ab.started, ab.stage_advanced, ab.promoted, ab.rolled_back

A/B Release Entity:

interface ABRelease {
  id: UUID;
  tenantId: UUID;
  environmentId: UUID;
  name: string;
  variations: Variation[];
  activeVariation: string;           // "A" or "B"
  trafficSplit: TrafficSplit;
  rolloutStrategy: RolloutStrategy;
  status: ABReleaseStatus;
  createdAt: DateTime;
  completedAt: DateTime | null;
  createdBy: UUID;
}

interface Variation {
  name: string;                      // "A", "B"
  releaseId: UUID;
  targetGroupId: UUID | null;        // for target-group based A/B
  trafficPercentage: number;
  deploymentJobId: UUID | null;
}

interface TrafficSplit {
  type: "percentage" | "sticky" | "header";
  percentages: Record<string, number>;  // {"A": 90, "B": 10}
  stickyKey?: string;                   // cookie or header name
  headerMatch?: {                       // for header-based routing
    header: string;
    values: Record<string, string>;     // value -> variation
  };
}

type ABReleaseStatus =
  | "created"        // Configured, not started
  | "deploying"      // Deploying variations
  | "running"        // Active with traffic split
  | "promoting"      // Promoting winner to 100%
  | "completed"      // Successfully completed
  | "rolled_back";   // Rolled back to original

A/B Release Models:

Model Description Use Case
Target-Group A/B Deploy different releases to different target groups Background workers, internal services
Router-Based A/B Use load balancer to split traffic Web/API traffic, customer-facing
Hybrid A/B Combination of both Complex deployments

Module: traffic-router

Aspect Specification
Responsibility Router plugin orchestration; traffic shifting
Dependencies integration-manager, connector-runtime
Protocol Plugin-specific (API calls, config generation)

Router Connector Interface:

interface RouterConnector extends BaseConnector {
  // Traffic management
  configureRoute(config: RouteConfig): Promise<void>;
  getTrafficDistribution(): Promise<TrafficDistribution>;
  shiftTraffic(from: string, to: string, percentage: number): Promise<void>;

  // Configuration
  reloadConfig(): Promise<void>;
  validateConfig(config: string): Promise<ValidationResult>;
}

interface RouteConfig {
  upstream: string;
  backends: Array<{
    name: string;
    targets: string[];
    weight: number;
  }>;
  healthCheck?: {
    path: string;
    interval: number;
    timeout: number;
  };
}

interface TrafficDistribution {
  backends: Array<{
    name: string;
    weight: number;
    healthyTargets: number;
    totalTargets: number;
  }>;
  timestamp: DateTime;
}

Router Plugins:

Plugin Capabilities
router.nginx Config generation, reload via signal/API
router.haproxy Config generation, reload via socket
router.traefik Dynamic config API
router.aws_alb Target group weights via AWS API
router.custom Webhook-based custom integration

Module: canary-controller

Aspect Specification
Responsibility Canary ramp automation; health monitoring
Dependencies ab-manager, traffic-router
Data Entities CanaryStage, HealthResult
Events Produced canary.stage_started, canary.stage_passed, canary.stage_failed

Canary Stage Entity:

interface CanaryStage {
  id: UUID;
  abReleaseId: UUID;
  stageNumber: number;
  trafficPercentage: number;
  status: CanaryStageStatus;
  healthThreshold: number;           // Required health % to pass
  durationSeconds: number;           // How long to run stage
  requireApproval: boolean;          // Require manual approval
  startedAt: DateTime | null;
  completedAt: DateTime | null;
  healthResult: HealthResult | null;
}

type CanaryStageStatus =
  | "pending"
  | "running"
  | "succeeded"
  | "failed"
  | "skipped";

interface HealthResult {
  healthy: boolean;
  healthPercentage: number;
  metrics: {
    successRate: number;
    errorRate: number;
    latencyP50: number;
    latencyP99: number;
  };
  samples: number;
  evaluatedAt: DateTime;
}

Canary Rollout Execution:

class CanaryController {
  async executeRollout(abRelease: ABRelease): Promise<void> {
    const stages = abRelease.rolloutStrategy.stages;

    for (const stage of stages) {
      this.log(`Starting canary stage ${stage.stageNumber}: ${stage.trafficPercentage}%`);

      // 1. Shift traffic to canary percentage
      await this.trafficRouter.shiftTraffic(
        abRelease.variations[0].name,  // baseline
        abRelease.variations[1].name,  // canary
        stage.trafficPercentage
      );

      // 2. Update stage status
      stage.status = "running";
      stage.startedAt = new Date();
      await this.save(stage);

      // 3. Wait for stage duration
      await this.waitForDuration(stage.durationSeconds);

      // 4. Evaluate health
      const healthResult = await this.evaluateHealth(abRelease, stage);
      stage.healthResult = healthResult;

      if (!healthResult.healthy || healthResult.healthPercentage < stage.healthThreshold) {
        stage.status = "failed";
        await this.save(stage);

        // Rollback
        await this.rollback(abRelease);
        throw new CanaryFailedError(`Stage ${stage.stageNumber} failed health check`);
      }

      // 5. Check if approval required
      if (stage.requireApproval) {
        await this.waitForApproval(abRelease, stage);
      }

      stage.status = "succeeded";
      stage.completedAt = new Date();
      await this.save(stage);

      // 6. Check for auto-advance
      if (!abRelease.rolloutStrategy.autoAdvance) {
        await this.waitForManualAdvance(abRelease);
      }
    }

    // All stages passed - promote canary to 100%
    await this.promote(abRelease, abRelease.variations[1].name);
  }

  private async evaluateHealth(abRelease: ABRelease, stage: CanaryStage): Promise<HealthResult> {
    // Collect metrics from targets
    const canaryVariation = abRelease.variations.find(v => v.name === "B");
    const targets = await this.getTargets(canaryVariation.targetGroupId);

    let healthyCount = 0;
    let totalLatency = 0;
    let errorCount = 0;

    for (const target of targets) {
      const health = await this.checkTargetHealth(target);
      if (health.healthy) healthyCount++;
      totalLatency += health.latencyMs;
      if (health.errorRate > 0) errorCount++;
    }

    return {
      healthy: healthyCount >= targets.length * (stage.healthThreshold / 100),
      healthPercentage: (healthyCount / targets.length) * 100,
      metrics: {
        successRate: ((targets.length - errorCount) / targets.length) * 100,
        errorRate: (errorCount / targets.length) * 100,
        latencyP50: totalLatency / targets.length,
        latencyP99: totalLatency / targets.length * 1.5,  // simplified
      },
      samples: targets.length,
      evaluatedAt: new Date(),
    };
  }
}

Module: rollout-strategy

Aspect Specification
Responsibility Strategy templates; configuration
Data Entities RolloutStrategyTemplate

Built-in Strategy Templates:

Template Stages Description
canary-10-25-50-100 4 Standard canary: 10%, 25%, 50%, 100%
canary-1-5-10-50-100 5 Conservative: 1%, 5%, 10%, 50%, 100%
blue-green-instant 2 Deploy 100% to green, instant switch
blue-green-gradual 4 Gradual shift: 25%, 50%, 75%, 100%

Rollout Strategy Definition:

interface RolloutStrategy {
  id: UUID;
  name: string;
  stages: Array<{
    trafficPercentage: number;
    durationSeconds: number;
    healthThreshold: number;
    requireApproval: boolean;
  }>;
  autoAdvance: boolean;
  rollbackOnFailure: boolean;
  healthCheckInterval: number;
}

// Example: Standard Canary
const standardCanary: RolloutStrategy = {
  name: "canary-10-25-50-100",
  stages: [
    { trafficPercentage: 10, durationSeconds: 300, healthThreshold: 95, requireApproval: false },
    { trafficPercentage: 25, durationSeconds: 600, healthThreshold: 95, requireApproval: false },
    { trafficPercentage: 50, durationSeconds: 900, healthThreshold: 95, requireApproval: true },
    { trafficPercentage: 100, durationSeconds: 0, healthThreshold: 95, requireApproval: false },
  ],
  autoAdvance: true,
  rollbackOnFailure: true,
  healthCheckInterval: 30,
};

Database Schema

-- A/B Releases
CREATE TABLE release.ab_releases (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
    environment_id UUID NOT NULL REFERENCES release.environments(id),
    name VARCHAR(255) NOT NULL,
    variations JSONB NOT NULL,            -- [{name, releaseId, targetGroupId, trafficPercentage}]
    active_variation VARCHAR(50) NOT NULL DEFAULT 'A',
    traffic_split JSONB NOT NULL,
    rollout_strategy JSONB NOT NULL,
    status VARCHAR(50) NOT NULL DEFAULT 'created' CHECK (status IN (
        'created', 'deploying', 'running', 'promoting', 'completed', 'rolled_back'
    )),
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    completed_at TIMESTAMPTZ,
    created_by UUID REFERENCES users(id)
);

CREATE INDEX idx_ab_releases_tenant_env ON release.ab_releases(tenant_id, environment_id);
CREATE INDEX idx_ab_releases_status ON release.ab_releases(status);

-- Canary Stages
CREATE TABLE release.canary_stages (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    ab_release_id UUID NOT NULL REFERENCES release.ab_releases(id) ON DELETE CASCADE,
    stage_number INTEGER NOT NULL,
    traffic_percentage INTEGER NOT NULL,
    status VARCHAR(50) NOT NULL DEFAULT 'pending' CHECK (status IN (
        'pending', 'running', 'succeeded', 'failed', 'skipped'
    )),
    health_threshold DECIMAL(5,2),
    duration_seconds INTEGER,
    require_approval BOOLEAN NOT NULL DEFAULT FALSE,
    started_at TIMESTAMPTZ,
    completed_at TIMESTAMPTZ,
    health_result JSONB,
    UNIQUE (ab_release_id, stage_number)
);

API Endpoints

# A/B Releases
POST   /api/v1/ab-releases
       Body: {
         environmentId: UUID,
         name: string,
         variations: [
           { name: "A", releaseId: UUID, targetGroupId?: UUID },
           { name: "B", releaseId: UUID, targetGroupId?: UUID }
         ],
         trafficSplit: TrafficSplit,
         rolloutStrategy: RolloutStrategy
       }
       Response: ABRelease

GET    /api/v1/ab-releases
       Query: ?environmentId={uuid}&status={status}
       Response: ABRelease[]

GET    /api/v1/ab-releases/{id}
       Response: ABRelease (with stages)

POST   /api/v1/ab-releases/{id}/start
       Response: ABRelease

POST   /api/v1/ab-releases/{id}/advance
       Body: { stageNumber?: number }  # advance to next or specific stage
       Response: ABRelease

POST   /api/v1/ab-releases/{id}/promote
       Body: { variation: "A" | "B" }  # promote to 100%
       Response: ABRelease

POST   /api/v1/ab-releases/{id}/rollback
       Response: ABRelease

GET    /api/v1/ab-releases/{id}/traffic
       Response: { currentSplit: TrafficDistribution, history: TrafficHistory[] }

GET    /api/v1/ab-releases/{id}/health
       Response: { variations: [{ name, healthStatus, metrics }] }

# Rollout Strategies
GET    /api/v1/rollout-strategies
       Response: RolloutStrategyTemplate[]

GET    /api/v1/rollout-strategies/{id}
       Response: RolloutStrategyTemplate

References