Files
git.stella-ops.org/docs/modules/release-orchestrator/deployment/strategies.md

21 KiB

Deployment Strategies

Overview

Release Orchestrator supports multiple deployment strategies to balance deployment speed, risk, and availability requirements.

Strategy Comparison

Strategy Description Risk Level Downtime Rollback Speed
All-at-once Deploy to all targets simultaneously High Brief Fast
Rolling Deploy to targets in batches Medium None Medium
Canary Deploy to subset, then expand Low None Fast
Blue-Green Deploy to parallel environment Low None Instant

All-at-Once Strategy

Description

Deploys to all targets simultaneously. Simple and fast, but highest risk.

           ALL-AT-ONCE DEPLOYMENT

  Time 0                    Time 1
  ┌─────────────────┐       ┌─────────────────┐
  │  Target 1 [v1]  │       │  Target 1 [v2]  │
  ├─────────────────┤       ├─────────────────┤
  │  Target 2 [v1]  │  ───► │  Target 2 [v2]  │
  ├─────────────────┤       ├─────────────────┤
  │  Target 3 [v1]  │       │  Target 3 [v2]  │
  └─────────────────┘       └─────────────────┘

Configuration

interface AllAtOnceConfig {
  strategy: "all-at-once";

  // Concurrency limit (0 = unlimited)
  maxConcurrent: number;

  // Health check after deployment
  healthCheck: HealthCheckConfig;

  // Failure behavior
  failureBehavior: "rollback" | "continue" | "pause";
}

// Example
const config: AllAtOnceConfig = {
  strategy: "all-at-once",
  maxConcurrent: 0,
  healthCheck: {
    type: "http",
    path: "/health",
    timeout: 30,
    retries: 3,
    interval: 10
  },
  failureBehavior: "rollback"
};

Execution

class AllAtOnceExecutor {
  async execute(job: DeploymentJob, config: AllAtOnceConfig): Promise<void> {
    const tasks = job.tasks;
    const concurrency = config.maxConcurrent || tasks.length;

    // Execute all tasks with concurrency limit
    const results = await pMap(
      tasks,
      async (task) => {
        try {
          await this.executeTask(task);
          return { taskId: task.id, success: true };
        } catch (error) {
          return { taskId: task.id, success: false, error };
        }
      },
      { concurrency }
    );

    // Check for failures
    const failures = results.filter(r => !r.success);

    if (failures.length > 0) {
      if (config.failureBehavior === "rollback") {
        await this.rollbackAll(job);
        throw new DeploymentFailedError(failures);
      } else if (config.failureBehavior === "pause") {
        job.status = "failed";
        throw new DeploymentFailedError(failures);
      }
      // "continue" - proceed despite failures
    }

    // Health check all targets
    await this.verifyAllTargets(job, config.healthCheck);
  }
}

Use Cases

  • Development environments
  • Small deployments
  • Time-critical updates
  • Stateless services with fast startup

Rolling Strategy

Description

Deploys to targets in configurable batches, maintaining availability throughout.

           ROLLING DEPLOYMENT (batch size: 1)

  Time 0            Time 1            Time 2            Time 3
  ┌─────────────┐   ┌─────────────┐   ┌─────────────┐   ┌─────────────┐
  │ T1 [v1]     │   │ T1 [v2] ✓   │   │ T1 [v2] ✓   │   │ T1 [v2] ✓   │
  ├─────────────┤   ├─────────────┤   ├─────────────┤   ├─────────────┤
  │ T2 [v1]     │──►│ T2 [v1]     │──►│ T2 [v2] ✓   │──►│ T2 [v2] ✓   │
  ├─────────────┤   ├─────────────┤   ├─────────────┤   ├─────────────┤
  │ T3 [v1]     │   │ T3 [v1]     │   │ T3 [v1]     │   │ T3 [v2] ✓   │
  └─────────────┘   └─────────────┘   └─────────────┘   └─────────────┘

Configuration

interface RollingConfig {
  strategy: "rolling";

  // Batch configuration
  batchSize: number;           // Targets per batch
  batchPercent?: number;       // Alternative: percentage of targets

  // Timing
  batchDelay: number;          // Seconds between batches
  stabilizationTime: number;   // Wait after health check passes

  // Health check
  healthCheck: HealthCheckConfig;

  // Failure handling
  maxFailedBatches: number;    // Failures before stopping
  failureBehavior: "rollback" | "pause" | "skip";

  // Ordering
  targetOrder: "default" | "shuffle" | "priority";
}

// Example
const config: RollingConfig = {
  strategy: "rolling",
  batchSize: 2,
  batchDelay: 30,
  stabilizationTime: 60,
  healthCheck: {
    type: "http",
    path: "/health",
    timeout: 30,
    retries: 5,
    interval: 10
  },
  maxFailedBatches: 1,
  failureBehavior: "rollback",
  targetOrder: "default"
};

Execution

class RollingExecutor {
  async execute(job: DeploymentJob, config: RollingConfig): Promise<void> {
    const tasks = this.orderTasks(job.tasks, config.targetOrder);
    const batches = this.createBatches(tasks, config);
    let failedBatches = 0;
    const completedTasks: DeploymentTask[] = [];

    for (const batch of batches) {
      this.emitProgress(job, {
        phase: "deploying",
        currentBatch: batches.indexOf(batch) + 1,
        totalBatches: batches.length,
        completedTargets: completedTasks.length,
        totalTargets: tasks.length
      });

      // Execute batch
      const results = await Promise.all(
        batch.map(task => this.executeTask(task))
      );

      // Check batch results
      const failures = results.filter(r => !r.success);

      if (failures.length > 0) {
        failedBatches++;

        if (failedBatches > config.maxFailedBatches) {
          if (config.failureBehavior === "rollback") {
            await this.rollbackCompleted(completedTasks);
          }
          throw new DeploymentFailedError(failures);
        }

        if (config.failureBehavior === "pause") {
          job.status = "failed";
          throw new DeploymentFailedError(failures);
        }
        // "skip" - continue to next batch
      }

      // Health check batch targets
      await this.verifyBatch(batch, config.healthCheck);

      // Wait for stabilization
      if (config.stabilizationTime > 0) {
        await sleep(config.stabilizationTime * 1000);
      }

      completedTasks.push(...batch);

      // Wait before next batch
      if (batches.indexOf(batch) < batches.length - 1) {
        await sleep(config.batchDelay * 1000);
      }
    }
  }

  private createBatches(
    tasks: DeploymentTask[],
    config: RollingConfig
  ): DeploymentTask[][] {
    const batchSize = config.batchPercent
      ? Math.ceil(tasks.length * config.batchPercent / 100)
      : config.batchSize;

    const batches: DeploymentTask[][] = [];
    for (let i = 0; i < tasks.length; i += batchSize) {
      batches.push(tasks.slice(i, i + batchSize));
    }

    return batches;
  }
}

Use Cases

  • Production deployments
  • High-availability requirements
  • Large target counts
  • Services requiring gradual rollout

Canary Strategy

Description

Deploys to a small subset of targets first, validates, then expands to remaining targets.

           CANARY DEPLOYMENT

  Phase 1: Canary (10%)     Phase 2: Expand (50%)    Phase 3: Full (100%)

  ┌─────────────┐           ┌─────────────┐          ┌─────────────┐
  │ T1 [v2] ✓   │ ◄─canary  │ T1 [v2] ✓   │          │ T1 [v2] ✓   │
  ├─────────────┤           ├─────────────┤          ├─────────────┤
  │ T2 [v1]     │           │ T2 [v2] ✓   │          │ T2 [v2] ✓   │
  ├─────────────┤           ├─────────────┤          ├─────────────┤
  │ T3 [v1]     │           │ T3 [v2] ✓   │          │ T3 [v2] ✓   │
  ├─────────────┤           ├─────────────┤          ├─────────────┤
  │ T4 [v1]     │           │ T4 [v2] ✓   │          │ T4 [v2] ✓   │
  ├─────────────┤           ├─────────────┤          ├─────────────┤
  │ T5 [v1]     │           │ T5 [v1]     │          │ T5 [v2] ✓   │
  └─────────────┘           └─────────────┘          └─────────────┘

        │                         │                        │
        ▼                         ▼                        ▼
   Health Check              Health Check             Health Check
   Error Rate Check          Error Rate Check         Error Rate Check

Configuration

interface CanaryConfig {
  strategy: "canary";

  // Canary stages
  stages: CanaryStage[];

  // Canary selection
  canarySelector: "random" | "labeled" | "first";
  canaryLabel?: string;        // Label for canary targets

  // Automatic vs manual progression
  autoProgress: boolean;

  // Health and metrics checks
  healthCheck: HealthCheckConfig;
  metricsCheck?: MetricsCheckConfig;
}

interface CanaryStage {
  name: string;
  percentage: number;          // Target percentage
  duration: number;            // Minimum time at this stage (seconds)
  autoProgress: boolean;       // Auto-advance after duration
}

interface MetricsCheckConfig {
  integrationId: UUID;         // Metrics integration
  queries: MetricQuery[];
  failureThreshold: number;    // Percentage deviation to fail
}

interface MetricQuery {
  name: string;
  query: string;               // PromQL or similar
  operator: "lt" | "gt" | "eq";
  threshold: number;
}

// Example
const config: CanaryConfig = {
  strategy: "canary",
  stages: [
    { name: "canary", percentage: 10, duration: 300, autoProgress: false },
    { name: "expand", percentage: 50, duration: 300, autoProgress: true },
    { name: "full", percentage: 100, duration: 0, autoProgress: true }
  ],
  canarySelector: "labeled",
  canaryLabel: "canary=true",
  autoProgress: false,
  healthCheck: {
    type: "http",
    path: "/health",
    timeout: 30,
    retries: 5,
    interval: 10
  },
  metricsCheck: {
    integrationId: "prometheus-uuid",
    queries: [
      {
        name: "error_rate",
        query: "rate(http_requests_total{status=~\"5..\"}[5m]) / rate(http_requests_total[5m])",
        operator: "lt",
        threshold: 0.01  // Less than 1% error rate
      }
    ],
    failureThreshold: 10
  }
};

Execution

class CanaryExecutor {
  async execute(job: DeploymentJob, config: CanaryConfig): Promise<void> {
    const tasks = this.orderTasks(job.tasks, config);

    for (const stage of config.stages) {
      const targetCount = Math.ceil(tasks.length * stage.percentage / 100);
      const stageTasks = tasks.slice(0, targetCount);
      const newTasks = stageTasks.filter(t => t.status === "pending");

      this.emitProgress(job, {
        phase: "canary",
        stage: stage.name,
        percentage: stage.percentage,
        targets: stageTasks.length
      });

      // Deploy to new targets in this stage
      await Promise.all(newTasks.map(task => this.executeTask(task)));

      // Health check stage targets
      await this.verifyTargets(stageTasks, config.healthCheck);

      // Metrics check if configured
      if (config.metricsCheck) {
        await this.checkMetrics(stageTasks, config.metricsCheck);
      }

      // Wait for stage duration
      if (stage.duration > 0) {
        await this.waitWithMonitoring(
          stageTasks,
          stage.duration,
          config.metricsCheck
        );
      }

      // Wait for manual approval if not auto-progress
      if (!stage.autoProgress && stage.percentage < 100) {
        await this.waitForApproval(job, stage.name);
      }
    }
  }

  private async checkMetrics(
    targets: DeploymentTask[],
    config: MetricsCheckConfig
  ): Promise<void> {
    const metricsClient = await this.getMetricsClient(config.integrationId);

    for (const query of config.queries) {
      const result = await metricsClient.query(query.query);

      const passed = this.evaluateMetric(result, query);

      if (!passed) {
        throw new CanaryMetricsFailedError(query.name, result, query.threshold);
      }
    }
  }
}

Use Cases

  • Risk-sensitive deployments
  • Services with real user traffic
  • Deployments with metrics-based validation
  • Gradual feature rollouts

Blue-Green Strategy

Description

Deploys to a parallel "green" environment while "blue" continues serving traffic, then switches.

           BLUE-GREEN DEPLOYMENT

  Phase 1: Deploy Green         Phase 2: Switch Traffic

  ┌─────────────────────────┐   ┌─────────────────────────┐
  │  Load Balancer          │   │  Load Balancer          │
  │         │               │   │         │               │
  │         ▼               │   │         ▼               │
  │  ┌─────────────┐        │   │  ┌─────────────┐        │
  │  │ Blue [v1]   │◄─active│   │  │ Blue [v1]   │        │
  │  │ T1, T2, T3  │        │   │  │ T1, T2, T3  │        │
  │  └─────────────┘        │   │  └─────────────┘        │
  │                         │   │                         │
  │  ┌─────────────┐        │   │  ┌─────────────┐        │
  │  │ Green [v2]  │◄─deploy│   │  │ Green [v2]  │◄─active│
  │  │ T4, T5, T6  │        │   │  │ T4, T5, T6  │        │
  │  └─────────────┘        │   │  └─────────────┘        │
  │                         │   │                         │
  └─────────────────────────┘   └─────────────────────────┘

Configuration

interface BlueGreenConfig {
  strategy: "blue-green";

  // Environment labels
  blueLabel: string;           // Label for blue targets
  greenLabel: string;          // Label for green targets

  // Traffic routing
  routerIntegration: UUID;     // Router/LB integration
  routingConfig: RoutingConfig;

  // Validation
  healthCheck: HealthCheckConfig;
  warmupTime: number;          // Seconds to warm up green
  validationTests?: string[];  // Test suites to run

  // Switchover
  switchoverMode: "instant" | "gradual";
  gradualSteps?: number[];     // Percentage steps for gradual

  // Rollback
  keepBlueActive: number;      // Seconds to keep blue ready
}

// Example
const config: BlueGreenConfig = {
  strategy: "blue-green",
  blueLabel: "deployment=blue",
  greenLabel: "deployment=green",
  routerIntegration: "nginx-lb-uuid",
  routingConfig: {
    upstreamName: "myapp",
    healthEndpoint: "/health"
  },
  healthCheck: {
    type: "http",
    path: "/health",
    timeout: 30,
    retries: 5,
    interval: 10
  },
  warmupTime: 60,
  validationTests: ["smoke-test-suite"],
  switchoverMode: "instant",
  keepBlueActive: 1800  // 30 minutes
};

Execution

class BlueGreenExecutor {
  async execute(job: DeploymentJob, config: BlueGreenConfig): Promise<void> {
    // Identify blue and green targets
    const { blue, green } = this.categorizeTargets(job.tasks, config);

    // Phase 1: Deploy to green
    this.emitProgress(job, { phase: "deploying-green" });

    await Promise.all(green.map(task => this.executeTask(task)));

    // Health check green targets
    await this.verifyTargets(green, config.healthCheck);

    // Warmup period
    if (config.warmupTime > 0) {
      this.emitProgress(job, { phase: "warming-up" });
      await sleep(config.warmupTime * 1000);
    }

    // Run validation tests
    if (config.validationTests?.length) {
      this.emitProgress(job, { phase: "validating" });
      await this.runValidationTests(green, config.validationTests);
    }

    // Phase 2: Switch traffic
    this.emitProgress(job, { phase: "switching-traffic" });

    if (config.switchoverMode === "instant") {
      await this.instantSwitchover(config, blue, green);
    } else {
      await this.gradualSwitchover(config, blue, green);
    }

    // Verify traffic routing
    await this.verifyRouting(green, config);

    // Schedule blue decommission
    if (config.keepBlueActive > 0) {
      this.scheduleBlueDecommission(blue, config.keepBlueActive);
    }
  }

  private async instantSwitchover(
    config: BlueGreenConfig,
    blue: DeploymentTask[],
    green: DeploymentTask[]
  ): Promise<void> {
    const router = await this.getRouter(config.routerIntegration);

    // Update upstream to green targets
    await router.updateUpstream(config.routingConfig.upstreamName, {
      servers: green.map(t => ({
        address: t.target.address,
        weight: 1
      }))
    });

    // Remove blue from rotation
    await router.removeServers(
      config.routingConfig.upstreamName,
      blue.map(t => t.target.address)
    );
  }

  private async gradualSwitchover(
    config: BlueGreenConfig,
    blue: DeploymentTask[],
    green: DeploymentTask[]
  ): Promise<void> {
    const router = await this.getRouter(config.routerIntegration);
    const steps = config.gradualSteps || [25, 50, 75, 100];

    for (const percentage of steps) {
      await router.setTrafficSplit(config.routingConfig.upstreamName, {
        blue: 100 - percentage,
        green: percentage
      });

      // Monitor for errors
      await this.monitorTraffic(30);
    }
  }
}

Use Cases

  • Zero-downtime deployments
  • Database migration deployments
  • High-stakes production updates
  • Instant rollback requirements

Strategy Selection Guide

                    STRATEGY SELECTION

                         START
                           │
                           ▼
              ┌────────────────────────┐
              │  Zero downtime needed? │
              └───────────┬────────────┘
                          │
               No         │         Yes
               │          │          │
               ▼          │          ▼
         ┌──────────┐     │    ┌───────────────────┐
         │ All-at-  │     │    │ Metrics-based     │
         │ once     │     │    │ validation needed?│
         └──────────┘     │    └─────────┬─────────┘
                          │              │
                          │    No        │        Yes
                          │    │         │         │
                          │    ▼         │         ▼
                          │  ┌──────────┐│    ┌──────────┐
                          │  │ Instant   ││    │ Canary   │
                          │  │ rollback? ││    │          │
                          │  └────┬─────┘│    └──────────┘
                          │       │      │
                          │ No    │ Yes  │
                          │ │     │  │   │
                          │ ▼     │  ▼   │
                          │┌──────┐│┌────┴─────┐
                          ││Rolling│││Blue-Green│
                          │└──────┘│└──────────┘
                          │       │
                          └───────┘

References