release orchestrator pivot, architecture and planning
This commit is contained in:
@@ -0,0 +1,471 @@
|
||||
# PROGDL: Progressive Delivery
|
||||
|
||||
**Purpose**: A/B releases, canary deployments, and traffic management.
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ PROGRESSIVE DELIVERY ARCHITECTURE │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ A/B RELEASE MANAGER │ │
|
||||
│ │ │ │
|
||||
│ │ - Create A/B release with variations │ │
|
||||
│ │ - Manage traffic split configuration │ │
|
||||
│ │ - Coordinate rollout stages │ │
|
||||
│ │ - Handle promotion/rollback │ │
|
||||
│ └──────────────────────────────┬──────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌──────────────────┴──────────────────┐ │
|
||||
│ │ │ │
|
||||
│ ▼ ▼ │
|
||||
│ ┌───────────────────────┐ ┌───────────────────────┐ │
|
||||
│ │ TARGET-GROUP A/B │ │ ROUTER-BASED A/B │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ Deploy to groups │ │ Configure traffic │ │
|
||||
│ │ by labels/membership │ │ via load balancer │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ Good for: │ │ Good for: │ │
|
||||
│ │ - Background workers │ │ - Web/API traffic │ │
|
||||
│ │ - Batch processors │ │ - Customer-facing │ │
|
||||
│ │ - Internal services │ │ - L7 routing │ │
|
||||
│ └───────────────────────┘ └───────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ CANARY CONTROLLER │ │
|
||||
│ │ │ │
|
||||
│ │ - Execute rollout stages │ │
|
||||
│ │ - Monitor health metrics │ │
|
||||
│ │ - Auto-advance or pause │ │
|
||||
│ │ - Trigger rollback on failure │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ TRAFFIC ROUTER INTEGRATION │ │
|
||||
│ │ │ │
|
||||
│ │ Plugin-based integration with: │ │
|
||||
│ │ - Nginx (config generation + reload) │ │
|
||||
│ │ - HAProxy (config generation + reload) │ │
|
||||
│ │ - Traefik (dynamic config API) │ │
|
||||
│ │ - AWS ALB (target group weights) │ │
|
||||
│ │ - Custom (webhook) │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Modules
|
||||
|
||||
### Module: `ab-manager`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | A/B release lifecycle; variation management |
|
||||
| **Dependencies** | `release-manager`, `environment-manager`, `deploy-orchestrator` |
|
||||
| **Data Entities** | `ABRelease`, `Variation`, `TrafficSplit` |
|
||||
| **Events Produced** | `ab.created`, `ab.started`, `ab.stage_advanced`, `ab.promoted`, `ab.rolled_back` |
|
||||
|
||||
**A/B Release Entity**:
|
||||
```typescript
|
||||
interface ABRelease {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
environmentId: UUID;
|
||||
name: string;
|
||||
variations: Variation[];
|
||||
activeVariation: string; // "A" or "B"
|
||||
trafficSplit: TrafficSplit;
|
||||
rolloutStrategy: RolloutStrategy;
|
||||
status: ABReleaseStatus;
|
||||
createdAt: DateTime;
|
||||
completedAt: DateTime | null;
|
||||
createdBy: UUID;
|
||||
}
|
||||
|
||||
interface Variation {
|
||||
name: string; // "A", "B"
|
||||
releaseId: UUID;
|
||||
targetGroupId: UUID | null; // for target-group based A/B
|
||||
trafficPercentage: number;
|
||||
deploymentJobId: UUID | null;
|
||||
}
|
||||
|
||||
interface TrafficSplit {
|
||||
type: "percentage" | "sticky" | "header";
|
||||
percentages: Record<string, number>; // {"A": 90, "B": 10}
|
||||
stickyKey?: string; // cookie or header name
|
||||
headerMatch?: { // for header-based routing
|
||||
header: string;
|
||||
values: Record<string, string>; // value -> variation
|
||||
};
|
||||
}
|
||||
|
||||
type ABReleaseStatus =
|
||||
| "created" // Configured, not started
|
||||
| "deploying" // Deploying variations
|
||||
| "running" // Active with traffic split
|
||||
| "promoting" // Promoting winner to 100%
|
||||
| "completed" // Successfully completed
|
||||
| "rolled_back"; // Rolled back to original
|
||||
```
|
||||
|
||||
**A/B Release Models**:
|
||||
|
||||
| Model | Description | Use Case |
|
||||
|-------|-------------|----------|
|
||||
| **Target-Group A/B** | Deploy different releases to different target groups | Background workers, internal services |
|
||||
| **Router-Based A/B** | Use load balancer to split traffic | Web/API traffic, customer-facing |
|
||||
| **Hybrid A/B** | Combination of both | Complex deployments |
|
||||
|
||||
---
|
||||
|
||||
### Module: `traffic-router`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Router plugin orchestration; traffic shifting |
|
||||
| **Dependencies** | `integration-manager`, `connector-runtime` |
|
||||
| **Protocol** | Plugin-specific (API calls, config generation) |
|
||||
|
||||
**Router Connector Interface**:
|
||||
```typescript
|
||||
interface RouterConnector extends BaseConnector {
|
||||
// Traffic management
|
||||
configureRoute(config: RouteConfig): Promise<void>;
|
||||
getTrafficDistribution(): Promise<TrafficDistribution>;
|
||||
shiftTraffic(from: string, to: string, percentage: number): Promise<void>;
|
||||
|
||||
// Configuration
|
||||
reloadConfig(): Promise<void>;
|
||||
validateConfig(config: string): Promise<ValidationResult>;
|
||||
}
|
||||
|
||||
interface RouteConfig {
|
||||
upstream: string;
|
||||
backends: Array<{
|
||||
name: string;
|
||||
targets: string[];
|
||||
weight: number;
|
||||
}>;
|
||||
healthCheck?: {
|
||||
path: string;
|
||||
interval: number;
|
||||
timeout: number;
|
||||
};
|
||||
}
|
||||
|
||||
interface TrafficDistribution {
|
||||
backends: Array<{
|
||||
name: string;
|
||||
weight: number;
|
||||
healthyTargets: number;
|
||||
totalTargets: number;
|
||||
}>;
|
||||
timestamp: DateTime;
|
||||
}
|
||||
```
|
||||
|
||||
**Router Plugins**:
|
||||
|
||||
| Plugin | Capabilities |
|
||||
|--------|-------------|
|
||||
| `router.nginx` | Config generation, reload via signal/API |
|
||||
| `router.haproxy` | Config generation, reload via socket |
|
||||
| `router.traefik` | Dynamic config API |
|
||||
| `router.aws_alb` | Target group weights via AWS API |
|
||||
| `router.custom` | Webhook-based custom integration |
|
||||
|
||||
---
|
||||
|
||||
### Module: `canary-controller`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Canary ramp automation; health monitoring |
|
||||
| **Dependencies** | `ab-manager`, `traffic-router` |
|
||||
| **Data Entities** | `CanaryStage`, `HealthResult` |
|
||||
| **Events Produced** | `canary.stage_started`, `canary.stage_passed`, `canary.stage_failed` |
|
||||
|
||||
**Canary Stage Entity**:
|
||||
```typescript
|
||||
interface CanaryStage {
|
||||
id: UUID;
|
||||
abReleaseId: UUID;
|
||||
stageNumber: number;
|
||||
trafficPercentage: number;
|
||||
status: CanaryStageStatus;
|
||||
healthThreshold: number; // Required health % to pass
|
||||
durationSeconds: number; // How long to run stage
|
||||
requireApproval: boolean; // Require manual approval
|
||||
startedAt: DateTime | null;
|
||||
completedAt: DateTime | null;
|
||||
healthResult: HealthResult | null;
|
||||
}
|
||||
|
||||
type CanaryStageStatus =
|
||||
| "pending"
|
||||
| "running"
|
||||
| "succeeded"
|
||||
| "failed"
|
||||
| "skipped";
|
||||
|
||||
interface HealthResult {
|
||||
healthy: boolean;
|
||||
healthPercentage: number;
|
||||
metrics: {
|
||||
successRate: number;
|
||||
errorRate: number;
|
||||
latencyP50: number;
|
||||
latencyP99: number;
|
||||
};
|
||||
samples: number;
|
||||
evaluatedAt: DateTime;
|
||||
}
|
||||
```
|
||||
|
||||
**Canary Rollout Execution**:
|
||||
```typescript
|
||||
class CanaryController {
|
||||
async executeRollout(abRelease: ABRelease): Promise<void> {
|
||||
const stages = abRelease.rolloutStrategy.stages;
|
||||
|
||||
for (const stage of stages) {
|
||||
this.log(`Starting canary stage ${stage.stageNumber}: ${stage.trafficPercentage}%`);
|
||||
|
||||
// 1. Shift traffic to canary percentage
|
||||
await this.trafficRouter.shiftTraffic(
|
||||
abRelease.variations[0].name, // baseline
|
||||
abRelease.variations[1].name, // canary
|
||||
stage.trafficPercentage
|
||||
);
|
||||
|
||||
// 2. Update stage status
|
||||
stage.status = "running";
|
||||
stage.startedAt = new Date();
|
||||
await this.save(stage);
|
||||
|
||||
// 3. Wait for stage duration
|
||||
await this.waitForDuration(stage.durationSeconds);
|
||||
|
||||
// 4. Evaluate health
|
||||
const healthResult = await this.evaluateHealth(abRelease, stage);
|
||||
stage.healthResult = healthResult;
|
||||
|
||||
if (!healthResult.healthy || healthResult.healthPercentage < stage.healthThreshold) {
|
||||
stage.status = "failed";
|
||||
await this.save(stage);
|
||||
|
||||
// Rollback
|
||||
await this.rollback(abRelease);
|
||||
throw new CanaryFailedError(`Stage ${stage.stageNumber} failed health check`);
|
||||
}
|
||||
|
||||
// 5. Check if approval required
|
||||
if (stage.requireApproval) {
|
||||
await this.waitForApproval(abRelease, stage);
|
||||
}
|
||||
|
||||
stage.status = "succeeded";
|
||||
stage.completedAt = new Date();
|
||||
await this.save(stage);
|
||||
|
||||
// 6. Check for auto-advance
|
||||
if (!abRelease.rolloutStrategy.autoAdvance) {
|
||||
await this.waitForManualAdvance(abRelease);
|
||||
}
|
||||
}
|
||||
|
||||
// All stages passed - promote canary to 100%
|
||||
await this.promote(abRelease, abRelease.variations[1].name);
|
||||
}
|
||||
|
||||
private async evaluateHealth(abRelease: ABRelease, stage: CanaryStage): Promise<HealthResult> {
|
||||
// Collect metrics from targets
|
||||
const canaryVariation = abRelease.variations.find(v => v.name === "B");
|
||||
const targets = await this.getTargets(canaryVariation.targetGroupId);
|
||||
|
||||
let healthyCount = 0;
|
||||
let totalLatency = 0;
|
||||
let errorCount = 0;
|
||||
|
||||
for (const target of targets) {
|
||||
const health = await this.checkTargetHealth(target);
|
||||
if (health.healthy) healthyCount++;
|
||||
totalLatency += health.latencyMs;
|
||||
if (health.errorRate > 0) errorCount++;
|
||||
}
|
||||
|
||||
return {
|
||||
healthy: healthyCount >= targets.length * (stage.healthThreshold / 100),
|
||||
healthPercentage: (healthyCount / targets.length) * 100,
|
||||
metrics: {
|
||||
successRate: ((targets.length - errorCount) / targets.length) * 100,
|
||||
errorRate: (errorCount / targets.length) * 100,
|
||||
latencyP50: totalLatency / targets.length,
|
||||
latencyP99: totalLatency / targets.length * 1.5, // simplified
|
||||
},
|
||||
samples: targets.length,
|
||||
evaluatedAt: new Date(),
|
||||
};
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `rollout-strategy`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Strategy templates; configuration |
|
||||
| **Data Entities** | `RolloutStrategyTemplate` |
|
||||
|
||||
**Built-in Strategy Templates**:
|
||||
|
||||
| Template | Stages | Description |
|
||||
|----------|--------|-------------|
|
||||
| `canary-10-25-50-100` | 4 | Standard canary: 10%, 25%, 50%, 100% |
|
||||
| `canary-1-5-10-50-100` | 5 | Conservative: 1%, 5%, 10%, 50%, 100% |
|
||||
| `blue-green-instant` | 2 | Deploy 100% to green, instant switch |
|
||||
| `blue-green-gradual` | 4 | Gradual shift: 25%, 50%, 75%, 100% |
|
||||
|
||||
**Rollout Strategy Definition**:
|
||||
```typescript
|
||||
interface RolloutStrategy {
|
||||
id: UUID;
|
||||
name: string;
|
||||
stages: Array<{
|
||||
trafficPercentage: number;
|
||||
durationSeconds: number;
|
||||
healthThreshold: number;
|
||||
requireApproval: boolean;
|
||||
}>;
|
||||
autoAdvance: boolean;
|
||||
rollbackOnFailure: boolean;
|
||||
healthCheckInterval: number;
|
||||
}
|
||||
|
||||
// Example: Standard Canary
|
||||
const standardCanary: RolloutStrategy = {
|
||||
name: "canary-10-25-50-100",
|
||||
stages: [
|
||||
{ trafficPercentage: 10, durationSeconds: 300, healthThreshold: 95, requireApproval: false },
|
||||
{ trafficPercentage: 25, durationSeconds: 600, healthThreshold: 95, requireApproval: false },
|
||||
{ trafficPercentage: 50, durationSeconds: 900, healthThreshold: 95, requireApproval: true },
|
||||
{ trafficPercentage: 100, durationSeconds: 0, healthThreshold: 95, requireApproval: false },
|
||||
],
|
||||
autoAdvance: true,
|
||||
rollbackOnFailure: true,
|
||||
healthCheckInterval: 30,
|
||||
};
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Schema
|
||||
|
||||
```sql
|
||||
-- A/B Releases
|
||||
CREATE TABLE release.ab_releases (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
environment_id UUID NOT NULL REFERENCES release.environments(id),
|
||||
name VARCHAR(255) NOT NULL,
|
||||
variations JSONB NOT NULL, -- [{name, releaseId, targetGroupId, trafficPercentage}]
|
||||
active_variation VARCHAR(50) NOT NULL DEFAULT 'A',
|
||||
traffic_split JSONB NOT NULL,
|
||||
rollout_strategy JSONB NOT NULL,
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'created' CHECK (status IN (
|
||||
'created', 'deploying', 'running', 'promoting', 'completed', 'rolled_back'
|
||||
)),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
completed_at TIMESTAMPTZ,
|
||||
created_by UUID REFERENCES users(id)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_ab_releases_tenant_env ON release.ab_releases(tenant_id, environment_id);
|
||||
CREATE INDEX idx_ab_releases_status ON release.ab_releases(status);
|
||||
|
||||
-- Canary Stages
|
||||
CREATE TABLE release.canary_stages (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
ab_release_id UUID NOT NULL REFERENCES release.ab_releases(id) ON DELETE CASCADE,
|
||||
stage_number INTEGER NOT NULL,
|
||||
traffic_percentage INTEGER NOT NULL,
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'pending' CHECK (status IN (
|
||||
'pending', 'running', 'succeeded', 'failed', 'skipped'
|
||||
)),
|
||||
health_threshold DECIMAL(5,2),
|
||||
duration_seconds INTEGER,
|
||||
require_approval BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
health_result JSONB,
|
||||
UNIQUE (ab_release_id, stage_number)
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
```yaml
|
||||
# A/B Releases
|
||||
POST /api/v1/ab-releases
|
||||
Body: {
|
||||
environmentId: UUID,
|
||||
name: string,
|
||||
variations: [
|
||||
{ name: "A", releaseId: UUID, targetGroupId?: UUID },
|
||||
{ name: "B", releaseId: UUID, targetGroupId?: UUID }
|
||||
],
|
||||
trafficSplit: TrafficSplit,
|
||||
rolloutStrategy: RolloutStrategy
|
||||
}
|
||||
Response: ABRelease
|
||||
|
||||
GET /api/v1/ab-releases
|
||||
Query: ?environmentId={uuid}&status={status}
|
||||
Response: ABRelease[]
|
||||
|
||||
GET /api/v1/ab-releases/{id}
|
||||
Response: ABRelease (with stages)
|
||||
|
||||
POST /api/v1/ab-releases/{id}/start
|
||||
Response: ABRelease
|
||||
|
||||
POST /api/v1/ab-releases/{id}/advance
|
||||
Body: { stageNumber?: number } # advance to next or specific stage
|
||||
Response: ABRelease
|
||||
|
||||
POST /api/v1/ab-releases/{id}/promote
|
||||
Body: { variation: "A" | "B" } # promote to 100%
|
||||
Response: ABRelease
|
||||
|
||||
POST /api/v1/ab-releases/{id}/rollback
|
||||
Response: ABRelease
|
||||
|
||||
GET /api/v1/ab-releases/{id}/traffic
|
||||
Response: { currentSplit: TrafficDistribution, history: TrafficHistory[] }
|
||||
|
||||
GET /api/v1/ab-releases/{id}/health
|
||||
Response: { variations: [{ name, healthStatus, metrics }] }
|
||||
|
||||
# Rollout Strategies
|
||||
GET /api/v1/rollout-strategies
|
||||
Response: RolloutStrategyTemplate[]
|
||||
|
||||
GET /api/v1/rollout-strategies/{id}
|
||||
Response: RolloutStrategyTemplate
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [Module Overview](overview.md)
|
||||
- [Deploy Orchestrator](deploy-orchestrator.md)
|
||||
- [A/B Releases](../progressive-delivery/ab-releases.md)
|
||||
- [Canary Controller](../progressive-delivery/canary.md)
|
||||
- [Router Plugins](../progressive-delivery/routers.md)
|
||||
Reference in New Issue
Block a user