release orchestrator pivot, architecture and planning

This commit is contained in:
2026-01-10 22:37:22 +02:00
parent c84f421e2f
commit d509c44411
130 changed files with 70292 additions and 721 deletions

View File

@@ -0,0 +1,597 @@
# AGENTS: Deployment Agents
**Purpose**: Lightweight deployment agents for target execution.
## Agent Types
| Agent Type | Transport | Target Types |
|------------|-----------|--------------|
| `agent-docker` | gRPC | Docker hosts |
| `agent-compose` | gRPC | Docker Compose hosts |
| `agent-ssh` | SSH | Linux remote hosts |
| `agent-winrm` | WinRM | Windows remote hosts |
| `agent-ecs` | AWS API | AWS ECS services |
| `agent-nomad` | Nomad API | HashiCorp Nomad jobs |
## Modules
### Module: `agent-core`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Shared agent runtime; task execution framework |
| **Protocol** | gRPC for communication with Stella Core |
| **Security** | mTLS authentication; short-lived JWT for tasks |
**Agent Lifecycle**:
1. Agent starts with registration token
2. Agent registers with capabilities and labels
3. Agent sends heartbeats (default: 30s interval)
4. Agent receives tasks from Stella Core
5. Agent reports task completion/failure
**Agent Task Protocol**:
```typescript
// Task assignment (Core → Agent)
interface AgentTask {
id: UUID;
type: TaskType;
targetId: UUID;
payload: TaskPayload;
credentials: EncryptedCredentials;
timeout: number;
priority: TaskPriority;
idempotencyKey: string;
assignedAt: DateTime;
expiresAt: DateTime;
}
type TaskType =
| "deploy"
| "rollback"
| "health-check"
| "inspect"
| "execute-command"
| "upload-files"
| "write-sticker"
| "read-sticker";
interface DeployTaskPayload {
image: string;
digest: string;
config: DeployConfig;
artifacts: ArtifactReference[];
previousDigest?: string;
hooks: {
preDeploy?: HookConfig;
postDeploy?: HookConfig;
};
}
// Task result (Agent → Core)
interface TaskResult {
taskId: UUID;
success: boolean;
startedAt: DateTime;
completedAt: DateTime;
// Success details
outputs?: Record<string, any>;
artifacts?: ArtifactReference[];
// Failure details
error?: string;
errorType?: string;
retriable?: boolean;
// Logs
logs: string;
// Metrics
metrics: {
pullDurationMs?: number;
deployDurationMs?: number;
healthCheckDurationMs?: number;
};
}
```
---
### Module: `agent-docker`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Docker container deployment |
| **Dependencies** | Docker Engine API |
| **Capabilities** | `docker.deploy`, `docker.rollback`, `docker.inspect` |
**Docker Agent Implementation**:
```typescript
class DockerAgent implements TargetExecutor {
private docker: Docker;
async deploy(task: DeployTaskPayload): Promise<DeployResult> {
const { image, digest, config, previousDigest } = task;
const containerName = config.containerName;
// 1. Pull image and verify digest
this.log(`Pulling image ${image}@${digest}`);
await this.docker.pull(image, { digest });
const pulledDigest = await this.getImageDigest(image);
if (pulledDigest !== digest) {
throw new DigestMismatchError(
`Expected digest ${digest}, got ${pulledDigest}. Possible tampering detected.`
);
}
// 2. Run pre-deploy hook
if (task.hooks?.preDeploy) {
await this.runHook(task.hooks.preDeploy, "pre-deploy");
}
// 3. Stop and rename existing container
const existingContainer = await this.findContainer(containerName);
if (existingContainer) {
this.log(`Stopping existing container ${containerName}`);
await existingContainer.stop({ t: 10 });
await existingContainer.rename(`${containerName}-previous-${Date.now()}`);
}
// 4. Create new container
this.log(`Creating container ${containerName} from ${image}@${digest}`);
const container = await this.docker.createContainer({
name: containerName,
Image: `${image}@${digest}`, // Always use digest, not tag
Env: this.buildEnvVars(config.environment),
HostConfig: {
PortBindings: this.buildPortBindings(config.ports),
Binds: this.buildBindMounts(config.volumes),
RestartPolicy: { Name: config.restartPolicy || "unless-stopped" },
Memory: config.memoryLimit,
CpuQuota: config.cpuLimit,
},
Labels: {
"stella.release.id": config.releaseId,
"stella.release.name": config.releaseName,
"stella.digest": digest,
"stella.deployed.at": new Date().toISOString(),
},
});
// 5. Start container
this.log(`Starting container ${containerName}`);
await container.start();
// 6. Wait for container to be healthy
if (config.healthCheck) {
this.log(`Waiting for container health check`);
const healthy = await this.waitForHealthy(container, config.healthCheck.timeout);
if (!healthy) {
await this.rollbackContainer(containerName, existingContainer);
throw new HealthCheckFailedError(`Container ${containerName} failed health check`);
}
}
// 7. Run post-deploy hook
if (task.hooks?.postDeploy) {
await this.runHook(task.hooks.postDeploy, "post-deploy");
}
// 8. Cleanup previous container
if (existingContainer && config.cleanupPrevious !== false) {
this.log(`Removing previous container`);
await existingContainer.remove({ force: true });
}
return {
success: true,
containerId: container.id,
previousDigest: previousDigest,
};
}
async rollback(task: RollbackTaskPayload): Promise<DeployResult> {
const { containerName, targetDigest } = task;
if (targetDigest) {
// Deploy specific digest
return this.deploy({ ...task, digest: targetDigest });
}
// Find and restore previous container
const previousContainer = await this.findContainer(`${containerName}-previous-*`);
if (!previousContainer) {
throw new RollbackError(`No previous container found for ${containerName}`);
}
const currentContainer = await this.findContainer(containerName);
if (currentContainer) {
await currentContainer.stop({ t: 10 });
await currentContainer.rename(`${containerName}-failed-${Date.now()}`);
}
await previousContainer.rename(containerName);
await previousContainer.start();
return { success: true, containerId: previousContainer.id };
}
async writeSticker(sticker: VersionSticker): Promise<void> {
const stickerPath = this.config.stickerPath || "/var/stella/version.json";
const stickerContent = JSON.stringify(sticker, null, 2);
if (this.config.stickerLocation === "volume") {
await this.docker.run("alpine", [
"sh", "-c",
`echo '${stickerContent}' > ${stickerPath}`
], {
HostConfig: { Binds: [`${this.config.stickerVolume}:/var/stella`] }
});
} else {
fs.writeFileSync(stickerPath, stickerContent);
}
}
}
```
---
### Module: `agent-compose`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Docker Compose stack deployment |
| **Dependencies** | Docker Compose CLI |
| **Capabilities** | `compose.deploy`, `compose.rollback`, `compose.inspect` |
**Compose Agent Implementation**:
```typescript
class ComposeAgent implements TargetExecutor {
async deploy(task: DeployTaskPayload): Promise<DeployResult> {
const { artifacts, config } = task;
const deployDir = config.deploymentDirectory;
// 1. Write compose lock file
const composeLock = artifacts.find(a => a.type === "compose_lock");
const composeContent = await this.fetchArtifact(composeLock);
const composePath = path.join(deployDir, "compose.stella.lock.yml");
await fs.writeFile(composePath, composeContent);
// 2. Run pre-deploy hook
if (task.hooks?.preDeploy) {
await this.runHook(task.hooks.preDeploy, deployDir);
}
// 3. Pull images
this.log("Pulling images...");
await this.runCompose(deployDir, ["pull"]);
// 4. Verify digests
await this.verifyDigests(composePath, config.expectedDigests);
// 5. Deploy
this.log("Deploying services...");
await this.runCompose(deployDir, ["up", "-d", "--remove-orphans", "--force-recreate"]);
// 6. Wait for services to be healthy
if (config.healthCheck) {
const healthy = await this.waitForServicesHealthy(deployDir, config.healthCheck.timeout);
if (!healthy) {
await this.rollbackToBackup(deployDir);
throw new HealthCheckFailedError("Services failed health check");
}
}
// 7. Run post-deploy hook
if (task.hooks?.postDeploy) {
await this.runHook(task.hooks.postDeploy, deployDir);
}
// 8. Write version sticker
await this.writeSticker(config.sticker, deployDir);
return { success: true };
}
private async verifyDigests(
composePath: string,
expectedDigests: Record<string, string>
): Promise<void> {
const composeContent = yaml.parse(await fs.readFile(composePath, "utf-8"));
for (const [service, expectedDigest] of Object.entries(expectedDigests)) {
const serviceConfig = composeContent.services[service];
if (!serviceConfig) {
throw new Error(`Service ${service} not found in compose file`);
}
const image = serviceConfig.image;
if (!image.includes("@sha256:")) {
throw new Error(`Service ${service} image not pinned to digest: ${image}`);
}
const actualDigest = image.split("@")[1];
if (actualDigest !== expectedDigest) {
throw new DigestMismatchError(
`Service ${service}: expected ${expectedDigest}, got ${actualDigest}`
);
}
}
}
}
```
---
### Module: `agent-ssh`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | SSH remote execution (agentless) |
| **Dependencies** | SSH client library |
| **Capabilities** | `ssh.deploy`, `ssh.execute`, `ssh.upload` |
**SSH Remote Executor**:
```typescript
class SSHRemoteExecutor implements TargetExecutor {
async connect(config: SSHConnectionConfig): Promise<void> {
const privateKey = await this.secrets.getSecret(config.privateKeyRef);
this.ssh = new SSHClient();
await this.ssh.connect({
host: config.host,
port: config.port || 22,
username: config.username,
privateKey: privateKey.value,
readyTimeout: config.connectionTimeout || 30000,
});
}
async deploy(task: DeployTaskPayload): Promise<DeployResult> {
const { artifacts, config } = task;
const deployDir = config.deploymentDirectory;
try {
// 1. Ensure deployment directory exists
await this.exec(`mkdir -p ${deployDir}`);
await this.exec(`mkdir -p ${deployDir}/.stella-backup`);
// 2. Backup current deployment
await this.exec(`cp -r ${deployDir}/* ${deployDir}/.stella-backup/ 2>/dev/null || true`);
// 3. Upload artifacts
for (const artifact of artifacts) {
const content = await this.fetchArtifact(artifact);
const remotePath = path.join(deployDir, artifact.name);
await this.uploadFile(content, remotePath);
}
// 4. Run pre-deploy hook
if (task.hooks?.preDeploy) {
await this.runRemoteHook(task.hooks.preDeploy, deployDir);
}
// 5. Execute deployment script
const deployScript = artifacts.find(a => a.type === "deploy_script");
if (deployScript) {
const scriptPath = path.join(deployDir, deployScript.name);
await this.exec(`chmod +x ${scriptPath}`);
const result = await this.exec(scriptPath, { cwd: deployDir, timeout: config.deploymentTimeout });
if (result.exitCode !== 0) {
throw new DeploymentError(`Deploy script failed: ${result.stderr}`);
}
}
// 6. Run post-deploy hook
if (task.hooks?.postDeploy) {
await this.runRemoteHook(task.hooks.postDeploy, deployDir);
}
// 7. Health check
if (config.healthCheck) {
const healthy = await this.runHealthCheck(config.healthCheck);
if (!healthy) {
await this.rollback(task);
throw new HealthCheckFailedError("Health check failed");
}
}
// 8. Write version sticker
await this.writeSticker(config.sticker, deployDir);
// 9. Cleanup backup
await this.exec(`rm -rf ${deployDir}/.stella-backup`);
return { success: true };
} finally {
this.ssh.end();
}
}
}
```
---
### Module: `agent-winrm`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | WinRM remote execution (agentless) |
| **Dependencies** | WinRM client library |
| **Capabilities** | `winrm.deploy`, `winrm.execute`, `winrm.upload` |
| **Authentication** | NTLM, Kerberos, Basic |
---
### Module: `agent-ecs`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | AWS ECS service deployment |
| **Dependencies** | AWS SDK |
| **Capabilities** | `ecs.deploy`, `ecs.rollback`, `ecs.inspect` |
---
### Module: `agent-nomad`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | HashiCorp Nomad job deployment |
| **Dependencies** | Nomad API client |
| **Capabilities** | `nomad.deploy`, `nomad.rollback`, `nomad.inspect` |
---
## Agent Security Model
### Registration Flow
```
┌─────────────────────────────────────────────────────────────────────────────┐
│ AGENT REGISTRATION FLOW │
│ │
│ 1. Admin generates registration token (one-time use) │
│ POST /api/v1/admin/agent-tokens │
│ → { token: "reg_xxx", expiresAt: "..." } │
│ │
│ 2. Agent starts with registration token │
│ ./stella-agent --register --token=reg_xxx │
│ │
│ 3. Agent requests mTLS certificate │
│ POST /api/v1/agents/register │
│ Headers: X-Registration-Token: reg_xxx │
│ Body: { name, version, capabilities, csr } │
│ → { agentId, certificate, caCertificate } │
│ │
│ 4. Agent establishes mTLS connection │
│ Uses issued certificate for all subsequent requests │
│ │
│ 5. Agent requests short-lived JWT for task execution │
│ POST /api/v1/agents/token (over mTLS) │
│ → { token, expiresIn: 3600 } // 1 hour │
│ │
│ 6. Agent refreshes token before expiration │
│ Token refresh only over mTLS connection │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
```
### Communication Security
```
┌─────────────────────────────────────────────────────────────────────────────┐
│ AGENT COMMUNICATION SECURITY │
│ │
│ ┌──────────────┐ ┌──────────────┐ │
│ │ AGENT │ │ STELLA CORE │ │
│ └──────┬───────┘ └──────┬───────┘ │
│ │ │ │
│ │ mTLS (mutual TLS) │ │
│ │ - Agent cert signed by Stella CA │ │
│ │ - Server cert verified by Agent │ │
│ │ - TLS 1.3 only │ │
│ │ - Perfect forward secrecy │ │
│ │◄───────────────────────────────────────►│ │
│ │ │ │
│ │ Encrypted payload │ │
│ │ - Task payloads encrypted with │ │
│ │ agent-specific key │ │
│ │ - Logs encrypted in transit │ │
│ │◄───────────────────────────────────────►│ │
│ │ │ │
│ │ Heartbeat + capability refresh │ │
│ │ - Every 30 seconds │ │
│ │ - Signed with agent key │ │
│ │─────────────────────────────────────────►│ │
│ │ │ │
│ │ Task assignment │ │
│ │ - Contains short-lived credentials │ │
│ │ - Scoped to specific target │ │
│ │ - Expires after task timeout │ │
│ │◄─────────────────────────────────────────│ │
│ │ │ │
└─────────────────────────────────────────────────────────────────────────────┘
```
---
## Database Schema
```sql
-- Agents
CREATE TABLE release.agents (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
name VARCHAR(255) NOT NULL,
version VARCHAR(50) NOT NULL,
capabilities JSONB NOT NULL DEFAULT '[]',
labels JSONB NOT NULL DEFAULT '{}',
status VARCHAR(50) NOT NULL DEFAULT 'offline' CHECK (status IN (
'online', 'offline', 'degraded'
)),
last_heartbeat TIMESTAMPTZ,
resource_usage JSONB,
certificate_fingerprint VARCHAR(64),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (tenant_id, name)
);
CREATE INDEX idx_agents_tenant ON release.agents(tenant_id);
CREATE INDEX idx_agents_status ON release.agents(status);
CREATE INDEX idx_agents_capabilities ON release.agents USING GIN (capabilities);
```
---
## API Endpoints
```yaml
# Agent Registration
POST /api/v1/agents/register
Headers: X-Registration-Token: {token}
Body: { name, version, capabilities, csr }
Response: { agentId, certificate, caCertificate }
# Agent Management
GET /api/v1/agents
Query: ?status={online|offline|degraded}&capability={type}
Response: Agent[]
GET /api/v1/agents/{id}
Response: Agent
PUT /api/v1/agents/{id}
Body: { labels?, capabilities? }
Response: Agent
DELETE /api/v1/agents/{id}
Response: { deleted: true }
# Agent Communication
POST /api/v1/agents/{id}/heartbeat
Body: { status, resourceUsage, capabilities }
Response: { tasks: AgentTask[] }
POST /api/v1/agents/{id}/tasks/{taskId}/complete
Body: { success, result, logs }
Response: { acknowledged: true }
# WebSocket for real-time task stream
WS /api/v1/agents/{id}/task-stream
Messages:
- { type: "task_assigned", task: AgentTask }
- { type: "task_cancelled", taskId }
```
---
## References
- [Module Overview](overview.md)
- [Deploy Orchestrator](deploy-orchestrator.md)
- [Agent Security](../security/agent-security.md)
- [API Documentation](../api/agents.md)

View File

@@ -0,0 +1,477 @@
# DEPLOY: Deployment Execution
**Purpose**: Orchestrate deployment jobs, execute on targets, manage rollbacks, and generate artifacts.
## Modules
### Module: `deploy-orchestrator`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Deployment job coordination; strategy execution |
| **Dependencies** | `target-executor`, `artifact-generator`, `agent-manager` |
| **Data Entities** | `DeploymentJob`, `DeploymentTask` |
| **Events Produced** | `deployment.started`, `deployment.task_started`, `deployment.task_completed`, `deployment.completed`, `deployment.failed` |
**Deployment Job Entity**:
```typescript
interface DeploymentJob {
id: UUID;
tenantId: UUID;
promotionId: UUID;
releaseId: UUID;
environmentId: UUID;
status: DeploymentStatus;
strategy: DeploymentStrategy;
startedAt: DateTime;
completedAt: DateTime;
artifacts: GeneratedArtifact[];
rollbackOf: UUID | null; // If this is a rollback job
tasks: DeploymentTask[];
}
type DeploymentStatus =
| "pending" // Waiting to start
| "running" // Deployment in progress
| "succeeded" // All tasks succeeded
| "failed" // One or more tasks failed
| "cancelled" // User cancelled
| "rolling_back" // Rollback in progress
| "rolled_back"; // Rollback complete
interface DeploymentTask {
id: UUID;
jobId: UUID;
targetId: UUID;
digest: string;
status: TaskStatus;
agentId: UUID | null;
startedAt: DateTime;
completedAt: DateTime;
exitCode: number | null;
logs: string;
previousDigest: string | null;
stickerWritten: boolean;
}
type TaskStatus =
| "pending"
| "running"
| "succeeded"
| "failed"
| "cancelled"
| "skipped";
```
---
### Module: `target-executor`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Target-specific deployment logic |
| **Dependencies** | `agent-manager`, `connector-runtime` |
| **Protocol** | gRPC for agents, SSH/WinRM for agentless |
**Executor Types**:
| Type | Transport | Use Case |
|------|-----------|----------|
| `agent-docker` | gRPC | Docker hosts with agent |
| `agent-compose` | gRPC | Compose hosts with agent |
| `ssh-remote` | SSH | Agentless Linux hosts |
| `winrm-remote` | WinRM | Agentless Windows hosts |
| `ecs-api` | AWS API | AWS ECS services |
| `nomad-api` | Nomad API | HashiCorp Nomad jobs |
---
### Module: `runner-executor`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Script/hook execution in sandbox |
| **Dependencies** | `plugin-sandbox` |
| **Supported Scripts** | C# (.csx), Bash, PowerShell |
**Hook Types**:
- `pre-deploy`: Run before deployment starts
- `post-deploy`: Run after deployment succeeds
- `on-failure`: Run when deployment fails
- `on-rollback`: Run during rollback
---
### Module: `artifact-generator`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Generate immutable deployment artifacts |
| **Dependencies** | `release-manager`, `environment-manager` |
| **Data Entities** | `GeneratedArtifact`, `ComposeLock`, `VersionSticker` |
**Generated Artifacts**:
| Artifact Type | Description |
|---------------|-------------|
| `compose_lock` | `compose.stella.lock.yml` - Pinned digests |
| `script` | Compiled deployment script |
| `sticker` | `stella.version.json` - Version marker |
| `evidence` | Decision and execution evidence |
| `config` | Environment-specific config files |
**Compose Lock File Generation**:
```typescript
class ComposeLockGenerator {
async generate(
release: Release,
environment: Environment,
targets: Target[]
): Promise<GeneratedArtifact> {
const services: Record<string, any> = {};
for (const component of release.components) {
services[component.componentName] = {
// CRITICAL: Always use digest, never tag
image: `${component.imageRepository}@${component.digest}`,
// Environment variables
environment: this.mergeEnvironment(
environment.config.variables,
this.buildStellaEnv(release, environment)
),
// Labels for Stella tracking
labels: {
"stella.release.id": release.id,
"stella.release.name": release.name,
"stella.component.name": component.componentName,
"stella.component.digest": component.digest,
"stella.environment": environment.name,
"stella.deployed.at": new Date().toISOString(),
},
};
}
const composeLock = {
version: "3.8",
services,
"x-stella": {
release_id: release.id,
release_name: release.name,
environment: environment.name,
generated_at: new Date().toISOString(),
inputs_hash: this.computeInputsHash(release, environment),
components: release.components.map(c => ({
name: c.componentName,
digest: c.digest,
semver: c.semver,
})),
},
};
const content = yaml.stringify(composeLock);
const hash = crypto.createHash("sha256").update(content).digest("hex");
return {
type: "compose_lock",
name: "compose.stella.lock.yml",
content: Buffer.from(content),
contentHash: `sha256:${hash}`,
};
}
}
```
**Version Sticker Generation**:
```typescript
interface VersionSticker {
stella_version: "1.0";
release_id: UUID;
release_name: string;
components: Array<{
name: string;
digest: string;
semver: string;
tag: string;
image_repository: string;
}>;
environment: string;
environment_id: UUID;
deployed_at: string;
deployed_by: UUID;
promotion_id: UUID;
workflow_run_id: UUID;
evidence_packet_id: UUID;
evidence_packet_hash: string;
orchestrator_version: string;
source_ref?: {
commit_sha: string;
branch: string;
repository: string;
};
}
```
---
### Module: `rollback-manager`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Rollback orchestration; previous state recovery |
| **Dependencies** | `deploy-orchestrator`, `target-registry` |
**Rollback Strategies**:
| Strategy | Description |
|----------|-------------|
| `to-previous` | Roll back to last successful deployment |
| `to-release` | Roll back to specific release ID |
| `to-sticker` | Roll back to version in sticker on target |
**Rollback Flow**:
1. Identify rollback target (previous release or specified)
2. Create rollback deployment job
3. Execute deployment with rollback artifacts
4. Update target state and sticker
5. Record rollback evidence
---
## Deployment Strategies
### All-at-Once
Deploy to all targets simultaneously.
```typescript
interface AllAtOnceConfig {
parallelism: number; // Max concurrent deployments (0 = unlimited)
continueOnFailure: boolean; // Continue if some targets fail
failureThreshold: number; // Max failures before abort
}
```
### Rolling
Deploy to targets sequentially with health checks.
```typescript
interface RollingConfig {
batchSize: number; // Targets per batch
batchDelay: number; // Seconds between batches
healthCheckBetweenBatches: boolean;
rollbackOnFailure: boolean;
maxUnavailable: number; // Max targets unavailable at once
}
```
### Canary
Deploy to subset, verify, then proceed.
```typescript
interface CanaryConfig {
canaryTargets: number; // Number or percentage for canary
canaryDuration: number; // Seconds to run canary
healthThreshold: number; // Required health percentage
autoPromote: boolean; // Auto-proceed if healthy
requireApproval: boolean; // Require manual approval
}
```
### Blue-Green
Deploy to B, switch traffic, retire A.
```typescript
interface BlueGreenConfig {
targetGroupA: UUID; // Current (blue) target group
targetGroupB: UUID; // New (green) target group
trafficShiftType: "instant" | "gradual";
gradualShiftSteps?: number[]; // e.g., [10, 25, 50, 100]
rollbackOnHealthFailure: boolean;
}
```
---
## Rolling Deployment Algorithm
```python
class RollingDeploymentExecutor:
def execute(self, job: DeploymentJob, config: RollingConfig) -> DeploymentResult:
targets = self.get_targets(job.environment_id)
batches = self.create_batches(targets, config.batch_size)
deployed_targets = []
failed_targets = []
for batch_index, batch in enumerate(batches):
self.log(f"Starting batch {batch_index + 1} of {len(batches)}")
# Deploy batch in parallel
batch_results = self.deploy_batch(job, batch)
for target, result in batch_results:
if result.success:
deployed_targets.append(target)
# Write version sticker
self.write_sticker(target, job.release)
else:
failed_targets.append(target)
if config.rollback_on_failure:
# Rollback all deployed targets
self.rollback_targets(deployed_targets, job.previous_release)
return DeploymentResult(
success=False,
error=f"Batch {batch_index + 1} failed, rolled back",
deployed=deployed_targets,
failed=failed_targets,
rolled_back=deployed_targets
)
# Health check between batches
if config.health_check_between_batches and batch_index < len(batches) - 1:
health_result = self.check_batch_health(deployed_targets[-len(batch):])
if not health_result.healthy:
if config.rollback_on_failure:
self.rollback_targets(deployed_targets, job.previous_release)
return DeploymentResult(
success=False,
error=f"Health check failed after batch {batch_index + 1}",
deployed=deployed_targets,
failed=failed_targets,
rolled_back=deployed_targets
)
# Delay between batches
if config.batch_delay > 0 and batch_index < len(batches) - 1:
time.sleep(config.batch_delay)
return DeploymentResult(
success=len(failed_targets) == 0,
deployed=deployed_targets,
failed=failed_targets
)
```
---
## Database Schema
```sql
-- Deployment Jobs
CREATE TABLE release.deployment_jobs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
promotion_id UUID NOT NULL REFERENCES release.promotions(id),
release_id UUID NOT NULL REFERENCES release.releases(id),
environment_id UUID NOT NULL REFERENCES release.environments(id),
status VARCHAR(50) NOT NULL DEFAULT 'pending' CHECK (status IN (
'pending', 'running', 'succeeded', 'failed', 'cancelled', 'rolling_back', 'rolled_back'
)),
strategy VARCHAR(50) NOT NULL DEFAULT 'all-at-once',
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
artifacts JSONB NOT NULL DEFAULT '[]',
rollback_of UUID REFERENCES release.deployment_jobs(id),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_deployment_jobs_promotion ON release.deployment_jobs(promotion_id);
CREATE INDEX idx_deployment_jobs_status ON release.deployment_jobs(status);
-- Deployment Tasks
CREATE TABLE release.deployment_tasks (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
job_id UUID NOT NULL REFERENCES release.deployment_jobs(id) ON DELETE CASCADE,
target_id UUID NOT NULL REFERENCES release.targets(id),
digest VARCHAR(100) NOT NULL,
status VARCHAR(50) NOT NULL DEFAULT 'pending' CHECK (status IN (
'pending', 'running', 'succeeded', 'failed', 'cancelled', 'skipped'
)),
agent_id UUID REFERENCES release.agents(id),
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
exit_code INTEGER,
logs TEXT,
previous_digest VARCHAR(100),
sticker_written BOOLEAN NOT NULL DEFAULT FALSE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_deployment_tasks_job ON release.deployment_tasks(job_id);
CREATE INDEX idx_deployment_tasks_target ON release.deployment_tasks(target_id);
CREATE INDEX idx_deployment_tasks_status ON release.deployment_tasks(status);
-- Generated Artifacts
CREATE TABLE release.generated_artifacts (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
deployment_job_id UUID REFERENCES release.deployment_jobs(id) ON DELETE CASCADE,
artifact_type VARCHAR(50) NOT NULL CHECK (artifact_type IN (
'compose_lock', 'script', 'sticker', 'evidence', 'config'
)),
name VARCHAR(255) NOT NULL,
content_hash VARCHAR(100) NOT NULL,
content BYTEA, -- for small artifacts
storage_ref VARCHAR(500), -- for large artifacts (S3, etc.)
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_generated_artifacts_job ON release.generated_artifacts(deployment_job_id);
```
---
## API Endpoints
```yaml
# Deployment Jobs (mostly read-only; created by promotions)
GET /api/v1/deployment-jobs
Query: ?promotionId={uuid}&status={status}&environmentId={uuid}
Response: DeploymentJob[]
GET /api/v1/deployment-jobs/{id}
Response: DeploymentJob (with tasks)
GET /api/v1/deployment-jobs/{id}/tasks
Response: DeploymentTask[]
GET /api/v1/deployment-jobs/{id}/tasks/{taskId}
Response: DeploymentTask (with logs)
GET /api/v1/deployment-jobs/{id}/tasks/{taskId}/logs
Query: ?follow=true
Response: string | SSE stream
GET /api/v1/deployment-jobs/{id}/artifacts
Response: GeneratedArtifact[]
GET /api/v1/deployment-jobs/{id}/artifacts/{artifactId}
Response: binary (download)
# Rollback
POST /api/v1/rollbacks
Body: {
environmentId: UUID,
strategy: "to-previous" | "to-release" | "to-sticker",
targetReleaseId?: UUID # for to-release strategy
}
Response: DeploymentJob (rollback job)
GET /api/v1/rollbacks
Query: ?environmentId={uuid}
Response: DeploymentJob[] (rollback jobs only)
```
---
## References
- [Module Overview](overview.md)
- [Agents Specification](agents.md)
- [Deployment Strategies](../deployment/strategies.md)
- [Artifact Generation](../deployment/artifacts.md)
- [API Documentation](../api/deployments.md)

View File

@@ -0,0 +1,418 @@
# ENVMGR: Environment & Inventory Manager
**Purpose**: Model environments, targets, agents, and their relationships.
## Modules
### Module: `environment-manager`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Environment CRUD, ordering, configuration, freeze windows |
| **Dependencies** | `authority` |
| **Data Entities** | `Environment`, `EnvironmentConfig`, `FreezeWindow` |
| **Events Produced** | `environment.created`, `environment.updated`, `environment.freeze_started`, `environment.freeze_ended` |
**Key Operations**:
```
CreateEnvironment(name, displayName, orderIndex, config) → Environment
UpdateEnvironment(id, config) → Environment
DeleteEnvironment(id) → void
SetFreezeWindow(environmentId, start, end, reason, exceptions) → FreezeWindow
ClearFreezeWindow(environmentId, windowId) → void
ListEnvironments(tenantId) → Environment[]
GetEnvironmentState(id) → EnvironmentState
```
**Environment Entity**:
```typescript
interface Environment {
id: UUID;
tenantId: UUID;
name: string; // "dev", "stage", "prod"
displayName: string; // "Development"
orderIndex: number; // 0, 1, 2 for promotion order
config: EnvironmentConfig;
freezeWindows: FreezeWindow[];
requiredApprovals: number; // 0 for dev, 1+ for prod
requireSeparationOfDuties: boolean;
autoPromoteFrom: UUID | null; // auto-promote from this env
promotionPolicy: string; // OPA policy name
createdAt: DateTime;
updatedAt: DateTime;
}
interface EnvironmentConfig {
variables: Record<string, string>; // env-specific variables
secrets: SecretReference[]; // vault references
registryOverrides: RegistryOverride[]; // per-env registry
agentLabels: string[]; // required agent labels
deploymentTimeout: number; // seconds
healthCheckConfig: HealthCheckConfig;
}
interface FreezeWindow {
id: UUID;
start: DateTime;
end: DateTime;
reason: string;
createdBy: UUID;
exceptions: UUID[]; // users who can override
}
```
---
### Module: `target-registry`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Deployment target inventory; capability tracking |
| **Dependencies** | `environment-manager`, `agent-manager` |
| **Data Entities** | `Target`, `TargetGroup`, `TargetCapability` |
| **Events Produced** | `target.created`, `target.updated`, `target.deleted`, `target.health_changed` |
**Target Types** (plugin-provided):
| Type | Description |
|------|-------------|
| `docker_host` | Single Docker host |
| `compose_host` | Docker Compose host |
| `ssh_remote` | Generic SSH target |
| `winrm_remote` | Windows remote target |
| `ecs_service` | AWS ECS service |
| `nomad_job` | HashiCorp Nomad job |
**Target Entity**:
```typescript
interface Target {
id: UUID;
tenantId: UUID;
environmentId: UUID;
name: string; // "prod-web-01"
targetType: string; // "docker_host"
connection: TargetConnection; // type-specific
capabilities: TargetCapability[];
labels: Record<string, string>; // for grouping
healthStatus: HealthStatus;
lastHealthCheck: DateTime;
deploymentDirectory: string; // where artifacts are placed
currentDigest: string | null; // what's currently deployed
agentId: UUID | null; // assigned agent
}
interface TargetConnection {
// Common fields
host: string;
port: number;
// Type-specific (examples)
// docker_host:
dockerSocket?: string;
tlsCert?: SecretReference;
// ssh_remote:
username?: string;
privateKey?: SecretReference;
// ecs_service:
cluster?: string;
service?: string;
region?: string;
roleArn?: string;
}
interface TargetGroup {
id: UUID;
tenantId: UUID;
environmentId: UUID;
name: string;
labels: Record<string, string>;
createdAt: DateTime;
}
```
---
### Module: `agent-manager`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Agent registration, heartbeat, capability advertisement |
| **Dependencies** | `authority` (for agent tokens) |
| **Data Entities** | `Agent`, `AgentCapability`, `AgentHeartbeat` |
| **Events Produced** | `agent.registered`, `agent.online`, `agent.offline`, `agent.capability_changed` |
**Agent Lifecycle**:
1. Agent starts, requests registration token from Authority
2. Agent registers with capabilities and labels
3. Agent sends heartbeats (default: 30s interval)
4. Agent pulls tasks from task queue
5. Agent reports task completion/failure
**Agent Entity**:
```typescript
interface Agent {
id: UUID;
tenantId: UUID;
name: string;
version: string;
capabilities: AgentCapability[];
labels: Record<string, string>;
status: "online" | "offline" | "degraded";
lastHeartbeat: DateTime;
assignedTargets: UUID[];
resourceUsage: ResourceUsage;
}
interface AgentCapability {
type: string; // "docker", "compose", "ssh", "winrm"
version: string; // capability version
config: object; // capability-specific config
}
interface ResourceUsage {
cpuPercent: number;
memoryPercent: number;
diskPercent: number;
activeTasks: number;
}
```
**Agent Registration Protocol**:
```
1. Admin generates registration token (one-time use)
POST /api/v1/admin/agent-tokens
→ { token: "reg_xxx", expiresAt: "..." }
2. Agent starts with registration token
./stella-agent --register --token=reg_xxx
3. Agent requests mTLS certificate
POST /api/v1/agents/register
Headers: X-Registration-Token: reg_xxx
Body: { name, version, capabilities, csr }
→ { agentId, certificate, caCertificate }
4. Agent establishes mTLS connection
Uses issued certificate for all subsequent requests
5. Agent requests short-lived JWT for task execution
POST /api/v1/agents/token (over mTLS)
→ { token, expiresIn: 3600 } // 1 hour
```
---
### Module: `inventory-sync`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Drift detection; expected vs actual state reconciliation |
| **Dependencies** | `target-registry`, `agent-manager` |
| **Events Produced** | `inventory.drift_detected`, `inventory.reconciled` |
**Drift Detection Process**:
1. Read `stella.version.json` from target deployment directory
2. Compare with expected state in database
3. Flag discrepancies (digest mismatch, missing sticker, unexpected files)
4. Report on dashboard
**Drift Detection Types**:
| Drift Type | Description | Severity |
|------------|-------------|----------|
| `digest_mismatch` | Running digest differs from expected | Critical |
| `missing_sticker` | No version sticker found on target | Warning |
| `stale_sticker` | Sticker timestamp older than last deployment | Warning |
| `orphan_container` | Container not managed by Stella | Info |
| `extra_files` | Unexpected files in deployment directory | Info |
---
## Cache Eviction Policies
Environment configurations and target states are cached to improve performance. **All caches MUST have bounded size and TTL-based eviction**:
| Cache Type | Purpose | TTL | Max Size | Eviction Strategy |
|-----------|---------|-----|----------|-------------------|
| **Environment Configs** | Environment configuration data | 30 minutes | 500 entries | Sliding expiration |
| **Target Health** | Target health status | 5 minutes | 2,000 entries | Sliding expiration |
| **Agent Capabilities** | Agent capability advertisement | 10 minutes | 1,000 entries | Sliding expiration |
| **Freeze Windows** | Active freeze window checks | 15 minutes | 100 entries | Absolute expiration |
**Implementation**:
```csharp
public class EnvironmentConfigCache
{
private readonly MemoryCache _cache;
public EnvironmentConfigCache()
{
_cache = new MemoryCache(new MemoryCacheOptions
{
SizeLimit = 500 // Max 500 environment configs
});
}
public void CacheConfig(Guid environmentId, EnvironmentConfig config)
{
_cache.Set(environmentId, config, new MemoryCacheEntryOptions
{
Size = 1,
SlidingExpiration = TimeSpan.FromMinutes(30) // 30-minute TTL
});
}
public EnvironmentConfig? GetCachedConfig(Guid environmentId)
=> _cache.Get<EnvironmentConfig>(environmentId);
public void InvalidateConfig(Guid environmentId)
=> _cache.Remove(environmentId);
}
```
**Cache Invalidation**:
- Environment configs: Invalidate on update
- Target health: Invalidate on health check or deployment
- Agent capabilities: Invalidate on capability change event
- Freeze windows: Invalidate on window creation/deletion
**Reference**: See [Implementation Guide](../implementation-guide.md#caching) for cache implementation patterns.
---
## Database Schema
```sql
-- Environments
CREATE TABLE release.environments (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
name VARCHAR(100) NOT NULL,
display_name VARCHAR(255) NOT NULL,
order_index INTEGER NOT NULL,
config JSONB NOT NULL DEFAULT '{}',
freeze_windows JSONB NOT NULL DEFAULT '[]',
required_approvals INTEGER NOT NULL DEFAULT 0,
require_sod BOOLEAN NOT NULL DEFAULT FALSE,
auto_promote_from UUID REFERENCES release.environments(id),
promotion_policy VARCHAR(255),
deployment_timeout INTEGER NOT NULL DEFAULT 600,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (tenant_id, name)
);
CREATE INDEX idx_environments_tenant ON release.environments(tenant_id);
CREATE INDEX idx_environments_order ON release.environments(tenant_id, order_index);
-- Target Groups
CREATE TABLE release.target_groups (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
environment_id UUID NOT NULL REFERENCES release.environments(id) ON DELETE CASCADE,
name VARCHAR(255) NOT NULL,
labels JSONB NOT NULL DEFAULT '{}',
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (tenant_id, environment_id, name)
);
-- Targets
CREATE TABLE release.targets (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
environment_id UUID NOT NULL REFERENCES release.environments(id) ON DELETE CASCADE,
target_group_id UUID REFERENCES release.target_groups(id),
name VARCHAR(255) NOT NULL,
target_type VARCHAR(100) NOT NULL,
connection JSONB NOT NULL,
capabilities JSONB NOT NULL DEFAULT '[]',
labels JSONB NOT NULL DEFAULT '{}',
deployment_directory VARCHAR(500),
health_status VARCHAR(50) NOT NULL DEFAULT 'unknown',
last_health_check TIMESTAMPTZ,
current_digest VARCHAR(100),
agent_id UUID REFERENCES release.agents(id),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (tenant_id, environment_id, name)
);
CREATE INDEX idx_targets_tenant_env ON release.targets(tenant_id, environment_id);
CREATE INDEX idx_targets_type ON release.targets(target_type);
CREATE INDEX idx_targets_labels ON release.targets USING GIN (labels);
-- Agents
CREATE TABLE release.agents (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
name VARCHAR(255) NOT NULL,
version VARCHAR(50) NOT NULL,
capabilities JSONB NOT NULL DEFAULT '[]',
labels JSONB NOT NULL DEFAULT '{}',
status VARCHAR(50) NOT NULL DEFAULT 'offline',
last_heartbeat TIMESTAMPTZ,
resource_usage JSONB,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (tenant_id, name)
);
CREATE INDEX idx_agents_tenant ON release.agents(tenant_id);
CREATE INDEX idx_agents_status ON release.agents(status);
CREATE INDEX idx_agents_capabilities ON release.agents USING GIN (capabilities);
```
---
## API Endpoints
```yaml
# Environments
POST /api/v1/environments
GET /api/v1/environments
GET /api/v1/environments/{id}
PUT /api/v1/environments/{id}
DELETE /api/v1/environments/{id}
# Freeze Windows
POST /api/v1/environments/{envId}/freeze-windows
GET /api/v1/environments/{envId}/freeze-windows
DELETE /api/v1/environments/{envId}/freeze-windows/{windowId}
# Target Groups
POST /api/v1/environments/{envId}/target-groups
GET /api/v1/environments/{envId}/target-groups
GET /api/v1/target-groups/{id}
PUT /api/v1/target-groups/{id}
DELETE /api/v1/target-groups/{id}
# Targets
POST /api/v1/targets
GET /api/v1/targets
GET /api/v1/targets/{id}
PUT /api/v1/targets/{id}
DELETE /api/v1/targets/{id}
POST /api/v1/targets/{id}/health-check
GET /api/v1/targets/{id}/sticker
GET /api/v1/targets/{id}/drift
# Agents
POST /api/v1/agents/register
GET /api/v1/agents
GET /api/v1/agents/{id}
PUT /api/v1/agents/{id}
DELETE /api/v1/agents/{id}
POST /api/v1/agents/{id}/heartbeat
POST /api/v1/agents/{id}/tasks/{taskId}/complete
```
---
## References
- [Module Overview](overview.md)
- [Agent Specification](agents.md)
- [API Documentation](../api/environments.md)
- [Agent Security](../security/agent-security.md)

View File

@@ -0,0 +1,575 @@
# RELEVI: Release Evidence
**Purpose**: Cryptographically sealed evidence packets for audit-grade release governance.
## Modules
### Module: `evidence-collector`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Evidence aggregation; packet composition |
| **Dependencies** | `promotion-manager`, `deploy-orchestrator`, `decision-engine` |
| **Data Entities** | `EvidencePacket`, `EvidenceContent` |
| **Events Produced** | `evidence.collected`, `evidence.packet_created` |
**Evidence Packet Structure**:
```typescript
interface EvidencePacket {
id: UUID;
tenantId: UUID;
promotionId: UUID;
packetType: EvidencePacketType;
content: EvidenceContent;
contentHash: string; // SHA-256 of content
signature: string; // Cryptographic signature
signerKeyRef: string; // Reference to signing key
createdAt: DateTime;
// Note: No updatedAt - packets are immutable
}
type EvidencePacketType =
| "release_decision" // Promotion decision evidence
| "deployment" // Deployment execution evidence
| "rollback" // Rollback evidence
| "ab_promotion"; // A/B promotion evidence
interface EvidenceContent {
// Metadata
version: "1.0";
generatedAt: DateTime;
generatorVersion: string;
// What
release: {
id: UUID;
name: string;
components: Array<{
name: string;
digest: string;
semver: string;
imageRepository: string;
}>;
sourceRef: SourceReference | null;
};
// Where
environment: {
id: UUID;
name: string;
targets: Array<{
id: UUID;
name: string;
type: string;
}>;
};
// Who
actors: {
requester: {
id: UUID;
name: string;
email: string;
};
approvers: Array<{
id: UUID;
name: string;
action: string;
at: DateTime;
comment: string | null;
}>;
};
// Why
decision: {
result: "allow" | "deny";
gates: Array<{
type: string;
name: string;
status: string;
message: string;
details: Record<string, any>;
}>;
reasons: string[];
};
// How
execution: {
workflowRunId: UUID | null;
deploymentJobId: UUID | null;
artifacts: Array<{
type: string;
name: string;
contentHash: string;
}>;
logs: string | null; // Compressed/truncated
};
// When
timeline: {
requestedAt: DateTime;
decidedAt: DateTime | null;
startedAt: DateTime | null;
completedAt: DateTime | null;
};
// Integrity
inputsHash: string; // Hash of all inputs for replay
previousEvidenceId: UUID | null; // Chain to previous evidence
}
```
---
### Module: `evidence-signer`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Cryptographic signing of evidence packets |
| **Dependencies** | `authority`, `vault` (for key storage) |
| **Algorithms** | RS256, ES256, Ed25519 |
**Signing Process**:
```typescript
class EvidenceSigner {
async sign(content: EvidenceContent): Promise<SignedEvidence> {
// 1. Canonicalize content (RFC 8785)
const canonicalJson = canonicalize(content);
// 2. Compute content hash
const contentHash = crypto
.createHash("sha256")
.update(canonicalJson)
.digest("hex");
// 3. Get signing key from vault
const keyRef = await this.getActiveSigningKey();
const privateKey = await this.vault.getPrivateKey(keyRef);
// 4. Sign the content hash
const signature = await this.signWithKey(contentHash, privateKey);
return {
content,
contentHash: `sha256:${contentHash}`,
signature: base64Encode(signature),
signerKeyRef: keyRef,
algorithm: this.config.signatureAlgorithm,
};
}
async verify(packet: EvidencePacket): Promise<VerificationResult> {
// 1. Canonicalize stored content
const canonicalJson = canonicalize(packet.content);
// 2. Verify content hash
const computedHash = crypto
.createHash("sha256")
.update(canonicalJson)
.digest("hex");
if (`sha256:${computedHash}` !== packet.contentHash) {
return { valid: false, error: "Content hash mismatch" };
}
// 3. Get public key
const publicKey = await this.vault.getPublicKey(packet.signerKeyRef);
// 4. Verify signature
const signatureValid = await this.verifySignature(
computedHash,
base64Decode(packet.signature),
publicKey
);
return {
valid: signatureValid,
signerKeyRef: packet.signerKeyRef,
signedAt: packet.createdAt,
};
}
}
```
---
### Module: `sticker-writer`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Version sticker generation and placement |
| **Dependencies** | `deploy-orchestrator`, `agent-manager` |
| **Data Entities** | `VersionSticker` |
**Version Sticker Schema**:
```typescript
interface VersionSticker {
stella_version: "1.0";
// Release identity
release_id: UUID;
release_name: string;
// Component details
components: Array<{
name: string;
digest: string;
semver: string;
tag: string;
image_repository: string;
}>;
// Deployment context
environment: string;
environment_id: UUID;
deployed_at: string; // ISO 8601
deployed_by: UUID;
// Traceability
promotion_id: UUID;
workflow_run_id: UUID;
// Evidence chain
evidence_packet_id: UUID;
evidence_packet_hash: string;
policy_decision_hash: string;
// Orchestrator info
orchestrator_version: string;
// Source reference
source_ref?: {
commit_sha: string;
branch: string;
repository: string;
};
}
```
**Sticker Placement**:
- Written to `/var/stella/version.json` on each target
- Atomic write (write to temp, rename)
- Read during drift detection
- Verified against expected state
---
### Module: `audit-exporter`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Compliance report generation; evidence export |
| **Dependencies** | `evidence-collector` |
| **Export Formats** | JSON, PDF, CSV |
**Audit Report Types**:
| Report Type | Description |
|-------------|-------------|
| `release_audit` | Full audit trail for a release |
| `environment_audit` | All deployments to an environment |
| `compliance_summary` | Summary for compliance review |
| `change_log` | Chronological change log |
**Report Generation**:
```typescript
interface AuditReportRequest {
type: AuditReportType;
scope: {
releaseId?: UUID;
environmentId?: UUID;
from?: DateTime;
to?: DateTime;
};
format: "json" | "pdf" | "csv";
options?: {
includeDecisionDetails: boolean;
includeApproverDetails: boolean;
includeLogs: boolean;
includeArtifacts: boolean;
};
}
interface AuditReport {
id: UUID;
type: AuditReportType;
scope: ReportScope;
generatedAt: DateTime;
generatedBy: UUID;
summary: {
totalPromotions: number;
successfulDeployments: number;
failedDeployments: number;
rollbacks: number;
averageDeploymentTime: number;
};
entries: AuditEntry[];
// For compliance
signatureChain: {
valid: boolean;
verifiedPackets: number;
invalidPackets: number;
};
}
```
---
## Immutability Enforcement
Evidence packets are append-only. This is enforced at multiple levels:
### Database Level
```sql
-- Evidence packets table with no UPDATE/DELETE
CREATE TABLE release.evidence_packets (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
promotion_id UUID NOT NULL REFERENCES release.promotions(id),
packet_type VARCHAR(50) NOT NULL CHECK (packet_type IN (
'release_decision', 'deployment', 'rollback', 'ab_promotion'
)),
content JSONB NOT NULL,
content_hash VARCHAR(100) NOT NULL,
signature TEXT,
signer_key_ref VARCHAR(255),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
-- Note: No updated_at column; immutable by design
);
-- Append-only enforcement via trigger
CREATE OR REPLACE FUNCTION prevent_evidence_modification()
RETURNS TRIGGER AS $$
BEGIN
RAISE EXCEPTION 'Evidence packets are immutable and cannot be modified or deleted';
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER evidence_packets_immutable
BEFORE UPDATE OR DELETE ON evidence_packets
FOR EACH ROW EXECUTE FUNCTION prevent_evidence_modification();
-- Revoke UPDATE/DELETE from application role
REVOKE UPDATE, DELETE ON release.evidence_packets FROM app_role;
-- Version stickers table
CREATE TABLE release.version_stickers (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
target_id UUID NOT NULL REFERENCES release.targets(id),
release_id UUID NOT NULL REFERENCES release.releases(id),
promotion_id UUID NOT NULL REFERENCES release.promotions(id),
sticker_content JSONB NOT NULL,
content_hash VARCHAR(100) NOT NULL,
written_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
verified_at TIMESTAMPTZ,
drift_detected BOOLEAN NOT NULL DEFAULT FALSE
);
CREATE INDEX idx_version_stickers_target ON release.version_stickers(target_id);
CREATE INDEX idx_version_stickers_release ON release.version_stickers(release_id);
CREATE INDEX idx_evidence_packets_promotion ON release.evidence_packets(promotion_id);
CREATE INDEX idx_evidence_packets_created ON release.evidence_packets(created_at DESC);
```
### Application Level
```csharp
// Evidence service enforces immutability
public sealed class EvidenceService
{
// Only Create method - no Update or Delete
public async Task<EvidencePacket> CreateAsync(
EvidenceContent content,
CancellationToken ct)
{
// Sign content
var signed = await _signer.SignAsync(content, ct);
// Store (append-only)
var packet = new EvidencePacket
{
Id = Guid.NewGuid(),
TenantId = content.TenantId,
PromotionId = content.PromotionId,
PacketType = content.PacketType,
Content = content,
ContentHash = signed.ContentHash,
Signature = signed.Signature,
SignerKeyRef = signed.SignerKeyRef,
CreatedAt = DateTime.UtcNow,
};
await _repository.InsertAsync(packet, ct);
return packet;
}
// Read methods only
public async Task<EvidencePacket> GetAsync(Guid id, CancellationToken ct);
public async Task<IReadOnlyList<EvidencePacket>> ListAsync(
EvidenceFilter filter, CancellationToken ct);
public async Task<VerificationResult> VerifyAsync(
Guid id, CancellationToken ct);
// No Update or Delete methods exist
}
```
---
## Evidence Chain
Evidence packets form a verifiable chain:
```
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ Evidence #1 │ │ Evidence #2 │ │ Evidence #3 │
│ (Dev Deploy) │────►│ (Stage Deploy) │────►│ (Prod Deploy) │
│ │ │ │ │ │
│ prevEvidenceId: │ │ prevEvidenceId: │ │ prevEvidenceId: │
│ null │ │ #1 │ │ #2 │
│ │ │ │ │ │
│ contentHash: │ │ contentHash: │ │ contentHash: │
│ sha256:abc... │ │ sha256:def... │ │ sha256:ghi... │
└─────────────────┘ └─────────────────┘ └─────────────────┘
```
**Chain Verification**:
```typescript
async function verifyEvidenceChain(releaseId: UUID): Promise<ChainVerificationResult> {
const packets = await getPacketsForRelease(releaseId);
const results: PacketVerificationResult[] = [];
let previousHash: string | null = null;
for (const packet of packets) {
// 1. Verify packet signature
const signatureValid = await verifySignature(packet);
// 2. Verify content hash
const contentValid = await verifyContentHash(packet);
// 3. Verify chain link
const chainValid = packet.content.previousEvidenceId === null
? previousHash === null
: await verifyPreviousLink(packet, previousHash);
results.push({
packetId: packet.id,
signatureValid,
contentValid,
chainValid,
valid: signatureValid && contentValid && chainValid,
});
previousHash = packet.contentHash;
}
return {
valid: results.every(r => r.valid),
packets: results,
};
}
```
---
## API Endpoints
```yaml
# Evidence Packets
GET /api/v1/evidence-packets
Query: ?promotionId={uuid}&type={type}&from={date}&to={date}
Response: EvidencePacket[]
GET /api/v1/evidence-packets/{id}
Response: EvidencePacket (full content)
GET /api/v1/evidence-packets/{id}/verify
Response: VerificationResult
GET /api/v1/evidence-packets/{id}/download
Query: ?format={json|pdf}
Response: binary
# Evidence Chain
GET /api/v1/releases/{id}/evidence-chain
Response: EvidenceChain
GET /api/v1/releases/{id}/evidence-chain/verify
Response: ChainVerificationResult
# Audit Reports
POST /api/v1/audit-reports
Body: {
type: "release" | "environment" | "compliance",
scope: { releaseId?, environmentId?, from?, to? },
format: "json" | "pdf" | "csv"
}
Response: { reportId: UUID, status: "generating" }
GET /api/v1/audit-reports/{id}
Response: { status, downloadUrl? }
GET /api/v1/audit-reports/{id}/download
Response: binary
# Version Stickers
GET /api/v1/version-stickers
Query: ?targetId={uuid}&releaseId={uuid}
Response: VersionSticker[]
GET /api/v1/version-stickers/{id}
Response: VersionSticker
```
---
## Deterministic Replay
Evidence packets enable deterministic replay - given the same inputs and policy version, the same decision is produced:
```typescript
async function replayDecision(evidencePacket: EvidencePacket): Promise<ReplayResult> {
const content = evidencePacket.content;
// 1. Verify inputs hash
const currentInputsHash = computeInputsHash(
content.release,
content.environment,
content.decision.gates
);
if (currentInputsHash !== content.inputsHash) {
return { valid: false, error: "Inputs have changed since original decision" };
}
// 2. Re-evaluate decision with same inputs
const replayedDecision = await evaluateDecision(
content.release,
content.environment,
{ asOf: content.timeline.decidedAt } // Use policy version from that time
);
// 3. Compare decisions
const decisionsMatch = replayedDecision.result === content.decision.result;
return {
valid: decisionsMatch,
originalDecision: content.decision.result,
replayedDecision: replayedDecision.result,
differences: decisionsMatch ? [] : computeDifferences(content.decision, replayedDecision),
};
}
```
---
## References
- [Module Overview](overview.md)
- [Design Principles](../design/principles.md)
- [Security Architecture](../security/overview.md)
- [Evidence Schema](../appendices/evidence-schema.md)

View File

@@ -0,0 +1,373 @@
# INTHUB: Integration Hub
**Purpose**: Central management of all external integrations (SCM, CI, registries, vaults, targets).
## Modules
### Module: `integration-manager`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | CRUD for integration instances; plugin type registry |
| **Dependencies** | `plugin-registry`, `authority` (for credentials) |
| **Data Entities** | `Integration`, `IntegrationType`, `IntegrationCredential` |
| **Events Produced** | `integration.created`, `integration.updated`, `integration.deleted`, `integration.health_changed` |
| **Events Consumed** | `plugin.registered`, `plugin.unregistered` |
**Key Operations**:
```
CreateIntegration(type, name, config, credentials) → Integration
UpdateIntegration(id, config, credentials) → Integration
DeleteIntegration(id) → void
TestConnection(id) → ConnectionTestResult
DiscoverResources(id, resourceType) → Resource[]
GetIntegrationHealth(id) → HealthStatus
ListIntegrations(filter) → Integration[]
```
**Integration Entity**:
```typescript
interface Integration {
id: UUID;
tenantId: UUID;
type: string; // "scm.github", "registry.harbor"
name: string; // user-defined name
config: IntegrationConfig; // type-specific config
credentialId: UUID; // reference to vault
healthStatus: HealthStatus;
lastHealthCheck: DateTime;
createdAt: DateTime;
updatedAt: DateTime;
}
interface IntegrationConfig {
endpoint: string;
authMode: "token" | "oauth" | "mtls" | "iam";
timeout: number;
retryPolicy: RetryPolicy;
customHeaders?: Record<string, string>;
// Type-specific fields added by plugin
[key: string]: any;
}
```
---
### Module: `connection-profiles`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Default settings management; "last used" pattern |
| **Dependencies** | `integration-manager` |
| **Data Entities** | `ConnectionProfile`, `ProfileTemplate` |
**Behavior**: When user adds a new integration instance:
1. Wizard defaults to last used endpoint, auth mode, network settings
2. Secrets are **never** auto-reused (explicit confirmation required)
3. User can save as named profile for reuse
**Profile Entity**:
```typescript
interface ConnectionProfile {
id: UUID;
tenantId: UUID;
name: string; // "Production GitHub"
integrationType: string;
defaultConfig: Partial<IntegrationConfig>;
isDefault: boolean;
lastUsedAt: DateTime;
createdBy: UUID;
}
```
---
### Module: `connector-runtime`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Execute plugin connector logic in controlled environment |
| **Dependencies** | `plugin-loader`, `plugin-sandbox` |
| **Protocol** | gRPC (preferred) or HTTP/REST |
**Connector Interface** (implemented by plugins):
```protobuf
service Connector {
// Connection management
rpc TestConnection(TestConnectionRequest) returns (TestConnectionResponse);
rpc GetHealth(HealthRequest) returns (HealthResponse);
// Resource discovery
rpc DiscoverResources(DiscoverRequest) returns (DiscoverResponse);
rpc ListRepositories(ListReposRequest) returns (ListReposResponse);
rpc ListBranches(ListBranchesRequest) returns (ListBranchesResponse);
rpc ListTags(ListTagsRequest) returns (ListTagsResponse);
// Registry operations
rpc ResolveTagToDigest(ResolveRequest) returns (ResolveResponse);
rpc FetchManifest(ManifestRequest) returns (ManifestResponse);
rpc VerifyDigest(VerifyRequest) returns (VerifyResponse);
// Secrets operations
rpc GetSecretsRef(SecretsRequest) returns (SecretsResponse);
rpc FetchSecret(FetchSecretRequest) returns (FetchSecretResponse);
// Workflow step execution
rpc ExecuteStep(StepRequest) returns (stream StepResponse);
rpc CancelStep(CancelRequest) returns (CancelResponse);
}
```
**Request/Response Types**:
```protobuf
message TestConnectionRequest {
string integration_id = 1;
map<string, string> config = 2;
string credential_ref = 3;
}
message TestConnectionResponse {
bool success = 1;
string error_message = 2;
map<string, string> details = 3;
int64 latency_ms = 4;
}
message ResolveRequest {
string integration_id = 1;
string image_ref = 2; // "myapp:v2.3.1"
}
message ResolveResponse {
string digest = 1; // "sha256:abc123..."
string manifest_type = 2;
int64 size_bytes = 3;
google.protobuf.Timestamp pushed_at = 4;
}
```
---
### Module: `doctor-checks`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Integration health diagnostics; troubleshooting |
| **Dependencies** | `integration-manager`, `connector-runtime` |
**Doctor Check Types**:
| Check | Purpose | Pass Criteria |
|-------|---------|---------------|
| **Connectivity** | Can reach endpoint | TCP connect succeeds |
| **TLS** | Certificate valid | Chain validates, not expired |
| **Authentication** | Credentials valid | Auth request succeeds |
| **Authorization** | Permissions sufficient | Required scopes present |
| **Version** | API version supported | Version in supported range |
| **Rate Limit** | Quota available | >10% remaining |
| **Latency** | Response time acceptable | <5s p99 |
**Doctor Check Output**:
```typescript
interface DoctorCheckResult {
checkType: string;
status: "pass" | "warn" | "fail";
message: string;
details: Record<string, any>;
suggestions: string[];
runAt: DateTime;
durationMs: number;
}
interface DoctorReport {
integrationId: UUID;
overallStatus: "healthy" | "degraded" | "unhealthy";
checks: DoctorCheckResult[];
generatedAt: DateTime;
}
```
---
## Cache Eviction Policies
Integration health status and connector results are cached to reduce load on external systems. **All caches MUST have bounded size and TTL-based eviction**:
| Cache Type | Purpose | TTL | Max Size | Eviction Strategy |
|-----------|---------|-----|----------|-------------------|
| **Health Checks** | Integration health status | 5 minutes | 1,000 entries | Sliding expiration |
| **Connection Tests** | Test connection results | 2 minutes | 500 entries | Sliding expiration |
| **Resource Discovery** | Discovered resources (repos, tags) | 10 minutes | 5,000 entries | Sliding expiration |
| **Tag Resolution** | Tag digest mappings | 1 hour | 10,000 entries | Absolute expiration |
**Implementation**:
```csharp
public class IntegrationHealthCache
{
private readonly MemoryCache _cache;
public IntegrationHealthCache()
{
_cache = new MemoryCache(new MemoryCacheOptions
{
SizeLimit = 1_000 // Max 1,000 integration health entries
});
}
public void CacheHealthStatus(Guid integrationId, HealthStatus status)
{
_cache.Set(integrationId, status, new MemoryCacheEntryOptions
{
Size = 1,
SlidingExpiration = TimeSpan.FromMinutes(5) // 5-minute TTL
});
}
public HealthStatus? GetCachedHealthStatus(Guid integrationId)
=> _cache.Get<HealthStatus>(integrationId);
}
```
**Reference**: See [Implementation Guide](../implementation-guide.md#caching) for cache implementation patterns.
---
## Integration Types
The following integration types are supported (via plugins):
### SCM Integrations
| Type | Plugin | Capabilities |
|------|--------|--------------|
| `scm.github` | Built-in | repos, branches, commits, webhooks, status |
| `scm.gitlab` | Built-in | repos, branches, commits, webhooks, pipelines |
| `scm.bitbucket` | Plugin | repos, branches, commits, webhooks |
| `scm.azure_repos` | Plugin | repos, branches, commits, pipelines |
### Registry Integrations
| Type | Plugin | Capabilities |
|------|--------|--------------|
| `registry.harbor` | Built-in | repos, tags, digests, scanning status |
| `registry.ecr` | Plugin | repos, tags, digests, IAM auth |
| `registry.gcr` | Plugin | repos, tags, digests |
| `registry.dockerhub` | Plugin | repos, tags, digests |
| `registry.ghcr` | Plugin | repos, tags, digests |
| `registry.acr` | Plugin | repos, tags, digests |
### Vault Integrations
| Type | Plugin | Capabilities |
|------|--------|--------------|
| `vault.hashicorp` | Built-in | KV, transit, PKI |
| `vault.aws_secrets` | Plugin | secrets, IAM auth |
| `vault.azure_keyvault` | Plugin | secrets, certificates |
| `vault.gcp_secrets` | Plugin | secrets, IAM auth |
### CI Integrations
| Type | Plugin | Capabilities |
|------|--------|--------------|
| `ci.github_actions` | Built-in | workflows, runs, artifacts, status |
| `ci.gitlab_ci` | Built-in | pipelines, jobs, artifacts |
| `ci.jenkins` | Plugin | jobs, builds, artifacts |
| `ci.azure_pipelines` | Plugin | pipelines, runs, artifacts |
### Router Integrations (for Progressive Delivery)
| Type | Plugin | Capabilities |
|------|--------|--------------|
| `router.nginx` | Plugin | upstream config, reload |
| `router.haproxy` | Plugin | backend config, reload |
| `router.traefik` | Plugin | dynamic config |
| `router.aws_alb` | Plugin | target groups, listener rules |
---
## Database Schema
```sql
-- Integration types (populated by plugins)
CREATE TABLE release.integration_types (
id TEXT PRIMARY KEY, -- "scm.github"
plugin_id UUID REFERENCES release.plugins(id),
display_name TEXT NOT NULL,
description TEXT,
icon_url TEXT,
config_schema JSONB NOT NULL, -- JSON Schema for config
capabilities TEXT[] NOT NULL, -- ["repos", "webhooks", "status"]
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
-- Integration instances
CREATE TABLE release.integrations (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id),
type_id TEXT NOT NULL REFERENCES release.integration_types(id),
name TEXT NOT NULL,
config JSONB NOT NULL,
credential_ref TEXT NOT NULL, -- vault reference
health_status TEXT NOT NULL DEFAULT 'unknown',
last_health_check TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
created_by UUID NOT NULL REFERENCES users(id),
UNIQUE(tenant_id, name)
);
-- Connection profiles
CREATE TABLE release.connection_profiles (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id),
name TEXT NOT NULL,
integration_type TEXT NOT NULL,
default_config JSONB NOT NULL,
is_default BOOLEAN NOT NULL DEFAULT false,
last_used_at TIMESTAMPTZ,
created_by UUID NOT NULL REFERENCES users(id),
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE(tenant_id, name)
);
-- Doctor check history
CREATE TABLE release.doctor_checks (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
integration_id UUID NOT NULL REFERENCES release.integrations(id),
check_type TEXT NOT NULL,
status TEXT NOT NULL,
message TEXT,
details JSONB,
duration_ms INTEGER NOT NULL,
run_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX idx_doctor_checks_integration ON release.doctor_checks(integration_id, run_at DESC);
```
---
## API Endpoints
See [API Documentation](../api/overview.md) for full specification.
```
GET /api/v1/integration-types # List available types
GET /api/v1/integration-types/{type} # Get type details
GET /api/v1/integrations # List integrations
POST /api/v1/integrations # Create integration
GET /api/v1/integrations/{id} # Get integration
PUT /api/v1/integrations/{id} # Update integration
DELETE /api/v1/integrations/{id} # Delete integration
POST /api/v1/integrations/{id}/test # Test connection
GET /api/v1/integrations/{id}/health # Get health status
POST /api/v1/integrations/{id}/doctor # Run doctor checks
GET /api/v1/integrations/{id}/resources # Discover resources
GET /api/v1/connection-profiles # List profiles
POST /api/v1/connection-profiles # Create profile
GET /api/v1/connection-profiles/{id} # Get profile
PUT /api/v1/connection-profiles/{id} # Update profile
DELETE /api/v1/connection-profiles/{id} # Delete profile
```

View File

@@ -0,0 +1,203 @@
# Module Landscape Overview
The Stella Ops Suite comprises existing modules (vulnerability scanning) and new modules (release orchestration). Modules are organized into **themes** (functional areas).
## Architecture Diagram
```
┌─────────────────────────────────────────────────────────────────────────────────┐
│ STELLA OPS SUITE │
│ │
│ ┌───────────────────────────────────────────────────────────────────────────┐ │
│ │ EXISTING THEMES (Vulnerability) │ │
│ │ │ │
│ │ INGEST VEXOPS REASON SCANENG EVIDENCE │ │
│ │ ├─concelier ├─excititor ├─policy ├─scanner ├─locker │ │
│ │ └─advisory-ai └─linksets └─opa-runtime ├─sbom-gen ├─export │ │
│ │ └─reachability └─timeline │ │
│ │ │ │
│ │ RUNTIME JOBCTRL OBSERVE REPLAY DEVEXP │ │
│ │ ├─signals ├─scheduler ├─notifier └─replay-core ├─cli │ │
│ │ ├─graph ├─orchestrator └─telemetry ├─web-ui │ │
│ │ └─zastava └─task-runner └─sdk │ │
│ └───────────────────────────────────────────────────────────────────────────┘ │
│ │
│ ┌───────────────────────────────────────────────────────────────────────────┐ │
│ │ NEW THEMES (Release Orchestration) │ │
│ │ │ │
│ │ INTHUB (Integration Hub) │ │
│ │ ├─integration-manager Central registry of configured integrations │ │
│ │ ├─connection-profiles Default settings + credential management │ │
│ │ ├─connector-runtime Plugin connector execution environment │ │
│ │ └─doctor-checks Integration health diagnostics │ │
│ │ │ │
│ │ ENVMGR (Environment & Inventory) │ │
│ │ ├─environment-manager Environment CRUD, ordering, config │ │
│ │ ├─target-registry Deployment targets (hosts/services) │ │
│ │ ├─agent-manager Agent registration, health, capabilities │ │
│ │ └─inventory-sync Drift detection, state reconciliation │ │
│ │ │ │
│ │ RELMAN (Release Management) │ │
│ │ ├─component-registry Image repos → components mapping │ │
│ │ ├─version-manager Tag/digest → semver mapping │ │
│ │ ├─release-manager Release bundle lifecycle │ │
│ │ └─release-catalog Release history, search, compare │ │
│ │ │ │
│ │ WORKFL (Workflow Engine) │ │
│ │ ├─workflow-designer Template creation, step graph editor │ │
│ │ ├─workflow-engine DAG execution, state machine │ │
│ │ ├─step-executor Step dispatch, retry, timeout │ │
│ │ └─step-registry Built-in + plugin-provided steps │ │
│ │ │ │
│ │ PROMOT (Promotion & Approval) │ │
│ │ ├─promotion-manager Promotion request lifecycle │ │
│ │ ├─approval-gateway Approval collection, SoD enforcement │ │
│ │ ├─decision-engine Gate evaluation, policy integration │ │
│ │ └─gate-registry Built-in + custom gates │ │
│ │ │ │
│ │ DEPLOY (Deployment Execution) │ │
│ │ ├─deploy-orchestrator Deployment job coordination │ │
│ │ ├─target-executor Target-specific deployment logic │ │
│ │ ├─runner-executor Script/hook execution sandbox │ │
│ │ ├─artifact-generator Compose/script artifact generation │ │
│ │ └─rollback-manager Rollback orchestration │ │
│ │ │ │
│ │ AGENTS (Deployment Agents) │ │
│ │ ├─agent-core Shared agent runtime │ │
│ │ ├─agent-docker Docker host agent │ │
│ │ ├─agent-compose Docker Compose agent │ │
│ │ ├─agent-ssh SSH remote executor │ │
│ │ ├─agent-winrm WinRM remote executor │ │
│ │ ├─agent-ecs AWS ECS agent │ │
│ │ └─agent-nomad HashiCorp Nomad agent │ │
│ │ │ │
│ │ PROGDL (Progressive Delivery) │ │
│ │ ├─ab-manager A/B release coordination │ │
│ │ ├─traffic-router Router plugin orchestration │ │
│ │ ├─canary-controller Canary ramp automation │ │
│ │ └─rollout-strategy Strategy templates │ │
│ │ │ │
│ │ RELEVI (Release Evidence) │ │
│ │ ├─evidence-collector Evidence aggregation │ │
│ │ ├─evidence-signer Cryptographic signing │ │
│ │ ├─sticker-writer Version sticker generation │ │
│ │ └─audit-exporter Compliance report generation │ │
│ │ │ │
│ │ PLUGIN (Plugin Infrastructure) │ │
│ │ ├─plugin-registry Plugin discovery, versioning │ │
│ │ ├─plugin-loader Plugin lifecycle management │ │
│ │ ├─plugin-sandbox Isolation, resource limits │ │
│ │ └─plugin-sdk SDK for plugin development │ │
│ └───────────────────────────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────────────────────────┘
```
## Theme Summary
### Existing Themes (Vulnerability Scanning)
| Theme | Purpose | Key Modules |
|-------|---------|-------------|
| **INGEST** | Advisory ingestion | concelier, advisory-ai |
| **VEXOPS** | VEX document handling | excititor, linksets |
| **REASON** | Policy and decisioning | policy, opa-runtime |
| **SCANENG** | Scanning and SBOM | scanner, sbom-gen, reachability |
| **EVIDENCE** | Evidence and attestation | locker, export, timeline |
| **RUNTIME** | Runtime signals | signals, graph, zastava |
| **JOBCTRL** | Job orchestration | scheduler, orchestrator, task-runner |
| **OBSERVE** | Observability | notifier, telemetry |
| **REPLAY** | Deterministic replay | replay-core |
| **DEVEXP** | Developer experience | cli, web-ui, sdk |
### New Themes (Release Orchestration)
| Theme | Purpose | Key Modules | Documentation |
|-------|---------|-------------|---------------|
| **INTHUB** | Integration hub | integration-manager, connection-profiles, connector-runtime, doctor-checks | [Details](integration-hub.md) |
| **ENVMGR** | Environment & inventory | environment-manager, target-registry, agent-manager, inventory-sync | [Details](environment-manager.md) |
| **RELMAN** | Release management | component-registry, version-manager, release-manager, release-catalog | [Details](release-manager.md) |
| **WORKFL** | Workflow engine | workflow-designer, workflow-engine, step-executor, step-registry | [Details](workflow-engine.md) |
| **PROMOT** | Promotion & approval | promotion-manager, approval-gateway, decision-engine, gate-registry | [Details](promotion-manager.md) |
| **DEPLOY** | Deployment execution | deploy-orchestrator, target-executor, runner-executor, artifact-generator, rollback-manager | [Details](deploy-orchestrator.md) |
| **AGENTS** | Deployment agents | agent-core, agent-docker, agent-compose, agent-ssh, agent-winrm, agent-ecs, agent-nomad | [Details](agents.md) |
| **PROGDL** | Progressive delivery | ab-manager, traffic-router, canary-controller, rollout-strategy | [Details](progressive-delivery.md) |
| **RELEVI** | Release evidence | evidence-collector, evidence-signer, sticker-writer, audit-exporter | [Details](evidence.md) |
| **PLUGIN** | Plugin infrastructure | plugin-registry, plugin-loader, plugin-sandbox, plugin-sdk | [Details](plugin-system.md) |
## Module Dependencies
```
┌──────────────┐
│ AUTHORITY │
└──────┬───────┘
┌──────────────────┼──────────────────┐
│ │ │
▼ ▼ ▼
┌───────────────┐ ┌───────────────┐ ┌───────────────┐
│ INTHUB │ │ ENVMGR │ │ PLUGIN │
│ (Integrations)│ │ (Environments)│ │ (Plugins) │
└───────┬───────┘ └───────┬───────┘ └───────┬───────┘
│ │ │
└──────────┬───────┴──────────────────┘
┌───────────────┐
│ RELMAN │
│ (Releases) │
└───────┬───────┘
┌───────────────┐
│ WORKFL │
│ (Workflows) │
└───────┬───────┘
┌──────────┴──────────┐
│ │
▼ ▼
┌───────────────┐ ┌───────────────┐
│ PROMOT │ │ DEPLOY │
│ (Promotion) │ │ (Deployment) │
└───────┬───────┘ └───────┬───────┘
│ │
│ ▼
│ ┌───────────────┐
│ │ AGENTS │
│ │ (Agents) │
│ └───────┬───────┘
│ │
└──────────┬──────────┘
┌───────────────┐
│ RELEVI │
│ (Evidence) │
└───────────────┘
```
## Communication Patterns
| Pattern | Usage |
|---------|-------|
| **Synchronous API** | User-initiated operations (CRUD, queries) |
| **Event Bus** | Cross-module notifications (domain events) |
| **Task Queue** | Long-running operations (deployments, syncs) |
| **WebSocket/SSE** | Real-time UI updates |
| **gRPC Streams** | Agent communication |
## Database Schema Organization
Each theme owns a PostgreSQL schema:
| Schema | Owner Theme |
|--------|-------------|
| `release.integrations` | INTHUB |
| `release.environments` | ENVMGR |
| `release.components` | RELMAN |
| `release.workflows` | WORKFL |
| `release.promotions` | PROMOT |
| `release.deployments` | DEPLOY |
| `release.agents` | AGENTS |
| `release.evidence` | RELEVI |
| `release.plugins` | PLUGIN |

View File

@@ -0,0 +1,629 @@
# PLUGIN: Plugin Infrastructure
**Purpose**: Extensible plugin system for integrations, steps, and custom functionality.
## Architecture Overview
```
┌─────────────────────────────────────────────────────────────────────────────┐
│ PLUGIN ARCHITECTURE │
│ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ PLUGIN REGISTRY │ │
│ │ │ │
│ │ - Plugin discovery and versioning │ │
│ │ - Manifest validation │ │
│ │ - Dependency resolution │ │
│ └──────────────────────────────┬──────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ PLUGIN LOADER │ │
│ │ │ │
│ │ - Lifecycle management (load, start, stop, unload) │ │
│ │ - Health monitoring │ │
│ │ - Hot reload support │ │
│ └──────────────────────────────┬──────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ PLUGIN SANDBOX │ │
│ │ │ │
│ │ - Process isolation │ │
│ │ - Resource limits (CPU, memory, network) │ │
│ │ - Capability enforcement │ │
│ └─────────────────────────────────────────────────────────────────────┘ │
│ │
│ Plugin Types: │
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
│ │ Connector │ │ Step │ │ Gate │ │ Agent │ │
│ │ Plugins │ │ Providers │ │ Providers │ │ Plugins │ │
│ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
```
## Modules
### Module: `plugin-registry`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Plugin discovery; versioning; manifest management |
| **Data Entities** | `Plugin`, `PluginManifest`, `PluginVersion` |
| **Events Produced** | `plugin.discovered`, `plugin.registered`, `plugin.unregistered` |
**Plugin Entity**:
```typescript
interface Plugin {
id: UUID;
pluginId: string; // "com.example.my-connector"
version: string; // "1.2.3"
vendor: string;
license: string;
manifest: PluginManifest;
status: PluginStatus;
entrypoint: string; // Path to plugin executable/module
lastHealthCheck: DateTime;
healthMessage: string | null;
installedAt: DateTime;
updatedAt: DateTime;
}
type PluginStatus =
| "discovered" // Found but not loaded
| "loaded" // Loaded but not active
| "active" // Running and healthy
| "stopped" // Manually stopped
| "failed" // Failed to load or crashed
| "degraded"; // Running but with issues
```
---
### Module: `plugin-loader`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Plugin lifecycle management |
| **Dependencies** | `plugin-registry`, `plugin-sandbox` |
| **Events Produced** | `plugin.loaded`, `plugin.started`, `plugin.stopped`, `plugin.failed` |
**Plugin Lifecycle**:
```
┌──────────────┐
│ DISCOVERED │ ──── Plugin found in registry
└──────┬───────┘
│ load()
┌──────────────┐
│ LOADED │ ──── Plugin validated and prepared
└──────┬───────┘
│ start()
┌──────────────┐ ┌──────────────┐
│ ACTIVE │ ──── │ DEGRADED │ ◄── Health issues
└──────┬───────┘ └──────────────┘
│ stop() │
▼ │
┌──────────────┐ │
│ STOPPED │ ◄───────────┘ manual stop
└──────────────┘
│ unload()
┌──────────────┐
│ UNLOADED │
└──────────────┘
```
**Lifecycle Operations**:
```typescript
interface PluginLoader {
// Discovery
discover(): Promise<Plugin[]>;
refresh(): Promise<void>;
// Lifecycle
load(pluginId: string): Promise<Plugin>;
start(pluginId: string): Promise<void>;
stop(pluginId: string): Promise<void>;
unload(pluginId: string): Promise<void>;
restart(pluginId: string): Promise<void>;
// Health
checkHealth(pluginId: string): Promise<HealthStatus>;
getStatus(pluginId: string): Promise<PluginStatus>;
// Hot reload
reload(pluginId: string): Promise<void>;
}
```
---
### Module: `plugin-sandbox`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Isolation; resource limits; security |
| **Enforcement** | Process isolation, capability-based security |
**Sandbox Configuration**:
```typescript
interface SandboxConfig {
// Process isolation
processIsolation: boolean; // Run in separate process
containerIsolation: boolean; // Run in container
// Resource limits
resourceLimits: {
maxMemoryMb: number; // Memory limit
maxCpuPercent: number; // CPU limit
maxDiskMb: number; // Disk quota
maxNetworkBandwidth: number; // Network bandwidth limit
};
// Network restrictions
networkPolicy: {
allowedHosts: string[]; // Allowed outbound hosts
blockedHosts: string[]; // Blocked hosts
allowOutbound: boolean; // Allow any outbound
};
// Filesystem restrictions
filesystemPolicy: {
readOnlyPaths: string[];
writablePaths: string[];
blockedPaths: string[];
};
// Timeouts
timeouts: {
initializationMs: number;
operationMs: number;
shutdownMs: number;
};
}
```
**Capability Enforcement**:
```typescript
interface PluginCapabilities {
// Integration capabilities
integrations: {
scm: boolean;
ci: boolean;
registry: boolean;
vault: boolean;
router: boolean;
};
// Step capabilities
steps: {
deploy: boolean;
gate: boolean;
notify: boolean;
custom: boolean;
};
// System capabilities
system: {
network: boolean;
filesystem: boolean;
secrets: boolean;
database: boolean;
};
}
```
---
### Module: `plugin-sdk`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | SDK for plugin development |
| **Languages** | C#, TypeScript, Go |
**Plugin SDK Interface**:
```typescript
// Base plugin interface
interface StellaPlugin {
// Lifecycle
initialize(config: PluginConfig): Promise<void>;
start(): Promise<void>;
stop(): Promise<void>;
dispose(): Promise<void>;
// Health
getHealth(): Promise<HealthStatus>;
// Metadata
getManifest(): PluginManifest;
}
// Connector plugin interface
interface ConnectorPlugin extends StellaPlugin {
createConnector(config: ConnectorConfig): Promise<Connector>;
}
// Step provider plugin interface
interface StepProviderPlugin extends StellaPlugin {
getStepTypes(): StepType[];
executeStep(
stepType: string,
config: StepConfig,
inputs: StepInputs,
context: StepContext
): AsyncGenerator<StepEvent>;
}
// Gate provider plugin interface
interface GateProviderPlugin extends StellaPlugin {
getGateTypes(): GateType[];
evaluateGate(
gateType: string,
config: GateConfig,
context: GateContext
): Promise<GateResult>;
}
```
---
## Three-Surface Plugin Model
Plugins contribute to the system through three distinct surfaces:
### 1. Manifest Surface (Static)
The plugin manifest declares:
- Plugin identity and version
- Required capabilities
- Provided integrations/steps/gates
- Configuration schema
- UI components (optional)
```yaml
# plugin.stella.yaml
plugin:
id: "com.example.jenkins-connector"
version: "1.0.0"
vendor: "Example Corp"
license: "Apache-2.0"
description: "Jenkins CI integration for Stella Ops"
capabilities:
required:
- network
optional:
- secrets
provides:
integrations:
- type: "ci.jenkins"
displayName: "Jenkins"
configSchema: "./schemas/jenkins-config.json"
capabilities:
- "pipelines"
- "builds"
- "artifacts"
steps:
- type: "jenkins-trigger"
displayName: "Trigger Jenkins Build"
category: "integration"
configSchema: "./schemas/jenkins-trigger-config.json"
inputSchema: "./schemas/jenkins-trigger-input.json"
outputSchema: "./schemas/jenkins-trigger-output.json"
ui:
configScreen: "./ui/config.html"
icon: "./assets/jenkins-icon.svg"
dependencies:
stellaCore: ">=1.0.0"
```
### 2. Connector Runtime Surface (Dynamic)
Plugins implement connector interfaces for runtime operations:
```typescript
// Jenkins connector implementation
class JenkinsConnector implements CIConnector {
private client: JenkinsClient;
async initialize(config: ConnectorConfig, secrets: SecretHandle[]): Promise<void> {
const apiToken = await this.getSecret(secrets, "api_token");
this.client = new JenkinsClient({
baseUrl: config.endpoint,
username: config.username,
apiToken: apiToken,
});
}
async testConnection(): Promise<ConnectionTestResult> {
try {
const crumb = await this.client.getCrumb();
return { success: true, message: "Connected to Jenkins" };
} catch (error) {
return { success: false, message: error.message };
}
}
async listPipelines(): Promise<PipelineInfo[]> {
const jobs = await this.client.getJobs();
return jobs.map(job => ({
id: job.name,
name: job.displayName,
url: job.url,
lastBuild: job.lastBuild?.number,
}));
}
async triggerPipeline(pipelineId: string, params: object): Promise<PipelineRun> {
const queueItem = await this.client.build(pipelineId, params);
return {
id: queueItem.id.toString(),
pipelineId,
status: "queued",
startedAt: new Date(),
};
}
async getPipelineRun(runId: string): Promise<PipelineRun> {
const build = await this.client.getBuild(runId);
return {
id: build.number.toString(),
pipelineId: build.job,
status: this.mapStatus(build.result),
startedAt: new Date(build.timestamp),
completedAt: build.result ? new Date(build.timestamp + build.duration) : null,
};
}
}
```
### 3. Step Provider Surface (Execution)
Plugins implement step execution logic:
```typescript
// Jenkins trigger step implementation
class JenkinsTriggerStep implements StepExecutor {
async *execute(
config: StepConfig,
inputs: StepInputs,
context: StepContext
): AsyncGenerator<StepEvent> {
const connector = await context.getConnector<JenkinsConnector>(config.integrationId);
yield { type: "log", line: `Triggering Jenkins job: ${config.jobName}` };
// Trigger build
const run = await connector.triggerPipeline(config.jobName, inputs.parameters);
yield { type: "output", name: "buildId", value: run.id };
yield { type: "log", line: `Build queued: ${run.id}` };
// Wait for completion if configured
if (config.waitForCompletion) {
yield { type: "log", line: "Waiting for build to complete..." };
while (true) {
const status = await connector.getPipelineRun(run.id);
if (status.status === "succeeded") {
yield { type: "output", name: "status", value: "succeeded" };
yield { type: "result", success: true };
return;
}
if (status.status === "failed") {
yield { type: "output", name: "status", value: "failed" };
yield { type: "result", success: false, message: "Build failed" };
return;
}
yield { type: "progress", progress: 50, message: `Build running: ${status.status}` };
await sleep(config.pollIntervalSeconds * 1000);
}
}
yield { type: "result", success: true };
}
}
```
---
## Database Schema
```sql
-- Plugins
CREATE TABLE release.plugins (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
plugin_id VARCHAR(255) NOT NULL UNIQUE,
version VARCHAR(50) NOT NULL,
vendor VARCHAR(255) NOT NULL,
license VARCHAR(100),
manifest JSONB NOT NULL,
status VARCHAR(50) NOT NULL DEFAULT 'discovered' CHECK (status IN (
'discovered', 'loaded', 'active', 'stopped', 'failed', 'degraded'
)),
entrypoint VARCHAR(500) NOT NULL,
last_health_check TIMESTAMPTZ,
health_message TEXT,
installed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_plugins_status ON release.plugins(status);
-- Plugin Instances (per-tenant configuration)
CREATE TABLE release.plugin_instances (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
plugin_id UUID NOT NULL REFERENCES release.plugins(id) ON DELETE CASCADE,
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
config JSONB NOT NULL DEFAULT '{}',
enabled BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_plugin_instances_tenant ON release.plugin_instances(tenant_id);
-- Integration types (populated by plugins)
CREATE TABLE release.integration_types (
id TEXT PRIMARY KEY, -- "scm.github", "ci.jenkins"
plugin_id UUID REFERENCES release.plugins(id),
display_name TEXT NOT NULL,
description TEXT,
icon_url TEXT,
config_schema JSONB NOT NULL, -- JSON Schema for config
capabilities TEXT[] NOT NULL, -- ["repos", "webhooks", "status"]
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
```
---
## API Endpoints
```yaml
# Plugin Registry
GET /api/v1/plugins
Query: ?status={status}&capability={type}
Response: Plugin[]
GET /api/v1/plugins/{id}
Response: Plugin (with manifest)
POST /api/v1/plugins/{id}/enable
Response: Plugin
POST /api/v1/plugins/{id}/disable
Response: Plugin
GET /api/v1/plugins/{id}/health
Response: { status, message, diagnostics[] }
# Plugin Instances (per-tenant config)
POST /api/v1/plugin-instances
Body: { pluginId: UUID, config: object }
Response: PluginInstance
GET /api/v1/plugin-instances
Response: PluginInstance[]
PUT /api/v1/plugin-instances/{id}
Body: { config: object, enabled: boolean }
Response: PluginInstance
DELETE /api/v1/plugin-instances/{id}
Response: { deleted: true }
```
---
## Plugin Security
### Capability Declaration
Plugins must declare all required capabilities in their manifest. The system enforces:
1. **Network Access**: Plugins can only access declared hosts
2. **Secret Access**: Plugins receive secrets through controlled injection
3. **Database Access**: No direct database access; API only
4. **Filesystem Access**: Limited to declared paths
### Sandbox Enforcement
```typescript
// Plugin execution is sandboxed
class PluginSandbox {
async execute<T>(
plugin: Plugin,
operation: () => Promise<T>
): Promise<T> {
// 1. Verify capabilities
this.verifyCapabilities(plugin);
// 2. Set resource limits
const limits = this.getResourceLimits(plugin);
await this.applyLimits(limits);
// 3. Create isolated context
const context = await this.createIsolatedContext(plugin);
try {
// 4. Execute with timeout
return await this.withTimeout(
operation(),
plugin.manifest.timeouts.operationMs
);
} catch (error) {
// 5. Log and handle errors
await this.handlePluginError(plugin, error);
throw error;
} finally {
// 6. Cleanup
await context.dispose();
}
}
}
```
### Plugin Failures Cannot Crash Core
```csharp
// Core orchestration is protected from plugin failures
public sealed class PromotionDecisionEngine
{
public async Task<DecisionResult> EvaluateAsync(
Promotion promotion,
IReadOnlyList<IGateProvider> gates,
CancellationToken ct)
{
var results = new List<GateResult>();
foreach (var gate in gates)
{
try
{
// Plugin provides evaluation logic
var result = await gate.EvaluateAsync(promotion, ct);
results.Add(result);
}
catch (Exception ex)
{
// Plugin failure is logged but doesn't crash core
_logger.LogError(ex, "Gate {GateType} failed", gate.Type);
results.Add(new GateResult
{
GateType = gate.Type,
Status = GateStatus.Failed,
Message = $"Gate evaluation failed: {ex.Message}",
IsBlocking = gate.IsBlocking,
});
}
// Core decides how to aggregate (plugins cannot override)
if (results.Last().IsBlocking && _policy.FailFast)
break;
}
// Core makes final decision
return _decisionAggregator.Aggregate(results);
}
}
```
---
## References
- [Module Overview](overview.md)
- [Integration Hub](integration-hub.md)
- [Workflow Engine](workflow-engine.md)
- [Connector Interface](../integrations/connectors.md)

View File

@@ -0,0 +1,471 @@
# PROGDL: Progressive Delivery
**Purpose**: A/B releases, canary deployments, and traffic management.
## Architecture Overview
```
┌─────────────────────────────────────────────────────────────────────────────┐
│ PROGRESSIVE DELIVERY ARCHITECTURE │
│ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ A/B RELEASE MANAGER │ │
│ │ │ │
│ │ - Create A/B release with variations │ │
│ │ - Manage traffic split configuration │ │
│ │ - Coordinate rollout stages │ │
│ │ - Handle promotion/rollback │ │
│ └──────────────────────────────┬──────────────────────────────────────┘ │
│ │ │
│ ┌──────────────────┴──────────────────┐ │
│ │ │ │
│ ▼ ▼ │
│ ┌───────────────────────┐ ┌───────────────────────┐ │
│ │ TARGET-GROUP A/B │ │ ROUTER-BASED A/B │ │
│ │ │ │ │ │
│ │ Deploy to groups │ │ Configure traffic │ │
│ │ by labels/membership │ │ via load balancer │ │
│ │ │ │ │ │
│ │ Good for: │ │ Good for: │ │
│ │ - Background workers │ │ - Web/API traffic │ │
│ │ - Batch processors │ │ - Customer-facing │ │
│ │ - Internal services │ │ - L7 routing │ │
│ └───────────────────────┘ └───────────────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ CANARY CONTROLLER │ │
│ │ │ │
│ │ - Execute rollout stages │ │
│ │ - Monitor health metrics │ │
│ │ - Auto-advance or pause │ │
│ │ - Trigger rollback on failure │ │
│ └─────────────────────────────────────────────────────────────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ TRAFFIC ROUTER INTEGRATION │ │
│ │ │ │
│ │ Plugin-based integration with: │ │
│ │ - Nginx (config generation + reload) │ │
│ │ - HAProxy (config generation + reload) │ │
│ │ - Traefik (dynamic config API) │ │
│ │ - AWS ALB (target group weights) │ │
│ │ - Custom (webhook) │ │
│ └─────────────────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
```
## Modules
### Module: `ab-manager`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | A/B release lifecycle; variation management |
| **Dependencies** | `release-manager`, `environment-manager`, `deploy-orchestrator` |
| **Data Entities** | `ABRelease`, `Variation`, `TrafficSplit` |
| **Events Produced** | `ab.created`, `ab.started`, `ab.stage_advanced`, `ab.promoted`, `ab.rolled_back` |
**A/B Release Entity**:
```typescript
interface ABRelease {
id: UUID;
tenantId: UUID;
environmentId: UUID;
name: string;
variations: Variation[];
activeVariation: string; // "A" or "B"
trafficSplit: TrafficSplit;
rolloutStrategy: RolloutStrategy;
status: ABReleaseStatus;
createdAt: DateTime;
completedAt: DateTime | null;
createdBy: UUID;
}
interface Variation {
name: string; // "A", "B"
releaseId: UUID;
targetGroupId: UUID | null; // for target-group based A/B
trafficPercentage: number;
deploymentJobId: UUID | null;
}
interface TrafficSplit {
type: "percentage" | "sticky" | "header";
percentages: Record<string, number>; // {"A": 90, "B": 10}
stickyKey?: string; // cookie or header name
headerMatch?: { // for header-based routing
header: string;
values: Record<string, string>; // value -> variation
};
}
type ABReleaseStatus =
| "created" // Configured, not started
| "deploying" // Deploying variations
| "running" // Active with traffic split
| "promoting" // Promoting winner to 100%
| "completed" // Successfully completed
| "rolled_back"; // Rolled back to original
```
**A/B Release Models**:
| Model | Description | Use Case |
|-------|-------------|----------|
| **Target-Group A/B** | Deploy different releases to different target groups | Background workers, internal services |
| **Router-Based A/B** | Use load balancer to split traffic | Web/API traffic, customer-facing |
| **Hybrid A/B** | Combination of both | Complex deployments |
---
### Module: `traffic-router`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Router plugin orchestration; traffic shifting |
| **Dependencies** | `integration-manager`, `connector-runtime` |
| **Protocol** | Plugin-specific (API calls, config generation) |
**Router Connector Interface**:
```typescript
interface RouterConnector extends BaseConnector {
// Traffic management
configureRoute(config: RouteConfig): Promise<void>;
getTrafficDistribution(): Promise<TrafficDistribution>;
shiftTraffic(from: string, to: string, percentage: number): Promise<void>;
// Configuration
reloadConfig(): Promise<void>;
validateConfig(config: string): Promise<ValidationResult>;
}
interface RouteConfig {
upstream: string;
backends: Array<{
name: string;
targets: string[];
weight: number;
}>;
healthCheck?: {
path: string;
interval: number;
timeout: number;
};
}
interface TrafficDistribution {
backends: Array<{
name: string;
weight: number;
healthyTargets: number;
totalTargets: number;
}>;
timestamp: DateTime;
}
```
**Router Plugins**:
| Plugin | Capabilities |
|--------|-------------|
| `router.nginx` | Config generation, reload via signal/API |
| `router.haproxy` | Config generation, reload via socket |
| `router.traefik` | Dynamic config API |
| `router.aws_alb` | Target group weights via AWS API |
| `router.custom` | Webhook-based custom integration |
---
### Module: `canary-controller`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Canary ramp automation; health monitoring |
| **Dependencies** | `ab-manager`, `traffic-router` |
| **Data Entities** | `CanaryStage`, `HealthResult` |
| **Events Produced** | `canary.stage_started`, `canary.stage_passed`, `canary.stage_failed` |
**Canary Stage Entity**:
```typescript
interface CanaryStage {
id: UUID;
abReleaseId: UUID;
stageNumber: number;
trafficPercentage: number;
status: CanaryStageStatus;
healthThreshold: number; // Required health % to pass
durationSeconds: number; // How long to run stage
requireApproval: boolean; // Require manual approval
startedAt: DateTime | null;
completedAt: DateTime | null;
healthResult: HealthResult | null;
}
type CanaryStageStatus =
| "pending"
| "running"
| "succeeded"
| "failed"
| "skipped";
interface HealthResult {
healthy: boolean;
healthPercentage: number;
metrics: {
successRate: number;
errorRate: number;
latencyP50: number;
latencyP99: number;
};
samples: number;
evaluatedAt: DateTime;
}
```
**Canary Rollout Execution**:
```typescript
class CanaryController {
async executeRollout(abRelease: ABRelease): Promise<void> {
const stages = abRelease.rolloutStrategy.stages;
for (const stage of stages) {
this.log(`Starting canary stage ${stage.stageNumber}: ${stage.trafficPercentage}%`);
// 1. Shift traffic to canary percentage
await this.trafficRouter.shiftTraffic(
abRelease.variations[0].name, // baseline
abRelease.variations[1].name, // canary
stage.trafficPercentage
);
// 2. Update stage status
stage.status = "running";
stage.startedAt = new Date();
await this.save(stage);
// 3. Wait for stage duration
await this.waitForDuration(stage.durationSeconds);
// 4. Evaluate health
const healthResult = await this.evaluateHealth(abRelease, stage);
stage.healthResult = healthResult;
if (!healthResult.healthy || healthResult.healthPercentage < stage.healthThreshold) {
stage.status = "failed";
await this.save(stage);
// Rollback
await this.rollback(abRelease);
throw new CanaryFailedError(`Stage ${stage.stageNumber} failed health check`);
}
// 5. Check if approval required
if (stage.requireApproval) {
await this.waitForApproval(abRelease, stage);
}
stage.status = "succeeded";
stage.completedAt = new Date();
await this.save(stage);
// 6. Check for auto-advance
if (!abRelease.rolloutStrategy.autoAdvance) {
await this.waitForManualAdvance(abRelease);
}
}
// All stages passed - promote canary to 100%
await this.promote(abRelease, abRelease.variations[1].name);
}
private async evaluateHealth(abRelease: ABRelease, stage: CanaryStage): Promise<HealthResult> {
// Collect metrics from targets
const canaryVariation = abRelease.variations.find(v => v.name === "B");
const targets = await this.getTargets(canaryVariation.targetGroupId);
let healthyCount = 0;
let totalLatency = 0;
let errorCount = 0;
for (const target of targets) {
const health = await this.checkTargetHealth(target);
if (health.healthy) healthyCount++;
totalLatency += health.latencyMs;
if (health.errorRate > 0) errorCount++;
}
return {
healthy: healthyCount >= targets.length * (stage.healthThreshold / 100),
healthPercentage: (healthyCount / targets.length) * 100,
metrics: {
successRate: ((targets.length - errorCount) / targets.length) * 100,
errorRate: (errorCount / targets.length) * 100,
latencyP50: totalLatency / targets.length,
latencyP99: totalLatency / targets.length * 1.5, // simplified
},
samples: targets.length,
evaluatedAt: new Date(),
};
}
}
```
---
### Module: `rollout-strategy`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Strategy templates; configuration |
| **Data Entities** | `RolloutStrategyTemplate` |
**Built-in Strategy Templates**:
| Template | Stages | Description |
|----------|--------|-------------|
| `canary-10-25-50-100` | 4 | Standard canary: 10%, 25%, 50%, 100% |
| `canary-1-5-10-50-100` | 5 | Conservative: 1%, 5%, 10%, 50%, 100% |
| `blue-green-instant` | 2 | Deploy 100% to green, instant switch |
| `blue-green-gradual` | 4 | Gradual shift: 25%, 50%, 75%, 100% |
**Rollout Strategy Definition**:
```typescript
interface RolloutStrategy {
id: UUID;
name: string;
stages: Array<{
trafficPercentage: number;
durationSeconds: number;
healthThreshold: number;
requireApproval: boolean;
}>;
autoAdvance: boolean;
rollbackOnFailure: boolean;
healthCheckInterval: number;
}
// Example: Standard Canary
const standardCanary: RolloutStrategy = {
name: "canary-10-25-50-100",
stages: [
{ trafficPercentage: 10, durationSeconds: 300, healthThreshold: 95, requireApproval: false },
{ trafficPercentage: 25, durationSeconds: 600, healthThreshold: 95, requireApproval: false },
{ trafficPercentage: 50, durationSeconds: 900, healthThreshold: 95, requireApproval: true },
{ trafficPercentage: 100, durationSeconds: 0, healthThreshold: 95, requireApproval: false },
],
autoAdvance: true,
rollbackOnFailure: true,
healthCheckInterval: 30,
};
```
---
## Database Schema
```sql
-- A/B Releases
CREATE TABLE release.ab_releases (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
environment_id UUID NOT NULL REFERENCES release.environments(id),
name VARCHAR(255) NOT NULL,
variations JSONB NOT NULL, -- [{name, releaseId, targetGroupId, trafficPercentage}]
active_variation VARCHAR(50) NOT NULL DEFAULT 'A',
traffic_split JSONB NOT NULL,
rollout_strategy JSONB NOT NULL,
status VARCHAR(50) NOT NULL DEFAULT 'created' CHECK (status IN (
'created', 'deploying', 'running', 'promoting', 'completed', 'rolled_back'
)),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
completed_at TIMESTAMPTZ,
created_by UUID REFERENCES users(id)
);
CREATE INDEX idx_ab_releases_tenant_env ON release.ab_releases(tenant_id, environment_id);
CREATE INDEX idx_ab_releases_status ON release.ab_releases(status);
-- Canary Stages
CREATE TABLE release.canary_stages (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
ab_release_id UUID NOT NULL REFERENCES release.ab_releases(id) ON DELETE CASCADE,
stage_number INTEGER NOT NULL,
traffic_percentage INTEGER NOT NULL,
status VARCHAR(50) NOT NULL DEFAULT 'pending' CHECK (status IN (
'pending', 'running', 'succeeded', 'failed', 'skipped'
)),
health_threshold DECIMAL(5,2),
duration_seconds INTEGER,
require_approval BOOLEAN NOT NULL DEFAULT FALSE,
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
health_result JSONB,
UNIQUE (ab_release_id, stage_number)
);
```
---
## API Endpoints
```yaml
# A/B Releases
POST /api/v1/ab-releases
Body: {
environmentId: UUID,
name: string,
variations: [
{ name: "A", releaseId: UUID, targetGroupId?: UUID },
{ name: "B", releaseId: UUID, targetGroupId?: UUID }
],
trafficSplit: TrafficSplit,
rolloutStrategy: RolloutStrategy
}
Response: ABRelease
GET /api/v1/ab-releases
Query: ?environmentId={uuid}&status={status}
Response: ABRelease[]
GET /api/v1/ab-releases/{id}
Response: ABRelease (with stages)
POST /api/v1/ab-releases/{id}/start
Response: ABRelease
POST /api/v1/ab-releases/{id}/advance
Body: { stageNumber?: number } # advance to next or specific stage
Response: ABRelease
POST /api/v1/ab-releases/{id}/promote
Body: { variation: "A" | "B" } # promote to 100%
Response: ABRelease
POST /api/v1/ab-releases/{id}/rollback
Response: ABRelease
GET /api/v1/ab-releases/{id}/traffic
Response: { currentSplit: TrafficDistribution, history: TrafficHistory[] }
GET /api/v1/ab-releases/{id}/health
Response: { variations: [{ name, healthStatus, metrics }] }
# Rollout Strategies
GET /api/v1/rollout-strategies
Response: RolloutStrategyTemplate[]
GET /api/v1/rollout-strategies/{id}
Response: RolloutStrategyTemplate
```
---
## References
- [Module Overview](overview.md)
- [Deploy Orchestrator](deploy-orchestrator.md)
- [A/B Releases](../progressive-delivery/ab-releases.md)
- [Canary Controller](../progressive-delivery/canary.md)
- [Router Plugins](../progressive-delivery/routers.md)

View File

@@ -0,0 +1,433 @@
# PROMOT: Promotion & Approval Manager
**Purpose**: Manage promotion requests, approvals, gates, and decision records.
## Modules
### Module: `promotion-manager`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Promotion request lifecycle; state management |
| **Dependencies** | `release-manager`, `environment-manager`, `workflow-engine` |
| **Data Entities** | `Promotion`, `PromotionState` |
| **Events Produced** | `promotion.requested`, `promotion.approved`, `promotion.rejected`, `promotion.started`, `promotion.completed`, `promotion.failed`, `promotion.rolled_back` |
**Key Operations**:
```
RequestPromotion(releaseId, targetEnvironmentId, reason) → Promotion
ApprovePromotion(promotionId, comment) → Promotion
RejectPromotion(promotionId, reason) → Promotion
CancelPromotion(promotionId) → Promotion
GetPromotionStatus(promotionId) → PromotionState
GetDecisionRecord(promotionId) → DecisionRecord
```
**Promotion Entity**:
```typescript
interface Promotion {
id: UUID;
tenantId: UUID;
releaseId: UUID;
sourceEnvironmentId: UUID | null; // null for first deployment
targetEnvironmentId: UUID;
status: PromotionStatus;
decisionRecord: DecisionRecord;
workflowRunId: UUID | null;
requestedAt: DateTime;
requestedBy: UUID;
requestReason: string;
decidedAt: DateTime | null;
startedAt: DateTime | null;
completedAt: DateTime | null;
evidencePacketId: UUID | null;
}
type PromotionStatus =
| "pending_approval" // Waiting for human approval
| "pending_gate" // Waiting for gate evaluation
| "approved" // Ready for deployment
| "rejected" // Blocked by approval or gate
| "deploying" // Deployment in progress
| "deployed" // Successfully deployed
| "failed" // Deployment failed
| "cancelled" // User cancelled
| "rolled_back"; // Rolled back after failure
```
---
### Module: `approval-gateway`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Approval collection; separation of duties enforcement |
| **Dependencies** | `authority` (for user/group lookup) |
| **Data Entities** | `Approval`, `ApprovalPolicy` |
| **Events Produced** | `approval.granted`, `approval.denied` |
**Approval Policy Entity**:
```typescript
interface ApprovalPolicy {
id: UUID;
tenantId: UUID;
environmentId: UUID;
requiredCount: number; // Minimum approvals required
requiredRoles: string[]; // At least one approver must have role
requiredGroups: string[]; // At least one approver must be in group
requireSeparationOfDuties: boolean; // Requester cannot approve
allowSelfApproval: boolean; // Override SoD for specific users
expirationMinutes: number; // Approval expires after N minutes
}
interface Approval {
id: UUID;
tenantId: UUID;
promotionId: UUID;
approverId: UUID;
action: "approved" | "rejected";
comment: string;
approvedAt: DateTime;
approverRole: string;
approverGroups: string[];
}
```
**Separation of Duties (SoD) Rules**:
1. Requester cannot approve their own promotion (if `requireSeparationOfDuties` is true)
2. Same user cannot approve twice
3. At least N different users must approve (based on `requiredCount`)
4. At least one approver must match `requiredRoles` if specified
5. At least one approver must be in `requiredGroups` if specified
---
### Module: `decision-engine`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Gate evaluation; policy integration; decision record generation |
| **Dependencies** | `gate-registry`, `policy` (OPA integration), `scanner` (security data) |
| **Data Entities** | `DecisionRecord`, `GateResult` |
| **Events Produced** | `decision.evaluated`, `decision.recorded` |
**Decision Record Structure**:
```typescript
interface DecisionRecord {
promotionId: UUID;
evaluatedAt: DateTime;
decision: "allow" | "deny" | "pending";
// What was evaluated
release: {
id: UUID;
name: string;
components: Array<{
name: string;
digest: string;
semver: string;
}>;
};
environment: {
id: UUID;
name: string;
requiredApprovals: number;
freezeWindow: boolean;
};
// Gate evaluation results
gates: GateResult[];
// Approval status
approvalStatus: {
required: number;
received: number;
approvers: Array<{
userId: UUID;
action: string;
at: DateTime;
}>;
sodViolation: boolean;
};
// Reason for decision
reasons: string[];
// Hash of all inputs for replay verification
inputsHash: string;
}
interface GateResult {
gateType: string;
gateName: string;
status: "passed" | "failed" | "warning" | "skipped";
message: string;
details: Record<string, any>;
evaluatedAt: DateTime;
durationMs: number;
}
```
**Gate Evaluation Order**:
1. **Freeze Window Check**: Is environment in freeze?
2. **Approval Check**: All required approvals received?
3. **Security Gate**: No blocking vulnerabilities?
4. **Custom Policy Gates**: All OPA policies pass?
5. **Integration Gates**: External system checks pass?
---
### Module: `gate-registry`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Built-in + custom gate registration |
| **Dependencies** | `plugin-registry` |
| **Data Entities** | `GateDefinition`, `GateConfig` |
**Built-in Gates**:
| Gate Type | Description |
|-----------|-------------|
| `freeze-window` | Check if environment is in freeze |
| `approval` | Check if required approvals received |
| `security-scan` | Check for blocking vulnerabilities |
| `scan-freshness` | Check if scan is recent enough |
| `digest-verification` | Verify digests haven't changed |
| `environment-sequence` | Enforce promotion order |
| `custom-opa` | Custom OPA/Rego policy |
| `webhook` | External webhook gate |
**Gate Definition**:
```typescript
interface GateDefinition {
type: string;
displayName: string;
description: string;
configSchema: JSONSchema;
evaluator: "builtin" | UUID; // builtin or plugin ID
blocking: boolean; // Can block promotion
cacheable: boolean; // Can cache result
cacheTtlSeconds: number;
}
```
---
## Promotion State Machine
```
┌─────────────────────────────────────────────────────────────────────────────┐
│ PROMOTION STATE MACHINE │
│ │
│ ┌───────────────┐ │
│ │ REQUESTED │ ◄──── User requests promotion │
│ └───────┬───────┘ │
│ │ │
│ ▼ │
│ ┌───────────────┐ ┌───────────────┐ │
│ │ PENDING │─────►│ REJECTED │ ◄──── Approver rejects │
│ │ APPROVAL │ └───────────────┘ │
│ └───────┬───────┘ │
│ │ approval received │
│ ▼ │
│ ┌───────────────┐ ┌───────────────┐ │
│ │ PENDING │─────►│ REJECTED │ ◄──── Gate fails │
│ │ GATE │ └───────────────┘ │
│ └───────┬───────┘ │
│ │ all gates pass │
│ ▼ │
│ ┌───────────────┐ │
│ │ APPROVED │ ◄──── Ready for deployment │
│ └───────┬───────┘ │
│ │ workflow starts │
│ ▼ │
│ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │
│ │ DEPLOYING │─────►│ FAILED │─────►│ ROLLED_BACK │ │
│ └───────┬───────┘ └───────────────┘ └───────────────┘ │
│ │ │
│ │ deployment complete │
│ ▼ │
│ ┌───────────────┐ │
│ │ DEPLOYED │ ◄──── Success! │
│ └───────────────┘ │
│ │
│ Additional transitions: │
│ - Any non-terminal → CANCELLED: user cancels │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
```
---
## Database Schema
```sql
-- Promotions
CREATE TABLE release.promotions (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
release_id UUID NOT NULL REFERENCES release.releases(id),
source_environment_id UUID REFERENCES release.environments(id),
target_environment_id UUID NOT NULL REFERENCES release.environments(id),
status VARCHAR(50) NOT NULL DEFAULT 'pending_approval' CHECK (status IN (
'pending_approval', 'pending_gate', 'approved', 'rejected',
'deploying', 'deployed', 'failed', 'cancelled', 'rolled_back'
)),
decision_record JSONB,
workflow_run_id UUID REFERENCES release.workflow_runs(id),
requested_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
requested_by UUID NOT NULL REFERENCES users(id),
request_reason TEXT,
decided_at TIMESTAMPTZ,
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
evidence_packet_id UUID
);
CREATE INDEX idx_promotions_tenant ON release.promotions(tenant_id);
CREATE INDEX idx_promotions_release ON release.promotions(release_id);
CREATE INDEX idx_promotions_status ON release.promotions(status);
CREATE INDEX idx_promotions_target_env ON release.promotions(target_environment_id);
-- Approvals
CREATE TABLE release.approvals (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
promotion_id UUID NOT NULL REFERENCES release.promotions(id) ON DELETE CASCADE,
approver_id UUID NOT NULL REFERENCES users(id),
action VARCHAR(50) NOT NULL CHECK (action IN ('approved', 'rejected')),
comment TEXT,
approved_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
approver_role VARCHAR(255),
approver_groups JSONB NOT NULL DEFAULT '[]'
);
CREATE INDEX idx_approvals_promotion ON release.approvals(promotion_id);
CREATE INDEX idx_approvals_approver ON release.approvals(approver_id);
-- Approval Policies
CREATE TABLE release.approval_policies (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
environment_id UUID NOT NULL REFERENCES release.environments(id) ON DELETE CASCADE,
required_count INTEGER NOT NULL DEFAULT 1,
required_roles JSONB NOT NULL DEFAULT '[]',
required_groups JSONB NOT NULL DEFAULT '[]',
require_sod BOOLEAN NOT NULL DEFAULT FALSE,
allow_self_approval BOOLEAN NOT NULL DEFAULT FALSE,
expiration_minutes INTEGER NOT NULL DEFAULT 1440,
UNIQUE (tenant_id, environment_id)
);
```
---
## API Endpoints
```yaml
# Promotions
POST /api/v1/promotions
Body: { releaseId, targetEnvironmentId, reason? }
Response: Promotion
GET /api/v1/promotions
Query: ?status={status}&releaseId={uuid}&environmentId={uuid}&page={n}
Response: { data: Promotion[], meta: PaginationMeta }
GET /api/v1/promotions/{id}
Response: Promotion (with decision record, approvals)
POST /api/v1/promotions/{id}/approve
Body: { comment? }
Response: Promotion
POST /api/v1/promotions/{id}/reject
Body: { reason }
Response: Promotion
POST /api/v1/promotions/{id}/cancel
Response: Promotion
GET /api/v1/promotions/{id}/decision
Response: DecisionRecord
GET /api/v1/promotions/{id}/approvals
Response: Approval[]
GET /api/v1/promotions/{id}/evidence
Response: EvidencePacket
# Gate Evaluation Preview
POST /api/v1/promotions/preview-gates
Body: { releaseId, targetEnvironmentId }
Response: { wouldPass: boolean, gates: GateResult[] }
# Approval Policies
POST /api/v1/approval-policies
GET /api/v1/approval-policies
GET /api/v1/approval-policies/{id}
PUT /api/v1/approval-policies/{id}
DELETE /api/v1/approval-policies/{id}
# Pending Approvals (for current user)
GET /api/v1/my/pending-approvals
Response: Promotion[]
```
---
## Security Gate Integration
The security gate evaluates the release against vulnerability data from the Scanner module:
```typescript
interface SecurityGateConfig {
blockOnCritical: boolean; // Block if any critical severity
blockOnHigh: boolean; // Block if any high severity
maxCritical: number; // Max allowed critical (0 for strict)
maxHigh: number; // Max allowed high
requireFreshScan: boolean; // Require scan within N hours
scanFreshnessHours: number; // How recent scan must be
allowExceptions: boolean; // Allow VEX exceptions
requireVexJustification: boolean; // Require VEX for exceptions
}
interface SecurityGateResult {
passed: boolean;
summary: {
critical: number;
high: number;
medium: number;
low: number;
};
blocking: Array<{
cve: string;
severity: string;
component: string;
digest: string;
fixAvailable: boolean;
}>;
exceptions: Array<{
cve: string;
vexStatus: string;
justification: string;
}>;
scanAge: {
component: string;
scannedAt: DateTime;
ageHours: number;
fresh: boolean;
}[];
}
```
---
## References
- [Module Overview](overview.md)
- [Workflow Engine](workflow-engine.md)
- [Security Architecture](../security/overview.md)
- [API Documentation](../api/promotions.md)

View File

@@ -0,0 +1,406 @@
# RELMAN: Release Management
**Purpose**: Manage components, versions, and release bundles.
## Modules
### Module: `component-registry`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Map image repositories to logical components |
| **Dependencies** | `integration-manager` (for registry access) |
| **Data Entities** | `Component`, `ComponentVersion` |
| **Events Produced** | `component.created`, `component.updated`, `component.deleted` |
**Key Operations**:
```
CreateComponent(name, displayName, imageRepository, registryId) → Component
UpdateComponent(id, config) → Component
DeleteComponent(id) → void
SyncVersions(componentId, forceRefresh) → VersionMap[]
ListComponents(tenantId) → Component[]
```
**Component Entity**:
```typescript
interface Component {
id: UUID;
tenantId: UUID;
name: string; // "api", "worker", "frontend"
displayName: string; // "API Service"
imageRepository: string; // "registry.example.com/myapp/api"
registryIntegrationId: UUID; // which registry integration
versioningStrategy: VersionStrategy;
deploymentTemplate: string; // which workflow template to use
defaultChannel: string; // "stable", "beta"
metadata: Record<string, string>;
}
interface VersionStrategy {
type: "semver" | "date" | "sequential" | "manual";
tagPattern?: string; // regex for tag extraction
semverExtract?: string; // regex capture group
}
```
---
### Module: `version-manager`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Tag/digest mapping; version rules |
| **Dependencies** | `component-registry`, `connector-runtime` |
| **Data Entities** | `VersionMap`, `VersionRule`, `Channel` |
| **Events Produced** | `version.resolved`, `version.updated` |
**Version Resolution**:
```typescript
interface VersionMap {
id: UUID;
componentId: UUID;
tag: string; // "v2.3.1"
digest: string; // "sha256:abc123..."
semver: string; // "2.3.1"
channel: string; // "stable"
prerelease: boolean;
buildMetadata: string;
resolvedAt: DateTime;
source: "auto" | "manual";
}
interface VersionRule {
id: UUID;
componentId: UUID;
pattern: string; // "^v(\\d+\\.\\d+\\.\\d+)$"
channel: string; // "stable"
prereleasePattern: string;// ".*-(alpha|beta|rc).*"
}
```
**Version Resolution Algorithm**:
1. Fetch tags from registry (via connector)
2. Apply version rules to extract semver
3. Resolve each tag to digest
4. Store in version map
5. Update channels ("latest stable", "latest beta")
---
### Module: `release-manager`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Release bundle lifecycle; composition |
| **Dependencies** | `component-registry`, `version-manager` |
| **Data Entities** | `Release`, `ReleaseComponent` |
| **Events Produced** | `release.created`, `release.promoted`, `release.deprecated` |
**Release Entity**:
```typescript
interface Release {
id: UUID;
tenantId: UUID;
name: string; // "myapp-v2.3.1"
displayName: string; // "MyApp 2.3.1"
components: ReleaseComponent[];
sourceRef: SourceReference;
status: ReleaseStatus;
createdAt: DateTime;
createdBy: UUID;
deployedEnvironments: UUID[]; // where currently deployed
metadata: Record<string, string>;
}
interface ReleaseComponent {
componentId: UUID;
componentName: string;
digest: string; // sha256:...
semver: string; // resolved semver
tag: string; // original tag (for display)
role: "primary" | "sidecar" | "init" | "migration";
}
interface SourceReference {
scmIntegrationId?: UUID;
commitSha?: string;
branch?: string;
ciIntegrationId?: UUID;
buildId?: string;
pipelineUrl?: string;
}
type ReleaseStatus =
| "draft" // being composed
| "ready" // ready for promotion
| "promoting" // promotion in progress
| "deployed" // deployed to at least one env
| "deprecated" // marked as deprecated
| "archived"; // no longer active
```
**Release Creation Modes**:
| Mode | Description |
|------|-------------|
| **Full Release** | All components, latest versions |
| **Partial Release** | Subset of components updated; others pinned from last deployment |
| **Pinned Release** | All versions explicitly specified |
| **Channel Release** | All components from specific channel ("beta") |
---
### Module: `release-catalog`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Release history, search, comparison |
| **Dependencies** | `release-manager` |
**Key Operations**:
```
SearchReleases(filter, pagination) → Release[]
CompareReleases(releaseA, releaseB) → ReleaseDiff
GetReleaseHistory(componentId) → Release[]
GetReleaseLineage(releaseId) → ReleaseLineage // promotion path
```
**Release Comparison**:
```typescript
interface ReleaseDiff {
releaseA: UUID;
releaseB: UUID;
added: ComponentDiff[]; // Components in B not in A
removed: ComponentDiff[]; // Components in A not in B
changed: ComponentChange[]; // Components with different versions
unchanged: ComponentDiff[]; // Components with same version
}
interface ComponentChange {
componentId: UUID;
componentName: string;
fromVersion: string;
toVersion: string;
fromDigest: string;
toDigest: string;
}
```
---
## Database Schema
```sql
-- Components
CREATE TABLE release.components (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
name VARCHAR(255) NOT NULL,
display_name VARCHAR(255) NOT NULL,
image_repository VARCHAR(500) NOT NULL,
registry_integration_id UUID REFERENCES release.integrations(id),
versioning_strategy JSONB NOT NULL DEFAULT '{"type": "semver"}',
deployment_template VARCHAR(255),
default_channel VARCHAR(50) NOT NULL DEFAULT 'stable',
metadata JSONB NOT NULL DEFAULT '{}',
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (tenant_id, name)
);
CREATE INDEX idx_components_tenant ON release.components(tenant_id);
-- Version Maps
CREATE TABLE release.version_maps (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
component_id UUID NOT NULL REFERENCES release.components(id) ON DELETE CASCADE,
tag VARCHAR(255) NOT NULL,
digest VARCHAR(100) NOT NULL,
semver VARCHAR(50),
channel VARCHAR(50) NOT NULL DEFAULT 'stable',
prerelease BOOLEAN NOT NULL DEFAULT FALSE,
build_metadata VARCHAR(255),
resolved_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
source VARCHAR(50) NOT NULL DEFAULT 'auto',
UNIQUE (tenant_id, component_id, digest)
);
CREATE INDEX idx_version_maps_component ON release.version_maps(component_id);
CREATE INDEX idx_version_maps_digest ON release.version_maps(digest);
CREATE INDEX idx_version_maps_semver ON release.version_maps(semver);
-- Releases
CREATE TABLE release.releases (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
name VARCHAR(255) NOT NULL,
display_name VARCHAR(255) NOT NULL,
components JSONB NOT NULL, -- [{componentId, digest, semver, tag, role}]
source_ref JSONB, -- {scmIntegrationId, commitSha, ciIntegrationId, buildId}
status VARCHAR(50) NOT NULL DEFAULT 'draft',
metadata JSONB NOT NULL DEFAULT '{}',
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
created_by UUID REFERENCES users(id),
UNIQUE (tenant_id, name)
);
CREATE INDEX idx_releases_tenant ON release.releases(tenant_id);
CREATE INDEX idx_releases_status ON release.releases(status);
CREATE INDEX idx_releases_created ON release.releases(created_at DESC);
-- Release Environment State
CREATE TABLE release.release_environment_state (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
environment_id UUID NOT NULL REFERENCES release.environments(id) ON DELETE CASCADE,
release_id UUID NOT NULL REFERENCES release.releases(id),
status VARCHAR(50) NOT NULL,
deployed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
deployed_by UUID REFERENCES users(id),
promotion_id UUID,
evidence_ref VARCHAR(255),
UNIQUE (tenant_id, environment_id)
);
CREATE INDEX idx_release_env_state_env ON release.release_environment_state(environment_id);
CREATE INDEX idx_release_env_state_release ON release.release_environment_state(release_id);
```
---
## API Endpoints
```yaml
# Components
POST /api/v1/components
Body: { name, displayName, imageRepository, registryIntegrationId, versioningStrategy?, defaultChannel? }
Response: Component
GET /api/v1/components
Response: Component[]
GET /api/v1/components/{id}
Response: Component
PUT /api/v1/components/{id}
Response: Component
DELETE /api/v1/components/{id}
Response: { deleted: true }
POST /api/v1/components/{id}/sync-versions
Body: { forceRefresh?: boolean }
Response: { synced: number, versions: VersionMap[] }
GET /api/v1/components/{id}/versions
Query: ?channel={stable|beta}&limit={n}
Response: VersionMap[]
# Version Maps
POST /api/v1/version-maps
Body: { componentId, tag, semver, channel } # manual version assignment
Response: VersionMap
GET /api/v1/version-maps
Query: ?componentId={uuid}&channel={channel}
Response: VersionMap[]
# Releases
POST /api/v1/releases
Body: {
name: string,
displayName?: string,
components: [
{ componentId: UUID, version?: string, digest?: string, channel?: string }
],
sourceRef?: SourceReference
}
Response: Release
GET /api/v1/releases
Query: ?status={status}&componentId={uuid}&page={n}&pageSize={n}
Response: { data: Release[], meta: PaginationMeta }
GET /api/v1/releases/{id}
Response: Release (with full component details)
PUT /api/v1/releases/{id}
Body: { displayName?, metadata?, status? }
Response: Release
DELETE /api/v1/releases/{id}
Response: { deleted: true }
GET /api/v1/releases/{id}/state
Response: { environments: [{ environmentId, status, deployedAt }] }
POST /api/v1/releases/{id}/deprecate
Response: Release
GET /api/v1/releases/{id}/compare/{otherId}
Response: ReleaseDiff
# Quick release creation
POST /api/v1/releases/from-latest
Body: {
name: string,
channel?: string, # default: stable
componentIds?: UUID[], # default: all
pinFrom?: { environmentId: UUID } # for partial release
}
Response: Release
```
---
## Release Identity: Digest-First Principle
A core design invariant of the Release Orchestrator:
```
INVARIANT: A release is a set of OCI image digests (component -> digest mapping), never tags.
```
**Implementation Requirements**:
- Tags are convenience inputs for resolution
- Tags are resolved to digests at release creation time
- All downstream operations (promotion, deployment, rollback) use digests
- Digest mismatch at pull time = deployment failure (tamper detection)
**Example**:
```json
{
"id": "release-uuid",
"name": "myapp-v2.3.1",
"components": [
{
"componentId": "api-component-uuid",
"componentName": "api",
"tag": "v2.3.1",
"digest": "sha256:abc123def456...",
"semver": "2.3.1",
"role": "primary"
},
{
"componentId": "worker-component-uuid",
"componentName": "worker",
"tag": "v2.3.1",
"digest": "sha256:789xyz123abc...",
"semver": "2.3.1",
"role": "primary"
}
]
}
```
---
## References
- [Module Overview](overview.md)
- [Design Principles](../design/principles.md)
- [API Documentation](../api/releases.md)
- [Promotion Manager](promotion-manager.md)

View File

@@ -0,0 +1,590 @@
# WORKFL: Workflow Engine
**Purpose**: DAG-based workflow execution for deployments, approvals, and custom automation.
## Modules
### Module: `workflow-designer`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Template creation; DAG graph editor; validation |
| **Dependencies** | `step-registry` |
| **Data Entities** | `WorkflowTemplate`, `StepNode`, `StepEdge` |
**Workflow Template Structure**:
```typescript
interface WorkflowTemplate {
id: UUID;
tenantId: UUID;
name: string;
displayName: string;
description: string;
version: number;
// DAG structure
nodes: StepNode[];
edges: StepEdge[];
// I/O
inputs: InputDefinition[];
outputs: OutputDefinition[];
// Metadata
tags: string[];
isBuiltin: boolean;
createdAt: DateTime;
createdBy: UUID;
}
```
---
### Module: `workflow-engine`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | DAG execution; state machine; pause/resume |
| **Dependencies** | `step-executor`, `step-registry` |
| **Data Entities** | `WorkflowRun`, `WorkflowState` |
| **Events Produced** | `workflow.started`, `workflow.paused`, `workflow.resumed`, `workflow.completed`, `workflow.failed` |
**Workflow Execution Algorithm**:
```python
class WorkflowEngine:
def execute(self, workflow_run: WorkflowRun) -> None:
"""Main workflow execution loop."""
# Initialize
workflow_run.status = "running"
workflow_run.started_at = now()
self.save(workflow_run)
try:
while not self.is_terminal(workflow_run):
# Handle pause state
if workflow_run.status == "paused":
self.wait_for_resume(workflow_run)
continue
# Get nodes ready for execution
ready_nodes = self.get_ready_nodes(workflow_run)
if not ready_nodes:
# Check if we're waiting on approvals
if self.has_pending_approvals(workflow_run):
workflow_run.status = "paused"
self.save(workflow_run)
continue
# Check if all nodes are complete
if self.all_nodes_complete(workflow_run):
break
# Deadlock detection
raise WorkflowDeadlockError(workflow_run.id)
# Execute ready nodes in parallel
futures = []
for node in ready_nodes:
future = self.executor.submit(
self.execute_node,
workflow_run,
node
)
futures.append((node, future))
# Wait for at least one to complete
completed = self.wait_any(futures)
for node, result in completed:
step_run = self.get_step_run(workflow_run, node.id)
if result.success:
step_run.status = "succeeded"
step_run.outputs = result.outputs
self.propagate_outputs(workflow_run, node, result.outputs)
else:
step_run.status = "failed"
step_run.error_message = result.error
# Handle failure action
if node.on_failure == "fail":
workflow_run.status = "failed"
workflow_run.error_message = f"Step {node.name} failed: {result.error}"
self.cancel_pending_steps(workflow_run)
return
elif node.on_failure == "rollback":
self.trigger_rollback(workflow_run, node)
elif node.on_failure.startswith("goto:"):
target = node.on_failure.split(":")[1]
self.add_ready_node(workflow_run, target)
# "continue" just continues to next nodes
step_run.completed_at = now()
self.save(step_run)
# Workflow completed successfully
workflow_run.status = "succeeded"
workflow_run.completed_at = now()
self.save(workflow_run)
except WorkflowCancelledError:
workflow_run.status = "cancelled"
workflow_run.completed_at = now()
self.save(workflow_run)
except Exception as e:
workflow_run.status = "failed"
workflow_run.error_message = str(e)
workflow_run.completed_at = now()
self.save(workflow_run)
```
---
### Module: `step-executor`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Step dispatch; retry logic; timeout handling |
| **Dependencies** | `step-registry`, `plugin-sandbox` |
| **Data Entities** | `StepRun`, `StepResult` |
| **Events Produced** | `step.started`, `step.progress`, `step.completed`, `step.failed`, `step.retrying` |
**Step Node Structure**:
```typescript
interface StepNode {
id: string; // Unique within template (e.g., "deploy-api")
type: string; // Step type from registry
name: string; // Display name
config: Record<string, any>; // Step-specific configuration
inputs: InputBinding[]; // Input value bindings
outputs: OutputBinding[]; // Output declarations
position: { x: number; y: number }; // UI position
// Execution settings
timeout: number; // Seconds (default from step type)
retryPolicy: RetryPolicy;
onFailure: FailureAction;
condition?: string; // JS expression for conditional execution
// Documentation
description?: string;
documentation?: string;
}
type FailureAction = "fail" | "continue" | "rollback" | "goto:{nodeId}";
interface InputBinding {
name: string; // Input parameter name
source: InputSource;
}
type InputSource =
| { type: "literal"; value: any }
| { type: "context"; path: string } // e.g., "release.name"
| { type: "output"; nodeId: string; outputName: string }
| { type: "secret"; secretName: string }
| { type: "expression"; expression: string }; // JS expression
interface StepEdge {
id: string;
from: string; // Source node ID
to: string; // Target node ID
condition?: string; // Optional condition expression
label?: string; // Display label for conditional edges
}
interface RetryPolicy {
maxRetries: number;
backoffType: "fixed" | "exponential";
backoffSeconds: number;
retryableErrors: string[];
}
```
---
### Module: `step-registry`
| Aspect | Specification |
|--------|---------------|
| **Responsibility** | Built-in + plugin-provided step types |
| **Dependencies** | `plugin-registry` |
| **Data Entities** | `StepType`, `StepSchema` |
**Built-in Step Types**:
| Step Type | Category | Description |
|-----------|----------|-------------|
| `approval` | Control | Wait for human approval |
| `security-gate` | Gate | Evaluate security policy |
| `custom-gate` | Gate | Custom OPA policy evaluation |
| `deploy-docker` | Deploy | Deploy single container |
| `deploy-compose` | Deploy | Deploy Docker Compose stack |
| `deploy-ecs` | Deploy | Deploy to AWS ECS |
| `deploy-nomad` | Deploy | Deploy to HashiCorp Nomad |
| `health-check` | Verify | HTTP/TCP health check |
| `smoke-test` | Verify | Run smoke test suite |
| `execute-script` | Custom | Run C#/Bash script |
| `webhook` | Integration | Call external webhook |
| `trigger-ci` | Integration | Trigger CI pipeline |
| `wait-ci` | Integration | Wait for CI pipeline |
| `notify` | Notification | Send notification |
| `rollback` | Recovery | Rollback deployment |
| `traffic-shift` | Progressive | Shift traffic percentage |
**Step Type Definition**:
```typescript
interface StepType {
type: string; // "deploy-compose"
displayName: string; // "Deploy Compose Stack"
description: string;
category: StepCategory;
icon: string;
// Schema
configSchema: JSONSchema; // Step configuration schema
inputSchema: JSONSchema; // Required inputs schema
outputSchema: JSONSchema; // Produced outputs schema
// Execution
executor: "builtin" | UUID; // builtin or plugin ID
defaultTimeout: number;
safeToRetry: boolean;
retryableErrors: string[];
// Documentation
documentation: string;
examples: StepExample[];
}
```
---
## Workflow Run State Machine
```
┌─────────────────────────────────────────────────────────────────────────────┐
│ WORKFLOW RUN STATE MACHINE │
│ │
│ ┌──────────┐ │
│ │ CREATED │ │
│ └────┬─────┘ │
│ │ start() │
│ ▼ │
│ ┌─────────────────────────────┐ │
│ │ │ │
│ pause() ┌──┴──────────┐ │ │
│ ┌────────►│ PAUSED │◄─────────┐ │ │
│ │ └──────┬──────┘ │ │ │
│ │ │ resume() │ │ │
│ │ ▼ │ │ │
│ │ ┌─────────────┐ │ │ │
│ └─────────│ RUNNING │──────────┘ │ │
│ └──────┬──────┘ (waiting for │ │
│ │ approval) │ │
│ ┌────────────┼────────────┐ │ │
│ │ │ │ │ │
│ ▼ ▼ ▼ │ │
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │
│ │ SUCCEEDED │ │ FAILED │ │ CANCELLED │ │ │
│ └───────────┘ └───────────┘ └───────────┘ │ │
│ │
│ Transitions: │
│ - CREATED → RUNNING: start() │
│ - RUNNING → PAUSED: pause(), waiting approval │
│ - PAUSED → RUNNING: resume(), approval granted │
│ - RUNNING → SUCCEEDED: all nodes complete │
│ - RUNNING → FAILED: node fails with fail action │
│ - RUNNING → CANCELLED: cancel() │
│ - PAUSED → CANCELLED: cancel() │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
```
## Step Run State Machine
```
┌─────────────────────────────────────────────────────────────────────────────┐
│ STEP RUN STATE MACHINE │
│ │
│ ┌──────────┐ │
│ │ PENDING │ ◄──── Initial state; dependencies not met │
│ └────┬─────┘ │
│ │ dependencies met + condition true │
│ ▼ │
│ ┌──────────┐ │
│ │ RUNNING │ ◄──── Step is executing │
│ └────┬─────┘ │
│ │ │
│ ┌────┴────────────────┬─────────────────┐ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ │
│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │
│ └───────────┘ └─────┬─────┘ └───────────┘ │
│ │ ▲ │
│ │ │ condition false │
│ ▼ │ │
│ ┌───────────┐ │ │
│ │ RETRYING │──────┘ (max retries exceeded) │
│ └─────┬─────┘ │
│ │ │
│ │ retry attempt │
│ └──────────────────┐ │
│ │ │
│ ▼ │
│ ┌──────────┐ │
│ │ RUNNING │ (retry) │
│ └──────────┘ │
│ │
│ Additional transitions: │
│ - Any state → CANCELLED: workflow cancelled │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
```
---
## Database Schema
```sql
-- Workflow Templates
CREATE TABLE release.workflow_templates (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID REFERENCES tenants(id) ON DELETE CASCADE,
name VARCHAR(255) NOT NULL,
display_name VARCHAR(255) NOT NULL,
description TEXT,
version INTEGER NOT NULL DEFAULT 1,
nodes JSONB NOT NULL,
edges JSONB NOT NULL,
inputs JSONB NOT NULL DEFAULT '[]',
outputs JSONB NOT NULL DEFAULT '[]',
tags JSONB NOT NULL DEFAULT '[]',
is_builtin BOOLEAN NOT NULL DEFAULT FALSE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
created_by UUID REFERENCES users(id)
);
CREATE INDEX idx_workflow_templates_tenant ON release.workflow_templates(tenant_id);
CREATE INDEX idx_workflow_templates_name ON release.workflow_templates(name);
-- Workflow Runs
CREATE TABLE release.workflow_runs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
template_id UUID NOT NULL REFERENCES release.workflow_templates(id),
template_version INTEGER NOT NULL,
status VARCHAR(50) NOT NULL DEFAULT 'created',
context JSONB NOT NULL,
inputs JSONB NOT NULL DEFAULT '{}',
outputs JSONB NOT NULL DEFAULT '{}',
error_message TEXT,
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
created_by UUID REFERENCES users(id)
);
CREATE INDEX idx_workflow_runs_tenant ON release.workflow_runs(tenant_id);
CREATE INDEX idx_workflow_runs_template ON release.workflow_runs(template_id);
CREATE INDEX idx_workflow_runs_status ON release.workflow_runs(status);
-- Step Runs
CREATE TABLE release.step_runs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
workflow_run_id UUID NOT NULL REFERENCES release.workflow_runs(id) ON DELETE CASCADE,
node_id VARCHAR(255) NOT NULL,
status VARCHAR(50) NOT NULL DEFAULT 'pending',
inputs JSONB NOT NULL DEFAULT '{}',
outputs JSONB NOT NULL DEFAULT '{}',
error_message TEXT,
logs TEXT,
attempt_number INTEGER NOT NULL DEFAULT 1,
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
UNIQUE (workflow_run_id, node_id)
);
CREATE INDEX idx_step_runs_workflow ON release.step_runs(workflow_run_id);
CREATE INDEX idx_step_runs_status ON release.step_runs(status);
-- Step Registry
CREATE TABLE release.step_types (
type VARCHAR(255) PRIMARY KEY,
display_name VARCHAR(255) NOT NULL,
description TEXT,
category VARCHAR(100) NOT NULL,
icon VARCHAR(255),
config_schema JSONB NOT NULL,
input_schema JSONB NOT NULL,
output_schema JSONB NOT NULL,
executor VARCHAR(255) NOT NULL DEFAULT 'builtin',
default_timeout INTEGER NOT NULL DEFAULT 300,
safe_to_retry BOOLEAN NOT NULL DEFAULT FALSE,
retryable_errors JSONB NOT NULL DEFAULT '[]',
documentation TEXT,
examples JSONB NOT NULL DEFAULT '[]',
plugin_id UUID REFERENCES release.plugins(id),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_step_types_category ON release.step_types(category);
CREATE INDEX idx_step_types_plugin ON release.step_types(plugin_id);
```
---
## Workflow Template Example: Standard Deployment
```json
{
"id": "template-standard-deploy",
"name": "standard-deploy",
"displayName": "Standard Deployment",
"version": 1,
"inputs": [
{ "name": "releaseId", "type": "uuid", "required": true },
{ "name": "environmentId", "type": "uuid", "required": true },
{ "name": "promotionId", "type": "uuid", "required": true }
],
"nodes": [
{
"id": "approval",
"type": "approval",
"name": "Approval Gate",
"config": {},
"inputs": [
{ "name": "promotionId", "source": { "type": "context", "path": "promotionId" } }
],
"position": { "x": 100, "y": 100 }
},
{
"id": "security-gate",
"type": "security-gate",
"name": "Security Verification",
"config": {
"blockOnCritical": true,
"blockOnHigh": true
},
"inputs": [
{ "name": "releaseId", "source": { "type": "context", "path": "releaseId" } }
],
"position": { "x": 100, "y": 200 }
},
{
"id": "deploy-targets",
"type": "deploy-compose",
"name": "Deploy to Targets",
"config": {
"strategy": "rolling",
"parallelism": 2
},
"inputs": [
{ "name": "releaseId", "source": { "type": "context", "path": "releaseId" } },
{ "name": "environmentId", "source": { "type": "context", "path": "environmentId" } }
],
"timeout": 600,
"retryPolicy": {
"maxRetries": 2,
"backoffType": "exponential",
"backoffSeconds": 30
},
"onFailure": "rollback",
"position": { "x": 100, "y": 400 }
},
{
"id": "health-check",
"type": "health-check",
"name": "Health Verification",
"config": {
"type": "http",
"path": "/health",
"expectedStatus": 200,
"timeout": 30,
"retries": 5
},
"inputs": [
{ "name": "targets", "source": { "type": "output", "nodeId": "deploy-targets", "outputName": "deployedTargets" } }
],
"onFailure": "rollback",
"position": { "x": 100, "y": 500 }
},
{
"id": "notify-success",
"type": "notify",
"name": "Success Notification",
"config": {
"channel": "slack",
"template": "deployment-success"
},
"onFailure": "continue",
"position": { "x": 100, "y": 700 }
},
{
"id": "rollback-handler",
"type": "rollback",
"name": "Rollback Handler",
"config": {
"strategy": "to-previous"
},
"inputs": [
{ "name": "deploymentJobId", "source": { "type": "output", "nodeId": "deploy-targets", "outputName": "jobId" } }
],
"position": { "x": 300, "y": 450 }
}
],
"edges": [
{ "id": "e1", "from": "approval", "to": "security-gate" },
{ "id": "e2", "from": "security-gate", "to": "deploy-targets" },
{ "id": "e3", "from": "deploy-targets", "to": "health-check" },
{ "id": "e4", "from": "health-check", "to": "notify-success" },
{ "id": "e5", "from": "deploy-targets", "to": "rollback-handler", "condition": "status === 'failed'" },
{ "id": "e6", "from": "health-check", "to": "rollback-handler", "condition": "status === 'failed'" }
]
}
```
---
## API Endpoints
See [API Documentation](../api/workflows.md) for full specification.
```yaml
# Workflow Templates
POST /api/v1/workflow-templates
GET /api/v1/workflow-templates
GET /api/v1/workflow-templates/{id}
PUT /api/v1/workflow-templates/{id}
DELETE /api/v1/workflow-templates/{id}
POST /api/v1/workflow-templates/{id}/validate
# Step Registry
GET /api/v1/step-types
GET /api/v1/step-types/{type}
# Workflow Runs
POST /api/v1/workflow-runs
GET /api/v1/workflow-runs
GET /api/v1/workflow-runs/{id}
POST /api/v1/workflow-runs/{id}/pause
POST /api/v1/workflow-runs/{id}/resume
POST /api/v1/workflow-runs/{id}/cancel
GET /api/v1/workflow-runs/{id}/steps
GET /api/v1/workflow-runs/{id}/steps/{nodeId}
GET /api/v1/workflow-runs/{id}/steps/{nodeId}/logs
GET /api/v1/workflow-runs/{id}/steps/{nodeId}/artifacts
```
---
## References
- [Module Overview](overview.md)
- [Workflow Templates](../workflow/templates.md)
- [Execution State Machine](../workflow/execution.md)
- [API Documentation](../api/workflows.md)