release orchestrator pivot, architecture and planning
This commit is contained in:
597
docs/modules/release-orchestrator/modules/agents.md
Normal file
597
docs/modules/release-orchestrator/modules/agents.md
Normal file
@@ -0,0 +1,597 @@
|
||||
# AGENTS: Deployment Agents
|
||||
|
||||
**Purpose**: Lightweight deployment agents for target execution.
|
||||
|
||||
## Agent Types
|
||||
|
||||
| Agent Type | Transport | Target Types |
|
||||
|------------|-----------|--------------|
|
||||
| `agent-docker` | gRPC | Docker hosts |
|
||||
| `agent-compose` | gRPC | Docker Compose hosts |
|
||||
| `agent-ssh` | SSH | Linux remote hosts |
|
||||
| `agent-winrm` | WinRM | Windows remote hosts |
|
||||
| `agent-ecs` | AWS API | AWS ECS services |
|
||||
| `agent-nomad` | Nomad API | HashiCorp Nomad jobs |
|
||||
|
||||
## Modules
|
||||
|
||||
### Module: `agent-core`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Shared agent runtime; task execution framework |
|
||||
| **Protocol** | gRPC for communication with Stella Core |
|
||||
| **Security** | mTLS authentication; short-lived JWT for tasks |
|
||||
|
||||
**Agent Lifecycle**:
|
||||
1. Agent starts with registration token
|
||||
2. Agent registers with capabilities and labels
|
||||
3. Agent sends heartbeats (default: 30s interval)
|
||||
4. Agent receives tasks from Stella Core
|
||||
5. Agent reports task completion/failure
|
||||
|
||||
**Agent Task Protocol**:
|
||||
```typescript
|
||||
// Task assignment (Core → Agent)
|
||||
interface AgentTask {
|
||||
id: UUID;
|
||||
type: TaskType;
|
||||
targetId: UUID;
|
||||
payload: TaskPayload;
|
||||
credentials: EncryptedCredentials;
|
||||
timeout: number;
|
||||
priority: TaskPriority;
|
||||
idempotencyKey: string;
|
||||
assignedAt: DateTime;
|
||||
expiresAt: DateTime;
|
||||
}
|
||||
|
||||
type TaskType =
|
||||
| "deploy"
|
||||
| "rollback"
|
||||
| "health-check"
|
||||
| "inspect"
|
||||
| "execute-command"
|
||||
| "upload-files"
|
||||
| "write-sticker"
|
||||
| "read-sticker";
|
||||
|
||||
interface DeployTaskPayload {
|
||||
image: string;
|
||||
digest: string;
|
||||
config: DeployConfig;
|
||||
artifacts: ArtifactReference[];
|
||||
previousDigest?: string;
|
||||
hooks: {
|
||||
preDeploy?: HookConfig;
|
||||
postDeploy?: HookConfig;
|
||||
};
|
||||
}
|
||||
|
||||
// Task result (Agent → Core)
|
||||
interface TaskResult {
|
||||
taskId: UUID;
|
||||
success: boolean;
|
||||
startedAt: DateTime;
|
||||
completedAt: DateTime;
|
||||
|
||||
// Success details
|
||||
outputs?: Record<string, any>;
|
||||
artifacts?: ArtifactReference[];
|
||||
|
||||
// Failure details
|
||||
error?: string;
|
||||
errorType?: string;
|
||||
retriable?: boolean;
|
||||
|
||||
// Logs
|
||||
logs: string;
|
||||
|
||||
// Metrics
|
||||
metrics: {
|
||||
pullDurationMs?: number;
|
||||
deployDurationMs?: number;
|
||||
healthCheckDurationMs?: number;
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `agent-docker`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Docker container deployment |
|
||||
| **Dependencies** | Docker Engine API |
|
||||
| **Capabilities** | `docker.deploy`, `docker.rollback`, `docker.inspect` |
|
||||
|
||||
**Docker Agent Implementation**:
|
||||
```typescript
|
||||
class DockerAgent implements TargetExecutor {
|
||||
private docker: Docker;
|
||||
|
||||
async deploy(task: DeployTaskPayload): Promise<DeployResult> {
|
||||
const { image, digest, config, previousDigest } = task;
|
||||
const containerName = config.containerName;
|
||||
|
||||
// 1. Pull image and verify digest
|
||||
this.log(`Pulling image ${image}@${digest}`);
|
||||
await this.docker.pull(image, { digest });
|
||||
|
||||
const pulledDigest = await this.getImageDigest(image);
|
||||
if (pulledDigest !== digest) {
|
||||
throw new DigestMismatchError(
|
||||
`Expected digest ${digest}, got ${pulledDigest}. Possible tampering detected.`
|
||||
);
|
||||
}
|
||||
|
||||
// 2. Run pre-deploy hook
|
||||
if (task.hooks?.preDeploy) {
|
||||
await this.runHook(task.hooks.preDeploy, "pre-deploy");
|
||||
}
|
||||
|
||||
// 3. Stop and rename existing container
|
||||
const existingContainer = await this.findContainer(containerName);
|
||||
if (existingContainer) {
|
||||
this.log(`Stopping existing container ${containerName}`);
|
||||
await existingContainer.stop({ t: 10 });
|
||||
await existingContainer.rename(`${containerName}-previous-${Date.now()}`);
|
||||
}
|
||||
|
||||
// 4. Create new container
|
||||
this.log(`Creating container ${containerName} from ${image}@${digest}`);
|
||||
const container = await this.docker.createContainer({
|
||||
name: containerName,
|
||||
Image: `${image}@${digest}`, // Always use digest, not tag
|
||||
Env: this.buildEnvVars(config.environment),
|
||||
HostConfig: {
|
||||
PortBindings: this.buildPortBindings(config.ports),
|
||||
Binds: this.buildBindMounts(config.volumes),
|
||||
RestartPolicy: { Name: config.restartPolicy || "unless-stopped" },
|
||||
Memory: config.memoryLimit,
|
||||
CpuQuota: config.cpuLimit,
|
||||
},
|
||||
Labels: {
|
||||
"stella.release.id": config.releaseId,
|
||||
"stella.release.name": config.releaseName,
|
||||
"stella.digest": digest,
|
||||
"stella.deployed.at": new Date().toISOString(),
|
||||
},
|
||||
});
|
||||
|
||||
// 5. Start container
|
||||
this.log(`Starting container ${containerName}`);
|
||||
await container.start();
|
||||
|
||||
// 6. Wait for container to be healthy
|
||||
if (config.healthCheck) {
|
||||
this.log(`Waiting for container health check`);
|
||||
const healthy = await this.waitForHealthy(container, config.healthCheck.timeout);
|
||||
if (!healthy) {
|
||||
await this.rollbackContainer(containerName, existingContainer);
|
||||
throw new HealthCheckFailedError(`Container ${containerName} failed health check`);
|
||||
}
|
||||
}
|
||||
|
||||
// 7. Run post-deploy hook
|
||||
if (task.hooks?.postDeploy) {
|
||||
await this.runHook(task.hooks.postDeploy, "post-deploy");
|
||||
}
|
||||
|
||||
// 8. Cleanup previous container
|
||||
if (existingContainer && config.cleanupPrevious !== false) {
|
||||
this.log(`Removing previous container`);
|
||||
await existingContainer.remove({ force: true });
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
containerId: container.id,
|
||||
previousDigest: previousDigest,
|
||||
};
|
||||
}
|
||||
|
||||
async rollback(task: RollbackTaskPayload): Promise<DeployResult> {
|
||||
const { containerName, targetDigest } = task;
|
||||
|
||||
if (targetDigest) {
|
||||
// Deploy specific digest
|
||||
return this.deploy({ ...task, digest: targetDigest });
|
||||
}
|
||||
|
||||
// Find and restore previous container
|
||||
const previousContainer = await this.findContainer(`${containerName}-previous-*`);
|
||||
if (!previousContainer) {
|
||||
throw new RollbackError(`No previous container found for ${containerName}`);
|
||||
}
|
||||
|
||||
const currentContainer = await this.findContainer(containerName);
|
||||
if (currentContainer) {
|
||||
await currentContainer.stop({ t: 10 });
|
||||
await currentContainer.rename(`${containerName}-failed-${Date.now()}`);
|
||||
}
|
||||
|
||||
await previousContainer.rename(containerName);
|
||||
await previousContainer.start();
|
||||
|
||||
return { success: true, containerId: previousContainer.id };
|
||||
}
|
||||
|
||||
async writeSticker(sticker: VersionSticker): Promise<void> {
|
||||
const stickerPath = this.config.stickerPath || "/var/stella/version.json";
|
||||
const stickerContent = JSON.stringify(sticker, null, 2);
|
||||
|
||||
if (this.config.stickerLocation === "volume") {
|
||||
await this.docker.run("alpine", [
|
||||
"sh", "-c",
|
||||
`echo '${stickerContent}' > ${stickerPath}`
|
||||
], {
|
||||
HostConfig: { Binds: [`${this.config.stickerVolume}:/var/stella`] }
|
||||
});
|
||||
} else {
|
||||
fs.writeFileSync(stickerPath, stickerContent);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `agent-compose`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Docker Compose stack deployment |
|
||||
| **Dependencies** | Docker Compose CLI |
|
||||
| **Capabilities** | `compose.deploy`, `compose.rollback`, `compose.inspect` |
|
||||
|
||||
**Compose Agent Implementation**:
|
||||
```typescript
|
||||
class ComposeAgent implements TargetExecutor {
|
||||
async deploy(task: DeployTaskPayload): Promise<DeployResult> {
|
||||
const { artifacts, config } = task;
|
||||
const deployDir = config.deploymentDirectory;
|
||||
|
||||
// 1. Write compose lock file
|
||||
const composeLock = artifacts.find(a => a.type === "compose_lock");
|
||||
const composeContent = await this.fetchArtifact(composeLock);
|
||||
const composePath = path.join(deployDir, "compose.stella.lock.yml");
|
||||
await fs.writeFile(composePath, composeContent);
|
||||
|
||||
// 2. Run pre-deploy hook
|
||||
if (task.hooks?.preDeploy) {
|
||||
await this.runHook(task.hooks.preDeploy, deployDir);
|
||||
}
|
||||
|
||||
// 3. Pull images
|
||||
this.log("Pulling images...");
|
||||
await this.runCompose(deployDir, ["pull"]);
|
||||
|
||||
// 4. Verify digests
|
||||
await this.verifyDigests(composePath, config.expectedDigests);
|
||||
|
||||
// 5. Deploy
|
||||
this.log("Deploying services...");
|
||||
await this.runCompose(deployDir, ["up", "-d", "--remove-orphans", "--force-recreate"]);
|
||||
|
||||
// 6. Wait for services to be healthy
|
||||
if (config.healthCheck) {
|
||||
const healthy = await this.waitForServicesHealthy(deployDir, config.healthCheck.timeout);
|
||||
if (!healthy) {
|
||||
await this.rollbackToBackup(deployDir);
|
||||
throw new HealthCheckFailedError("Services failed health check");
|
||||
}
|
||||
}
|
||||
|
||||
// 7. Run post-deploy hook
|
||||
if (task.hooks?.postDeploy) {
|
||||
await this.runHook(task.hooks.postDeploy, deployDir);
|
||||
}
|
||||
|
||||
// 8. Write version sticker
|
||||
await this.writeSticker(config.sticker, deployDir);
|
||||
|
||||
return { success: true };
|
||||
}
|
||||
|
||||
private async verifyDigests(
|
||||
composePath: string,
|
||||
expectedDigests: Record<string, string>
|
||||
): Promise<void> {
|
||||
const composeContent = yaml.parse(await fs.readFile(composePath, "utf-8"));
|
||||
|
||||
for (const [service, expectedDigest] of Object.entries(expectedDigests)) {
|
||||
const serviceConfig = composeContent.services[service];
|
||||
if (!serviceConfig) {
|
||||
throw new Error(`Service ${service} not found in compose file`);
|
||||
}
|
||||
|
||||
const image = serviceConfig.image;
|
||||
if (!image.includes("@sha256:")) {
|
||||
throw new Error(`Service ${service} image not pinned to digest: ${image}`);
|
||||
}
|
||||
|
||||
const actualDigest = image.split("@")[1];
|
||||
if (actualDigest !== expectedDigest) {
|
||||
throw new DigestMismatchError(
|
||||
`Service ${service}: expected ${expectedDigest}, got ${actualDigest}`
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `agent-ssh`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | SSH remote execution (agentless) |
|
||||
| **Dependencies** | SSH client library |
|
||||
| **Capabilities** | `ssh.deploy`, `ssh.execute`, `ssh.upload` |
|
||||
|
||||
**SSH Remote Executor**:
|
||||
```typescript
|
||||
class SSHRemoteExecutor implements TargetExecutor {
|
||||
async connect(config: SSHConnectionConfig): Promise<void> {
|
||||
const privateKey = await this.secrets.getSecret(config.privateKeyRef);
|
||||
|
||||
this.ssh = new SSHClient();
|
||||
await this.ssh.connect({
|
||||
host: config.host,
|
||||
port: config.port || 22,
|
||||
username: config.username,
|
||||
privateKey: privateKey.value,
|
||||
readyTimeout: config.connectionTimeout || 30000,
|
||||
});
|
||||
}
|
||||
|
||||
async deploy(task: DeployTaskPayload): Promise<DeployResult> {
|
||||
const { artifacts, config } = task;
|
||||
const deployDir = config.deploymentDirectory;
|
||||
|
||||
try {
|
||||
// 1. Ensure deployment directory exists
|
||||
await this.exec(`mkdir -p ${deployDir}`);
|
||||
await this.exec(`mkdir -p ${deployDir}/.stella-backup`);
|
||||
|
||||
// 2. Backup current deployment
|
||||
await this.exec(`cp -r ${deployDir}/* ${deployDir}/.stella-backup/ 2>/dev/null || true`);
|
||||
|
||||
// 3. Upload artifacts
|
||||
for (const artifact of artifacts) {
|
||||
const content = await this.fetchArtifact(artifact);
|
||||
const remotePath = path.join(deployDir, artifact.name);
|
||||
await this.uploadFile(content, remotePath);
|
||||
}
|
||||
|
||||
// 4. Run pre-deploy hook
|
||||
if (task.hooks?.preDeploy) {
|
||||
await this.runRemoteHook(task.hooks.preDeploy, deployDir);
|
||||
}
|
||||
|
||||
// 5. Execute deployment script
|
||||
const deployScript = artifacts.find(a => a.type === "deploy_script");
|
||||
if (deployScript) {
|
||||
const scriptPath = path.join(deployDir, deployScript.name);
|
||||
await this.exec(`chmod +x ${scriptPath}`);
|
||||
const result = await this.exec(scriptPath, { cwd: deployDir, timeout: config.deploymentTimeout });
|
||||
if (result.exitCode !== 0) {
|
||||
throw new DeploymentError(`Deploy script failed: ${result.stderr}`);
|
||||
}
|
||||
}
|
||||
|
||||
// 6. Run post-deploy hook
|
||||
if (task.hooks?.postDeploy) {
|
||||
await this.runRemoteHook(task.hooks.postDeploy, deployDir);
|
||||
}
|
||||
|
||||
// 7. Health check
|
||||
if (config.healthCheck) {
|
||||
const healthy = await this.runHealthCheck(config.healthCheck);
|
||||
if (!healthy) {
|
||||
await this.rollback(task);
|
||||
throw new HealthCheckFailedError("Health check failed");
|
||||
}
|
||||
}
|
||||
|
||||
// 8. Write version sticker
|
||||
await this.writeSticker(config.sticker, deployDir);
|
||||
|
||||
// 9. Cleanup backup
|
||||
await this.exec(`rm -rf ${deployDir}/.stella-backup`);
|
||||
|
||||
return { success: true };
|
||||
} finally {
|
||||
this.ssh.end();
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `agent-winrm`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | WinRM remote execution (agentless) |
|
||||
| **Dependencies** | WinRM client library |
|
||||
| **Capabilities** | `winrm.deploy`, `winrm.execute`, `winrm.upload` |
|
||||
| **Authentication** | NTLM, Kerberos, Basic |
|
||||
|
||||
---
|
||||
|
||||
### Module: `agent-ecs`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | AWS ECS service deployment |
|
||||
| **Dependencies** | AWS SDK |
|
||||
| **Capabilities** | `ecs.deploy`, `ecs.rollback`, `ecs.inspect` |
|
||||
|
||||
---
|
||||
|
||||
### Module: `agent-nomad`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | HashiCorp Nomad job deployment |
|
||||
| **Dependencies** | Nomad API client |
|
||||
| **Capabilities** | `nomad.deploy`, `nomad.rollback`, `nomad.inspect` |
|
||||
|
||||
---
|
||||
|
||||
## Agent Security Model
|
||||
|
||||
### Registration Flow
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ AGENT REGISTRATION FLOW │
|
||||
│ │
|
||||
│ 1. Admin generates registration token (one-time use) │
|
||||
│ POST /api/v1/admin/agent-tokens │
|
||||
│ → { token: "reg_xxx", expiresAt: "..." } │
|
||||
│ │
|
||||
│ 2. Agent starts with registration token │
|
||||
│ ./stella-agent --register --token=reg_xxx │
|
||||
│ │
|
||||
│ 3. Agent requests mTLS certificate │
|
||||
│ POST /api/v1/agents/register │
|
||||
│ Headers: X-Registration-Token: reg_xxx │
|
||||
│ Body: { name, version, capabilities, csr } │
|
||||
│ → { agentId, certificate, caCertificate } │
|
||||
│ │
|
||||
│ 4. Agent establishes mTLS connection │
|
||||
│ Uses issued certificate for all subsequent requests │
|
||||
│ │
|
||||
│ 5. Agent requests short-lived JWT for task execution │
|
||||
│ POST /api/v1/agents/token (over mTLS) │
|
||||
│ → { token, expiresIn: 3600 } // 1 hour │
|
||||
│ │
|
||||
│ 6. Agent refreshes token before expiration │
|
||||
│ Token refresh only over mTLS connection │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Communication Security
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ AGENT COMMUNICATION SECURITY │
|
||||
│ │
|
||||
│ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ AGENT │ │ STELLA CORE │ │
|
||||
│ └──────┬───────┘ └──────┬───────┘ │
|
||||
│ │ │ │
|
||||
│ │ mTLS (mutual TLS) │ │
|
||||
│ │ - Agent cert signed by Stella CA │ │
|
||||
│ │ - Server cert verified by Agent │ │
|
||||
│ │ - TLS 1.3 only │ │
|
||||
│ │ - Perfect forward secrecy │ │
|
||||
│ │◄───────────────────────────────────────►│ │
|
||||
│ │ │ │
|
||||
│ │ Encrypted payload │ │
|
||||
│ │ - Task payloads encrypted with │ │
|
||||
│ │ agent-specific key │ │
|
||||
│ │ - Logs encrypted in transit │ │
|
||||
│ │◄───────────────────────────────────────►│ │
|
||||
│ │ │ │
|
||||
│ │ Heartbeat + capability refresh │ │
|
||||
│ │ - Every 30 seconds │ │
|
||||
│ │ - Signed with agent key │ │
|
||||
│ │─────────────────────────────────────────►│ │
|
||||
│ │ │ │
|
||||
│ │ Task assignment │ │
|
||||
│ │ - Contains short-lived credentials │ │
|
||||
│ │ - Scoped to specific target │ │
|
||||
│ │ - Expires after task timeout │ │
|
||||
│ │◄─────────────────────────────────────────│ │
|
||||
│ │ │ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Schema
|
||||
|
||||
```sql
|
||||
-- Agents
|
||||
CREATE TABLE release.agents (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
version VARCHAR(50) NOT NULL,
|
||||
capabilities JSONB NOT NULL DEFAULT '[]',
|
||||
labels JSONB NOT NULL DEFAULT '{}',
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'offline' CHECK (status IN (
|
||||
'online', 'offline', 'degraded'
|
||||
)),
|
||||
last_heartbeat TIMESTAMPTZ,
|
||||
resource_usage JSONB,
|
||||
certificate_fingerprint VARCHAR(64),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
UNIQUE (tenant_id, name)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_agents_tenant ON release.agents(tenant_id);
|
||||
CREATE INDEX idx_agents_status ON release.agents(status);
|
||||
CREATE INDEX idx_agents_capabilities ON release.agents USING GIN (capabilities);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
```yaml
|
||||
# Agent Registration
|
||||
POST /api/v1/agents/register
|
||||
Headers: X-Registration-Token: {token}
|
||||
Body: { name, version, capabilities, csr }
|
||||
Response: { agentId, certificate, caCertificate }
|
||||
|
||||
# Agent Management
|
||||
GET /api/v1/agents
|
||||
Query: ?status={online|offline|degraded}&capability={type}
|
||||
Response: Agent[]
|
||||
|
||||
GET /api/v1/agents/{id}
|
||||
Response: Agent
|
||||
|
||||
PUT /api/v1/agents/{id}
|
||||
Body: { labels?, capabilities? }
|
||||
Response: Agent
|
||||
|
||||
DELETE /api/v1/agents/{id}
|
||||
Response: { deleted: true }
|
||||
|
||||
# Agent Communication
|
||||
POST /api/v1/agents/{id}/heartbeat
|
||||
Body: { status, resourceUsage, capabilities }
|
||||
Response: { tasks: AgentTask[] }
|
||||
|
||||
POST /api/v1/agents/{id}/tasks/{taskId}/complete
|
||||
Body: { success, result, logs }
|
||||
Response: { acknowledged: true }
|
||||
|
||||
# WebSocket for real-time task stream
|
||||
WS /api/v1/agents/{id}/task-stream
|
||||
Messages:
|
||||
- { type: "task_assigned", task: AgentTask }
|
||||
- { type: "task_cancelled", taskId }
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [Module Overview](overview.md)
|
||||
- [Deploy Orchestrator](deploy-orchestrator.md)
|
||||
- [Agent Security](../security/agent-security.md)
|
||||
- [API Documentation](../api/agents.md)
|
||||
477
docs/modules/release-orchestrator/modules/deploy-orchestrator.md
Normal file
477
docs/modules/release-orchestrator/modules/deploy-orchestrator.md
Normal file
@@ -0,0 +1,477 @@
|
||||
# DEPLOY: Deployment Execution
|
||||
|
||||
**Purpose**: Orchestrate deployment jobs, execute on targets, manage rollbacks, and generate artifacts.
|
||||
|
||||
## Modules
|
||||
|
||||
### Module: `deploy-orchestrator`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Deployment job coordination; strategy execution |
|
||||
| **Dependencies** | `target-executor`, `artifact-generator`, `agent-manager` |
|
||||
| **Data Entities** | `DeploymentJob`, `DeploymentTask` |
|
||||
| **Events Produced** | `deployment.started`, `deployment.task_started`, `deployment.task_completed`, `deployment.completed`, `deployment.failed` |
|
||||
|
||||
**Deployment Job Entity**:
|
||||
```typescript
|
||||
interface DeploymentJob {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
promotionId: UUID;
|
||||
releaseId: UUID;
|
||||
environmentId: UUID;
|
||||
status: DeploymentStatus;
|
||||
strategy: DeploymentStrategy;
|
||||
startedAt: DateTime;
|
||||
completedAt: DateTime;
|
||||
artifacts: GeneratedArtifact[];
|
||||
rollbackOf: UUID | null; // If this is a rollback job
|
||||
tasks: DeploymentTask[];
|
||||
}
|
||||
|
||||
type DeploymentStatus =
|
||||
| "pending" // Waiting to start
|
||||
| "running" // Deployment in progress
|
||||
| "succeeded" // All tasks succeeded
|
||||
| "failed" // One or more tasks failed
|
||||
| "cancelled" // User cancelled
|
||||
| "rolling_back" // Rollback in progress
|
||||
| "rolled_back"; // Rollback complete
|
||||
|
||||
interface DeploymentTask {
|
||||
id: UUID;
|
||||
jobId: UUID;
|
||||
targetId: UUID;
|
||||
digest: string;
|
||||
status: TaskStatus;
|
||||
agentId: UUID | null;
|
||||
startedAt: DateTime;
|
||||
completedAt: DateTime;
|
||||
exitCode: number | null;
|
||||
logs: string;
|
||||
previousDigest: string | null;
|
||||
stickerWritten: boolean;
|
||||
}
|
||||
|
||||
type TaskStatus =
|
||||
| "pending"
|
||||
| "running"
|
||||
| "succeeded"
|
||||
| "failed"
|
||||
| "cancelled"
|
||||
| "skipped";
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `target-executor`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Target-specific deployment logic |
|
||||
| **Dependencies** | `agent-manager`, `connector-runtime` |
|
||||
| **Protocol** | gRPC for agents, SSH/WinRM for agentless |
|
||||
|
||||
**Executor Types**:
|
||||
|
||||
| Type | Transport | Use Case |
|
||||
|------|-----------|----------|
|
||||
| `agent-docker` | gRPC | Docker hosts with agent |
|
||||
| `agent-compose` | gRPC | Compose hosts with agent |
|
||||
| `ssh-remote` | SSH | Agentless Linux hosts |
|
||||
| `winrm-remote` | WinRM | Agentless Windows hosts |
|
||||
| `ecs-api` | AWS API | AWS ECS services |
|
||||
| `nomad-api` | Nomad API | HashiCorp Nomad jobs |
|
||||
|
||||
---
|
||||
|
||||
### Module: `runner-executor`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Script/hook execution in sandbox |
|
||||
| **Dependencies** | `plugin-sandbox` |
|
||||
| **Supported Scripts** | C# (.csx), Bash, PowerShell |
|
||||
|
||||
**Hook Types**:
|
||||
- `pre-deploy`: Run before deployment starts
|
||||
- `post-deploy`: Run after deployment succeeds
|
||||
- `on-failure`: Run when deployment fails
|
||||
- `on-rollback`: Run during rollback
|
||||
|
||||
---
|
||||
|
||||
### Module: `artifact-generator`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Generate immutable deployment artifacts |
|
||||
| **Dependencies** | `release-manager`, `environment-manager` |
|
||||
| **Data Entities** | `GeneratedArtifact`, `ComposeLock`, `VersionSticker` |
|
||||
|
||||
**Generated Artifacts**:
|
||||
|
||||
| Artifact Type | Description |
|
||||
|---------------|-------------|
|
||||
| `compose_lock` | `compose.stella.lock.yml` - Pinned digests |
|
||||
| `script` | Compiled deployment script |
|
||||
| `sticker` | `stella.version.json` - Version marker |
|
||||
| `evidence` | Decision and execution evidence |
|
||||
| `config` | Environment-specific config files |
|
||||
|
||||
**Compose Lock File Generation**:
|
||||
```typescript
|
||||
class ComposeLockGenerator {
|
||||
async generate(
|
||||
release: Release,
|
||||
environment: Environment,
|
||||
targets: Target[]
|
||||
): Promise<GeneratedArtifact> {
|
||||
|
||||
const services: Record<string, any> = {};
|
||||
|
||||
for (const component of release.components) {
|
||||
services[component.componentName] = {
|
||||
// CRITICAL: Always use digest, never tag
|
||||
image: `${component.imageRepository}@${component.digest}`,
|
||||
|
||||
// Environment variables
|
||||
environment: this.mergeEnvironment(
|
||||
environment.config.variables,
|
||||
this.buildStellaEnv(release, environment)
|
||||
),
|
||||
|
||||
// Labels for Stella tracking
|
||||
labels: {
|
||||
"stella.release.id": release.id,
|
||||
"stella.release.name": release.name,
|
||||
"stella.component.name": component.componentName,
|
||||
"stella.component.digest": component.digest,
|
||||
"stella.environment": environment.name,
|
||||
"stella.deployed.at": new Date().toISOString(),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const composeLock = {
|
||||
version: "3.8",
|
||||
services,
|
||||
"x-stella": {
|
||||
release_id: release.id,
|
||||
release_name: release.name,
|
||||
environment: environment.name,
|
||||
generated_at: new Date().toISOString(),
|
||||
inputs_hash: this.computeInputsHash(release, environment),
|
||||
components: release.components.map(c => ({
|
||||
name: c.componentName,
|
||||
digest: c.digest,
|
||||
semver: c.semver,
|
||||
})),
|
||||
},
|
||||
};
|
||||
|
||||
const content = yaml.stringify(composeLock);
|
||||
const hash = crypto.createHash("sha256").update(content).digest("hex");
|
||||
|
||||
return {
|
||||
type: "compose_lock",
|
||||
name: "compose.stella.lock.yml",
|
||||
content: Buffer.from(content),
|
||||
contentHash: `sha256:${hash}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Version Sticker Generation**:
|
||||
```typescript
|
||||
interface VersionSticker {
|
||||
stella_version: "1.0";
|
||||
release_id: UUID;
|
||||
release_name: string;
|
||||
components: Array<{
|
||||
name: string;
|
||||
digest: string;
|
||||
semver: string;
|
||||
tag: string;
|
||||
image_repository: string;
|
||||
}>;
|
||||
environment: string;
|
||||
environment_id: UUID;
|
||||
deployed_at: string;
|
||||
deployed_by: UUID;
|
||||
promotion_id: UUID;
|
||||
workflow_run_id: UUID;
|
||||
evidence_packet_id: UUID;
|
||||
evidence_packet_hash: string;
|
||||
orchestrator_version: string;
|
||||
source_ref?: {
|
||||
commit_sha: string;
|
||||
branch: string;
|
||||
repository: string;
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `rollback-manager`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Rollback orchestration; previous state recovery |
|
||||
| **Dependencies** | `deploy-orchestrator`, `target-registry` |
|
||||
|
||||
**Rollback Strategies**:
|
||||
|
||||
| Strategy | Description |
|
||||
|----------|-------------|
|
||||
| `to-previous` | Roll back to last successful deployment |
|
||||
| `to-release` | Roll back to specific release ID |
|
||||
| `to-sticker` | Roll back to version in sticker on target |
|
||||
|
||||
**Rollback Flow**:
|
||||
1. Identify rollback target (previous release or specified)
|
||||
2. Create rollback deployment job
|
||||
3. Execute deployment with rollback artifacts
|
||||
4. Update target state and sticker
|
||||
5. Record rollback evidence
|
||||
|
||||
---
|
||||
|
||||
## Deployment Strategies
|
||||
|
||||
### All-at-Once
|
||||
Deploy to all targets simultaneously.
|
||||
|
||||
```typescript
|
||||
interface AllAtOnceConfig {
|
||||
parallelism: number; // Max concurrent deployments (0 = unlimited)
|
||||
continueOnFailure: boolean; // Continue if some targets fail
|
||||
failureThreshold: number; // Max failures before abort
|
||||
}
|
||||
```
|
||||
|
||||
### Rolling
|
||||
Deploy to targets sequentially with health checks.
|
||||
|
||||
```typescript
|
||||
interface RollingConfig {
|
||||
batchSize: number; // Targets per batch
|
||||
batchDelay: number; // Seconds between batches
|
||||
healthCheckBetweenBatches: boolean;
|
||||
rollbackOnFailure: boolean;
|
||||
maxUnavailable: number; // Max targets unavailable at once
|
||||
}
|
||||
```
|
||||
|
||||
### Canary
|
||||
Deploy to subset, verify, then proceed.
|
||||
|
||||
```typescript
|
||||
interface CanaryConfig {
|
||||
canaryTargets: number; // Number or percentage for canary
|
||||
canaryDuration: number; // Seconds to run canary
|
||||
healthThreshold: number; // Required health percentage
|
||||
autoPromote: boolean; // Auto-proceed if healthy
|
||||
requireApproval: boolean; // Require manual approval
|
||||
}
|
||||
```
|
||||
|
||||
### Blue-Green
|
||||
Deploy to B, switch traffic, retire A.
|
||||
|
||||
```typescript
|
||||
interface BlueGreenConfig {
|
||||
targetGroupA: UUID; // Current (blue) target group
|
||||
targetGroupB: UUID; // New (green) target group
|
||||
trafficShiftType: "instant" | "gradual";
|
||||
gradualShiftSteps?: number[]; // e.g., [10, 25, 50, 100]
|
||||
rollbackOnHealthFailure: boolean;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Rolling Deployment Algorithm
|
||||
|
||||
```python
|
||||
class RollingDeploymentExecutor:
|
||||
def execute(self, job: DeploymentJob, config: RollingConfig) -> DeploymentResult:
|
||||
targets = self.get_targets(job.environment_id)
|
||||
batches = self.create_batches(targets, config.batch_size)
|
||||
|
||||
deployed_targets = []
|
||||
failed_targets = []
|
||||
|
||||
for batch_index, batch in enumerate(batches):
|
||||
self.log(f"Starting batch {batch_index + 1} of {len(batches)}")
|
||||
|
||||
# Deploy batch in parallel
|
||||
batch_results = self.deploy_batch(job, batch)
|
||||
|
||||
for target, result in batch_results:
|
||||
if result.success:
|
||||
deployed_targets.append(target)
|
||||
# Write version sticker
|
||||
self.write_sticker(target, job.release)
|
||||
else:
|
||||
failed_targets.append(target)
|
||||
|
||||
if config.rollback_on_failure:
|
||||
# Rollback all deployed targets
|
||||
self.rollback_targets(deployed_targets, job.previous_release)
|
||||
return DeploymentResult(
|
||||
success=False,
|
||||
error=f"Batch {batch_index + 1} failed, rolled back",
|
||||
deployed=deployed_targets,
|
||||
failed=failed_targets,
|
||||
rolled_back=deployed_targets
|
||||
)
|
||||
|
||||
# Health check between batches
|
||||
if config.health_check_between_batches and batch_index < len(batches) - 1:
|
||||
health_result = self.check_batch_health(deployed_targets[-len(batch):])
|
||||
|
||||
if not health_result.healthy:
|
||||
if config.rollback_on_failure:
|
||||
self.rollback_targets(deployed_targets, job.previous_release)
|
||||
return DeploymentResult(
|
||||
success=False,
|
||||
error=f"Health check failed after batch {batch_index + 1}",
|
||||
deployed=deployed_targets,
|
||||
failed=failed_targets,
|
||||
rolled_back=deployed_targets
|
||||
)
|
||||
|
||||
# Delay between batches
|
||||
if config.batch_delay > 0 and batch_index < len(batches) - 1:
|
||||
time.sleep(config.batch_delay)
|
||||
|
||||
return DeploymentResult(
|
||||
success=len(failed_targets) == 0,
|
||||
deployed=deployed_targets,
|
||||
failed=failed_targets
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Schema
|
||||
|
||||
```sql
|
||||
-- Deployment Jobs
|
||||
CREATE TABLE release.deployment_jobs (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
promotion_id UUID NOT NULL REFERENCES release.promotions(id),
|
||||
release_id UUID NOT NULL REFERENCES release.releases(id),
|
||||
environment_id UUID NOT NULL REFERENCES release.environments(id),
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'pending' CHECK (status IN (
|
||||
'pending', 'running', 'succeeded', 'failed', 'cancelled', 'rolling_back', 'rolled_back'
|
||||
)),
|
||||
strategy VARCHAR(50) NOT NULL DEFAULT 'all-at-once',
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
artifacts JSONB NOT NULL DEFAULT '[]',
|
||||
rollback_of UUID REFERENCES release.deployment_jobs(id),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_deployment_jobs_promotion ON release.deployment_jobs(promotion_id);
|
||||
CREATE INDEX idx_deployment_jobs_status ON release.deployment_jobs(status);
|
||||
|
||||
-- Deployment Tasks
|
||||
CREATE TABLE release.deployment_tasks (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
job_id UUID NOT NULL REFERENCES release.deployment_jobs(id) ON DELETE CASCADE,
|
||||
target_id UUID NOT NULL REFERENCES release.targets(id),
|
||||
digest VARCHAR(100) NOT NULL,
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'pending' CHECK (status IN (
|
||||
'pending', 'running', 'succeeded', 'failed', 'cancelled', 'skipped'
|
||||
)),
|
||||
agent_id UUID REFERENCES release.agents(id),
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
exit_code INTEGER,
|
||||
logs TEXT,
|
||||
previous_digest VARCHAR(100),
|
||||
sticker_written BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_deployment_tasks_job ON release.deployment_tasks(job_id);
|
||||
CREATE INDEX idx_deployment_tasks_target ON release.deployment_tasks(target_id);
|
||||
CREATE INDEX idx_deployment_tasks_status ON release.deployment_tasks(status);
|
||||
|
||||
-- Generated Artifacts
|
||||
CREATE TABLE release.generated_artifacts (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
deployment_job_id UUID REFERENCES release.deployment_jobs(id) ON DELETE CASCADE,
|
||||
artifact_type VARCHAR(50) NOT NULL CHECK (artifact_type IN (
|
||||
'compose_lock', 'script', 'sticker', 'evidence', 'config'
|
||||
)),
|
||||
name VARCHAR(255) NOT NULL,
|
||||
content_hash VARCHAR(100) NOT NULL,
|
||||
content BYTEA, -- for small artifacts
|
||||
storage_ref VARCHAR(500), -- for large artifacts (S3, etc.)
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_generated_artifacts_job ON release.generated_artifacts(deployment_job_id);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
```yaml
|
||||
# Deployment Jobs (mostly read-only; created by promotions)
|
||||
GET /api/v1/deployment-jobs
|
||||
Query: ?promotionId={uuid}&status={status}&environmentId={uuid}
|
||||
Response: DeploymentJob[]
|
||||
|
||||
GET /api/v1/deployment-jobs/{id}
|
||||
Response: DeploymentJob (with tasks)
|
||||
|
||||
GET /api/v1/deployment-jobs/{id}/tasks
|
||||
Response: DeploymentTask[]
|
||||
|
||||
GET /api/v1/deployment-jobs/{id}/tasks/{taskId}
|
||||
Response: DeploymentTask (with logs)
|
||||
|
||||
GET /api/v1/deployment-jobs/{id}/tasks/{taskId}/logs
|
||||
Query: ?follow=true
|
||||
Response: string | SSE stream
|
||||
|
||||
GET /api/v1/deployment-jobs/{id}/artifacts
|
||||
Response: GeneratedArtifact[]
|
||||
|
||||
GET /api/v1/deployment-jobs/{id}/artifacts/{artifactId}
|
||||
Response: binary (download)
|
||||
|
||||
# Rollback
|
||||
POST /api/v1/rollbacks
|
||||
Body: {
|
||||
environmentId: UUID,
|
||||
strategy: "to-previous" | "to-release" | "to-sticker",
|
||||
targetReleaseId?: UUID # for to-release strategy
|
||||
}
|
||||
Response: DeploymentJob (rollback job)
|
||||
|
||||
GET /api/v1/rollbacks
|
||||
Query: ?environmentId={uuid}
|
||||
Response: DeploymentJob[] (rollback jobs only)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [Module Overview](overview.md)
|
||||
- [Agents Specification](agents.md)
|
||||
- [Deployment Strategies](../deployment/strategies.md)
|
||||
- [Artifact Generation](../deployment/artifacts.md)
|
||||
- [API Documentation](../api/deployments.md)
|
||||
418
docs/modules/release-orchestrator/modules/environment-manager.md
Normal file
418
docs/modules/release-orchestrator/modules/environment-manager.md
Normal file
@@ -0,0 +1,418 @@
|
||||
# ENVMGR: Environment & Inventory Manager
|
||||
|
||||
**Purpose**: Model environments, targets, agents, and their relationships.
|
||||
|
||||
## Modules
|
||||
|
||||
### Module: `environment-manager`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Environment CRUD, ordering, configuration, freeze windows |
|
||||
| **Dependencies** | `authority` |
|
||||
| **Data Entities** | `Environment`, `EnvironmentConfig`, `FreezeWindow` |
|
||||
| **Events Produced** | `environment.created`, `environment.updated`, `environment.freeze_started`, `environment.freeze_ended` |
|
||||
|
||||
**Key Operations**:
|
||||
```
|
||||
CreateEnvironment(name, displayName, orderIndex, config) → Environment
|
||||
UpdateEnvironment(id, config) → Environment
|
||||
DeleteEnvironment(id) → void
|
||||
SetFreezeWindow(environmentId, start, end, reason, exceptions) → FreezeWindow
|
||||
ClearFreezeWindow(environmentId, windowId) → void
|
||||
ListEnvironments(tenantId) → Environment[]
|
||||
GetEnvironmentState(id) → EnvironmentState
|
||||
```
|
||||
|
||||
**Environment Entity**:
|
||||
```typescript
|
||||
interface Environment {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
name: string; // "dev", "stage", "prod"
|
||||
displayName: string; // "Development"
|
||||
orderIndex: number; // 0, 1, 2 for promotion order
|
||||
config: EnvironmentConfig;
|
||||
freezeWindows: FreezeWindow[];
|
||||
requiredApprovals: number; // 0 for dev, 1+ for prod
|
||||
requireSeparationOfDuties: boolean;
|
||||
autoPromoteFrom: UUID | null; // auto-promote from this env
|
||||
promotionPolicy: string; // OPA policy name
|
||||
createdAt: DateTime;
|
||||
updatedAt: DateTime;
|
||||
}
|
||||
|
||||
interface EnvironmentConfig {
|
||||
variables: Record<string, string>; // env-specific variables
|
||||
secrets: SecretReference[]; // vault references
|
||||
registryOverrides: RegistryOverride[]; // per-env registry
|
||||
agentLabels: string[]; // required agent labels
|
||||
deploymentTimeout: number; // seconds
|
||||
healthCheckConfig: HealthCheckConfig;
|
||||
}
|
||||
|
||||
interface FreezeWindow {
|
||||
id: UUID;
|
||||
start: DateTime;
|
||||
end: DateTime;
|
||||
reason: string;
|
||||
createdBy: UUID;
|
||||
exceptions: UUID[]; // users who can override
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `target-registry`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Deployment target inventory; capability tracking |
|
||||
| **Dependencies** | `environment-manager`, `agent-manager` |
|
||||
| **Data Entities** | `Target`, `TargetGroup`, `TargetCapability` |
|
||||
| **Events Produced** | `target.created`, `target.updated`, `target.deleted`, `target.health_changed` |
|
||||
|
||||
**Target Types** (plugin-provided):
|
||||
|
||||
| Type | Description |
|
||||
|------|-------------|
|
||||
| `docker_host` | Single Docker host |
|
||||
| `compose_host` | Docker Compose host |
|
||||
| `ssh_remote` | Generic SSH target |
|
||||
| `winrm_remote` | Windows remote target |
|
||||
| `ecs_service` | AWS ECS service |
|
||||
| `nomad_job` | HashiCorp Nomad job |
|
||||
|
||||
**Target Entity**:
|
||||
```typescript
|
||||
interface Target {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
environmentId: UUID;
|
||||
name: string; // "prod-web-01"
|
||||
targetType: string; // "docker_host"
|
||||
connection: TargetConnection; // type-specific
|
||||
capabilities: TargetCapability[];
|
||||
labels: Record<string, string>; // for grouping
|
||||
healthStatus: HealthStatus;
|
||||
lastHealthCheck: DateTime;
|
||||
deploymentDirectory: string; // where artifacts are placed
|
||||
currentDigest: string | null; // what's currently deployed
|
||||
agentId: UUID | null; // assigned agent
|
||||
}
|
||||
|
||||
interface TargetConnection {
|
||||
// Common fields
|
||||
host: string;
|
||||
port: number;
|
||||
|
||||
// Type-specific (examples)
|
||||
// docker_host:
|
||||
dockerSocket?: string;
|
||||
tlsCert?: SecretReference;
|
||||
|
||||
// ssh_remote:
|
||||
username?: string;
|
||||
privateKey?: SecretReference;
|
||||
|
||||
// ecs_service:
|
||||
cluster?: string;
|
||||
service?: string;
|
||||
region?: string;
|
||||
roleArn?: string;
|
||||
}
|
||||
|
||||
interface TargetGroup {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
environmentId: UUID;
|
||||
name: string;
|
||||
labels: Record<string, string>;
|
||||
createdAt: DateTime;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `agent-manager`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Agent registration, heartbeat, capability advertisement |
|
||||
| **Dependencies** | `authority` (for agent tokens) |
|
||||
| **Data Entities** | `Agent`, `AgentCapability`, `AgentHeartbeat` |
|
||||
| **Events Produced** | `agent.registered`, `agent.online`, `agent.offline`, `agent.capability_changed` |
|
||||
|
||||
**Agent Lifecycle**:
|
||||
1. Agent starts, requests registration token from Authority
|
||||
2. Agent registers with capabilities and labels
|
||||
3. Agent sends heartbeats (default: 30s interval)
|
||||
4. Agent pulls tasks from task queue
|
||||
5. Agent reports task completion/failure
|
||||
|
||||
**Agent Entity**:
|
||||
```typescript
|
||||
interface Agent {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
name: string;
|
||||
version: string;
|
||||
capabilities: AgentCapability[];
|
||||
labels: Record<string, string>;
|
||||
status: "online" | "offline" | "degraded";
|
||||
lastHeartbeat: DateTime;
|
||||
assignedTargets: UUID[];
|
||||
resourceUsage: ResourceUsage;
|
||||
}
|
||||
|
||||
interface AgentCapability {
|
||||
type: string; // "docker", "compose", "ssh", "winrm"
|
||||
version: string; // capability version
|
||||
config: object; // capability-specific config
|
||||
}
|
||||
|
||||
interface ResourceUsage {
|
||||
cpuPercent: number;
|
||||
memoryPercent: number;
|
||||
diskPercent: number;
|
||||
activeTasks: number;
|
||||
}
|
||||
```
|
||||
|
||||
**Agent Registration Protocol**:
|
||||
```
|
||||
1. Admin generates registration token (one-time use)
|
||||
POST /api/v1/admin/agent-tokens
|
||||
→ { token: "reg_xxx", expiresAt: "..." }
|
||||
|
||||
2. Agent starts with registration token
|
||||
./stella-agent --register --token=reg_xxx
|
||||
|
||||
3. Agent requests mTLS certificate
|
||||
POST /api/v1/agents/register
|
||||
Headers: X-Registration-Token: reg_xxx
|
||||
Body: { name, version, capabilities, csr }
|
||||
→ { agentId, certificate, caCertificate }
|
||||
|
||||
4. Agent establishes mTLS connection
|
||||
Uses issued certificate for all subsequent requests
|
||||
|
||||
5. Agent requests short-lived JWT for task execution
|
||||
POST /api/v1/agents/token (over mTLS)
|
||||
→ { token, expiresIn: 3600 } // 1 hour
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `inventory-sync`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Drift detection; expected vs actual state reconciliation |
|
||||
| **Dependencies** | `target-registry`, `agent-manager` |
|
||||
| **Events Produced** | `inventory.drift_detected`, `inventory.reconciled` |
|
||||
|
||||
**Drift Detection Process**:
|
||||
1. Read `stella.version.json` from target deployment directory
|
||||
2. Compare with expected state in database
|
||||
3. Flag discrepancies (digest mismatch, missing sticker, unexpected files)
|
||||
4. Report on dashboard
|
||||
|
||||
**Drift Detection Types**:
|
||||
|
||||
| Drift Type | Description | Severity |
|
||||
|------------|-------------|----------|
|
||||
| `digest_mismatch` | Running digest differs from expected | Critical |
|
||||
| `missing_sticker` | No version sticker found on target | Warning |
|
||||
| `stale_sticker` | Sticker timestamp older than last deployment | Warning |
|
||||
| `orphan_container` | Container not managed by Stella | Info |
|
||||
| `extra_files` | Unexpected files in deployment directory | Info |
|
||||
|
||||
---
|
||||
|
||||
## Cache Eviction Policies
|
||||
|
||||
Environment configurations and target states are cached to improve performance. **All caches MUST have bounded size and TTL-based eviction**:
|
||||
|
||||
| Cache Type | Purpose | TTL | Max Size | Eviction Strategy |
|
||||
|-----------|---------|-----|----------|-------------------|
|
||||
| **Environment Configs** | Environment configuration data | 30 minutes | 500 entries | Sliding expiration |
|
||||
| **Target Health** | Target health status | 5 minutes | 2,000 entries | Sliding expiration |
|
||||
| **Agent Capabilities** | Agent capability advertisement | 10 minutes | 1,000 entries | Sliding expiration |
|
||||
| **Freeze Windows** | Active freeze window checks | 15 minutes | 100 entries | Absolute expiration |
|
||||
|
||||
**Implementation**:
|
||||
```csharp
|
||||
public class EnvironmentConfigCache
|
||||
{
|
||||
private readonly MemoryCache _cache;
|
||||
|
||||
public EnvironmentConfigCache()
|
||||
{
|
||||
_cache = new MemoryCache(new MemoryCacheOptions
|
||||
{
|
||||
SizeLimit = 500 // Max 500 environment configs
|
||||
});
|
||||
}
|
||||
|
||||
public void CacheConfig(Guid environmentId, EnvironmentConfig config)
|
||||
{
|
||||
_cache.Set(environmentId, config, new MemoryCacheEntryOptions
|
||||
{
|
||||
Size = 1,
|
||||
SlidingExpiration = TimeSpan.FromMinutes(30) // 30-minute TTL
|
||||
});
|
||||
}
|
||||
|
||||
public EnvironmentConfig? GetCachedConfig(Guid environmentId)
|
||||
=> _cache.Get<EnvironmentConfig>(environmentId);
|
||||
|
||||
public void InvalidateConfig(Guid environmentId)
|
||||
=> _cache.Remove(environmentId);
|
||||
}
|
||||
```
|
||||
|
||||
**Cache Invalidation**:
|
||||
- Environment configs: Invalidate on update
|
||||
- Target health: Invalidate on health check or deployment
|
||||
- Agent capabilities: Invalidate on capability change event
|
||||
- Freeze windows: Invalidate on window creation/deletion
|
||||
|
||||
**Reference**: See [Implementation Guide](../implementation-guide.md#caching) for cache implementation patterns.
|
||||
|
||||
---
|
||||
|
||||
## Database Schema
|
||||
|
||||
```sql
|
||||
-- Environments
|
||||
CREATE TABLE release.environments (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
name VARCHAR(100) NOT NULL,
|
||||
display_name VARCHAR(255) NOT NULL,
|
||||
order_index INTEGER NOT NULL,
|
||||
config JSONB NOT NULL DEFAULT '{}',
|
||||
freeze_windows JSONB NOT NULL DEFAULT '[]',
|
||||
required_approvals INTEGER NOT NULL DEFAULT 0,
|
||||
require_sod BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
auto_promote_from UUID REFERENCES release.environments(id),
|
||||
promotion_policy VARCHAR(255),
|
||||
deployment_timeout INTEGER NOT NULL DEFAULT 600,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
UNIQUE (tenant_id, name)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_environments_tenant ON release.environments(tenant_id);
|
||||
CREATE INDEX idx_environments_order ON release.environments(tenant_id, order_index);
|
||||
|
||||
-- Target Groups
|
||||
CREATE TABLE release.target_groups (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
environment_id UUID NOT NULL REFERENCES release.environments(id) ON DELETE CASCADE,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
labels JSONB NOT NULL DEFAULT '{}',
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
UNIQUE (tenant_id, environment_id, name)
|
||||
);
|
||||
|
||||
-- Targets
|
||||
CREATE TABLE release.targets (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
environment_id UUID NOT NULL REFERENCES release.environments(id) ON DELETE CASCADE,
|
||||
target_group_id UUID REFERENCES release.target_groups(id),
|
||||
name VARCHAR(255) NOT NULL,
|
||||
target_type VARCHAR(100) NOT NULL,
|
||||
connection JSONB NOT NULL,
|
||||
capabilities JSONB NOT NULL DEFAULT '[]',
|
||||
labels JSONB NOT NULL DEFAULT '{}',
|
||||
deployment_directory VARCHAR(500),
|
||||
health_status VARCHAR(50) NOT NULL DEFAULT 'unknown',
|
||||
last_health_check TIMESTAMPTZ,
|
||||
current_digest VARCHAR(100),
|
||||
agent_id UUID REFERENCES release.agents(id),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
UNIQUE (tenant_id, environment_id, name)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_targets_tenant_env ON release.targets(tenant_id, environment_id);
|
||||
CREATE INDEX idx_targets_type ON release.targets(target_type);
|
||||
CREATE INDEX idx_targets_labels ON release.targets USING GIN (labels);
|
||||
|
||||
-- Agents
|
||||
CREATE TABLE release.agents (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
version VARCHAR(50) NOT NULL,
|
||||
capabilities JSONB NOT NULL DEFAULT '[]',
|
||||
labels JSONB NOT NULL DEFAULT '{}',
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'offline',
|
||||
last_heartbeat TIMESTAMPTZ,
|
||||
resource_usage JSONB,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
UNIQUE (tenant_id, name)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_agents_tenant ON release.agents(tenant_id);
|
||||
CREATE INDEX idx_agents_status ON release.agents(status);
|
||||
CREATE INDEX idx_agents_capabilities ON release.agents USING GIN (capabilities);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
```yaml
|
||||
# Environments
|
||||
POST /api/v1/environments
|
||||
GET /api/v1/environments
|
||||
GET /api/v1/environments/{id}
|
||||
PUT /api/v1/environments/{id}
|
||||
DELETE /api/v1/environments/{id}
|
||||
|
||||
# Freeze Windows
|
||||
POST /api/v1/environments/{envId}/freeze-windows
|
||||
GET /api/v1/environments/{envId}/freeze-windows
|
||||
DELETE /api/v1/environments/{envId}/freeze-windows/{windowId}
|
||||
|
||||
# Target Groups
|
||||
POST /api/v1/environments/{envId}/target-groups
|
||||
GET /api/v1/environments/{envId}/target-groups
|
||||
GET /api/v1/target-groups/{id}
|
||||
PUT /api/v1/target-groups/{id}
|
||||
DELETE /api/v1/target-groups/{id}
|
||||
|
||||
# Targets
|
||||
POST /api/v1/targets
|
||||
GET /api/v1/targets
|
||||
GET /api/v1/targets/{id}
|
||||
PUT /api/v1/targets/{id}
|
||||
DELETE /api/v1/targets/{id}
|
||||
POST /api/v1/targets/{id}/health-check
|
||||
GET /api/v1/targets/{id}/sticker
|
||||
GET /api/v1/targets/{id}/drift
|
||||
|
||||
# Agents
|
||||
POST /api/v1/agents/register
|
||||
GET /api/v1/agents
|
||||
GET /api/v1/agents/{id}
|
||||
PUT /api/v1/agents/{id}
|
||||
DELETE /api/v1/agents/{id}
|
||||
POST /api/v1/agents/{id}/heartbeat
|
||||
POST /api/v1/agents/{id}/tasks/{taskId}/complete
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [Module Overview](overview.md)
|
||||
- [Agent Specification](agents.md)
|
||||
- [API Documentation](../api/environments.md)
|
||||
- [Agent Security](../security/agent-security.md)
|
||||
575
docs/modules/release-orchestrator/modules/evidence.md
Normal file
575
docs/modules/release-orchestrator/modules/evidence.md
Normal file
@@ -0,0 +1,575 @@
|
||||
# RELEVI: Release Evidence
|
||||
|
||||
**Purpose**: Cryptographically sealed evidence packets for audit-grade release governance.
|
||||
|
||||
## Modules
|
||||
|
||||
### Module: `evidence-collector`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Evidence aggregation; packet composition |
|
||||
| **Dependencies** | `promotion-manager`, `deploy-orchestrator`, `decision-engine` |
|
||||
| **Data Entities** | `EvidencePacket`, `EvidenceContent` |
|
||||
| **Events Produced** | `evidence.collected`, `evidence.packet_created` |
|
||||
|
||||
**Evidence Packet Structure**:
|
||||
```typescript
|
||||
interface EvidencePacket {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
promotionId: UUID;
|
||||
packetType: EvidencePacketType;
|
||||
content: EvidenceContent;
|
||||
contentHash: string; // SHA-256 of content
|
||||
signature: string; // Cryptographic signature
|
||||
signerKeyRef: string; // Reference to signing key
|
||||
createdAt: DateTime;
|
||||
// Note: No updatedAt - packets are immutable
|
||||
}
|
||||
|
||||
type EvidencePacketType =
|
||||
| "release_decision" // Promotion decision evidence
|
||||
| "deployment" // Deployment execution evidence
|
||||
| "rollback" // Rollback evidence
|
||||
| "ab_promotion"; // A/B promotion evidence
|
||||
|
||||
interface EvidenceContent {
|
||||
// Metadata
|
||||
version: "1.0";
|
||||
generatedAt: DateTime;
|
||||
generatorVersion: string;
|
||||
|
||||
// What
|
||||
release: {
|
||||
id: UUID;
|
||||
name: string;
|
||||
components: Array<{
|
||||
name: string;
|
||||
digest: string;
|
||||
semver: string;
|
||||
imageRepository: string;
|
||||
}>;
|
||||
sourceRef: SourceReference | null;
|
||||
};
|
||||
|
||||
// Where
|
||||
environment: {
|
||||
id: UUID;
|
||||
name: string;
|
||||
targets: Array<{
|
||||
id: UUID;
|
||||
name: string;
|
||||
type: string;
|
||||
}>;
|
||||
};
|
||||
|
||||
// Who
|
||||
actors: {
|
||||
requester: {
|
||||
id: UUID;
|
||||
name: string;
|
||||
email: string;
|
||||
};
|
||||
approvers: Array<{
|
||||
id: UUID;
|
||||
name: string;
|
||||
action: string;
|
||||
at: DateTime;
|
||||
comment: string | null;
|
||||
}>;
|
||||
};
|
||||
|
||||
// Why
|
||||
decision: {
|
||||
result: "allow" | "deny";
|
||||
gates: Array<{
|
||||
type: string;
|
||||
name: string;
|
||||
status: string;
|
||||
message: string;
|
||||
details: Record<string, any>;
|
||||
}>;
|
||||
reasons: string[];
|
||||
};
|
||||
|
||||
// How
|
||||
execution: {
|
||||
workflowRunId: UUID | null;
|
||||
deploymentJobId: UUID | null;
|
||||
artifacts: Array<{
|
||||
type: string;
|
||||
name: string;
|
||||
contentHash: string;
|
||||
}>;
|
||||
logs: string | null; // Compressed/truncated
|
||||
};
|
||||
|
||||
// When
|
||||
timeline: {
|
||||
requestedAt: DateTime;
|
||||
decidedAt: DateTime | null;
|
||||
startedAt: DateTime | null;
|
||||
completedAt: DateTime | null;
|
||||
};
|
||||
|
||||
// Integrity
|
||||
inputsHash: string; // Hash of all inputs for replay
|
||||
previousEvidenceId: UUID | null; // Chain to previous evidence
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `evidence-signer`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Cryptographic signing of evidence packets |
|
||||
| **Dependencies** | `authority`, `vault` (for key storage) |
|
||||
| **Algorithms** | RS256, ES256, Ed25519 |
|
||||
|
||||
**Signing Process**:
|
||||
```typescript
|
||||
class EvidenceSigner {
|
||||
async sign(content: EvidenceContent): Promise<SignedEvidence> {
|
||||
// 1. Canonicalize content (RFC 8785)
|
||||
const canonicalJson = canonicalize(content);
|
||||
|
||||
// 2. Compute content hash
|
||||
const contentHash = crypto
|
||||
.createHash("sha256")
|
||||
.update(canonicalJson)
|
||||
.digest("hex");
|
||||
|
||||
// 3. Get signing key from vault
|
||||
const keyRef = await this.getActiveSigningKey();
|
||||
const privateKey = await this.vault.getPrivateKey(keyRef);
|
||||
|
||||
// 4. Sign the content hash
|
||||
const signature = await this.signWithKey(contentHash, privateKey);
|
||||
|
||||
return {
|
||||
content,
|
||||
contentHash: `sha256:${contentHash}`,
|
||||
signature: base64Encode(signature),
|
||||
signerKeyRef: keyRef,
|
||||
algorithm: this.config.signatureAlgorithm,
|
||||
};
|
||||
}
|
||||
|
||||
async verify(packet: EvidencePacket): Promise<VerificationResult> {
|
||||
// 1. Canonicalize stored content
|
||||
const canonicalJson = canonicalize(packet.content);
|
||||
|
||||
// 2. Verify content hash
|
||||
const computedHash = crypto
|
||||
.createHash("sha256")
|
||||
.update(canonicalJson)
|
||||
.digest("hex");
|
||||
|
||||
if (`sha256:${computedHash}` !== packet.contentHash) {
|
||||
return { valid: false, error: "Content hash mismatch" };
|
||||
}
|
||||
|
||||
// 3. Get public key
|
||||
const publicKey = await this.vault.getPublicKey(packet.signerKeyRef);
|
||||
|
||||
// 4. Verify signature
|
||||
const signatureValid = await this.verifySignature(
|
||||
computedHash,
|
||||
base64Decode(packet.signature),
|
||||
publicKey
|
||||
);
|
||||
|
||||
return {
|
||||
valid: signatureValid,
|
||||
signerKeyRef: packet.signerKeyRef,
|
||||
signedAt: packet.createdAt,
|
||||
};
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `sticker-writer`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Version sticker generation and placement |
|
||||
| **Dependencies** | `deploy-orchestrator`, `agent-manager` |
|
||||
| **Data Entities** | `VersionSticker` |
|
||||
|
||||
**Version Sticker Schema**:
|
||||
```typescript
|
||||
interface VersionSticker {
|
||||
stella_version: "1.0";
|
||||
|
||||
// Release identity
|
||||
release_id: UUID;
|
||||
release_name: string;
|
||||
|
||||
// Component details
|
||||
components: Array<{
|
||||
name: string;
|
||||
digest: string;
|
||||
semver: string;
|
||||
tag: string;
|
||||
image_repository: string;
|
||||
}>;
|
||||
|
||||
// Deployment context
|
||||
environment: string;
|
||||
environment_id: UUID;
|
||||
deployed_at: string; // ISO 8601
|
||||
deployed_by: UUID;
|
||||
|
||||
// Traceability
|
||||
promotion_id: UUID;
|
||||
workflow_run_id: UUID;
|
||||
|
||||
// Evidence chain
|
||||
evidence_packet_id: UUID;
|
||||
evidence_packet_hash: string;
|
||||
policy_decision_hash: string;
|
||||
|
||||
// Orchestrator info
|
||||
orchestrator_version: string;
|
||||
|
||||
// Source reference
|
||||
source_ref?: {
|
||||
commit_sha: string;
|
||||
branch: string;
|
||||
repository: string;
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
**Sticker Placement**:
|
||||
- Written to `/var/stella/version.json` on each target
|
||||
- Atomic write (write to temp, rename)
|
||||
- Read during drift detection
|
||||
- Verified against expected state
|
||||
|
||||
---
|
||||
|
||||
### Module: `audit-exporter`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Compliance report generation; evidence export |
|
||||
| **Dependencies** | `evidence-collector` |
|
||||
| **Export Formats** | JSON, PDF, CSV |
|
||||
|
||||
**Audit Report Types**:
|
||||
|
||||
| Report Type | Description |
|
||||
|-------------|-------------|
|
||||
| `release_audit` | Full audit trail for a release |
|
||||
| `environment_audit` | All deployments to an environment |
|
||||
| `compliance_summary` | Summary for compliance review |
|
||||
| `change_log` | Chronological change log |
|
||||
|
||||
**Report Generation**:
|
||||
```typescript
|
||||
interface AuditReportRequest {
|
||||
type: AuditReportType;
|
||||
scope: {
|
||||
releaseId?: UUID;
|
||||
environmentId?: UUID;
|
||||
from?: DateTime;
|
||||
to?: DateTime;
|
||||
};
|
||||
format: "json" | "pdf" | "csv";
|
||||
options?: {
|
||||
includeDecisionDetails: boolean;
|
||||
includeApproverDetails: boolean;
|
||||
includeLogs: boolean;
|
||||
includeArtifacts: boolean;
|
||||
};
|
||||
}
|
||||
|
||||
interface AuditReport {
|
||||
id: UUID;
|
||||
type: AuditReportType;
|
||||
scope: ReportScope;
|
||||
generatedAt: DateTime;
|
||||
generatedBy: UUID;
|
||||
|
||||
summary: {
|
||||
totalPromotions: number;
|
||||
successfulDeployments: number;
|
||||
failedDeployments: number;
|
||||
rollbacks: number;
|
||||
averageDeploymentTime: number;
|
||||
};
|
||||
|
||||
entries: AuditEntry[];
|
||||
|
||||
// For compliance
|
||||
signatureChain: {
|
||||
valid: boolean;
|
||||
verifiedPackets: number;
|
||||
invalidPackets: number;
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Immutability Enforcement
|
||||
|
||||
Evidence packets are append-only. This is enforced at multiple levels:
|
||||
|
||||
### Database Level
|
||||
```sql
|
||||
-- Evidence packets table with no UPDATE/DELETE
|
||||
CREATE TABLE release.evidence_packets (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
promotion_id UUID NOT NULL REFERENCES release.promotions(id),
|
||||
packet_type VARCHAR(50) NOT NULL CHECK (packet_type IN (
|
||||
'release_decision', 'deployment', 'rollback', 'ab_promotion'
|
||||
)),
|
||||
content JSONB NOT NULL,
|
||||
content_hash VARCHAR(100) NOT NULL,
|
||||
signature TEXT,
|
||||
signer_key_ref VARCHAR(255),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
-- Note: No updated_at column; immutable by design
|
||||
);
|
||||
|
||||
-- Append-only enforcement via trigger
|
||||
CREATE OR REPLACE FUNCTION prevent_evidence_modification()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
RAISE EXCEPTION 'Evidence packets are immutable and cannot be modified or deleted';
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE TRIGGER evidence_packets_immutable
|
||||
BEFORE UPDATE OR DELETE ON evidence_packets
|
||||
FOR EACH ROW EXECUTE FUNCTION prevent_evidence_modification();
|
||||
|
||||
-- Revoke UPDATE/DELETE from application role
|
||||
REVOKE UPDATE, DELETE ON release.evidence_packets FROM app_role;
|
||||
|
||||
-- Version stickers table
|
||||
CREATE TABLE release.version_stickers (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
target_id UUID NOT NULL REFERENCES release.targets(id),
|
||||
release_id UUID NOT NULL REFERENCES release.releases(id),
|
||||
promotion_id UUID NOT NULL REFERENCES release.promotions(id),
|
||||
sticker_content JSONB NOT NULL,
|
||||
content_hash VARCHAR(100) NOT NULL,
|
||||
written_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
verified_at TIMESTAMPTZ,
|
||||
drift_detected BOOLEAN NOT NULL DEFAULT FALSE
|
||||
);
|
||||
|
||||
CREATE INDEX idx_version_stickers_target ON release.version_stickers(target_id);
|
||||
CREATE INDEX idx_version_stickers_release ON release.version_stickers(release_id);
|
||||
CREATE INDEX idx_evidence_packets_promotion ON release.evidence_packets(promotion_id);
|
||||
CREATE INDEX idx_evidence_packets_created ON release.evidence_packets(created_at DESC);
|
||||
```
|
||||
|
||||
### Application Level
|
||||
```csharp
|
||||
// Evidence service enforces immutability
|
||||
public sealed class EvidenceService
|
||||
{
|
||||
// Only Create method - no Update or Delete
|
||||
public async Task<EvidencePacket> CreateAsync(
|
||||
EvidenceContent content,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Sign content
|
||||
var signed = await _signer.SignAsync(content, ct);
|
||||
|
||||
// Store (append-only)
|
||||
var packet = new EvidencePacket
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
TenantId = content.TenantId,
|
||||
PromotionId = content.PromotionId,
|
||||
PacketType = content.PacketType,
|
||||
Content = content,
|
||||
ContentHash = signed.ContentHash,
|
||||
Signature = signed.Signature,
|
||||
SignerKeyRef = signed.SignerKeyRef,
|
||||
CreatedAt = DateTime.UtcNow,
|
||||
};
|
||||
|
||||
await _repository.InsertAsync(packet, ct);
|
||||
return packet;
|
||||
}
|
||||
|
||||
// Read methods only
|
||||
public async Task<EvidencePacket> GetAsync(Guid id, CancellationToken ct);
|
||||
public async Task<IReadOnlyList<EvidencePacket>> ListAsync(
|
||||
EvidenceFilter filter, CancellationToken ct);
|
||||
public async Task<VerificationResult> VerifyAsync(
|
||||
Guid id, CancellationToken ct);
|
||||
|
||||
// No Update or Delete methods exist
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Evidence Chain
|
||||
|
||||
Evidence packets form a verifiable chain:
|
||||
|
||||
```
|
||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
||||
│ Evidence #1 │ │ Evidence #2 │ │ Evidence #3 │
|
||||
│ (Dev Deploy) │────►│ (Stage Deploy) │────►│ (Prod Deploy) │
|
||||
│ │ │ │ │ │
|
||||
│ prevEvidenceId: │ │ prevEvidenceId: │ │ prevEvidenceId: │
|
||||
│ null │ │ #1 │ │ #2 │
|
||||
│ │ │ │ │ │
|
||||
│ contentHash: │ │ contentHash: │ │ contentHash: │
|
||||
│ sha256:abc... │ │ sha256:def... │ │ sha256:ghi... │
|
||||
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
**Chain Verification**:
|
||||
```typescript
|
||||
async function verifyEvidenceChain(releaseId: UUID): Promise<ChainVerificationResult> {
|
||||
const packets = await getPacketsForRelease(releaseId);
|
||||
const results: PacketVerificationResult[] = [];
|
||||
|
||||
let previousHash: string | null = null;
|
||||
|
||||
for (const packet of packets) {
|
||||
// 1. Verify packet signature
|
||||
const signatureValid = await verifySignature(packet);
|
||||
|
||||
// 2. Verify content hash
|
||||
const contentValid = await verifyContentHash(packet);
|
||||
|
||||
// 3. Verify chain link
|
||||
const chainValid = packet.content.previousEvidenceId === null
|
||||
? previousHash === null
|
||||
: await verifyPreviousLink(packet, previousHash);
|
||||
|
||||
results.push({
|
||||
packetId: packet.id,
|
||||
signatureValid,
|
||||
contentValid,
|
||||
chainValid,
|
||||
valid: signatureValid && contentValid && chainValid,
|
||||
});
|
||||
|
||||
previousHash = packet.contentHash;
|
||||
}
|
||||
|
||||
return {
|
||||
valid: results.every(r => r.valid),
|
||||
packets: results,
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
```yaml
|
||||
# Evidence Packets
|
||||
GET /api/v1/evidence-packets
|
||||
Query: ?promotionId={uuid}&type={type}&from={date}&to={date}
|
||||
Response: EvidencePacket[]
|
||||
|
||||
GET /api/v1/evidence-packets/{id}
|
||||
Response: EvidencePacket (full content)
|
||||
|
||||
GET /api/v1/evidence-packets/{id}/verify
|
||||
Response: VerificationResult
|
||||
|
||||
GET /api/v1/evidence-packets/{id}/download
|
||||
Query: ?format={json|pdf}
|
||||
Response: binary
|
||||
|
||||
# Evidence Chain
|
||||
GET /api/v1/releases/{id}/evidence-chain
|
||||
Response: EvidenceChain
|
||||
|
||||
GET /api/v1/releases/{id}/evidence-chain/verify
|
||||
Response: ChainVerificationResult
|
||||
|
||||
# Audit Reports
|
||||
POST /api/v1/audit-reports
|
||||
Body: {
|
||||
type: "release" | "environment" | "compliance",
|
||||
scope: { releaseId?, environmentId?, from?, to? },
|
||||
format: "json" | "pdf" | "csv"
|
||||
}
|
||||
Response: { reportId: UUID, status: "generating" }
|
||||
|
||||
GET /api/v1/audit-reports/{id}
|
||||
Response: { status, downloadUrl? }
|
||||
|
||||
GET /api/v1/audit-reports/{id}/download
|
||||
Response: binary
|
||||
|
||||
# Version Stickers
|
||||
GET /api/v1/version-stickers
|
||||
Query: ?targetId={uuid}&releaseId={uuid}
|
||||
Response: VersionSticker[]
|
||||
|
||||
GET /api/v1/version-stickers/{id}
|
||||
Response: VersionSticker
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Deterministic Replay
|
||||
|
||||
Evidence packets enable deterministic replay - given the same inputs and policy version, the same decision is produced:
|
||||
|
||||
```typescript
|
||||
async function replayDecision(evidencePacket: EvidencePacket): Promise<ReplayResult> {
|
||||
const content = evidencePacket.content;
|
||||
|
||||
// 1. Verify inputs hash
|
||||
const currentInputsHash = computeInputsHash(
|
||||
content.release,
|
||||
content.environment,
|
||||
content.decision.gates
|
||||
);
|
||||
|
||||
if (currentInputsHash !== content.inputsHash) {
|
||||
return { valid: false, error: "Inputs have changed since original decision" };
|
||||
}
|
||||
|
||||
// 2. Re-evaluate decision with same inputs
|
||||
const replayedDecision = await evaluateDecision(
|
||||
content.release,
|
||||
content.environment,
|
||||
{ asOf: content.timeline.decidedAt } // Use policy version from that time
|
||||
);
|
||||
|
||||
// 3. Compare decisions
|
||||
const decisionsMatch = replayedDecision.result === content.decision.result;
|
||||
|
||||
return {
|
||||
valid: decisionsMatch,
|
||||
originalDecision: content.decision.result,
|
||||
replayedDecision: replayedDecision.result,
|
||||
differences: decisionsMatch ? [] : computeDifferences(content.decision, replayedDecision),
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [Module Overview](overview.md)
|
||||
- [Design Principles](../design/principles.md)
|
||||
- [Security Architecture](../security/overview.md)
|
||||
- [Evidence Schema](../appendices/evidence-schema.md)
|
||||
373
docs/modules/release-orchestrator/modules/integration-hub.md
Normal file
373
docs/modules/release-orchestrator/modules/integration-hub.md
Normal file
@@ -0,0 +1,373 @@
|
||||
# INTHUB: Integration Hub
|
||||
|
||||
**Purpose**: Central management of all external integrations (SCM, CI, registries, vaults, targets).
|
||||
|
||||
## Modules
|
||||
|
||||
### Module: `integration-manager`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | CRUD for integration instances; plugin type registry |
|
||||
| **Dependencies** | `plugin-registry`, `authority` (for credentials) |
|
||||
| **Data Entities** | `Integration`, `IntegrationType`, `IntegrationCredential` |
|
||||
| **Events Produced** | `integration.created`, `integration.updated`, `integration.deleted`, `integration.health_changed` |
|
||||
| **Events Consumed** | `plugin.registered`, `plugin.unregistered` |
|
||||
|
||||
**Key Operations**:
|
||||
```
|
||||
CreateIntegration(type, name, config, credentials) → Integration
|
||||
UpdateIntegration(id, config, credentials) → Integration
|
||||
DeleteIntegration(id) → void
|
||||
TestConnection(id) → ConnectionTestResult
|
||||
DiscoverResources(id, resourceType) → Resource[]
|
||||
GetIntegrationHealth(id) → HealthStatus
|
||||
ListIntegrations(filter) → Integration[]
|
||||
```
|
||||
|
||||
**Integration Entity**:
|
||||
```typescript
|
||||
interface Integration {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
type: string; // "scm.github", "registry.harbor"
|
||||
name: string; // user-defined name
|
||||
config: IntegrationConfig; // type-specific config
|
||||
credentialId: UUID; // reference to vault
|
||||
healthStatus: HealthStatus;
|
||||
lastHealthCheck: DateTime;
|
||||
createdAt: DateTime;
|
||||
updatedAt: DateTime;
|
||||
}
|
||||
|
||||
interface IntegrationConfig {
|
||||
endpoint: string;
|
||||
authMode: "token" | "oauth" | "mtls" | "iam";
|
||||
timeout: number;
|
||||
retryPolicy: RetryPolicy;
|
||||
customHeaders?: Record<string, string>;
|
||||
// Type-specific fields added by plugin
|
||||
[key: string]: any;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `connection-profiles`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Default settings management; "last used" pattern |
|
||||
| **Dependencies** | `integration-manager` |
|
||||
| **Data Entities** | `ConnectionProfile`, `ProfileTemplate` |
|
||||
|
||||
**Behavior**: When user adds a new integration instance:
|
||||
1. Wizard defaults to last used endpoint, auth mode, network settings
|
||||
2. Secrets are **never** auto-reused (explicit confirmation required)
|
||||
3. User can save as named profile for reuse
|
||||
|
||||
**Profile Entity**:
|
||||
```typescript
|
||||
interface ConnectionProfile {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
name: string; // "Production GitHub"
|
||||
integrationType: string;
|
||||
defaultConfig: Partial<IntegrationConfig>;
|
||||
isDefault: boolean;
|
||||
lastUsedAt: DateTime;
|
||||
createdBy: UUID;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `connector-runtime`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Execute plugin connector logic in controlled environment |
|
||||
| **Dependencies** | `plugin-loader`, `plugin-sandbox` |
|
||||
| **Protocol** | gRPC (preferred) or HTTP/REST |
|
||||
|
||||
**Connector Interface** (implemented by plugins):
|
||||
```protobuf
|
||||
service Connector {
|
||||
// Connection management
|
||||
rpc TestConnection(TestConnectionRequest) returns (TestConnectionResponse);
|
||||
rpc GetHealth(HealthRequest) returns (HealthResponse);
|
||||
|
||||
// Resource discovery
|
||||
rpc DiscoverResources(DiscoverRequest) returns (DiscoverResponse);
|
||||
rpc ListRepositories(ListReposRequest) returns (ListReposResponse);
|
||||
rpc ListBranches(ListBranchesRequest) returns (ListBranchesResponse);
|
||||
rpc ListTags(ListTagsRequest) returns (ListTagsResponse);
|
||||
|
||||
// Registry operations
|
||||
rpc ResolveTagToDigest(ResolveRequest) returns (ResolveResponse);
|
||||
rpc FetchManifest(ManifestRequest) returns (ManifestResponse);
|
||||
rpc VerifyDigest(VerifyRequest) returns (VerifyResponse);
|
||||
|
||||
// Secrets operations
|
||||
rpc GetSecretsRef(SecretsRequest) returns (SecretsResponse);
|
||||
rpc FetchSecret(FetchSecretRequest) returns (FetchSecretResponse);
|
||||
|
||||
// Workflow step execution
|
||||
rpc ExecuteStep(StepRequest) returns (stream StepResponse);
|
||||
rpc CancelStep(CancelRequest) returns (CancelResponse);
|
||||
}
|
||||
```
|
||||
|
||||
**Request/Response Types**:
|
||||
```protobuf
|
||||
message TestConnectionRequest {
|
||||
string integration_id = 1;
|
||||
map<string, string> config = 2;
|
||||
string credential_ref = 3;
|
||||
}
|
||||
|
||||
message TestConnectionResponse {
|
||||
bool success = 1;
|
||||
string error_message = 2;
|
||||
map<string, string> details = 3;
|
||||
int64 latency_ms = 4;
|
||||
}
|
||||
|
||||
message ResolveRequest {
|
||||
string integration_id = 1;
|
||||
string image_ref = 2; // "myapp:v2.3.1"
|
||||
}
|
||||
|
||||
message ResolveResponse {
|
||||
string digest = 1; // "sha256:abc123..."
|
||||
string manifest_type = 2;
|
||||
int64 size_bytes = 3;
|
||||
google.protobuf.Timestamp pushed_at = 4;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `doctor-checks`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Integration health diagnostics; troubleshooting |
|
||||
| **Dependencies** | `integration-manager`, `connector-runtime` |
|
||||
|
||||
**Doctor Check Types**:
|
||||
|
||||
| Check | Purpose | Pass Criteria |
|
||||
|-------|---------|---------------|
|
||||
| **Connectivity** | Can reach endpoint | TCP connect succeeds |
|
||||
| **TLS** | Certificate valid | Chain validates, not expired |
|
||||
| **Authentication** | Credentials valid | Auth request succeeds |
|
||||
| **Authorization** | Permissions sufficient | Required scopes present |
|
||||
| **Version** | API version supported | Version in supported range |
|
||||
| **Rate Limit** | Quota available | >10% remaining |
|
||||
| **Latency** | Response time acceptable | <5s p99 |
|
||||
|
||||
**Doctor Check Output**:
|
||||
```typescript
|
||||
interface DoctorCheckResult {
|
||||
checkType: string;
|
||||
status: "pass" | "warn" | "fail";
|
||||
message: string;
|
||||
details: Record<string, any>;
|
||||
suggestions: string[];
|
||||
runAt: DateTime;
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
interface DoctorReport {
|
||||
integrationId: UUID;
|
||||
overallStatus: "healthy" | "degraded" | "unhealthy";
|
||||
checks: DoctorCheckResult[];
|
||||
generatedAt: DateTime;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Cache Eviction Policies
|
||||
|
||||
Integration health status and connector results are cached to reduce load on external systems. **All caches MUST have bounded size and TTL-based eviction**:
|
||||
|
||||
| Cache Type | Purpose | TTL | Max Size | Eviction Strategy |
|
||||
|-----------|---------|-----|----------|-------------------|
|
||||
| **Health Checks** | Integration health status | 5 minutes | 1,000 entries | Sliding expiration |
|
||||
| **Connection Tests** | Test connection results | 2 minutes | 500 entries | Sliding expiration |
|
||||
| **Resource Discovery** | Discovered resources (repos, tags) | 10 minutes | 5,000 entries | Sliding expiration |
|
||||
| **Tag Resolution** | Tag → digest mappings | 1 hour | 10,000 entries | Absolute expiration |
|
||||
|
||||
**Implementation**:
|
||||
```csharp
|
||||
public class IntegrationHealthCache
|
||||
{
|
||||
private readonly MemoryCache _cache;
|
||||
|
||||
public IntegrationHealthCache()
|
||||
{
|
||||
_cache = new MemoryCache(new MemoryCacheOptions
|
||||
{
|
||||
SizeLimit = 1_000 // Max 1,000 integration health entries
|
||||
});
|
||||
}
|
||||
|
||||
public void CacheHealthStatus(Guid integrationId, HealthStatus status)
|
||||
{
|
||||
_cache.Set(integrationId, status, new MemoryCacheEntryOptions
|
||||
{
|
||||
Size = 1,
|
||||
SlidingExpiration = TimeSpan.FromMinutes(5) // 5-minute TTL
|
||||
});
|
||||
}
|
||||
|
||||
public HealthStatus? GetCachedHealthStatus(Guid integrationId)
|
||||
=> _cache.Get<HealthStatus>(integrationId);
|
||||
}
|
||||
```
|
||||
|
||||
**Reference**: See [Implementation Guide](../implementation-guide.md#caching) for cache implementation patterns.
|
||||
|
||||
---
|
||||
|
||||
## Integration Types
|
||||
|
||||
The following integration types are supported (via plugins):
|
||||
|
||||
### SCM Integrations
|
||||
|
||||
| Type | Plugin | Capabilities |
|
||||
|------|--------|--------------|
|
||||
| `scm.github` | Built-in | repos, branches, commits, webhooks, status |
|
||||
| `scm.gitlab` | Built-in | repos, branches, commits, webhooks, pipelines |
|
||||
| `scm.bitbucket` | Plugin | repos, branches, commits, webhooks |
|
||||
| `scm.azure_repos` | Plugin | repos, branches, commits, pipelines |
|
||||
|
||||
### Registry Integrations
|
||||
|
||||
| Type | Plugin | Capabilities |
|
||||
|------|--------|--------------|
|
||||
| `registry.harbor` | Built-in | repos, tags, digests, scanning status |
|
||||
| `registry.ecr` | Plugin | repos, tags, digests, IAM auth |
|
||||
| `registry.gcr` | Plugin | repos, tags, digests |
|
||||
| `registry.dockerhub` | Plugin | repos, tags, digests |
|
||||
| `registry.ghcr` | Plugin | repos, tags, digests |
|
||||
| `registry.acr` | Plugin | repos, tags, digests |
|
||||
|
||||
### Vault Integrations
|
||||
|
||||
| Type | Plugin | Capabilities |
|
||||
|------|--------|--------------|
|
||||
| `vault.hashicorp` | Built-in | KV, transit, PKI |
|
||||
| `vault.aws_secrets` | Plugin | secrets, IAM auth |
|
||||
| `vault.azure_keyvault` | Plugin | secrets, certificates |
|
||||
| `vault.gcp_secrets` | Plugin | secrets, IAM auth |
|
||||
|
||||
### CI Integrations
|
||||
|
||||
| Type | Plugin | Capabilities |
|
||||
|------|--------|--------------|
|
||||
| `ci.github_actions` | Built-in | workflows, runs, artifacts, status |
|
||||
| `ci.gitlab_ci` | Built-in | pipelines, jobs, artifacts |
|
||||
| `ci.jenkins` | Plugin | jobs, builds, artifacts |
|
||||
| `ci.azure_pipelines` | Plugin | pipelines, runs, artifacts |
|
||||
|
||||
### Router Integrations (for Progressive Delivery)
|
||||
|
||||
| Type | Plugin | Capabilities |
|
||||
|------|--------|--------------|
|
||||
| `router.nginx` | Plugin | upstream config, reload |
|
||||
| `router.haproxy` | Plugin | backend config, reload |
|
||||
| `router.traefik` | Plugin | dynamic config |
|
||||
| `router.aws_alb` | Plugin | target groups, listener rules |
|
||||
|
||||
---
|
||||
|
||||
## Database Schema
|
||||
|
||||
```sql
|
||||
-- Integration types (populated by plugins)
|
||||
CREATE TABLE release.integration_types (
|
||||
id TEXT PRIMARY KEY, -- "scm.github"
|
||||
plugin_id UUID REFERENCES release.plugins(id),
|
||||
display_name TEXT NOT NULL,
|
||||
description TEXT,
|
||||
icon_url TEXT,
|
||||
config_schema JSONB NOT NULL, -- JSON Schema for config
|
||||
capabilities TEXT[] NOT NULL, -- ["repos", "webhooks", "status"]
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
-- Integration instances
|
||||
CREATE TABLE release.integrations (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id),
|
||||
type_id TEXT NOT NULL REFERENCES release.integration_types(id),
|
||||
name TEXT NOT NULL,
|
||||
config JSONB NOT NULL,
|
||||
credential_ref TEXT NOT NULL, -- vault reference
|
||||
health_status TEXT NOT NULL DEFAULT 'unknown',
|
||||
last_health_check TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
created_by UUID NOT NULL REFERENCES users(id),
|
||||
UNIQUE(tenant_id, name)
|
||||
);
|
||||
|
||||
-- Connection profiles
|
||||
CREATE TABLE release.connection_profiles (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id),
|
||||
name TEXT NOT NULL,
|
||||
integration_type TEXT NOT NULL,
|
||||
default_config JSONB NOT NULL,
|
||||
is_default BOOLEAN NOT NULL DEFAULT false,
|
||||
last_used_at TIMESTAMPTZ,
|
||||
created_by UUID NOT NULL REFERENCES users(id),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
UNIQUE(tenant_id, name)
|
||||
);
|
||||
|
||||
-- Doctor check history
|
||||
CREATE TABLE release.doctor_checks (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
integration_id UUID NOT NULL REFERENCES release.integrations(id),
|
||||
check_type TEXT NOT NULL,
|
||||
status TEXT NOT NULL,
|
||||
message TEXT,
|
||||
details JSONB,
|
||||
duration_ms INTEGER NOT NULL,
|
||||
run_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_doctor_checks_integration ON release.doctor_checks(integration_id, run_at DESC);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
See [API Documentation](../api/overview.md) for full specification.
|
||||
|
||||
```
|
||||
GET /api/v1/integration-types # List available types
|
||||
GET /api/v1/integration-types/{type} # Get type details
|
||||
|
||||
GET /api/v1/integrations # List integrations
|
||||
POST /api/v1/integrations # Create integration
|
||||
GET /api/v1/integrations/{id} # Get integration
|
||||
PUT /api/v1/integrations/{id} # Update integration
|
||||
DELETE /api/v1/integrations/{id} # Delete integration
|
||||
POST /api/v1/integrations/{id}/test # Test connection
|
||||
GET /api/v1/integrations/{id}/health # Get health status
|
||||
POST /api/v1/integrations/{id}/doctor # Run doctor checks
|
||||
GET /api/v1/integrations/{id}/resources # Discover resources
|
||||
|
||||
GET /api/v1/connection-profiles # List profiles
|
||||
POST /api/v1/connection-profiles # Create profile
|
||||
GET /api/v1/connection-profiles/{id} # Get profile
|
||||
PUT /api/v1/connection-profiles/{id} # Update profile
|
||||
DELETE /api/v1/connection-profiles/{id} # Delete profile
|
||||
```
|
||||
203
docs/modules/release-orchestrator/modules/overview.md
Normal file
203
docs/modules/release-orchestrator/modules/overview.md
Normal file
@@ -0,0 +1,203 @@
|
||||
# Module Landscape Overview
|
||||
|
||||
The Stella Ops Suite comprises existing modules (vulnerability scanning) and new modules (release orchestration). Modules are organized into **themes** (functional areas).
|
||||
|
||||
## Architecture Diagram
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ STELLA OPS SUITE │
|
||||
│ │
|
||||
│ ┌───────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ EXISTING THEMES (Vulnerability) │ │
|
||||
│ │ │ │
|
||||
│ │ INGEST VEXOPS REASON SCANENG EVIDENCE │ │
|
||||
│ │ ├─concelier ├─excititor ├─policy ├─scanner ├─locker │ │
|
||||
│ │ └─advisory-ai └─linksets └─opa-runtime ├─sbom-gen ├─export │ │
|
||||
│ │ └─reachability └─timeline │ │
|
||||
│ │ │ │
|
||||
│ │ RUNTIME JOBCTRL OBSERVE REPLAY DEVEXP │ │
|
||||
│ │ ├─signals ├─scheduler ├─notifier └─replay-core ├─cli │ │
|
||||
│ │ ├─graph ├─orchestrator └─telemetry ├─web-ui │ │
|
||||
│ │ └─zastava └─task-runner └─sdk │ │
|
||||
│ └───────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌───────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ NEW THEMES (Release Orchestration) │ │
|
||||
│ │ │ │
|
||||
│ │ INTHUB (Integration Hub) │ │
|
||||
│ │ ├─integration-manager Central registry of configured integrations │ │
|
||||
│ │ ├─connection-profiles Default settings + credential management │ │
|
||||
│ │ ├─connector-runtime Plugin connector execution environment │ │
|
||||
│ │ └─doctor-checks Integration health diagnostics │ │
|
||||
│ │ │ │
|
||||
│ │ ENVMGR (Environment & Inventory) │ │
|
||||
│ │ ├─environment-manager Environment CRUD, ordering, config │ │
|
||||
│ │ ├─target-registry Deployment targets (hosts/services) │ │
|
||||
│ │ ├─agent-manager Agent registration, health, capabilities │ │
|
||||
│ │ └─inventory-sync Drift detection, state reconciliation │ │
|
||||
│ │ │ │
|
||||
│ │ RELMAN (Release Management) │ │
|
||||
│ │ ├─component-registry Image repos → components mapping │ │
|
||||
│ │ ├─version-manager Tag/digest → semver mapping │ │
|
||||
│ │ ├─release-manager Release bundle lifecycle │ │
|
||||
│ │ └─release-catalog Release history, search, compare │ │
|
||||
│ │ │ │
|
||||
│ │ WORKFL (Workflow Engine) │ │
|
||||
│ │ ├─workflow-designer Template creation, step graph editor │ │
|
||||
│ │ ├─workflow-engine DAG execution, state machine │ │
|
||||
│ │ ├─step-executor Step dispatch, retry, timeout │ │
|
||||
│ │ └─step-registry Built-in + plugin-provided steps │ │
|
||||
│ │ │ │
|
||||
│ │ PROMOT (Promotion & Approval) │ │
|
||||
│ │ ├─promotion-manager Promotion request lifecycle │ │
|
||||
│ │ ├─approval-gateway Approval collection, SoD enforcement │ │
|
||||
│ │ ├─decision-engine Gate evaluation, policy integration │ │
|
||||
│ │ └─gate-registry Built-in + custom gates │ │
|
||||
│ │ │ │
|
||||
│ │ DEPLOY (Deployment Execution) │ │
|
||||
│ │ ├─deploy-orchestrator Deployment job coordination │ │
|
||||
│ │ ├─target-executor Target-specific deployment logic │ │
|
||||
│ │ ├─runner-executor Script/hook execution sandbox │ │
|
||||
│ │ ├─artifact-generator Compose/script artifact generation │ │
|
||||
│ │ └─rollback-manager Rollback orchestration │ │
|
||||
│ │ │ │
|
||||
│ │ AGENTS (Deployment Agents) │ │
|
||||
│ │ ├─agent-core Shared agent runtime │ │
|
||||
│ │ ├─agent-docker Docker host agent │ │
|
||||
│ │ ├─agent-compose Docker Compose agent │ │
|
||||
│ │ ├─agent-ssh SSH remote executor │ │
|
||||
│ │ ├─agent-winrm WinRM remote executor │ │
|
||||
│ │ ├─agent-ecs AWS ECS agent │ │
|
||||
│ │ └─agent-nomad HashiCorp Nomad agent │ │
|
||||
│ │ │ │
|
||||
│ │ PROGDL (Progressive Delivery) │ │
|
||||
│ │ ├─ab-manager A/B release coordination │ │
|
||||
│ │ ├─traffic-router Router plugin orchestration │ │
|
||||
│ │ ├─canary-controller Canary ramp automation │ │
|
||||
│ │ └─rollout-strategy Strategy templates │ │
|
||||
│ │ │ │
|
||||
│ │ RELEVI (Release Evidence) │ │
|
||||
│ │ ├─evidence-collector Evidence aggregation │ │
|
||||
│ │ ├─evidence-signer Cryptographic signing │ │
|
||||
│ │ ├─sticker-writer Version sticker generation │ │
|
||||
│ │ └─audit-exporter Compliance report generation │ │
|
||||
│ │ │ │
|
||||
│ │ PLUGIN (Plugin Infrastructure) │ │
|
||||
│ │ ├─plugin-registry Plugin discovery, versioning │ │
|
||||
│ │ ├─plugin-loader Plugin lifecycle management │ │
|
||||
│ │ ├─plugin-sandbox Isolation, resource limits │ │
|
||||
│ │ └─plugin-sdk SDK for plugin development │ │
|
||||
│ └───────────────────────────────────────────────────────────────────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Theme Summary
|
||||
|
||||
### Existing Themes (Vulnerability Scanning)
|
||||
|
||||
| Theme | Purpose | Key Modules |
|
||||
|-------|---------|-------------|
|
||||
| **INGEST** | Advisory ingestion | concelier, advisory-ai |
|
||||
| **VEXOPS** | VEX document handling | excititor, linksets |
|
||||
| **REASON** | Policy and decisioning | policy, opa-runtime |
|
||||
| **SCANENG** | Scanning and SBOM | scanner, sbom-gen, reachability |
|
||||
| **EVIDENCE** | Evidence and attestation | locker, export, timeline |
|
||||
| **RUNTIME** | Runtime signals | signals, graph, zastava |
|
||||
| **JOBCTRL** | Job orchestration | scheduler, orchestrator, task-runner |
|
||||
| **OBSERVE** | Observability | notifier, telemetry |
|
||||
| **REPLAY** | Deterministic replay | replay-core |
|
||||
| **DEVEXP** | Developer experience | cli, web-ui, sdk |
|
||||
|
||||
### New Themes (Release Orchestration)
|
||||
|
||||
| Theme | Purpose | Key Modules | Documentation |
|
||||
|-------|---------|-------------|---------------|
|
||||
| **INTHUB** | Integration hub | integration-manager, connection-profiles, connector-runtime, doctor-checks | [Details](integration-hub.md) |
|
||||
| **ENVMGR** | Environment & inventory | environment-manager, target-registry, agent-manager, inventory-sync | [Details](environment-manager.md) |
|
||||
| **RELMAN** | Release management | component-registry, version-manager, release-manager, release-catalog | [Details](release-manager.md) |
|
||||
| **WORKFL** | Workflow engine | workflow-designer, workflow-engine, step-executor, step-registry | [Details](workflow-engine.md) |
|
||||
| **PROMOT** | Promotion & approval | promotion-manager, approval-gateway, decision-engine, gate-registry | [Details](promotion-manager.md) |
|
||||
| **DEPLOY** | Deployment execution | deploy-orchestrator, target-executor, runner-executor, artifact-generator, rollback-manager | [Details](deploy-orchestrator.md) |
|
||||
| **AGENTS** | Deployment agents | agent-core, agent-docker, agent-compose, agent-ssh, agent-winrm, agent-ecs, agent-nomad | [Details](agents.md) |
|
||||
| **PROGDL** | Progressive delivery | ab-manager, traffic-router, canary-controller, rollout-strategy | [Details](progressive-delivery.md) |
|
||||
| **RELEVI** | Release evidence | evidence-collector, evidence-signer, sticker-writer, audit-exporter | [Details](evidence.md) |
|
||||
| **PLUGIN** | Plugin infrastructure | plugin-registry, plugin-loader, plugin-sandbox, plugin-sdk | [Details](plugin-system.md) |
|
||||
|
||||
## Module Dependencies
|
||||
|
||||
```
|
||||
┌──────────────┐
|
||||
│ AUTHORITY │
|
||||
└──────┬───────┘
|
||||
│
|
||||
┌──────────────────┼──────────────────┐
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
┌───────────────┐ ┌───────────────┐ ┌───────────────┐
|
||||
│ INTHUB │ │ ENVMGR │ │ PLUGIN │
|
||||
│ (Integrations)│ │ (Environments)│ │ (Plugins) │
|
||||
└───────┬───────┘ └───────┬───────┘ └───────┬───────┘
|
||||
│ │ │
|
||||
└──────────┬───────┴──────────────────┘
|
||||
│
|
||||
▼
|
||||
┌───────────────┐
|
||||
│ RELMAN │
|
||||
│ (Releases) │
|
||||
└───────┬───────┘
|
||||
│
|
||||
▼
|
||||
┌───────────────┐
|
||||
│ WORKFL │
|
||||
│ (Workflows) │
|
||||
└───────┬───────┘
|
||||
│
|
||||
┌──────────┴──────────┐
|
||||
│ │
|
||||
▼ ▼
|
||||
┌───────────────┐ ┌───────────────┐
|
||||
│ PROMOT │ │ DEPLOY │
|
||||
│ (Promotion) │ │ (Deployment) │
|
||||
└───────┬───────┘ └───────┬───────┘
|
||||
│ │
|
||||
│ ▼
|
||||
│ ┌───────────────┐
|
||||
│ │ AGENTS │
|
||||
│ │ (Agents) │
|
||||
│ └───────┬───────┘
|
||||
│ │
|
||||
└──────────┬──────────┘
|
||||
│
|
||||
▼
|
||||
┌───────────────┐
|
||||
│ RELEVI │
|
||||
│ (Evidence) │
|
||||
└───────────────┘
|
||||
```
|
||||
|
||||
## Communication Patterns
|
||||
|
||||
| Pattern | Usage |
|
||||
|---------|-------|
|
||||
| **Synchronous API** | User-initiated operations (CRUD, queries) |
|
||||
| **Event Bus** | Cross-module notifications (domain events) |
|
||||
| **Task Queue** | Long-running operations (deployments, syncs) |
|
||||
| **WebSocket/SSE** | Real-time UI updates |
|
||||
| **gRPC Streams** | Agent communication |
|
||||
|
||||
## Database Schema Organization
|
||||
|
||||
Each theme owns a PostgreSQL schema:
|
||||
|
||||
| Schema | Owner Theme |
|
||||
|--------|-------------|
|
||||
| `release.integrations` | INTHUB |
|
||||
| `release.environments` | ENVMGR |
|
||||
| `release.components` | RELMAN |
|
||||
| `release.workflows` | WORKFL |
|
||||
| `release.promotions` | PROMOT |
|
||||
| `release.deployments` | DEPLOY |
|
||||
| `release.agents` | AGENTS |
|
||||
| `release.evidence` | RELEVI |
|
||||
| `release.plugins` | PLUGIN |
|
||||
629
docs/modules/release-orchestrator/modules/plugin-system.md
Normal file
629
docs/modules/release-orchestrator/modules/plugin-system.md
Normal file
@@ -0,0 +1,629 @@
|
||||
# PLUGIN: Plugin Infrastructure
|
||||
|
||||
**Purpose**: Extensible plugin system for integrations, steps, and custom functionality.
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ PLUGIN ARCHITECTURE │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ PLUGIN REGISTRY │ │
|
||||
│ │ │ │
|
||||
│ │ - Plugin discovery and versioning │ │
|
||||
│ │ - Manifest validation │ │
|
||||
│ │ - Dependency resolution │ │
|
||||
│ └──────────────────────────────┬──────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ PLUGIN LOADER │ │
|
||||
│ │ │ │
|
||||
│ │ - Lifecycle management (load, start, stop, unload) │ │
|
||||
│ │ - Health monitoring │ │
|
||||
│ │ - Hot reload support │ │
|
||||
│ └──────────────────────────────┬──────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ PLUGIN SANDBOX │ │
|
||||
│ │ │ │
|
||||
│ │ - Process isolation │ │
|
||||
│ │ - Resource limits (CPU, memory, network) │ │
|
||||
│ │ - Capability enforcement │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Plugin Types: │
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ Connector │ │ Step │ │ Gate │ │ Agent │ │
|
||||
│ │ Plugins │ │ Providers │ │ Providers │ │ Plugins │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Modules
|
||||
|
||||
### Module: `plugin-registry`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Plugin discovery; versioning; manifest management |
|
||||
| **Data Entities** | `Plugin`, `PluginManifest`, `PluginVersion` |
|
||||
| **Events Produced** | `plugin.discovered`, `plugin.registered`, `plugin.unregistered` |
|
||||
|
||||
**Plugin Entity**:
|
||||
```typescript
|
||||
interface Plugin {
|
||||
id: UUID;
|
||||
pluginId: string; // "com.example.my-connector"
|
||||
version: string; // "1.2.3"
|
||||
vendor: string;
|
||||
license: string;
|
||||
manifest: PluginManifest;
|
||||
status: PluginStatus;
|
||||
entrypoint: string; // Path to plugin executable/module
|
||||
lastHealthCheck: DateTime;
|
||||
healthMessage: string | null;
|
||||
installedAt: DateTime;
|
||||
updatedAt: DateTime;
|
||||
}
|
||||
|
||||
type PluginStatus =
|
||||
| "discovered" // Found but not loaded
|
||||
| "loaded" // Loaded but not active
|
||||
| "active" // Running and healthy
|
||||
| "stopped" // Manually stopped
|
||||
| "failed" // Failed to load or crashed
|
||||
| "degraded"; // Running but with issues
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `plugin-loader`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Plugin lifecycle management |
|
||||
| **Dependencies** | `plugin-registry`, `plugin-sandbox` |
|
||||
| **Events Produced** | `plugin.loaded`, `plugin.started`, `plugin.stopped`, `plugin.failed` |
|
||||
|
||||
**Plugin Lifecycle**:
|
||||
```
|
||||
┌──────────────┐
|
||||
│ DISCOVERED │ ──── Plugin found in registry
|
||||
└──────┬───────┘
|
||||
│ load()
|
||||
▼
|
||||
┌──────────────┐
|
||||
│ LOADED │ ──── Plugin validated and prepared
|
||||
└──────┬───────┘
|
||||
│ start()
|
||||
▼
|
||||
┌──────────────┐ ┌──────────────┐
|
||||
│ ACTIVE │ ──── │ DEGRADED │ ◄── Health issues
|
||||
└──────┬───────┘ └──────────────┘
|
||||
│ stop() │
|
||||
▼ │
|
||||
┌──────────────┐ │
|
||||
│ STOPPED │ ◄───────────┘ manual stop
|
||||
└──────────────┘
|
||||
|
||||
│ unload()
|
||||
▼
|
||||
┌──────────────┐
|
||||
│ UNLOADED │
|
||||
└──────────────┘
|
||||
```
|
||||
|
||||
**Lifecycle Operations**:
|
||||
```typescript
|
||||
interface PluginLoader {
|
||||
// Discovery
|
||||
discover(): Promise<Plugin[]>;
|
||||
refresh(): Promise<void>;
|
||||
|
||||
// Lifecycle
|
||||
load(pluginId: string): Promise<Plugin>;
|
||||
start(pluginId: string): Promise<void>;
|
||||
stop(pluginId: string): Promise<void>;
|
||||
unload(pluginId: string): Promise<void>;
|
||||
restart(pluginId: string): Promise<void>;
|
||||
|
||||
// Health
|
||||
checkHealth(pluginId: string): Promise<HealthStatus>;
|
||||
getStatus(pluginId: string): Promise<PluginStatus>;
|
||||
|
||||
// Hot reload
|
||||
reload(pluginId: string): Promise<void>;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `plugin-sandbox`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Isolation; resource limits; security |
|
||||
| **Enforcement** | Process isolation, capability-based security |
|
||||
|
||||
**Sandbox Configuration**:
|
||||
```typescript
|
||||
interface SandboxConfig {
|
||||
// Process isolation
|
||||
processIsolation: boolean; // Run in separate process
|
||||
containerIsolation: boolean; // Run in container
|
||||
|
||||
// Resource limits
|
||||
resourceLimits: {
|
||||
maxMemoryMb: number; // Memory limit
|
||||
maxCpuPercent: number; // CPU limit
|
||||
maxDiskMb: number; // Disk quota
|
||||
maxNetworkBandwidth: number; // Network bandwidth limit
|
||||
};
|
||||
|
||||
// Network restrictions
|
||||
networkPolicy: {
|
||||
allowedHosts: string[]; // Allowed outbound hosts
|
||||
blockedHosts: string[]; // Blocked hosts
|
||||
allowOutbound: boolean; // Allow any outbound
|
||||
};
|
||||
|
||||
// Filesystem restrictions
|
||||
filesystemPolicy: {
|
||||
readOnlyPaths: string[];
|
||||
writablePaths: string[];
|
||||
blockedPaths: string[];
|
||||
};
|
||||
|
||||
// Timeouts
|
||||
timeouts: {
|
||||
initializationMs: number;
|
||||
operationMs: number;
|
||||
shutdownMs: number;
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
**Capability Enforcement**:
|
||||
```typescript
|
||||
interface PluginCapabilities {
|
||||
// Integration capabilities
|
||||
integrations: {
|
||||
scm: boolean;
|
||||
ci: boolean;
|
||||
registry: boolean;
|
||||
vault: boolean;
|
||||
router: boolean;
|
||||
};
|
||||
|
||||
// Step capabilities
|
||||
steps: {
|
||||
deploy: boolean;
|
||||
gate: boolean;
|
||||
notify: boolean;
|
||||
custom: boolean;
|
||||
};
|
||||
|
||||
// System capabilities
|
||||
system: {
|
||||
network: boolean;
|
||||
filesystem: boolean;
|
||||
secrets: boolean;
|
||||
database: boolean;
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `plugin-sdk`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | SDK for plugin development |
|
||||
| **Languages** | C#, TypeScript, Go |
|
||||
|
||||
**Plugin SDK Interface**:
|
||||
```typescript
|
||||
// Base plugin interface
|
||||
interface StellaPlugin {
|
||||
// Lifecycle
|
||||
initialize(config: PluginConfig): Promise<void>;
|
||||
start(): Promise<void>;
|
||||
stop(): Promise<void>;
|
||||
dispose(): Promise<void>;
|
||||
|
||||
// Health
|
||||
getHealth(): Promise<HealthStatus>;
|
||||
|
||||
// Metadata
|
||||
getManifest(): PluginManifest;
|
||||
}
|
||||
|
||||
// Connector plugin interface
|
||||
interface ConnectorPlugin extends StellaPlugin {
|
||||
createConnector(config: ConnectorConfig): Promise<Connector>;
|
||||
}
|
||||
|
||||
// Step provider plugin interface
|
||||
interface StepProviderPlugin extends StellaPlugin {
|
||||
getStepTypes(): StepType[];
|
||||
executeStep(
|
||||
stepType: string,
|
||||
config: StepConfig,
|
||||
inputs: StepInputs,
|
||||
context: StepContext
|
||||
): AsyncGenerator<StepEvent>;
|
||||
}
|
||||
|
||||
// Gate provider plugin interface
|
||||
interface GateProviderPlugin extends StellaPlugin {
|
||||
getGateTypes(): GateType[];
|
||||
evaluateGate(
|
||||
gateType: string,
|
||||
config: GateConfig,
|
||||
context: GateContext
|
||||
): Promise<GateResult>;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Three-Surface Plugin Model
|
||||
|
||||
Plugins contribute to the system through three distinct surfaces:
|
||||
|
||||
### 1. Manifest Surface (Static)
|
||||
|
||||
The plugin manifest declares:
|
||||
- Plugin identity and version
|
||||
- Required capabilities
|
||||
- Provided integrations/steps/gates
|
||||
- Configuration schema
|
||||
- UI components (optional)
|
||||
|
||||
```yaml
|
||||
# plugin.stella.yaml
|
||||
plugin:
|
||||
id: "com.example.jenkins-connector"
|
||||
version: "1.0.0"
|
||||
vendor: "Example Corp"
|
||||
license: "Apache-2.0"
|
||||
description: "Jenkins CI integration for Stella Ops"
|
||||
|
||||
capabilities:
|
||||
required:
|
||||
- network
|
||||
optional:
|
||||
- secrets
|
||||
|
||||
provides:
|
||||
integrations:
|
||||
- type: "ci.jenkins"
|
||||
displayName: "Jenkins"
|
||||
configSchema: "./schemas/jenkins-config.json"
|
||||
capabilities:
|
||||
- "pipelines"
|
||||
- "builds"
|
||||
- "artifacts"
|
||||
|
||||
steps:
|
||||
- type: "jenkins-trigger"
|
||||
displayName: "Trigger Jenkins Build"
|
||||
category: "integration"
|
||||
configSchema: "./schemas/jenkins-trigger-config.json"
|
||||
inputSchema: "./schemas/jenkins-trigger-input.json"
|
||||
outputSchema: "./schemas/jenkins-trigger-output.json"
|
||||
|
||||
ui:
|
||||
configScreen: "./ui/config.html"
|
||||
icon: "./assets/jenkins-icon.svg"
|
||||
|
||||
dependencies:
|
||||
stellaCore: ">=1.0.0"
|
||||
```
|
||||
|
||||
### 2. Connector Runtime Surface (Dynamic)
|
||||
|
||||
Plugins implement connector interfaces for runtime operations:
|
||||
|
||||
```typescript
|
||||
// Jenkins connector implementation
|
||||
class JenkinsConnector implements CIConnector {
|
||||
private client: JenkinsClient;
|
||||
|
||||
async initialize(config: ConnectorConfig, secrets: SecretHandle[]): Promise<void> {
|
||||
const apiToken = await this.getSecret(secrets, "api_token");
|
||||
this.client = new JenkinsClient({
|
||||
baseUrl: config.endpoint,
|
||||
username: config.username,
|
||||
apiToken: apiToken,
|
||||
});
|
||||
}
|
||||
|
||||
async testConnection(): Promise<ConnectionTestResult> {
|
||||
try {
|
||||
const crumb = await this.client.getCrumb();
|
||||
return { success: true, message: "Connected to Jenkins" };
|
||||
} catch (error) {
|
||||
return { success: false, message: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
async listPipelines(): Promise<PipelineInfo[]> {
|
||||
const jobs = await this.client.getJobs();
|
||||
return jobs.map(job => ({
|
||||
id: job.name,
|
||||
name: job.displayName,
|
||||
url: job.url,
|
||||
lastBuild: job.lastBuild?.number,
|
||||
}));
|
||||
}
|
||||
|
||||
async triggerPipeline(pipelineId: string, params: object): Promise<PipelineRun> {
|
||||
const queueItem = await this.client.build(pipelineId, params);
|
||||
return {
|
||||
id: queueItem.id.toString(),
|
||||
pipelineId,
|
||||
status: "queued",
|
||||
startedAt: new Date(),
|
||||
};
|
||||
}
|
||||
|
||||
async getPipelineRun(runId: string): Promise<PipelineRun> {
|
||||
const build = await this.client.getBuild(runId);
|
||||
return {
|
||||
id: build.number.toString(),
|
||||
pipelineId: build.job,
|
||||
status: this.mapStatus(build.result),
|
||||
startedAt: new Date(build.timestamp),
|
||||
completedAt: build.result ? new Date(build.timestamp + build.duration) : null,
|
||||
};
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Step Provider Surface (Execution)
|
||||
|
||||
Plugins implement step execution logic:
|
||||
|
||||
```typescript
|
||||
// Jenkins trigger step implementation
|
||||
class JenkinsTriggerStep implements StepExecutor {
|
||||
async *execute(
|
||||
config: StepConfig,
|
||||
inputs: StepInputs,
|
||||
context: StepContext
|
||||
): AsyncGenerator<StepEvent> {
|
||||
const connector = await context.getConnector<JenkinsConnector>(config.integrationId);
|
||||
|
||||
yield { type: "log", line: `Triggering Jenkins job: ${config.jobName}` };
|
||||
|
||||
// Trigger build
|
||||
const run = await connector.triggerPipeline(config.jobName, inputs.parameters);
|
||||
yield { type: "output", name: "buildId", value: run.id };
|
||||
yield { type: "log", line: `Build queued: ${run.id}` };
|
||||
|
||||
// Wait for completion if configured
|
||||
if (config.waitForCompletion) {
|
||||
yield { type: "log", line: "Waiting for build to complete..." };
|
||||
|
||||
while (true) {
|
||||
const status = await connector.getPipelineRun(run.id);
|
||||
|
||||
if (status.status === "succeeded") {
|
||||
yield { type: "output", name: "status", value: "succeeded" };
|
||||
yield { type: "result", success: true };
|
||||
return;
|
||||
}
|
||||
|
||||
if (status.status === "failed") {
|
||||
yield { type: "output", name: "status", value: "failed" };
|
||||
yield { type: "result", success: false, message: "Build failed" };
|
||||
return;
|
||||
}
|
||||
|
||||
yield { type: "progress", progress: 50, message: `Build running: ${status.status}` };
|
||||
await sleep(config.pollIntervalSeconds * 1000);
|
||||
}
|
||||
}
|
||||
|
||||
yield { type: "result", success: true };
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Schema
|
||||
|
||||
```sql
|
||||
-- Plugins
|
||||
CREATE TABLE release.plugins (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
plugin_id VARCHAR(255) NOT NULL UNIQUE,
|
||||
version VARCHAR(50) NOT NULL,
|
||||
vendor VARCHAR(255) NOT NULL,
|
||||
license VARCHAR(100),
|
||||
manifest JSONB NOT NULL,
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'discovered' CHECK (status IN (
|
||||
'discovered', 'loaded', 'active', 'stopped', 'failed', 'degraded'
|
||||
)),
|
||||
entrypoint VARCHAR(500) NOT NULL,
|
||||
last_health_check TIMESTAMPTZ,
|
||||
health_message TEXT,
|
||||
installed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_plugins_status ON release.plugins(status);
|
||||
|
||||
-- Plugin Instances (per-tenant configuration)
|
||||
CREATE TABLE release.plugin_instances (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
plugin_id UUID NOT NULL REFERENCES release.plugins(id) ON DELETE CASCADE,
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
config JSONB NOT NULL DEFAULT '{}',
|
||||
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_plugin_instances_tenant ON release.plugin_instances(tenant_id);
|
||||
|
||||
-- Integration types (populated by plugins)
|
||||
CREATE TABLE release.integration_types (
|
||||
id TEXT PRIMARY KEY, -- "scm.github", "ci.jenkins"
|
||||
plugin_id UUID REFERENCES release.plugins(id),
|
||||
display_name TEXT NOT NULL,
|
||||
description TEXT,
|
||||
icon_url TEXT,
|
||||
config_schema JSONB NOT NULL, -- JSON Schema for config
|
||||
capabilities TEXT[] NOT NULL, -- ["repos", "webhooks", "status"]
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
```yaml
|
||||
# Plugin Registry
|
||||
GET /api/v1/plugins
|
||||
Query: ?status={status}&capability={type}
|
||||
Response: Plugin[]
|
||||
|
||||
GET /api/v1/plugins/{id}
|
||||
Response: Plugin (with manifest)
|
||||
|
||||
POST /api/v1/plugins/{id}/enable
|
||||
Response: Plugin
|
||||
|
||||
POST /api/v1/plugins/{id}/disable
|
||||
Response: Plugin
|
||||
|
||||
GET /api/v1/plugins/{id}/health
|
||||
Response: { status, message, diagnostics[] }
|
||||
|
||||
# Plugin Instances (per-tenant config)
|
||||
POST /api/v1/plugin-instances
|
||||
Body: { pluginId: UUID, config: object }
|
||||
Response: PluginInstance
|
||||
|
||||
GET /api/v1/plugin-instances
|
||||
Response: PluginInstance[]
|
||||
|
||||
PUT /api/v1/plugin-instances/{id}
|
||||
Body: { config: object, enabled: boolean }
|
||||
Response: PluginInstance
|
||||
|
||||
DELETE /api/v1/plugin-instances/{id}
|
||||
Response: { deleted: true }
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Plugin Security
|
||||
|
||||
### Capability Declaration
|
||||
|
||||
Plugins must declare all required capabilities in their manifest. The system enforces:
|
||||
|
||||
1. **Network Access**: Plugins can only access declared hosts
|
||||
2. **Secret Access**: Plugins receive secrets through controlled injection
|
||||
3. **Database Access**: No direct database access; API only
|
||||
4. **Filesystem Access**: Limited to declared paths
|
||||
|
||||
### Sandbox Enforcement
|
||||
|
||||
```typescript
|
||||
// Plugin execution is sandboxed
|
||||
class PluginSandbox {
|
||||
async execute<T>(
|
||||
plugin: Plugin,
|
||||
operation: () => Promise<T>
|
||||
): Promise<T> {
|
||||
// 1. Verify capabilities
|
||||
this.verifyCapabilities(plugin);
|
||||
|
||||
// 2. Set resource limits
|
||||
const limits = this.getResourceLimits(plugin);
|
||||
await this.applyLimits(limits);
|
||||
|
||||
// 3. Create isolated context
|
||||
const context = await this.createIsolatedContext(plugin);
|
||||
|
||||
try {
|
||||
// 4. Execute with timeout
|
||||
return await this.withTimeout(
|
||||
operation(),
|
||||
plugin.manifest.timeouts.operationMs
|
||||
);
|
||||
} catch (error) {
|
||||
// 5. Log and handle errors
|
||||
await this.handlePluginError(plugin, error);
|
||||
throw error;
|
||||
} finally {
|
||||
// 6. Cleanup
|
||||
await context.dispose();
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Plugin Failures Cannot Crash Core
|
||||
|
||||
```csharp
|
||||
// Core orchestration is protected from plugin failures
|
||||
public sealed class PromotionDecisionEngine
|
||||
{
|
||||
public async Task<DecisionResult> EvaluateAsync(
|
||||
Promotion promotion,
|
||||
IReadOnlyList<IGateProvider> gates,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var results = new List<GateResult>();
|
||||
|
||||
foreach (var gate in gates)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Plugin provides evaluation logic
|
||||
var result = await gate.EvaluateAsync(promotion, ct);
|
||||
results.Add(result);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Plugin failure is logged but doesn't crash core
|
||||
_logger.LogError(ex, "Gate {GateType} failed", gate.Type);
|
||||
results.Add(new GateResult
|
||||
{
|
||||
GateType = gate.Type,
|
||||
Status = GateStatus.Failed,
|
||||
Message = $"Gate evaluation failed: {ex.Message}",
|
||||
IsBlocking = gate.IsBlocking,
|
||||
});
|
||||
}
|
||||
|
||||
// Core decides how to aggregate (plugins cannot override)
|
||||
if (results.Last().IsBlocking && _policy.FailFast)
|
||||
break;
|
||||
}
|
||||
|
||||
// Core makes final decision
|
||||
return _decisionAggregator.Aggregate(results);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [Module Overview](overview.md)
|
||||
- [Integration Hub](integration-hub.md)
|
||||
- [Workflow Engine](workflow-engine.md)
|
||||
- [Connector Interface](../integrations/connectors.md)
|
||||
@@ -0,0 +1,471 @@
|
||||
# PROGDL: Progressive Delivery
|
||||
|
||||
**Purpose**: A/B releases, canary deployments, and traffic management.
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ PROGRESSIVE DELIVERY ARCHITECTURE │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ A/B RELEASE MANAGER │ │
|
||||
│ │ │ │
|
||||
│ │ - Create A/B release with variations │ │
|
||||
│ │ - Manage traffic split configuration │ │
|
||||
│ │ - Coordinate rollout stages │ │
|
||||
│ │ - Handle promotion/rollback │ │
|
||||
│ └──────────────────────────────┬──────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌──────────────────┴──────────────────┐ │
|
||||
│ │ │ │
|
||||
│ ▼ ▼ │
|
||||
│ ┌───────────────────────┐ ┌───────────────────────┐ │
|
||||
│ │ TARGET-GROUP A/B │ │ ROUTER-BASED A/B │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ Deploy to groups │ │ Configure traffic │ │
|
||||
│ │ by labels/membership │ │ via load balancer │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ Good for: │ │ Good for: │ │
|
||||
│ │ - Background workers │ │ - Web/API traffic │ │
|
||||
│ │ - Batch processors │ │ - Customer-facing │ │
|
||||
│ │ - Internal services │ │ - L7 routing │ │
|
||||
│ └───────────────────────┘ └───────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ CANARY CONTROLLER │ │
|
||||
│ │ │ │
|
||||
│ │ - Execute rollout stages │ │
|
||||
│ │ - Monitor health metrics │ │
|
||||
│ │ - Auto-advance or pause │ │
|
||||
│ │ - Trigger rollback on failure │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ TRAFFIC ROUTER INTEGRATION │ │
|
||||
│ │ │ │
|
||||
│ │ Plugin-based integration with: │ │
|
||||
│ │ - Nginx (config generation + reload) │ │
|
||||
│ │ - HAProxy (config generation + reload) │ │
|
||||
│ │ - Traefik (dynamic config API) │ │
|
||||
│ │ - AWS ALB (target group weights) │ │
|
||||
│ │ - Custom (webhook) │ │
|
||||
│ └─────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Modules
|
||||
|
||||
### Module: `ab-manager`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | A/B release lifecycle; variation management |
|
||||
| **Dependencies** | `release-manager`, `environment-manager`, `deploy-orchestrator` |
|
||||
| **Data Entities** | `ABRelease`, `Variation`, `TrafficSplit` |
|
||||
| **Events Produced** | `ab.created`, `ab.started`, `ab.stage_advanced`, `ab.promoted`, `ab.rolled_back` |
|
||||
|
||||
**A/B Release Entity**:
|
||||
```typescript
|
||||
interface ABRelease {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
environmentId: UUID;
|
||||
name: string;
|
||||
variations: Variation[];
|
||||
activeVariation: string; // "A" or "B"
|
||||
trafficSplit: TrafficSplit;
|
||||
rolloutStrategy: RolloutStrategy;
|
||||
status: ABReleaseStatus;
|
||||
createdAt: DateTime;
|
||||
completedAt: DateTime | null;
|
||||
createdBy: UUID;
|
||||
}
|
||||
|
||||
interface Variation {
|
||||
name: string; // "A", "B"
|
||||
releaseId: UUID;
|
||||
targetGroupId: UUID | null; // for target-group based A/B
|
||||
trafficPercentage: number;
|
||||
deploymentJobId: UUID | null;
|
||||
}
|
||||
|
||||
interface TrafficSplit {
|
||||
type: "percentage" | "sticky" | "header";
|
||||
percentages: Record<string, number>; // {"A": 90, "B": 10}
|
||||
stickyKey?: string; // cookie or header name
|
||||
headerMatch?: { // for header-based routing
|
||||
header: string;
|
||||
values: Record<string, string>; // value -> variation
|
||||
};
|
||||
}
|
||||
|
||||
type ABReleaseStatus =
|
||||
| "created" // Configured, not started
|
||||
| "deploying" // Deploying variations
|
||||
| "running" // Active with traffic split
|
||||
| "promoting" // Promoting winner to 100%
|
||||
| "completed" // Successfully completed
|
||||
| "rolled_back"; // Rolled back to original
|
||||
```
|
||||
|
||||
**A/B Release Models**:
|
||||
|
||||
| Model | Description | Use Case |
|
||||
|-------|-------------|----------|
|
||||
| **Target-Group A/B** | Deploy different releases to different target groups | Background workers, internal services |
|
||||
| **Router-Based A/B** | Use load balancer to split traffic | Web/API traffic, customer-facing |
|
||||
| **Hybrid A/B** | Combination of both | Complex deployments |
|
||||
|
||||
---
|
||||
|
||||
### Module: `traffic-router`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Router plugin orchestration; traffic shifting |
|
||||
| **Dependencies** | `integration-manager`, `connector-runtime` |
|
||||
| **Protocol** | Plugin-specific (API calls, config generation) |
|
||||
|
||||
**Router Connector Interface**:
|
||||
```typescript
|
||||
interface RouterConnector extends BaseConnector {
|
||||
// Traffic management
|
||||
configureRoute(config: RouteConfig): Promise<void>;
|
||||
getTrafficDistribution(): Promise<TrafficDistribution>;
|
||||
shiftTraffic(from: string, to: string, percentage: number): Promise<void>;
|
||||
|
||||
// Configuration
|
||||
reloadConfig(): Promise<void>;
|
||||
validateConfig(config: string): Promise<ValidationResult>;
|
||||
}
|
||||
|
||||
interface RouteConfig {
|
||||
upstream: string;
|
||||
backends: Array<{
|
||||
name: string;
|
||||
targets: string[];
|
||||
weight: number;
|
||||
}>;
|
||||
healthCheck?: {
|
||||
path: string;
|
||||
interval: number;
|
||||
timeout: number;
|
||||
};
|
||||
}
|
||||
|
||||
interface TrafficDistribution {
|
||||
backends: Array<{
|
||||
name: string;
|
||||
weight: number;
|
||||
healthyTargets: number;
|
||||
totalTargets: number;
|
||||
}>;
|
||||
timestamp: DateTime;
|
||||
}
|
||||
```
|
||||
|
||||
**Router Plugins**:
|
||||
|
||||
| Plugin | Capabilities |
|
||||
|--------|-------------|
|
||||
| `router.nginx` | Config generation, reload via signal/API |
|
||||
| `router.haproxy` | Config generation, reload via socket |
|
||||
| `router.traefik` | Dynamic config API |
|
||||
| `router.aws_alb` | Target group weights via AWS API |
|
||||
| `router.custom` | Webhook-based custom integration |
|
||||
|
||||
---
|
||||
|
||||
### Module: `canary-controller`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Canary ramp automation; health monitoring |
|
||||
| **Dependencies** | `ab-manager`, `traffic-router` |
|
||||
| **Data Entities** | `CanaryStage`, `HealthResult` |
|
||||
| **Events Produced** | `canary.stage_started`, `canary.stage_passed`, `canary.stage_failed` |
|
||||
|
||||
**Canary Stage Entity**:
|
||||
```typescript
|
||||
interface CanaryStage {
|
||||
id: UUID;
|
||||
abReleaseId: UUID;
|
||||
stageNumber: number;
|
||||
trafficPercentage: number;
|
||||
status: CanaryStageStatus;
|
||||
healthThreshold: number; // Required health % to pass
|
||||
durationSeconds: number; // How long to run stage
|
||||
requireApproval: boolean; // Require manual approval
|
||||
startedAt: DateTime | null;
|
||||
completedAt: DateTime | null;
|
||||
healthResult: HealthResult | null;
|
||||
}
|
||||
|
||||
type CanaryStageStatus =
|
||||
| "pending"
|
||||
| "running"
|
||||
| "succeeded"
|
||||
| "failed"
|
||||
| "skipped";
|
||||
|
||||
interface HealthResult {
|
||||
healthy: boolean;
|
||||
healthPercentage: number;
|
||||
metrics: {
|
||||
successRate: number;
|
||||
errorRate: number;
|
||||
latencyP50: number;
|
||||
latencyP99: number;
|
||||
};
|
||||
samples: number;
|
||||
evaluatedAt: DateTime;
|
||||
}
|
||||
```
|
||||
|
||||
**Canary Rollout Execution**:
|
||||
```typescript
|
||||
class CanaryController {
|
||||
async executeRollout(abRelease: ABRelease): Promise<void> {
|
||||
const stages = abRelease.rolloutStrategy.stages;
|
||||
|
||||
for (const stage of stages) {
|
||||
this.log(`Starting canary stage ${stage.stageNumber}: ${stage.trafficPercentage}%`);
|
||||
|
||||
// 1. Shift traffic to canary percentage
|
||||
await this.trafficRouter.shiftTraffic(
|
||||
abRelease.variations[0].name, // baseline
|
||||
abRelease.variations[1].name, // canary
|
||||
stage.trafficPercentage
|
||||
);
|
||||
|
||||
// 2. Update stage status
|
||||
stage.status = "running";
|
||||
stage.startedAt = new Date();
|
||||
await this.save(stage);
|
||||
|
||||
// 3. Wait for stage duration
|
||||
await this.waitForDuration(stage.durationSeconds);
|
||||
|
||||
// 4. Evaluate health
|
||||
const healthResult = await this.evaluateHealth(abRelease, stage);
|
||||
stage.healthResult = healthResult;
|
||||
|
||||
if (!healthResult.healthy || healthResult.healthPercentage < stage.healthThreshold) {
|
||||
stage.status = "failed";
|
||||
await this.save(stage);
|
||||
|
||||
// Rollback
|
||||
await this.rollback(abRelease);
|
||||
throw new CanaryFailedError(`Stage ${stage.stageNumber} failed health check`);
|
||||
}
|
||||
|
||||
// 5. Check if approval required
|
||||
if (stage.requireApproval) {
|
||||
await this.waitForApproval(abRelease, stage);
|
||||
}
|
||||
|
||||
stage.status = "succeeded";
|
||||
stage.completedAt = new Date();
|
||||
await this.save(stage);
|
||||
|
||||
// 6. Check for auto-advance
|
||||
if (!abRelease.rolloutStrategy.autoAdvance) {
|
||||
await this.waitForManualAdvance(abRelease);
|
||||
}
|
||||
}
|
||||
|
||||
// All stages passed - promote canary to 100%
|
||||
await this.promote(abRelease, abRelease.variations[1].name);
|
||||
}
|
||||
|
||||
private async evaluateHealth(abRelease: ABRelease, stage: CanaryStage): Promise<HealthResult> {
|
||||
// Collect metrics from targets
|
||||
const canaryVariation = abRelease.variations.find(v => v.name === "B");
|
||||
const targets = await this.getTargets(canaryVariation.targetGroupId);
|
||||
|
||||
let healthyCount = 0;
|
||||
let totalLatency = 0;
|
||||
let errorCount = 0;
|
||||
|
||||
for (const target of targets) {
|
||||
const health = await this.checkTargetHealth(target);
|
||||
if (health.healthy) healthyCount++;
|
||||
totalLatency += health.latencyMs;
|
||||
if (health.errorRate > 0) errorCount++;
|
||||
}
|
||||
|
||||
return {
|
||||
healthy: healthyCount >= targets.length * (stage.healthThreshold / 100),
|
||||
healthPercentage: (healthyCount / targets.length) * 100,
|
||||
metrics: {
|
||||
successRate: ((targets.length - errorCount) / targets.length) * 100,
|
||||
errorRate: (errorCount / targets.length) * 100,
|
||||
latencyP50: totalLatency / targets.length,
|
||||
latencyP99: totalLatency / targets.length * 1.5, // simplified
|
||||
},
|
||||
samples: targets.length,
|
||||
evaluatedAt: new Date(),
|
||||
};
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `rollout-strategy`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Strategy templates; configuration |
|
||||
| **Data Entities** | `RolloutStrategyTemplate` |
|
||||
|
||||
**Built-in Strategy Templates**:
|
||||
|
||||
| Template | Stages | Description |
|
||||
|----------|--------|-------------|
|
||||
| `canary-10-25-50-100` | 4 | Standard canary: 10%, 25%, 50%, 100% |
|
||||
| `canary-1-5-10-50-100` | 5 | Conservative: 1%, 5%, 10%, 50%, 100% |
|
||||
| `blue-green-instant` | 2 | Deploy 100% to green, instant switch |
|
||||
| `blue-green-gradual` | 4 | Gradual shift: 25%, 50%, 75%, 100% |
|
||||
|
||||
**Rollout Strategy Definition**:
|
||||
```typescript
|
||||
interface RolloutStrategy {
|
||||
id: UUID;
|
||||
name: string;
|
||||
stages: Array<{
|
||||
trafficPercentage: number;
|
||||
durationSeconds: number;
|
||||
healthThreshold: number;
|
||||
requireApproval: boolean;
|
||||
}>;
|
||||
autoAdvance: boolean;
|
||||
rollbackOnFailure: boolean;
|
||||
healthCheckInterval: number;
|
||||
}
|
||||
|
||||
// Example: Standard Canary
|
||||
const standardCanary: RolloutStrategy = {
|
||||
name: "canary-10-25-50-100",
|
||||
stages: [
|
||||
{ trafficPercentage: 10, durationSeconds: 300, healthThreshold: 95, requireApproval: false },
|
||||
{ trafficPercentage: 25, durationSeconds: 600, healthThreshold: 95, requireApproval: false },
|
||||
{ trafficPercentage: 50, durationSeconds: 900, healthThreshold: 95, requireApproval: true },
|
||||
{ trafficPercentage: 100, durationSeconds: 0, healthThreshold: 95, requireApproval: false },
|
||||
],
|
||||
autoAdvance: true,
|
||||
rollbackOnFailure: true,
|
||||
healthCheckInterval: 30,
|
||||
};
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Schema
|
||||
|
||||
```sql
|
||||
-- A/B Releases
|
||||
CREATE TABLE release.ab_releases (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
environment_id UUID NOT NULL REFERENCES release.environments(id),
|
||||
name VARCHAR(255) NOT NULL,
|
||||
variations JSONB NOT NULL, -- [{name, releaseId, targetGroupId, trafficPercentage}]
|
||||
active_variation VARCHAR(50) NOT NULL DEFAULT 'A',
|
||||
traffic_split JSONB NOT NULL,
|
||||
rollout_strategy JSONB NOT NULL,
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'created' CHECK (status IN (
|
||||
'created', 'deploying', 'running', 'promoting', 'completed', 'rolled_back'
|
||||
)),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
completed_at TIMESTAMPTZ,
|
||||
created_by UUID REFERENCES users(id)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_ab_releases_tenant_env ON release.ab_releases(tenant_id, environment_id);
|
||||
CREATE INDEX idx_ab_releases_status ON release.ab_releases(status);
|
||||
|
||||
-- Canary Stages
|
||||
CREATE TABLE release.canary_stages (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
ab_release_id UUID NOT NULL REFERENCES release.ab_releases(id) ON DELETE CASCADE,
|
||||
stage_number INTEGER NOT NULL,
|
||||
traffic_percentage INTEGER NOT NULL,
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'pending' CHECK (status IN (
|
||||
'pending', 'running', 'succeeded', 'failed', 'skipped'
|
||||
)),
|
||||
health_threshold DECIMAL(5,2),
|
||||
duration_seconds INTEGER,
|
||||
require_approval BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
health_result JSONB,
|
||||
UNIQUE (ab_release_id, stage_number)
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
```yaml
|
||||
# A/B Releases
|
||||
POST /api/v1/ab-releases
|
||||
Body: {
|
||||
environmentId: UUID,
|
||||
name: string,
|
||||
variations: [
|
||||
{ name: "A", releaseId: UUID, targetGroupId?: UUID },
|
||||
{ name: "B", releaseId: UUID, targetGroupId?: UUID }
|
||||
],
|
||||
trafficSplit: TrafficSplit,
|
||||
rolloutStrategy: RolloutStrategy
|
||||
}
|
||||
Response: ABRelease
|
||||
|
||||
GET /api/v1/ab-releases
|
||||
Query: ?environmentId={uuid}&status={status}
|
||||
Response: ABRelease[]
|
||||
|
||||
GET /api/v1/ab-releases/{id}
|
||||
Response: ABRelease (with stages)
|
||||
|
||||
POST /api/v1/ab-releases/{id}/start
|
||||
Response: ABRelease
|
||||
|
||||
POST /api/v1/ab-releases/{id}/advance
|
||||
Body: { stageNumber?: number } # advance to next or specific stage
|
||||
Response: ABRelease
|
||||
|
||||
POST /api/v1/ab-releases/{id}/promote
|
||||
Body: { variation: "A" | "B" } # promote to 100%
|
||||
Response: ABRelease
|
||||
|
||||
POST /api/v1/ab-releases/{id}/rollback
|
||||
Response: ABRelease
|
||||
|
||||
GET /api/v1/ab-releases/{id}/traffic
|
||||
Response: { currentSplit: TrafficDistribution, history: TrafficHistory[] }
|
||||
|
||||
GET /api/v1/ab-releases/{id}/health
|
||||
Response: { variations: [{ name, healthStatus, metrics }] }
|
||||
|
||||
# Rollout Strategies
|
||||
GET /api/v1/rollout-strategies
|
||||
Response: RolloutStrategyTemplate[]
|
||||
|
||||
GET /api/v1/rollout-strategies/{id}
|
||||
Response: RolloutStrategyTemplate
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [Module Overview](overview.md)
|
||||
- [Deploy Orchestrator](deploy-orchestrator.md)
|
||||
- [A/B Releases](../progressive-delivery/ab-releases.md)
|
||||
- [Canary Controller](../progressive-delivery/canary.md)
|
||||
- [Router Plugins](../progressive-delivery/routers.md)
|
||||
433
docs/modules/release-orchestrator/modules/promotion-manager.md
Normal file
433
docs/modules/release-orchestrator/modules/promotion-manager.md
Normal file
@@ -0,0 +1,433 @@
|
||||
# PROMOT: Promotion & Approval Manager
|
||||
|
||||
**Purpose**: Manage promotion requests, approvals, gates, and decision records.
|
||||
|
||||
## Modules
|
||||
|
||||
### Module: `promotion-manager`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Promotion request lifecycle; state management |
|
||||
| **Dependencies** | `release-manager`, `environment-manager`, `workflow-engine` |
|
||||
| **Data Entities** | `Promotion`, `PromotionState` |
|
||||
| **Events Produced** | `promotion.requested`, `promotion.approved`, `promotion.rejected`, `promotion.started`, `promotion.completed`, `promotion.failed`, `promotion.rolled_back` |
|
||||
|
||||
**Key Operations**:
|
||||
```
|
||||
RequestPromotion(releaseId, targetEnvironmentId, reason) → Promotion
|
||||
ApprovePromotion(promotionId, comment) → Promotion
|
||||
RejectPromotion(promotionId, reason) → Promotion
|
||||
CancelPromotion(promotionId) → Promotion
|
||||
GetPromotionStatus(promotionId) → PromotionState
|
||||
GetDecisionRecord(promotionId) → DecisionRecord
|
||||
```
|
||||
|
||||
**Promotion Entity**:
|
||||
```typescript
|
||||
interface Promotion {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
releaseId: UUID;
|
||||
sourceEnvironmentId: UUID | null; // null for first deployment
|
||||
targetEnvironmentId: UUID;
|
||||
status: PromotionStatus;
|
||||
decisionRecord: DecisionRecord;
|
||||
workflowRunId: UUID | null;
|
||||
requestedAt: DateTime;
|
||||
requestedBy: UUID;
|
||||
requestReason: string;
|
||||
decidedAt: DateTime | null;
|
||||
startedAt: DateTime | null;
|
||||
completedAt: DateTime | null;
|
||||
evidencePacketId: UUID | null;
|
||||
}
|
||||
|
||||
type PromotionStatus =
|
||||
| "pending_approval" // Waiting for human approval
|
||||
| "pending_gate" // Waiting for gate evaluation
|
||||
| "approved" // Ready for deployment
|
||||
| "rejected" // Blocked by approval or gate
|
||||
| "deploying" // Deployment in progress
|
||||
| "deployed" // Successfully deployed
|
||||
| "failed" // Deployment failed
|
||||
| "cancelled" // User cancelled
|
||||
| "rolled_back"; // Rolled back after failure
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `approval-gateway`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Approval collection; separation of duties enforcement |
|
||||
| **Dependencies** | `authority` (for user/group lookup) |
|
||||
| **Data Entities** | `Approval`, `ApprovalPolicy` |
|
||||
| **Events Produced** | `approval.granted`, `approval.denied` |
|
||||
|
||||
**Approval Policy Entity**:
|
||||
```typescript
|
||||
interface ApprovalPolicy {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
environmentId: UUID;
|
||||
requiredCount: number; // Minimum approvals required
|
||||
requiredRoles: string[]; // At least one approver must have role
|
||||
requiredGroups: string[]; // At least one approver must be in group
|
||||
requireSeparationOfDuties: boolean; // Requester cannot approve
|
||||
allowSelfApproval: boolean; // Override SoD for specific users
|
||||
expirationMinutes: number; // Approval expires after N minutes
|
||||
}
|
||||
|
||||
interface Approval {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
promotionId: UUID;
|
||||
approverId: UUID;
|
||||
action: "approved" | "rejected";
|
||||
comment: string;
|
||||
approvedAt: DateTime;
|
||||
approverRole: string;
|
||||
approverGroups: string[];
|
||||
}
|
||||
```
|
||||
|
||||
**Separation of Duties (SoD) Rules**:
|
||||
1. Requester cannot approve their own promotion (if `requireSeparationOfDuties` is true)
|
||||
2. Same user cannot approve twice
|
||||
3. At least N different users must approve (based on `requiredCount`)
|
||||
4. At least one approver must match `requiredRoles` if specified
|
||||
5. At least one approver must be in `requiredGroups` if specified
|
||||
|
||||
---
|
||||
|
||||
### Module: `decision-engine`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Gate evaluation; policy integration; decision record generation |
|
||||
| **Dependencies** | `gate-registry`, `policy` (OPA integration), `scanner` (security data) |
|
||||
| **Data Entities** | `DecisionRecord`, `GateResult` |
|
||||
| **Events Produced** | `decision.evaluated`, `decision.recorded` |
|
||||
|
||||
**Decision Record Structure**:
|
||||
```typescript
|
||||
interface DecisionRecord {
|
||||
promotionId: UUID;
|
||||
evaluatedAt: DateTime;
|
||||
decision: "allow" | "deny" | "pending";
|
||||
|
||||
// What was evaluated
|
||||
release: {
|
||||
id: UUID;
|
||||
name: string;
|
||||
components: Array<{
|
||||
name: string;
|
||||
digest: string;
|
||||
semver: string;
|
||||
}>;
|
||||
};
|
||||
|
||||
environment: {
|
||||
id: UUID;
|
||||
name: string;
|
||||
requiredApprovals: number;
|
||||
freezeWindow: boolean;
|
||||
};
|
||||
|
||||
// Gate evaluation results
|
||||
gates: GateResult[];
|
||||
|
||||
// Approval status
|
||||
approvalStatus: {
|
||||
required: number;
|
||||
received: number;
|
||||
approvers: Array<{
|
||||
userId: UUID;
|
||||
action: string;
|
||||
at: DateTime;
|
||||
}>;
|
||||
sodViolation: boolean;
|
||||
};
|
||||
|
||||
// Reason for decision
|
||||
reasons: string[];
|
||||
|
||||
// Hash of all inputs for replay verification
|
||||
inputsHash: string;
|
||||
}
|
||||
|
||||
interface GateResult {
|
||||
gateType: string;
|
||||
gateName: string;
|
||||
status: "passed" | "failed" | "warning" | "skipped";
|
||||
message: string;
|
||||
details: Record<string, any>;
|
||||
evaluatedAt: DateTime;
|
||||
durationMs: number;
|
||||
}
|
||||
```
|
||||
|
||||
**Gate Evaluation Order**:
|
||||
1. **Freeze Window Check**: Is environment in freeze?
|
||||
2. **Approval Check**: All required approvals received?
|
||||
3. **Security Gate**: No blocking vulnerabilities?
|
||||
4. **Custom Policy Gates**: All OPA policies pass?
|
||||
5. **Integration Gates**: External system checks pass?
|
||||
|
||||
---
|
||||
|
||||
### Module: `gate-registry`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Built-in + custom gate registration |
|
||||
| **Dependencies** | `plugin-registry` |
|
||||
| **Data Entities** | `GateDefinition`, `GateConfig` |
|
||||
|
||||
**Built-in Gates**:
|
||||
|
||||
| Gate Type | Description |
|
||||
|-----------|-------------|
|
||||
| `freeze-window` | Check if environment is in freeze |
|
||||
| `approval` | Check if required approvals received |
|
||||
| `security-scan` | Check for blocking vulnerabilities |
|
||||
| `scan-freshness` | Check if scan is recent enough |
|
||||
| `digest-verification` | Verify digests haven't changed |
|
||||
| `environment-sequence` | Enforce promotion order |
|
||||
| `custom-opa` | Custom OPA/Rego policy |
|
||||
| `webhook` | External webhook gate |
|
||||
|
||||
**Gate Definition**:
|
||||
```typescript
|
||||
interface GateDefinition {
|
||||
type: string;
|
||||
displayName: string;
|
||||
description: string;
|
||||
configSchema: JSONSchema;
|
||||
evaluator: "builtin" | UUID; // builtin or plugin ID
|
||||
blocking: boolean; // Can block promotion
|
||||
cacheable: boolean; // Can cache result
|
||||
cacheTtlSeconds: number;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Promotion State Machine
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ PROMOTION STATE MACHINE │
|
||||
│ │
|
||||
│ ┌───────────────┐ │
|
||||
│ │ REQUESTED │ ◄──── User requests promotion │
|
||||
│ └───────┬───────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌───────────────┐ ┌───────────────┐ │
|
||||
│ │ PENDING │─────►│ REJECTED │ ◄──── Approver rejects │
|
||||
│ │ APPROVAL │ └───────────────┘ │
|
||||
│ └───────┬───────┘ │
|
||||
│ │ approval received │
|
||||
│ ▼ │
|
||||
│ ┌───────────────┐ ┌───────────────┐ │
|
||||
│ │ PENDING │─────►│ REJECTED │ ◄──── Gate fails │
|
||||
│ │ GATE │ └───────────────┘ │
|
||||
│ └───────┬───────┘ │
|
||||
│ │ all gates pass │
|
||||
│ ▼ │
|
||||
│ ┌───────────────┐ │
|
||||
│ │ APPROVED │ ◄──── Ready for deployment │
|
||||
│ └───────┬───────┘ │
|
||||
│ │ workflow starts │
|
||||
│ ▼ │
|
||||
│ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │
|
||||
│ │ DEPLOYING │─────►│ FAILED │─────►│ ROLLED_BACK │ │
|
||||
│ └───────┬───────┘ └───────────────┘ └───────────────┘ │
|
||||
│ │ │
|
||||
│ │ deployment complete │
|
||||
│ ▼ │
|
||||
│ ┌───────────────┐ │
|
||||
│ │ DEPLOYED │ ◄──── Success! │
|
||||
│ └───────────────┘ │
|
||||
│ │
|
||||
│ Additional transitions: │
|
||||
│ - Any non-terminal → CANCELLED: user cancels │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Schema
|
||||
|
||||
```sql
|
||||
-- Promotions
|
||||
CREATE TABLE release.promotions (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
release_id UUID NOT NULL REFERENCES release.releases(id),
|
||||
source_environment_id UUID REFERENCES release.environments(id),
|
||||
target_environment_id UUID NOT NULL REFERENCES release.environments(id),
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'pending_approval' CHECK (status IN (
|
||||
'pending_approval', 'pending_gate', 'approved', 'rejected',
|
||||
'deploying', 'deployed', 'failed', 'cancelled', 'rolled_back'
|
||||
)),
|
||||
decision_record JSONB,
|
||||
workflow_run_id UUID REFERENCES release.workflow_runs(id),
|
||||
requested_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
requested_by UUID NOT NULL REFERENCES users(id),
|
||||
request_reason TEXT,
|
||||
decided_at TIMESTAMPTZ,
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
evidence_packet_id UUID
|
||||
);
|
||||
|
||||
CREATE INDEX idx_promotions_tenant ON release.promotions(tenant_id);
|
||||
CREATE INDEX idx_promotions_release ON release.promotions(release_id);
|
||||
CREATE INDEX idx_promotions_status ON release.promotions(status);
|
||||
CREATE INDEX idx_promotions_target_env ON release.promotions(target_environment_id);
|
||||
|
||||
-- Approvals
|
||||
CREATE TABLE release.approvals (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
promotion_id UUID NOT NULL REFERENCES release.promotions(id) ON DELETE CASCADE,
|
||||
approver_id UUID NOT NULL REFERENCES users(id),
|
||||
action VARCHAR(50) NOT NULL CHECK (action IN ('approved', 'rejected')),
|
||||
comment TEXT,
|
||||
approved_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
approver_role VARCHAR(255),
|
||||
approver_groups JSONB NOT NULL DEFAULT '[]'
|
||||
);
|
||||
|
||||
CREATE INDEX idx_approvals_promotion ON release.approvals(promotion_id);
|
||||
CREATE INDEX idx_approvals_approver ON release.approvals(approver_id);
|
||||
|
||||
-- Approval Policies
|
||||
CREATE TABLE release.approval_policies (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
environment_id UUID NOT NULL REFERENCES release.environments(id) ON DELETE CASCADE,
|
||||
required_count INTEGER NOT NULL DEFAULT 1,
|
||||
required_roles JSONB NOT NULL DEFAULT '[]',
|
||||
required_groups JSONB NOT NULL DEFAULT '[]',
|
||||
require_sod BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
allow_self_approval BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
expiration_minutes INTEGER NOT NULL DEFAULT 1440,
|
||||
UNIQUE (tenant_id, environment_id)
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
```yaml
|
||||
# Promotions
|
||||
POST /api/v1/promotions
|
||||
Body: { releaseId, targetEnvironmentId, reason? }
|
||||
Response: Promotion
|
||||
|
||||
GET /api/v1/promotions
|
||||
Query: ?status={status}&releaseId={uuid}&environmentId={uuid}&page={n}
|
||||
Response: { data: Promotion[], meta: PaginationMeta }
|
||||
|
||||
GET /api/v1/promotions/{id}
|
||||
Response: Promotion (with decision record, approvals)
|
||||
|
||||
POST /api/v1/promotions/{id}/approve
|
||||
Body: { comment? }
|
||||
Response: Promotion
|
||||
|
||||
POST /api/v1/promotions/{id}/reject
|
||||
Body: { reason }
|
||||
Response: Promotion
|
||||
|
||||
POST /api/v1/promotions/{id}/cancel
|
||||
Response: Promotion
|
||||
|
||||
GET /api/v1/promotions/{id}/decision
|
||||
Response: DecisionRecord
|
||||
|
||||
GET /api/v1/promotions/{id}/approvals
|
||||
Response: Approval[]
|
||||
|
||||
GET /api/v1/promotions/{id}/evidence
|
||||
Response: EvidencePacket
|
||||
|
||||
# Gate Evaluation Preview
|
||||
POST /api/v1/promotions/preview-gates
|
||||
Body: { releaseId, targetEnvironmentId }
|
||||
Response: { wouldPass: boolean, gates: GateResult[] }
|
||||
|
||||
# Approval Policies
|
||||
POST /api/v1/approval-policies
|
||||
GET /api/v1/approval-policies
|
||||
GET /api/v1/approval-policies/{id}
|
||||
PUT /api/v1/approval-policies/{id}
|
||||
DELETE /api/v1/approval-policies/{id}
|
||||
|
||||
# Pending Approvals (for current user)
|
||||
GET /api/v1/my/pending-approvals
|
||||
Response: Promotion[]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Security Gate Integration
|
||||
|
||||
The security gate evaluates the release against vulnerability data from the Scanner module:
|
||||
|
||||
```typescript
|
||||
interface SecurityGateConfig {
|
||||
blockOnCritical: boolean; // Block if any critical severity
|
||||
blockOnHigh: boolean; // Block if any high severity
|
||||
maxCritical: number; // Max allowed critical (0 for strict)
|
||||
maxHigh: number; // Max allowed high
|
||||
requireFreshScan: boolean; // Require scan within N hours
|
||||
scanFreshnessHours: number; // How recent scan must be
|
||||
allowExceptions: boolean; // Allow VEX exceptions
|
||||
requireVexJustification: boolean; // Require VEX for exceptions
|
||||
}
|
||||
|
||||
interface SecurityGateResult {
|
||||
passed: boolean;
|
||||
summary: {
|
||||
critical: number;
|
||||
high: number;
|
||||
medium: number;
|
||||
low: number;
|
||||
};
|
||||
blocking: Array<{
|
||||
cve: string;
|
||||
severity: string;
|
||||
component: string;
|
||||
digest: string;
|
||||
fixAvailable: boolean;
|
||||
}>;
|
||||
exceptions: Array<{
|
||||
cve: string;
|
||||
vexStatus: string;
|
||||
justification: string;
|
||||
}>;
|
||||
scanAge: {
|
||||
component: string;
|
||||
scannedAt: DateTime;
|
||||
ageHours: number;
|
||||
fresh: boolean;
|
||||
}[];
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [Module Overview](overview.md)
|
||||
- [Workflow Engine](workflow-engine.md)
|
||||
- [Security Architecture](../security/overview.md)
|
||||
- [API Documentation](../api/promotions.md)
|
||||
406
docs/modules/release-orchestrator/modules/release-manager.md
Normal file
406
docs/modules/release-orchestrator/modules/release-manager.md
Normal file
@@ -0,0 +1,406 @@
|
||||
# RELMAN: Release Management
|
||||
|
||||
**Purpose**: Manage components, versions, and release bundles.
|
||||
|
||||
## Modules
|
||||
|
||||
### Module: `component-registry`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Map image repositories to logical components |
|
||||
| **Dependencies** | `integration-manager` (for registry access) |
|
||||
| **Data Entities** | `Component`, `ComponentVersion` |
|
||||
| **Events Produced** | `component.created`, `component.updated`, `component.deleted` |
|
||||
|
||||
**Key Operations**:
|
||||
```
|
||||
CreateComponent(name, displayName, imageRepository, registryId) → Component
|
||||
UpdateComponent(id, config) → Component
|
||||
DeleteComponent(id) → void
|
||||
SyncVersions(componentId, forceRefresh) → VersionMap[]
|
||||
ListComponents(tenantId) → Component[]
|
||||
```
|
||||
|
||||
**Component Entity**:
|
||||
```typescript
|
||||
interface Component {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
name: string; // "api", "worker", "frontend"
|
||||
displayName: string; // "API Service"
|
||||
imageRepository: string; // "registry.example.com/myapp/api"
|
||||
registryIntegrationId: UUID; // which registry integration
|
||||
versioningStrategy: VersionStrategy;
|
||||
deploymentTemplate: string; // which workflow template to use
|
||||
defaultChannel: string; // "stable", "beta"
|
||||
metadata: Record<string, string>;
|
||||
}
|
||||
|
||||
interface VersionStrategy {
|
||||
type: "semver" | "date" | "sequential" | "manual";
|
||||
tagPattern?: string; // regex for tag extraction
|
||||
semverExtract?: string; // regex capture group
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `version-manager`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Tag/digest mapping; version rules |
|
||||
| **Dependencies** | `component-registry`, `connector-runtime` |
|
||||
| **Data Entities** | `VersionMap`, `VersionRule`, `Channel` |
|
||||
| **Events Produced** | `version.resolved`, `version.updated` |
|
||||
|
||||
**Version Resolution**:
|
||||
```typescript
|
||||
interface VersionMap {
|
||||
id: UUID;
|
||||
componentId: UUID;
|
||||
tag: string; // "v2.3.1"
|
||||
digest: string; // "sha256:abc123..."
|
||||
semver: string; // "2.3.1"
|
||||
channel: string; // "stable"
|
||||
prerelease: boolean;
|
||||
buildMetadata: string;
|
||||
resolvedAt: DateTime;
|
||||
source: "auto" | "manual";
|
||||
}
|
||||
|
||||
interface VersionRule {
|
||||
id: UUID;
|
||||
componentId: UUID;
|
||||
pattern: string; // "^v(\\d+\\.\\d+\\.\\d+)$"
|
||||
channel: string; // "stable"
|
||||
prereleasePattern: string;// ".*-(alpha|beta|rc).*"
|
||||
}
|
||||
```
|
||||
|
||||
**Version Resolution Algorithm**:
|
||||
1. Fetch tags from registry (via connector)
|
||||
2. Apply version rules to extract semver
|
||||
3. Resolve each tag to digest
|
||||
4. Store in version map
|
||||
5. Update channels ("latest stable", "latest beta")
|
||||
|
||||
---
|
||||
|
||||
### Module: `release-manager`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Release bundle lifecycle; composition |
|
||||
| **Dependencies** | `component-registry`, `version-manager` |
|
||||
| **Data Entities** | `Release`, `ReleaseComponent` |
|
||||
| **Events Produced** | `release.created`, `release.promoted`, `release.deprecated` |
|
||||
|
||||
**Release Entity**:
|
||||
```typescript
|
||||
interface Release {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
name: string; // "myapp-v2.3.1"
|
||||
displayName: string; // "MyApp 2.3.1"
|
||||
components: ReleaseComponent[];
|
||||
sourceRef: SourceReference;
|
||||
status: ReleaseStatus;
|
||||
createdAt: DateTime;
|
||||
createdBy: UUID;
|
||||
deployedEnvironments: UUID[]; // where currently deployed
|
||||
metadata: Record<string, string>;
|
||||
}
|
||||
|
||||
interface ReleaseComponent {
|
||||
componentId: UUID;
|
||||
componentName: string;
|
||||
digest: string; // sha256:...
|
||||
semver: string; // resolved semver
|
||||
tag: string; // original tag (for display)
|
||||
role: "primary" | "sidecar" | "init" | "migration";
|
||||
}
|
||||
|
||||
interface SourceReference {
|
||||
scmIntegrationId?: UUID;
|
||||
commitSha?: string;
|
||||
branch?: string;
|
||||
ciIntegrationId?: UUID;
|
||||
buildId?: string;
|
||||
pipelineUrl?: string;
|
||||
}
|
||||
|
||||
type ReleaseStatus =
|
||||
| "draft" // being composed
|
||||
| "ready" // ready for promotion
|
||||
| "promoting" // promotion in progress
|
||||
| "deployed" // deployed to at least one env
|
||||
| "deprecated" // marked as deprecated
|
||||
| "archived"; // no longer active
|
||||
```
|
||||
|
||||
**Release Creation Modes**:
|
||||
|
||||
| Mode | Description |
|
||||
|------|-------------|
|
||||
| **Full Release** | All components, latest versions |
|
||||
| **Partial Release** | Subset of components updated; others pinned from last deployment |
|
||||
| **Pinned Release** | All versions explicitly specified |
|
||||
| **Channel Release** | All components from specific channel ("beta") |
|
||||
|
||||
---
|
||||
|
||||
### Module: `release-catalog`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Release history, search, comparison |
|
||||
| **Dependencies** | `release-manager` |
|
||||
|
||||
**Key Operations**:
|
||||
```
|
||||
SearchReleases(filter, pagination) → Release[]
|
||||
CompareReleases(releaseA, releaseB) → ReleaseDiff
|
||||
GetReleaseHistory(componentId) → Release[]
|
||||
GetReleaseLineage(releaseId) → ReleaseLineage // promotion path
|
||||
```
|
||||
|
||||
**Release Comparison**:
|
||||
```typescript
|
||||
interface ReleaseDiff {
|
||||
releaseA: UUID;
|
||||
releaseB: UUID;
|
||||
added: ComponentDiff[]; // Components in B not in A
|
||||
removed: ComponentDiff[]; // Components in A not in B
|
||||
changed: ComponentChange[]; // Components with different versions
|
||||
unchanged: ComponentDiff[]; // Components with same version
|
||||
}
|
||||
|
||||
interface ComponentChange {
|
||||
componentId: UUID;
|
||||
componentName: string;
|
||||
fromVersion: string;
|
||||
toVersion: string;
|
||||
fromDigest: string;
|
||||
toDigest: string;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Schema
|
||||
|
||||
```sql
|
||||
-- Components
|
||||
CREATE TABLE release.components (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
display_name VARCHAR(255) NOT NULL,
|
||||
image_repository VARCHAR(500) NOT NULL,
|
||||
registry_integration_id UUID REFERENCES release.integrations(id),
|
||||
versioning_strategy JSONB NOT NULL DEFAULT '{"type": "semver"}',
|
||||
deployment_template VARCHAR(255),
|
||||
default_channel VARCHAR(50) NOT NULL DEFAULT 'stable',
|
||||
metadata JSONB NOT NULL DEFAULT '{}',
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
UNIQUE (tenant_id, name)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_components_tenant ON release.components(tenant_id);
|
||||
|
||||
-- Version Maps
|
||||
CREATE TABLE release.version_maps (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
component_id UUID NOT NULL REFERENCES release.components(id) ON DELETE CASCADE,
|
||||
tag VARCHAR(255) NOT NULL,
|
||||
digest VARCHAR(100) NOT NULL,
|
||||
semver VARCHAR(50),
|
||||
channel VARCHAR(50) NOT NULL DEFAULT 'stable',
|
||||
prerelease BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
build_metadata VARCHAR(255),
|
||||
resolved_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
source VARCHAR(50) NOT NULL DEFAULT 'auto',
|
||||
UNIQUE (tenant_id, component_id, digest)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_version_maps_component ON release.version_maps(component_id);
|
||||
CREATE INDEX idx_version_maps_digest ON release.version_maps(digest);
|
||||
CREATE INDEX idx_version_maps_semver ON release.version_maps(semver);
|
||||
|
||||
-- Releases
|
||||
CREATE TABLE release.releases (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
display_name VARCHAR(255) NOT NULL,
|
||||
components JSONB NOT NULL, -- [{componentId, digest, semver, tag, role}]
|
||||
source_ref JSONB, -- {scmIntegrationId, commitSha, ciIntegrationId, buildId}
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'draft',
|
||||
metadata JSONB NOT NULL DEFAULT '{}',
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
created_by UUID REFERENCES users(id),
|
||||
UNIQUE (tenant_id, name)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_releases_tenant ON release.releases(tenant_id);
|
||||
CREATE INDEX idx_releases_status ON release.releases(status);
|
||||
CREATE INDEX idx_releases_created ON release.releases(created_at DESC);
|
||||
|
||||
-- Release Environment State
|
||||
CREATE TABLE release.release_environment_state (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
environment_id UUID NOT NULL REFERENCES release.environments(id) ON DELETE CASCADE,
|
||||
release_id UUID NOT NULL REFERENCES release.releases(id),
|
||||
status VARCHAR(50) NOT NULL,
|
||||
deployed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
deployed_by UUID REFERENCES users(id),
|
||||
promotion_id UUID,
|
||||
evidence_ref VARCHAR(255),
|
||||
UNIQUE (tenant_id, environment_id)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_release_env_state_env ON release.release_environment_state(environment_id);
|
||||
CREATE INDEX idx_release_env_state_release ON release.release_environment_state(release_id);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
```yaml
|
||||
# Components
|
||||
POST /api/v1/components
|
||||
Body: { name, displayName, imageRepository, registryIntegrationId, versioningStrategy?, defaultChannel? }
|
||||
Response: Component
|
||||
|
||||
GET /api/v1/components
|
||||
Response: Component[]
|
||||
|
||||
GET /api/v1/components/{id}
|
||||
Response: Component
|
||||
|
||||
PUT /api/v1/components/{id}
|
||||
Response: Component
|
||||
|
||||
DELETE /api/v1/components/{id}
|
||||
Response: { deleted: true }
|
||||
|
||||
POST /api/v1/components/{id}/sync-versions
|
||||
Body: { forceRefresh?: boolean }
|
||||
Response: { synced: number, versions: VersionMap[] }
|
||||
|
||||
GET /api/v1/components/{id}/versions
|
||||
Query: ?channel={stable|beta}&limit={n}
|
||||
Response: VersionMap[]
|
||||
|
||||
# Version Maps
|
||||
POST /api/v1/version-maps
|
||||
Body: { componentId, tag, semver, channel } # manual version assignment
|
||||
Response: VersionMap
|
||||
|
||||
GET /api/v1/version-maps
|
||||
Query: ?componentId={uuid}&channel={channel}
|
||||
Response: VersionMap[]
|
||||
|
||||
# Releases
|
||||
POST /api/v1/releases
|
||||
Body: {
|
||||
name: string,
|
||||
displayName?: string,
|
||||
components: [
|
||||
{ componentId: UUID, version?: string, digest?: string, channel?: string }
|
||||
],
|
||||
sourceRef?: SourceReference
|
||||
}
|
||||
Response: Release
|
||||
|
||||
GET /api/v1/releases
|
||||
Query: ?status={status}&componentId={uuid}&page={n}&pageSize={n}
|
||||
Response: { data: Release[], meta: PaginationMeta }
|
||||
|
||||
GET /api/v1/releases/{id}
|
||||
Response: Release (with full component details)
|
||||
|
||||
PUT /api/v1/releases/{id}
|
||||
Body: { displayName?, metadata?, status? }
|
||||
Response: Release
|
||||
|
||||
DELETE /api/v1/releases/{id}
|
||||
Response: { deleted: true }
|
||||
|
||||
GET /api/v1/releases/{id}/state
|
||||
Response: { environments: [{ environmentId, status, deployedAt }] }
|
||||
|
||||
POST /api/v1/releases/{id}/deprecate
|
||||
Response: Release
|
||||
|
||||
GET /api/v1/releases/{id}/compare/{otherId}
|
||||
Response: ReleaseDiff
|
||||
|
||||
# Quick release creation
|
||||
POST /api/v1/releases/from-latest
|
||||
Body: {
|
||||
name: string,
|
||||
channel?: string, # default: stable
|
||||
componentIds?: UUID[], # default: all
|
||||
pinFrom?: { environmentId: UUID } # for partial release
|
||||
}
|
||||
Response: Release
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Release Identity: Digest-First Principle
|
||||
|
||||
A core design invariant of the Release Orchestrator:
|
||||
|
||||
```
|
||||
INVARIANT: A release is a set of OCI image digests (component -> digest mapping), never tags.
|
||||
```
|
||||
|
||||
**Implementation Requirements**:
|
||||
- Tags are convenience inputs for resolution
|
||||
- Tags are resolved to digests at release creation time
|
||||
- All downstream operations (promotion, deployment, rollback) use digests
|
||||
- Digest mismatch at pull time = deployment failure (tamper detection)
|
||||
|
||||
**Example**:
|
||||
```json
|
||||
{
|
||||
"id": "release-uuid",
|
||||
"name": "myapp-v2.3.1",
|
||||
"components": [
|
||||
{
|
||||
"componentId": "api-component-uuid",
|
||||
"componentName": "api",
|
||||
"tag": "v2.3.1",
|
||||
"digest": "sha256:abc123def456...",
|
||||
"semver": "2.3.1",
|
||||
"role": "primary"
|
||||
},
|
||||
{
|
||||
"componentId": "worker-component-uuid",
|
||||
"componentName": "worker",
|
||||
"tag": "v2.3.1",
|
||||
"digest": "sha256:789xyz123abc...",
|
||||
"semver": "2.3.1",
|
||||
"role": "primary"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [Module Overview](overview.md)
|
||||
- [Design Principles](../design/principles.md)
|
||||
- [API Documentation](../api/releases.md)
|
||||
- [Promotion Manager](promotion-manager.md)
|
||||
590
docs/modules/release-orchestrator/modules/workflow-engine.md
Normal file
590
docs/modules/release-orchestrator/modules/workflow-engine.md
Normal file
@@ -0,0 +1,590 @@
|
||||
# WORKFL: Workflow Engine
|
||||
|
||||
**Purpose**: DAG-based workflow execution for deployments, approvals, and custom automation.
|
||||
|
||||
## Modules
|
||||
|
||||
### Module: `workflow-designer`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Template creation; DAG graph editor; validation |
|
||||
| **Dependencies** | `step-registry` |
|
||||
| **Data Entities** | `WorkflowTemplate`, `StepNode`, `StepEdge` |
|
||||
|
||||
**Workflow Template Structure**:
|
||||
```typescript
|
||||
interface WorkflowTemplate {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
name: string;
|
||||
displayName: string;
|
||||
description: string;
|
||||
version: number;
|
||||
|
||||
// DAG structure
|
||||
nodes: StepNode[];
|
||||
edges: StepEdge[];
|
||||
|
||||
// I/O
|
||||
inputs: InputDefinition[];
|
||||
outputs: OutputDefinition[];
|
||||
|
||||
// Metadata
|
||||
tags: string[];
|
||||
isBuiltin: boolean;
|
||||
createdAt: DateTime;
|
||||
createdBy: UUID;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `workflow-engine`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | DAG execution; state machine; pause/resume |
|
||||
| **Dependencies** | `step-executor`, `step-registry` |
|
||||
| **Data Entities** | `WorkflowRun`, `WorkflowState` |
|
||||
| **Events Produced** | `workflow.started`, `workflow.paused`, `workflow.resumed`, `workflow.completed`, `workflow.failed` |
|
||||
|
||||
**Workflow Execution Algorithm**:
|
||||
```python
|
||||
class WorkflowEngine:
|
||||
def execute(self, workflow_run: WorkflowRun) -> None:
|
||||
"""Main workflow execution loop."""
|
||||
|
||||
# Initialize
|
||||
workflow_run.status = "running"
|
||||
workflow_run.started_at = now()
|
||||
self.save(workflow_run)
|
||||
|
||||
try:
|
||||
while not self.is_terminal(workflow_run):
|
||||
# Handle pause state
|
||||
if workflow_run.status == "paused":
|
||||
self.wait_for_resume(workflow_run)
|
||||
continue
|
||||
|
||||
# Get nodes ready for execution
|
||||
ready_nodes = self.get_ready_nodes(workflow_run)
|
||||
|
||||
if not ready_nodes:
|
||||
# Check if we're waiting on approvals
|
||||
if self.has_pending_approvals(workflow_run):
|
||||
workflow_run.status = "paused"
|
||||
self.save(workflow_run)
|
||||
continue
|
||||
|
||||
# Check if all nodes are complete
|
||||
if self.all_nodes_complete(workflow_run):
|
||||
break
|
||||
|
||||
# Deadlock detection
|
||||
raise WorkflowDeadlockError(workflow_run.id)
|
||||
|
||||
# Execute ready nodes in parallel
|
||||
futures = []
|
||||
for node in ready_nodes:
|
||||
future = self.executor.submit(
|
||||
self.execute_node,
|
||||
workflow_run,
|
||||
node
|
||||
)
|
||||
futures.append((node, future))
|
||||
|
||||
# Wait for at least one to complete
|
||||
completed = self.wait_any(futures)
|
||||
|
||||
for node, result in completed:
|
||||
step_run = self.get_step_run(workflow_run, node.id)
|
||||
|
||||
if result.success:
|
||||
step_run.status = "succeeded"
|
||||
step_run.outputs = result.outputs
|
||||
self.propagate_outputs(workflow_run, node, result.outputs)
|
||||
else:
|
||||
step_run.status = "failed"
|
||||
step_run.error_message = result.error
|
||||
|
||||
# Handle failure action
|
||||
if node.on_failure == "fail":
|
||||
workflow_run.status = "failed"
|
||||
workflow_run.error_message = f"Step {node.name} failed: {result.error}"
|
||||
self.cancel_pending_steps(workflow_run)
|
||||
return
|
||||
elif node.on_failure == "rollback":
|
||||
self.trigger_rollback(workflow_run, node)
|
||||
elif node.on_failure.startswith("goto:"):
|
||||
target = node.on_failure.split(":")[1]
|
||||
self.add_ready_node(workflow_run, target)
|
||||
# "continue" just continues to next nodes
|
||||
|
||||
step_run.completed_at = now()
|
||||
self.save(step_run)
|
||||
|
||||
# Workflow completed successfully
|
||||
workflow_run.status = "succeeded"
|
||||
workflow_run.completed_at = now()
|
||||
self.save(workflow_run)
|
||||
|
||||
except WorkflowCancelledError:
|
||||
workflow_run.status = "cancelled"
|
||||
workflow_run.completed_at = now()
|
||||
self.save(workflow_run)
|
||||
except Exception as e:
|
||||
workflow_run.status = "failed"
|
||||
workflow_run.error_message = str(e)
|
||||
workflow_run.completed_at = now()
|
||||
self.save(workflow_run)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `step-executor`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Step dispatch; retry logic; timeout handling |
|
||||
| **Dependencies** | `step-registry`, `plugin-sandbox` |
|
||||
| **Data Entities** | `StepRun`, `StepResult` |
|
||||
| **Events Produced** | `step.started`, `step.progress`, `step.completed`, `step.failed`, `step.retrying` |
|
||||
|
||||
**Step Node Structure**:
|
||||
```typescript
|
||||
interface StepNode {
|
||||
id: string; // Unique within template (e.g., "deploy-api")
|
||||
type: string; // Step type from registry
|
||||
name: string; // Display name
|
||||
config: Record<string, any>; // Step-specific configuration
|
||||
inputs: InputBinding[]; // Input value bindings
|
||||
outputs: OutputBinding[]; // Output declarations
|
||||
position: { x: number; y: number }; // UI position
|
||||
|
||||
// Execution settings
|
||||
timeout: number; // Seconds (default from step type)
|
||||
retryPolicy: RetryPolicy;
|
||||
onFailure: FailureAction;
|
||||
condition?: string; // JS expression for conditional execution
|
||||
|
||||
// Documentation
|
||||
description?: string;
|
||||
documentation?: string;
|
||||
}
|
||||
|
||||
type FailureAction = "fail" | "continue" | "rollback" | "goto:{nodeId}";
|
||||
|
||||
interface InputBinding {
|
||||
name: string; // Input parameter name
|
||||
source: InputSource;
|
||||
}
|
||||
|
||||
type InputSource =
|
||||
| { type: "literal"; value: any }
|
||||
| { type: "context"; path: string } // e.g., "release.name"
|
||||
| { type: "output"; nodeId: string; outputName: string }
|
||||
| { type: "secret"; secretName: string }
|
||||
| { type: "expression"; expression: string }; // JS expression
|
||||
|
||||
interface StepEdge {
|
||||
id: string;
|
||||
from: string; // Source node ID
|
||||
to: string; // Target node ID
|
||||
condition?: string; // Optional condition expression
|
||||
label?: string; // Display label for conditional edges
|
||||
}
|
||||
|
||||
interface RetryPolicy {
|
||||
maxRetries: number;
|
||||
backoffType: "fixed" | "exponential";
|
||||
backoffSeconds: number;
|
||||
retryableErrors: string[];
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `step-registry`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Built-in + plugin-provided step types |
|
||||
| **Dependencies** | `plugin-registry` |
|
||||
| **Data Entities** | `StepType`, `StepSchema` |
|
||||
|
||||
**Built-in Step Types**:
|
||||
|
||||
| Step Type | Category | Description |
|
||||
|-----------|----------|-------------|
|
||||
| `approval` | Control | Wait for human approval |
|
||||
| `security-gate` | Gate | Evaluate security policy |
|
||||
| `custom-gate` | Gate | Custom OPA policy evaluation |
|
||||
| `deploy-docker` | Deploy | Deploy single container |
|
||||
| `deploy-compose` | Deploy | Deploy Docker Compose stack |
|
||||
| `deploy-ecs` | Deploy | Deploy to AWS ECS |
|
||||
| `deploy-nomad` | Deploy | Deploy to HashiCorp Nomad |
|
||||
| `health-check` | Verify | HTTP/TCP health check |
|
||||
| `smoke-test` | Verify | Run smoke test suite |
|
||||
| `execute-script` | Custom | Run C#/Bash script |
|
||||
| `webhook` | Integration | Call external webhook |
|
||||
| `trigger-ci` | Integration | Trigger CI pipeline |
|
||||
| `wait-ci` | Integration | Wait for CI pipeline |
|
||||
| `notify` | Notification | Send notification |
|
||||
| `rollback` | Recovery | Rollback deployment |
|
||||
| `traffic-shift` | Progressive | Shift traffic percentage |
|
||||
|
||||
**Step Type Definition**:
|
||||
```typescript
|
||||
interface StepType {
|
||||
type: string; // "deploy-compose"
|
||||
displayName: string; // "Deploy Compose Stack"
|
||||
description: string;
|
||||
category: StepCategory;
|
||||
icon: string;
|
||||
|
||||
// Schema
|
||||
configSchema: JSONSchema; // Step configuration schema
|
||||
inputSchema: JSONSchema; // Required inputs schema
|
||||
outputSchema: JSONSchema; // Produced outputs schema
|
||||
|
||||
// Execution
|
||||
executor: "builtin" | UUID; // builtin or plugin ID
|
||||
defaultTimeout: number;
|
||||
safeToRetry: boolean;
|
||||
retryableErrors: string[];
|
||||
|
||||
// Documentation
|
||||
documentation: string;
|
||||
examples: StepExample[];
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Workflow Run State Machine
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ WORKFLOW RUN STATE MACHINE │
|
||||
│ │
|
||||
│ ┌──────────┐ │
|
||||
│ │ CREATED │ │
|
||||
│ └────┬─────┘ │
|
||||
│ │ start() │
|
||||
│ ▼ │
|
||||
│ ┌─────────────────────────────┐ │
|
||||
│ │ │ │
|
||||
│ pause() ┌──┴──────────┐ │ │
|
||||
│ ┌────────►│ PAUSED │◄─────────┐ │ │
|
||||
│ │ └──────┬──────┘ │ │ │
|
||||
│ │ │ resume() │ │ │
|
||||
│ │ ▼ │ │ │
|
||||
│ │ ┌─────────────┐ │ │ │
|
||||
│ └─────────│ RUNNING │──────────┘ │ │
|
||||
│ └──────┬──────┘ (waiting for │ │
|
||||
│ │ approval) │ │
|
||||
│ ┌────────────┼────────────┐ │ │
|
||||
│ │ │ │ │ │
|
||||
│ ▼ ▼ ▼ │ │
|
||||
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │
|
||||
│ │ SUCCEEDED │ │ FAILED │ │ CANCELLED │ │ │
|
||||
│ └───────────┘ └───────────┘ └───────────┘ │ │
|
||||
│ │
|
||||
│ Transitions: │
|
||||
│ - CREATED → RUNNING: start() │
|
||||
│ - RUNNING → PAUSED: pause(), waiting approval │
|
||||
│ - PAUSED → RUNNING: resume(), approval granted │
|
||||
│ - RUNNING → SUCCEEDED: all nodes complete │
|
||||
│ - RUNNING → FAILED: node fails with fail action │
|
||||
│ - RUNNING → CANCELLED: cancel() │
|
||||
│ - PAUSED → CANCELLED: cancel() │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Step Run State Machine
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ STEP RUN STATE MACHINE │
|
||||
│ │
|
||||
│ ┌──────────┐ │
|
||||
│ │ PENDING │ ◄──── Initial state; dependencies not met │
|
||||
│ └────┬─────┘ │
|
||||
│ │ dependencies met + condition true │
|
||||
│ ▼ │
|
||||
│ ┌──────────┐ │
|
||||
│ │ RUNNING │ ◄──── Step is executing │
|
||||
│ └────┬─────┘ │
|
||||
│ │ │
|
||||
│ ┌────┴────────────────┬─────────────────┐ │
|
||||
│ │ │ │ │
|
||||
│ ▼ ▼ ▼ │
|
||||
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ │
|
||||
│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │
|
||||
│ └───────────┘ └─────┬─────┘ └───────────┘ │
|
||||
│ │ ▲ │
|
||||
│ │ │ condition false │
|
||||
│ ▼ │ │
|
||||
│ ┌───────────┐ │ │
|
||||
│ │ RETRYING │──────┘ (max retries exceeded) │
|
||||
│ └─────┬─────┘ │
|
||||
│ │ │
|
||||
│ │ retry attempt │
|
||||
│ └──────────────────┐ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌──────────┐ │
|
||||
│ │ RUNNING │ (retry) │
|
||||
│ └──────────┘ │
|
||||
│ │
|
||||
│ Additional transitions: │
|
||||
│ - Any state → CANCELLED: workflow cancelled │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Schema
|
||||
|
||||
```sql
|
||||
-- Workflow Templates
|
||||
CREATE TABLE release.workflow_templates (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
display_name VARCHAR(255) NOT NULL,
|
||||
description TEXT,
|
||||
version INTEGER NOT NULL DEFAULT 1,
|
||||
nodes JSONB NOT NULL,
|
||||
edges JSONB NOT NULL,
|
||||
inputs JSONB NOT NULL DEFAULT '[]',
|
||||
outputs JSONB NOT NULL DEFAULT '[]',
|
||||
tags JSONB NOT NULL DEFAULT '[]',
|
||||
is_builtin BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
created_by UUID REFERENCES users(id)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_workflow_templates_tenant ON release.workflow_templates(tenant_id);
|
||||
CREATE INDEX idx_workflow_templates_name ON release.workflow_templates(name);
|
||||
|
||||
-- Workflow Runs
|
||||
CREATE TABLE release.workflow_runs (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
template_id UUID NOT NULL REFERENCES release.workflow_templates(id),
|
||||
template_version INTEGER NOT NULL,
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'created',
|
||||
context JSONB NOT NULL,
|
||||
inputs JSONB NOT NULL DEFAULT '{}',
|
||||
outputs JSONB NOT NULL DEFAULT '{}',
|
||||
error_message TEXT,
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
created_by UUID REFERENCES users(id)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_workflow_runs_tenant ON release.workflow_runs(tenant_id);
|
||||
CREATE INDEX idx_workflow_runs_template ON release.workflow_runs(template_id);
|
||||
CREATE INDEX idx_workflow_runs_status ON release.workflow_runs(status);
|
||||
|
||||
-- Step Runs
|
||||
CREATE TABLE release.step_runs (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
workflow_run_id UUID NOT NULL REFERENCES release.workflow_runs(id) ON DELETE CASCADE,
|
||||
node_id VARCHAR(255) NOT NULL,
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'pending',
|
||||
inputs JSONB NOT NULL DEFAULT '{}',
|
||||
outputs JSONB NOT NULL DEFAULT '{}',
|
||||
error_message TEXT,
|
||||
logs TEXT,
|
||||
attempt_number INTEGER NOT NULL DEFAULT 1,
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
UNIQUE (workflow_run_id, node_id)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_step_runs_workflow ON release.step_runs(workflow_run_id);
|
||||
CREATE INDEX idx_step_runs_status ON release.step_runs(status);
|
||||
|
||||
-- Step Registry
|
||||
CREATE TABLE release.step_types (
|
||||
type VARCHAR(255) PRIMARY KEY,
|
||||
display_name VARCHAR(255) NOT NULL,
|
||||
description TEXT,
|
||||
category VARCHAR(100) NOT NULL,
|
||||
icon VARCHAR(255),
|
||||
config_schema JSONB NOT NULL,
|
||||
input_schema JSONB NOT NULL,
|
||||
output_schema JSONB NOT NULL,
|
||||
executor VARCHAR(255) NOT NULL DEFAULT 'builtin',
|
||||
default_timeout INTEGER NOT NULL DEFAULT 300,
|
||||
safe_to_retry BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
retryable_errors JSONB NOT NULL DEFAULT '[]',
|
||||
documentation TEXT,
|
||||
examples JSONB NOT NULL DEFAULT '[]',
|
||||
plugin_id UUID REFERENCES release.plugins(id),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_step_types_category ON release.step_types(category);
|
||||
CREATE INDEX idx_step_types_plugin ON release.step_types(plugin_id);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Workflow Template Example: Standard Deployment
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "template-standard-deploy",
|
||||
"name": "standard-deploy",
|
||||
"displayName": "Standard Deployment",
|
||||
"version": 1,
|
||||
"inputs": [
|
||||
{ "name": "releaseId", "type": "uuid", "required": true },
|
||||
{ "name": "environmentId", "type": "uuid", "required": true },
|
||||
{ "name": "promotionId", "type": "uuid", "required": true }
|
||||
],
|
||||
"nodes": [
|
||||
{
|
||||
"id": "approval",
|
||||
"type": "approval",
|
||||
"name": "Approval Gate",
|
||||
"config": {},
|
||||
"inputs": [
|
||||
{ "name": "promotionId", "source": { "type": "context", "path": "promotionId" } }
|
||||
],
|
||||
"position": { "x": 100, "y": 100 }
|
||||
},
|
||||
{
|
||||
"id": "security-gate",
|
||||
"type": "security-gate",
|
||||
"name": "Security Verification",
|
||||
"config": {
|
||||
"blockOnCritical": true,
|
||||
"blockOnHigh": true
|
||||
},
|
||||
"inputs": [
|
||||
{ "name": "releaseId", "source": { "type": "context", "path": "releaseId" } }
|
||||
],
|
||||
"position": { "x": 100, "y": 200 }
|
||||
},
|
||||
{
|
||||
"id": "deploy-targets",
|
||||
"type": "deploy-compose",
|
||||
"name": "Deploy to Targets",
|
||||
"config": {
|
||||
"strategy": "rolling",
|
||||
"parallelism": 2
|
||||
},
|
||||
"inputs": [
|
||||
{ "name": "releaseId", "source": { "type": "context", "path": "releaseId" } },
|
||||
{ "name": "environmentId", "source": { "type": "context", "path": "environmentId" } }
|
||||
],
|
||||
"timeout": 600,
|
||||
"retryPolicy": {
|
||||
"maxRetries": 2,
|
||||
"backoffType": "exponential",
|
||||
"backoffSeconds": 30
|
||||
},
|
||||
"onFailure": "rollback",
|
||||
"position": { "x": 100, "y": 400 }
|
||||
},
|
||||
{
|
||||
"id": "health-check",
|
||||
"type": "health-check",
|
||||
"name": "Health Verification",
|
||||
"config": {
|
||||
"type": "http",
|
||||
"path": "/health",
|
||||
"expectedStatus": 200,
|
||||
"timeout": 30,
|
||||
"retries": 5
|
||||
},
|
||||
"inputs": [
|
||||
{ "name": "targets", "source": { "type": "output", "nodeId": "deploy-targets", "outputName": "deployedTargets" } }
|
||||
],
|
||||
"onFailure": "rollback",
|
||||
"position": { "x": 100, "y": 500 }
|
||||
},
|
||||
{
|
||||
"id": "notify-success",
|
||||
"type": "notify",
|
||||
"name": "Success Notification",
|
||||
"config": {
|
||||
"channel": "slack",
|
||||
"template": "deployment-success"
|
||||
},
|
||||
"onFailure": "continue",
|
||||
"position": { "x": 100, "y": 700 }
|
||||
},
|
||||
{
|
||||
"id": "rollback-handler",
|
||||
"type": "rollback",
|
||||
"name": "Rollback Handler",
|
||||
"config": {
|
||||
"strategy": "to-previous"
|
||||
},
|
||||
"inputs": [
|
||||
{ "name": "deploymentJobId", "source": { "type": "output", "nodeId": "deploy-targets", "outputName": "jobId" } }
|
||||
],
|
||||
"position": { "x": 300, "y": 450 }
|
||||
}
|
||||
],
|
||||
"edges": [
|
||||
{ "id": "e1", "from": "approval", "to": "security-gate" },
|
||||
{ "id": "e2", "from": "security-gate", "to": "deploy-targets" },
|
||||
{ "id": "e3", "from": "deploy-targets", "to": "health-check" },
|
||||
{ "id": "e4", "from": "health-check", "to": "notify-success" },
|
||||
{ "id": "e5", "from": "deploy-targets", "to": "rollback-handler", "condition": "status === 'failed'" },
|
||||
{ "id": "e6", "from": "health-check", "to": "rollback-handler", "condition": "status === 'failed'" }
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
See [API Documentation](../api/workflows.md) for full specification.
|
||||
|
||||
```yaml
|
||||
# Workflow Templates
|
||||
POST /api/v1/workflow-templates
|
||||
GET /api/v1/workflow-templates
|
||||
GET /api/v1/workflow-templates/{id}
|
||||
PUT /api/v1/workflow-templates/{id}
|
||||
DELETE /api/v1/workflow-templates/{id}
|
||||
POST /api/v1/workflow-templates/{id}/validate
|
||||
|
||||
# Step Registry
|
||||
GET /api/v1/step-types
|
||||
GET /api/v1/step-types/{type}
|
||||
|
||||
# Workflow Runs
|
||||
POST /api/v1/workflow-runs
|
||||
GET /api/v1/workflow-runs
|
||||
GET /api/v1/workflow-runs/{id}
|
||||
POST /api/v1/workflow-runs/{id}/pause
|
||||
POST /api/v1/workflow-runs/{id}/resume
|
||||
POST /api/v1/workflow-runs/{id}/cancel
|
||||
GET /api/v1/workflow-runs/{id}/steps
|
||||
GET /api/v1/workflow-runs/{id}/steps/{nodeId}
|
||||
GET /api/v1/workflow-runs/{id}/steps/{nodeId}/logs
|
||||
GET /api/v1/workflow-runs/{id}/steps/{nodeId}/artifacts
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [Module Overview](overview.md)
|
||||
- [Workflow Templates](../workflow/templates.md)
|
||||
- [Execution State Machine](../workflow/execution.md)
|
||||
- [API Documentation](../api/workflows.md)
|
||||
Reference in New Issue
Block a user