release orchestrator pivot, architecture and planning
This commit is contained in:
597
docs/modules/release-orchestrator/modules/agents.md
Normal file
597
docs/modules/release-orchestrator/modules/agents.md
Normal file
@@ -0,0 +1,597 @@
|
||||
# AGENTS: Deployment Agents
|
||||
|
||||
**Purpose**: Lightweight deployment agents for target execution.
|
||||
|
||||
## Agent Types
|
||||
|
||||
| Agent Type | Transport | Target Types |
|
||||
|------------|-----------|--------------|
|
||||
| `agent-docker` | gRPC | Docker hosts |
|
||||
| `agent-compose` | gRPC | Docker Compose hosts |
|
||||
| `agent-ssh` | SSH | Linux remote hosts |
|
||||
| `agent-winrm` | WinRM | Windows remote hosts |
|
||||
| `agent-ecs` | AWS API | AWS ECS services |
|
||||
| `agent-nomad` | Nomad API | HashiCorp Nomad jobs |
|
||||
|
||||
## Modules
|
||||
|
||||
### Module: `agent-core`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Shared agent runtime; task execution framework |
|
||||
| **Protocol** | gRPC for communication with Stella Core |
|
||||
| **Security** | mTLS authentication; short-lived JWT for tasks |
|
||||
|
||||
**Agent Lifecycle**:
|
||||
1. Agent starts with registration token
|
||||
2. Agent registers with capabilities and labels
|
||||
3. Agent sends heartbeats (default: 30s interval)
|
||||
4. Agent receives tasks from Stella Core
|
||||
5. Agent reports task completion/failure
|
||||
|
||||
**Agent Task Protocol**:
|
||||
```typescript
|
||||
// Task assignment (Core → Agent)
|
||||
interface AgentTask {
|
||||
id: UUID;
|
||||
type: TaskType;
|
||||
targetId: UUID;
|
||||
payload: TaskPayload;
|
||||
credentials: EncryptedCredentials;
|
||||
timeout: number;
|
||||
priority: TaskPriority;
|
||||
idempotencyKey: string;
|
||||
assignedAt: DateTime;
|
||||
expiresAt: DateTime;
|
||||
}
|
||||
|
||||
type TaskType =
|
||||
| "deploy"
|
||||
| "rollback"
|
||||
| "health-check"
|
||||
| "inspect"
|
||||
| "execute-command"
|
||||
| "upload-files"
|
||||
| "write-sticker"
|
||||
| "read-sticker";
|
||||
|
||||
interface DeployTaskPayload {
|
||||
image: string;
|
||||
digest: string;
|
||||
config: DeployConfig;
|
||||
artifacts: ArtifactReference[];
|
||||
previousDigest?: string;
|
||||
hooks: {
|
||||
preDeploy?: HookConfig;
|
||||
postDeploy?: HookConfig;
|
||||
};
|
||||
}
|
||||
|
||||
// Task result (Agent → Core)
|
||||
interface TaskResult {
|
||||
taskId: UUID;
|
||||
success: boolean;
|
||||
startedAt: DateTime;
|
||||
completedAt: DateTime;
|
||||
|
||||
// Success details
|
||||
outputs?: Record<string, any>;
|
||||
artifacts?: ArtifactReference[];
|
||||
|
||||
// Failure details
|
||||
error?: string;
|
||||
errorType?: string;
|
||||
retriable?: boolean;
|
||||
|
||||
// Logs
|
||||
logs: string;
|
||||
|
||||
// Metrics
|
||||
metrics: {
|
||||
pullDurationMs?: number;
|
||||
deployDurationMs?: number;
|
||||
healthCheckDurationMs?: number;
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `agent-docker`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Docker container deployment |
|
||||
| **Dependencies** | Docker Engine API |
|
||||
| **Capabilities** | `docker.deploy`, `docker.rollback`, `docker.inspect` |
|
||||
|
||||
**Docker Agent Implementation**:
|
||||
```typescript
|
||||
class DockerAgent implements TargetExecutor {
|
||||
private docker: Docker;
|
||||
|
||||
async deploy(task: DeployTaskPayload): Promise<DeployResult> {
|
||||
const { image, digest, config, previousDigest } = task;
|
||||
const containerName = config.containerName;
|
||||
|
||||
// 1. Pull image and verify digest
|
||||
this.log(`Pulling image ${image}@${digest}`);
|
||||
await this.docker.pull(image, { digest });
|
||||
|
||||
const pulledDigest = await this.getImageDigest(image);
|
||||
if (pulledDigest !== digest) {
|
||||
throw new DigestMismatchError(
|
||||
`Expected digest ${digest}, got ${pulledDigest}. Possible tampering detected.`
|
||||
);
|
||||
}
|
||||
|
||||
// 2. Run pre-deploy hook
|
||||
if (task.hooks?.preDeploy) {
|
||||
await this.runHook(task.hooks.preDeploy, "pre-deploy");
|
||||
}
|
||||
|
||||
// 3. Stop and rename existing container
|
||||
const existingContainer = await this.findContainer(containerName);
|
||||
if (existingContainer) {
|
||||
this.log(`Stopping existing container ${containerName}`);
|
||||
await existingContainer.stop({ t: 10 });
|
||||
await existingContainer.rename(`${containerName}-previous-${Date.now()}`);
|
||||
}
|
||||
|
||||
// 4. Create new container
|
||||
this.log(`Creating container ${containerName} from ${image}@${digest}`);
|
||||
const container = await this.docker.createContainer({
|
||||
name: containerName,
|
||||
Image: `${image}@${digest}`, // Always use digest, not tag
|
||||
Env: this.buildEnvVars(config.environment),
|
||||
HostConfig: {
|
||||
PortBindings: this.buildPortBindings(config.ports),
|
||||
Binds: this.buildBindMounts(config.volumes),
|
||||
RestartPolicy: { Name: config.restartPolicy || "unless-stopped" },
|
||||
Memory: config.memoryLimit,
|
||||
CpuQuota: config.cpuLimit,
|
||||
},
|
||||
Labels: {
|
||||
"stella.release.id": config.releaseId,
|
||||
"stella.release.name": config.releaseName,
|
||||
"stella.digest": digest,
|
||||
"stella.deployed.at": new Date().toISOString(),
|
||||
},
|
||||
});
|
||||
|
||||
// 5. Start container
|
||||
this.log(`Starting container ${containerName}`);
|
||||
await container.start();
|
||||
|
||||
// 6. Wait for container to be healthy
|
||||
if (config.healthCheck) {
|
||||
this.log(`Waiting for container health check`);
|
||||
const healthy = await this.waitForHealthy(container, config.healthCheck.timeout);
|
||||
if (!healthy) {
|
||||
await this.rollbackContainer(containerName, existingContainer);
|
||||
throw new HealthCheckFailedError(`Container ${containerName} failed health check`);
|
||||
}
|
||||
}
|
||||
|
||||
// 7. Run post-deploy hook
|
||||
if (task.hooks?.postDeploy) {
|
||||
await this.runHook(task.hooks.postDeploy, "post-deploy");
|
||||
}
|
||||
|
||||
// 8. Cleanup previous container
|
||||
if (existingContainer && config.cleanupPrevious !== false) {
|
||||
this.log(`Removing previous container`);
|
||||
await existingContainer.remove({ force: true });
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
containerId: container.id,
|
||||
previousDigest: previousDigest,
|
||||
};
|
||||
}
|
||||
|
||||
async rollback(task: RollbackTaskPayload): Promise<DeployResult> {
|
||||
const { containerName, targetDigest } = task;
|
||||
|
||||
if (targetDigest) {
|
||||
// Deploy specific digest
|
||||
return this.deploy({ ...task, digest: targetDigest });
|
||||
}
|
||||
|
||||
// Find and restore previous container
|
||||
const previousContainer = await this.findContainer(`${containerName}-previous-*`);
|
||||
if (!previousContainer) {
|
||||
throw new RollbackError(`No previous container found for ${containerName}`);
|
||||
}
|
||||
|
||||
const currentContainer = await this.findContainer(containerName);
|
||||
if (currentContainer) {
|
||||
await currentContainer.stop({ t: 10 });
|
||||
await currentContainer.rename(`${containerName}-failed-${Date.now()}`);
|
||||
}
|
||||
|
||||
await previousContainer.rename(containerName);
|
||||
await previousContainer.start();
|
||||
|
||||
return { success: true, containerId: previousContainer.id };
|
||||
}
|
||||
|
||||
async writeSticker(sticker: VersionSticker): Promise<void> {
|
||||
const stickerPath = this.config.stickerPath || "/var/stella/version.json";
|
||||
const stickerContent = JSON.stringify(sticker, null, 2);
|
||||
|
||||
if (this.config.stickerLocation === "volume") {
|
||||
await this.docker.run("alpine", [
|
||||
"sh", "-c",
|
||||
`echo '${stickerContent}' > ${stickerPath}`
|
||||
], {
|
||||
HostConfig: { Binds: [`${this.config.stickerVolume}:/var/stella`] }
|
||||
});
|
||||
} else {
|
||||
fs.writeFileSync(stickerPath, stickerContent);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `agent-compose`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Docker Compose stack deployment |
|
||||
| **Dependencies** | Docker Compose CLI |
|
||||
| **Capabilities** | `compose.deploy`, `compose.rollback`, `compose.inspect` |
|
||||
|
||||
**Compose Agent Implementation**:
|
||||
```typescript
|
||||
class ComposeAgent implements TargetExecutor {
|
||||
async deploy(task: DeployTaskPayload): Promise<DeployResult> {
|
||||
const { artifacts, config } = task;
|
||||
const deployDir = config.deploymentDirectory;
|
||||
|
||||
// 1. Write compose lock file
|
||||
const composeLock = artifacts.find(a => a.type === "compose_lock");
|
||||
const composeContent = await this.fetchArtifact(composeLock);
|
||||
const composePath = path.join(deployDir, "compose.stella.lock.yml");
|
||||
await fs.writeFile(composePath, composeContent);
|
||||
|
||||
// 2. Run pre-deploy hook
|
||||
if (task.hooks?.preDeploy) {
|
||||
await this.runHook(task.hooks.preDeploy, deployDir);
|
||||
}
|
||||
|
||||
// 3. Pull images
|
||||
this.log("Pulling images...");
|
||||
await this.runCompose(deployDir, ["pull"]);
|
||||
|
||||
// 4. Verify digests
|
||||
await this.verifyDigests(composePath, config.expectedDigests);
|
||||
|
||||
// 5. Deploy
|
||||
this.log("Deploying services...");
|
||||
await this.runCompose(deployDir, ["up", "-d", "--remove-orphans", "--force-recreate"]);
|
||||
|
||||
// 6. Wait for services to be healthy
|
||||
if (config.healthCheck) {
|
||||
const healthy = await this.waitForServicesHealthy(deployDir, config.healthCheck.timeout);
|
||||
if (!healthy) {
|
||||
await this.rollbackToBackup(deployDir);
|
||||
throw new HealthCheckFailedError("Services failed health check");
|
||||
}
|
||||
}
|
||||
|
||||
// 7. Run post-deploy hook
|
||||
if (task.hooks?.postDeploy) {
|
||||
await this.runHook(task.hooks.postDeploy, deployDir);
|
||||
}
|
||||
|
||||
// 8. Write version sticker
|
||||
await this.writeSticker(config.sticker, deployDir);
|
||||
|
||||
return { success: true };
|
||||
}
|
||||
|
||||
private async verifyDigests(
|
||||
composePath: string,
|
||||
expectedDigests: Record<string, string>
|
||||
): Promise<void> {
|
||||
const composeContent = yaml.parse(await fs.readFile(composePath, "utf-8"));
|
||||
|
||||
for (const [service, expectedDigest] of Object.entries(expectedDigests)) {
|
||||
const serviceConfig = composeContent.services[service];
|
||||
if (!serviceConfig) {
|
||||
throw new Error(`Service ${service} not found in compose file`);
|
||||
}
|
||||
|
||||
const image = serviceConfig.image;
|
||||
if (!image.includes("@sha256:")) {
|
||||
throw new Error(`Service ${service} image not pinned to digest: ${image}`);
|
||||
}
|
||||
|
||||
const actualDigest = image.split("@")[1];
|
||||
if (actualDigest !== expectedDigest) {
|
||||
throw new DigestMismatchError(
|
||||
`Service ${service}: expected ${expectedDigest}, got ${actualDigest}`
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `agent-ssh`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | SSH remote execution (agentless) |
|
||||
| **Dependencies** | SSH client library |
|
||||
| **Capabilities** | `ssh.deploy`, `ssh.execute`, `ssh.upload` |
|
||||
|
||||
**SSH Remote Executor**:
|
||||
```typescript
|
||||
class SSHRemoteExecutor implements TargetExecutor {
|
||||
async connect(config: SSHConnectionConfig): Promise<void> {
|
||||
const privateKey = await this.secrets.getSecret(config.privateKeyRef);
|
||||
|
||||
this.ssh = new SSHClient();
|
||||
await this.ssh.connect({
|
||||
host: config.host,
|
||||
port: config.port || 22,
|
||||
username: config.username,
|
||||
privateKey: privateKey.value,
|
||||
readyTimeout: config.connectionTimeout || 30000,
|
||||
});
|
||||
}
|
||||
|
||||
async deploy(task: DeployTaskPayload): Promise<DeployResult> {
|
||||
const { artifacts, config } = task;
|
||||
const deployDir = config.deploymentDirectory;
|
||||
|
||||
try {
|
||||
// 1. Ensure deployment directory exists
|
||||
await this.exec(`mkdir -p ${deployDir}`);
|
||||
await this.exec(`mkdir -p ${deployDir}/.stella-backup`);
|
||||
|
||||
// 2. Backup current deployment
|
||||
await this.exec(`cp -r ${deployDir}/* ${deployDir}/.stella-backup/ 2>/dev/null || true`);
|
||||
|
||||
// 3. Upload artifacts
|
||||
for (const artifact of artifacts) {
|
||||
const content = await this.fetchArtifact(artifact);
|
||||
const remotePath = path.join(deployDir, artifact.name);
|
||||
await this.uploadFile(content, remotePath);
|
||||
}
|
||||
|
||||
// 4. Run pre-deploy hook
|
||||
if (task.hooks?.preDeploy) {
|
||||
await this.runRemoteHook(task.hooks.preDeploy, deployDir);
|
||||
}
|
||||
|
||||
// 5. Execute deployment script
|
||||
const deployScript = artifacts.find(a => a.type === "deploy_script");
|
||||
if (deployScript) {
|
||||
const scriptPath = path.join(deployDir, deployScript.name);
|
||||
await this.exec(`chmod +x ${scriptPath}`);
|
||||
const result = await this.exec(scriptPath, { cwd: deployDir, timeout: config.deploymentTimeout });
|
||||
if (result.exitCode !== 0) {
|
||||
throw new DeploymentError(`Deploy script failed: ${result.stderr}`);
|
||||
}
|
||||
}
|
||||
|
||||
// 6. Run post-deploy hook
|
||||
if (task.hooks?.postDeploy) {
|
||||
await this.runRemoteHook(task.hooks.postDeploy, deployDir);
|
||||
}
|
||||
|
||||
// 7. Health check
|
||||
if (config.healthCheck) {
|
||||
const healthy = await this.runHealthCheck(config.healthCheck);
|
||||
if (!healthy) {
|
||||
await this.rollback(task);
|
||||
throw new HealthCheckFailedError("Health check failed");
|
||||
}
|
||||
}
|
||||
|
||||
// 8. Write version sticker
|
||||
await this.writeSticker(config.sticker, deployDir);
|
||||
|
||||
// 9. Cleanup backup
|
||||
await this.exec(`rm -rf ${deployDir}/.stella-backup`);
|
||||
|
||||
return { success: true };
|
||||
} finally {
|
||||
this.ssh.end();
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `agent-winrm`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | WinRM remote execution (agentless) |
|
||||
| **Dependencies** | WinRM client library |
|
||||
| **Capabilities** | `winrm.deploy`, `winrm.execute`, `winrm.upload` |
|
||||
| **Authentication** | NTLM, Kerberos, Basic |
|
||||
|
||||
---
|
||||
|
||||
### Module: `agent-ecs`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | AWS ECS service deployment |
|
||||
| **Dependencies** | AWS SDK |
|
||||
| **Capabilities** | `ecs.deploy`, `ecs.rollback`, `ecs.inspect` |
|
||||
|
||||
---
|
||||
|
||||
### Module: `agent-nomad`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | HashiCorp Nomad job deployment |
|
||||
| **Dependencies** | Nomad API client |
|
||||
| **Capabilities** | `nomad.deploy`, `nomad.rollback`, `nomad.inspect` |
|
||||
|
||||
---
|
||||
|
||||
## Agent Security Model
|
||||
|
||||
### Registration Flow
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ AGENT REGISTRATION FLOW │
|
||||
│ │
|
||||
│ 1. Admin generates registration token (one-time use) │
|
||||
│ POST /api/v1/admin/agent-tokens │
|
||||
│ → { token: "reg_xxx", expiresAt: "..." } │
|
||||
│ │
|
||||
│ 2. Agent starts with registration token │
|
||||
│ ./stella-agent --register --token=reg_xxx │
|
||||
│ │
|
||||
│ 3. Agent requests mTLS certificate │
|
||||
│ POST /api/v1/agents/register │
|
||||
│ Headers: X-Registration-Token: reg_xxx │
|
||||
│ Body: { name, version, capabilities, csr } │
|
||||
│ → { agentId, certificate, caCertificate } │
|
||||
│ │
|
||||
│ 4. Agent establishes mTLS connection │
|
||||
│ Uses issued certificate for all subsequent requests │
|
||||
│ │
|
||||
│ 5. Agent requests short-lived JWT for task execution │
|
||||
│ POST /api/v1/agents/token (over mTLS) │
|
||||
│ → { token, expiresIn: 3600 } // 1 hour │
|
||||
│ │
|
||||
│ 6. Agent refreshes token before expiration │
|
||||
│ Token refresh only over mTLS connection │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Communication Security
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ AGENT COMMUNICATION SECURITY │
|
||||
│ │
|
||||
│ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ AGENT │ │ STELLA CORE │ │
|
||||
│ └──────┬───────┘ └──────┬───────┘ │
|
||||
│ │ │ │
|
||||
│ │ mTLS (mutual TLS) │ │
|
||||
│ │ - Agent cert signed by Stella CA │ │
|
||||
│ │ - Server cert verified by Agent │ │
|
||||
│ │ - TLS 1.3 only │ │
|
||||
│ │ - Perfect forward secrecy │ │
|
||||
│ │◄───────────────────────────────────────►│ │
|
||||
│ │ │ │
|
||||
│ │ Encrypted payload │ │
|
||||
│ │ - Task payloads encrypted with │ │
|
||||
│ │ agent-specific key │ │
|
||||
│ │ - Logs encrypted in transit │ │
|
||||
│ │◄───────────────────────────────────────►│ │
|
||||
│ │ │ │
|
||||
│ │ Heartbeat + capability refresh │ │
|
||||
│ │ - Every 30 seconds │ │
|
||||
│ │ - Signed with agent key │ │
|
||||
│ │─────────────────────────────────────────►│ │
|
||||
│ │ │ │
|
||||
│ │ Task assignment │ │
|
||||
│ │ - Contains short-lived credentials │ │
|
||||
│ │ - Scoped to specific target │ │
|
||||
│ │ - Expires after task timeout │ │
|
||||
│ │◄─────────────────────────────────────────│ │
|
||||
│ │ │ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Schema
|
||||
|
||||
```sql
|
||||
-- Agents
|
||||
CREATE TABLE release.agents (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
version VARCHAR(50) NOT NULL,
|
||||
capabilities JSONB NOT NULL DEFAULT '[]',
|
||||
labels JSONB NOT NULL DEFAULT '{}',
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'offline' CHECK (status IN (
|
||||
'online', 'offline', 'degraded'
|
||||
)),
|
||||
last_heartbeat TIMESTAMPTZ,
|
||||
resource_usage JSONB,
|
||||
certificate_fingerprint VARCHAR(64),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
UNIQUE (tenant_id, name)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_agents_tenant ON release.agents(tenant_id);
|
||||
CREATE INDEX idx_agents_status ON release.agents(status);
|
||||
CREATE INDEX idx_agents_capabilities ON release.agents USING GIN (capabilities);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
```yaml
|
||||
# Agent Registration
|
||||
POST /api/v1/agents/register
|
||||
Headers: X-Registration-Token: {token}
|
||||
Body: { name, version, capabilities, csr }
|
||||
Response: { agentId, certificate, caCertificate }
|
||||
|
||||
# Agent Management
|
||||
GET /api/v1/agents
|
||||
Query: ?status={online|offline|degraded}&capability={type}
|
||||
Response: Agent[]
|
||||
|
||||
GET /api/v1/agents/{id}
|
||||
Response: Agent
|
||||
|
||||
PUT /api/v1/agents/{id}
|
||||
Body: { labels?, capabilities? }
|
||||
Response: Agent
|
||||
|
||||
DELETE /api/v1/agents/{id}
|
||||
Response: { deleted: true }
|
||||
|
||||
# Agent Communication
|
||||
POST /api/v1/agents/{id}/heartbeat
|
||||
Body: { status, resourceUsage, capabilities }
|
||||
Response: { tasks: AgentTask[] }
|
||||
|
||||
POST /api/v1/agents/{id}/tasks/{taskId}/complete
|
||||
Body: { success, result, logs }
|
||||
Response: { acknowledged: true }
|
||||
|
||||
# WebSocket for real-time task stream
|
||||
WS /api/v1/agents/{id}/task-stream
|
||||
Messages:
|
||||
- { type: "task_assigned", task: AgentTask }
|
||||
- { type: "task_cancelled", taskId }
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [Module Overview](overview.md)
|
||||
- [Deploy Orchestrator](deploy-orchestrator.md)
|
||||
- [Agent Security](../security/agent-security.md)
|
||||
- [API Documentation](../api/agents.md)
|
||||
Reference in New Issue
Block a user