Files
git.stella-ops.org/docs/modules/release-orchestrator/modules/agents.md

21 KiB

AGENTS: Deployment Agents

Purpose: Lightweight deployment agents for target execution.

Agent Types

Agent Type Transport Target Types
agent-docker gRPC Docker hosts
agent-compose gRPC Docker Compose hosts
agent-ssh SSH Linux remote hosts
agent-winrm WinRM Windows remote hosts
agent-ecs AWS API AWS ECS services
agent-nomad Nomad API HashiCorp Nomad jobs

Modules

Module: agent-core

Aspect Specification
Responsibility Shared agent runtime; task execution framework
Protocol gRPC for communication with Stella Core
Security mTLS authentication; short-lived JWT for tasks

Agent Lifecycle:

  1. Agent starts with registration token
  2. Agent registers with capabilities and labels
  3. Agent sends heartbeats (default: 30s interval)
  4. Agent receives tasks from Stella Core
  5. Agent reports task completion/failure

Agent Task Protocol:

// Task assignment (Core → Agent)
interface AgentTask {
  id: UUID;
  type: TaskType;
  targetId: UUID;
  payload: TaskPayload;
  credentials: EncryptedCredentials;
  timeout: number;
  priority: TaskPriority;
  idempotencyKey: string;
  assignedAt: DateTime;
  expiresAt: DateTime;
}

type TaskType =
  | "deploy"
  | "rollback"
  | "health-check"
  | "inspect"
  | "execute-command"
  | "upload-files"
  | "write-sticker"
  | "read-sticker";

interface DeployTaskPayload {
  image: string;
  digest: string;
  config: DeployConfig;
  artifacts: ArtifactReference[];
  previousDigest?: string;
  hooks: {
    preDeploy?: HookConfig;
    postDeploy?: HookConfig;
  };
}

// Task result (Agent → Core)
interface TaskResult {
  taskId: UUID;
  success: boolean;
  startedAt: DateTime;
  completedAt: DateTime;

  // Success details
  outputs?: Record<string, any>;
  artifacts?: ArtifactReference[];

  // Failure details
  error?: string;
  errorType?: string;
  retriable?: boolean;

  // Logs
  logs: string;

  // Metrics
  metrics: {
    pullDurationMs?: number;
    deployDurationMs?: number;
    healthCheckDurationMs?: number;
  };
}

Module: agent-docker

Aspect Specification
Responsibility Docker container deployment
Dependencies Docker Engine API
Capabilities docker.deploy, docker.rollback, docker.inspect

Docker Agent Implementation:

class DockerAgent implements TargetExecutor {
  private docker: Docker;

  async deploy(task: DeployTaskPayload): Promise<DeployResult> {
    const { image, digest, config, previousDigest } = task;
    const containerName = config.containerName;

    // 1. Pull image and verify digest
    this.log(`Pulling image ${image}@${digest}`);
    await this.docker.pull(image, { digest });

    const pulledDigest = await this.getImageDigest(image);
    if (pulledDigest !== digest) {
      throw new DigestMismatchError(
        `Expected digest ${digest}, got ${pulledDigest}. Possible tampering detected.`
      );
    }

    // 2. Run pre-deploy hook
    if (task.hooks?.preDeploy) {
      await this.runHook(task.hooks.preDeploy, "pre-deploy");
    }

    // 3. Stop and rename existing container
    const existingContainer = await this.findContainer(containerName);
    if (existingContainer) {
      this.log(`Stopping existing container ${containerName}`);
      await existingContainer.stop({ t: 10 });
      await existingContainer.rename(`${containerName}-previous-${Date.now()}`);
    }

    // 4. Create new container
    this.log(`Creating container ${containerName} from ${image}@${digest}`);
    const container = await this.docker.createContainer({
      name: containerName,
      Image: `${image}@${digest}`,  // Always use digest, not tag
      Env: this.buildEnvVars(config.environment),
      HostConfig: {
        PortBindings: this.buildPortBindings(config.ports),
        Binds: this.buildBindMounts(config.volumes),
        RestartPolicy: { Name: config.restartPolicy || "unless-stopped" },
        Memory: config.memoryLimit,
        CpuQuota: config.cpuLimit,
      },
      Labels: {
        "stella.release.id": config.releaseId,
        "stella.release.name": config.releaseName,
        "stella.digest": digest,
        "stella.deployed.at": new Date().toISOString(),
      },
    });

    // 5. Start container
    this.log(`Starting container ${containerName}`);
    await container.start();

    // 6. Wait for container to be healthy
    if (config.healthCheck) {
      this.log(`Waiting for container health check`);
      const healthy = await this.waitForHealthy(container, config.healthCheck.timeout);
      if (!healthy) {
        await this.rollbackContainer(containerName, existingContainer);
        throw new HealthCheckFailedError(`Container ${containerName} failed health check`);
      }
    }

    // 7. Run post-deploy hook
    if (task.hooks?.postDeploy) {
      await this.runHook(task.hooks.postDeploy, "post-deploy");
    }

    // 8. Cleanup previous container
    if (existingContainer && config.cleanupPrevious !== false) {
      this.log(`Removing previous container`);
      await existingContainer.remove({ force: true });
    }

    return {
      success: true,
      containerId: container.id,
      previousDigest: previousDigest,
    };
  }

  async rollback(task: RollbackTaskPayload): Promise<DeployResult> {
    const { containerName, targetDigest } = task;

    if (targetDigest) {
      // Deploy specific digest
      return this.deploy({ ...task, digest: targetDigest });
    }

    // Find and restore previous container
    const previousContainer = await this.findContainer(`${containerName}-previous-*`);
    if (!previousContainer) {
      throw new RollbackError(`No previous container found for ${containerName}`);
    }

    const currentContainer = await this.findContainer(containerName);
    if (currentContainer) {
      await currentContainer.stop({ t: 10 });
      await currentContainer.rename(`${containerName}-failed-${Date.now()}`);
    }

    await previousContainer.rename(containerName);
    await previousContainer.start();

    return { success: true, containerId: previousContainer.id };
  }

  async writeSticker(sticker: VersionSticker): Promise<void> {
    const stickerPath = this.config.stickerPath || "/var/stella/version.json";
    const stickerContent = JSON.stringify(sticker, null, 2);

    if (this.config.stickerLocation === "volume") {
      await this.docker.run("alpine", [
        "sh", "-c",
        `echo '${stickerContent}' > ${stickerPath}`
      ], {
        HostConfig: { Binds: [`${this.config.stickerVolume}:/var/stella`] }
      });
    } else {
      fs.writeFileSync(stickerPath, stickerContent);
    }
  }
}

Module: agent-compose

Aspect Specification
Responsibility Docker Compose stack deployment
Dependencies Docker Compose CLI
Capabilities compose.deploy, compose.rollback, compose.inspect

Compose Agent Implementation:

class ComposeAgent implements TargetExecutor {
  async deploy(task: DeployTaskPayload): Promise<DeployResult> {
    const { artifacts, config } = task;
    const deployDir = config.deploymentDirectory;

    // 1. Write compose lock file
    const composeLock = artifacts.find(a => a.type === "compose_lock");
    const composeContent = await this.fetchArtifact(composeLock);
    const composePath = path.join(deployDir, "compose.stella.lock.yml");
    await fs.writeFile(composePath, composeContent);

    // 2. Run pre-deploy hook
    if (task.hooks?.preDeploy) {
      await this.runHook(task.hooks.preDeploy, deployDir);
    }

    // 3. Pull images
    this.log("Pulling images...");
    await this.runCompose(deployDir, ["pull"]);

    // 4. Verify digests
    await this.verifyDigests(composePath, config.expectedDigests);

    // 5. Deploy
    this.log("Deploying services...");
    await this.runCompose(deployDir, ["up", "-d", "--remove-orphans", "--force-recreate"]);

    // 6. Wait for services to be healthy
    if (config.healthCheck) {
      const healthy = await this.waitForServicesHealthy(deployDir, config.healthCheck.timeout);
      if (!healthy) {
        await this.rollbackToBackup(deployDir);
        throw new HealthCheckFailedError("Services failed health check");
      }
    }

    // 7. Run post-deploy hook
    if (task.hooks?.postDeploy) {
      await this.runHook(task.hooks.postDeploy, deployDir);
    }

    // 8. Write version sticker
    await this.writeSticker(config.sticker, deployDir);

    return { success: true };
  }

  private async verifyDigests(
    composePath: string,
    expectedDigests: Record<string, string>
  ): Promise<void> {
    const composeContent = yaml.parse(await fs.readFile(composePath, "utf-8"));

    for (const [service, expectedDigest] of Object.entries(expectedDigests)) {
      const serviceConfig = composeContent.services[service];
      if (!serviceConfig) {
        throw new Error(`Service ${service} not found in compose file`);
      }

      const image = serviceConfig.image;
      if (!image.includes("@sha256:")) {
        throw new Error(`Service ${service} image not pinned to digest: ${image}`);
      }

      const actualDigest = image.split("@")[1];
      if (actualDigest !== expectedDigest) {
        throw new DigestMismatchError(
          `Service ${service}: expected ${expectedDigest}, got ${actualDigest}`
        );
      }
    }
  }
}

Module: agent-ssh

Aspect Specification
Responsibility SSH remote execution (agentless)
Dependencies SSH client library
Capabilities ssh.deploy, ssh.execute, ssh.upload

SSH Remote Executor:

class SSHRemoteExecutor implements TargetExecutor {
  async connect(config: SSHConnectionConfig): Promise<void> {
    const privateKey = await this.secrets.getSecret(config.privateKeyRef);

    this.ssh = new SSHClient();
    await this.ssh.connect({
      host: config.host,
      port: config.port || 22,
      username: config.username,
      privateKey: privateKey.value,
      readyTimeout: config.connectionTimeout || 30000,
    });
  }

  async deploy(task: DeployTaskPayload): Promise<DeployResult> {
    const { artifacts, config } = task;
    const deployDir = config.deploymentDirectory;

    try {
      // 1. Ensure deployment directory exists
      await this.exec(`mkdir -p ${deployDir}`);
      await this.exec(`mkdir -p ${deployDir}/.stella-backup`);

      // 2. Backup current deployment
      await this.exec(`cp -r ${deployDir}/* ${deployDir}/.stella-backup/ 2>/dev/null || true`);

      // 3. Upload artifacts
      for (const artifact of artifacts) {
        const content = await this.fetchArtifact(artifact);
        const remotePath = path.join(deployDir, artifact.name);
        await this.uploadFile(content, remotePath);
      }

      // 4. Run pre-deploy hook
      if (task.hooks?.preDeploy) {
        await this.runRemoteHook(task.hooks.preDeploy, deployDir);
      }

      // 5. Execute deployment script
      const deployScript = artifacts.find(a => a.type === "deploy_script");
      if (deployScript) {
        const scriptPath = path.join(deployDir, deployScript.name);
        await this.exec(`chmod +x ${scriptPath}`);
        const result = await this.exec(scriptPath, { cwd: deployDir, timeout: config.deploymentTimeout });
        if (result.exitCode !== 0) {
          throw new DeploymentError(`Deploy script failed: ${result.stderr}`);
        }
      }

      // 6. Run post-deploy hook
      if (task.hooks?.postDeploy) {
        await this.runRemoteHook(task.hooks.postDeploy, deployDir);
      }

      // 7. Health check
      if (config.healthCheck) {
        const healthy = await this.runHealthCheck(config.healthCheck);
        if (!healthy) {
          await this.rollback(task);
          throw new HealthCheckFailedError("Health check failed");
        }
      }

      // 8. Write version sticker
      await this.writeSticker(config.sticker, deployDir);

      // 9. Cleanup backup
      await this.exec(`rm -rf ${deployDir}/.stella-backup`);

      return { success: true };
    } finally {
      this.ssh.end();
    }
  }
}

Module: agent-winrm

Aspect Specification
Responsibility WinRM remote execution (agentless)
Dependencies WinRM client library
Capabilities winrm.deploy, winrm.execute, winrm.upload
Authentication NTLM, Kerberos, Basic

Module: agent-ecs

Aspect Specification
Responsibility AWS ECS service deployment
Dependencies AWS SDK
Capabilities ecs.deploy, ecs.rollback, ecs.inspect

Module: agent-nomad

Aspect Specification
Responsibility HashiCorp Nomad job deployment
Dependencies Nomad API client
Capabilities nomad.deploy, nomad.rollback, nomad.inspect

Agent Security Model

Registration Flow

┌─────────────────────────────────────────────────────────────────────────────┐
│                    AGENT REGISTRATION FLOW                                  │
│                                                                             │
│  1. Admin generates registration token (one-time use)                       │
│     POST /api/v1/admin/agent-tokens                                        │
│     → { token: "reg_xxx", expiresAt: "..." }                              │
│                                                                             │
│  2. Agent starts with registration token                                    │
│     ./stella-agent --register --token=reg_xxx                              │
│                                                                             │
│  3. Agent requests mTLS certificate                                         │
│     POST /api/v1/agents/register                                           │
│     Headers: X-Registration-Token: reg_xxx                                 │
│     Body: { name, version, capabilities, csr }                             │
│     → { agentId, certificate, caCertificate }                              │
│                                                                             │
│  4. Agent establishes mTLS connection                                       │
│     Uses issued certificate for all subsequent requests                    │
│                                                                             │
│  5. Agent requests short-lived JWT for task execution                       │
│     POST /api/v1/agents/token (over mTLS)                                  │
│     → { token, expiresIn: 3600 }  // 1 hour                               │
│                                                                             │
│  6. Agent refreshes token before expiration                                 │
│     Token refresh only over mTLS connection                                │
│                                                                             │
└─────────────────────────────────────────────────────────────────────────────┘

Communication Security

┌─────────────────────────────────────────────────────────────────────────────┐
│                    AGENT COMMUNICATION SECURITY                             │
│                                                                             │
│  ┌──────────────┐                          ┌──────────────┐                │
│  │    AGENT     │                          │  STELLA CORE │                │
│  └──────┬───────┘                          └──────┬───────┘                │
│         │                                         │                         │
│         │  mTLS (mutual TLS)                      │                         │
│         │  - Agent cert signed by Stella CA       │                         │
│         │  - Server cert verified by Agent        │                         │
│         │  - TLS 1.3 only                         │                         │
│         │  - Perfect forward secrecy              │                         │
│         │◄───────────────────────────────────────►│                         │
│         │                                         │                         │
│         │  Encrypted payload                      │                         │
│         │  - Task payloads encrypted with         │                         │
│         │    agent-specific key                   │                         │
│         │  - Logs encrypted in transit            │                         │
│         │◄───────────────────────────────────────►│                         │
│         │                                         │                         │
│         │  Heartbeat + capability refresh         │                         │
│         │  - Every 30 seconds                     │                         │
│         │  - Signed with agent key                │                         │
│         │─────────────────────────────────────────►│                         │
│         │                                         │                         │
│         │  Task assignment                        │                         │
│         │  - Contains short-lived credentials     │                         │
│         │  - Scoped to specific target            │                         │
│         │  - Expires after task timeout           │                         │
│         │◄─────────────────────────────────────────│                         │
│         │                                         │                         │
└─────────────────────────────────────────────────────────────────────────────┘

Database Schema

-- Agents
CREATE TABLE release.agents (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
    name VARCHAR(255) NOT NULL,
    version VARCHAR(50) NOT NULL,
    capabilities JSONB NOT NULL DEFAULT '[]',
    labels JSONB NOT NULL DEFAULT '{}',
    status VARCHAR(50) NOT NULL DEFAULT 'offline' CHECK (status IN (
        'online', 'offline', 'degraded'
    )),
    last_heartbeat TIMESTAMPTZ,
    resource_usage JSONB,
    certificate_fingerprint VARCHAR(64),
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    UNIQUE (tenant_id, name)
);

CREATE INDEX idx_agents_tenant ON release.agents(tenant_id);
CREATE INDEX idx_agents_status ON release.agents(status);
CREATE INDEX idx_agents_capabilities ON release.agents USING GIN (capabilities);

API Endpoints

# Agent Registration
POST   /api/v1/agents/register
       Headers: X-Registration-Token: {token}
       Body: { name, version, capabilities, csr }
       Response: { agentId, certificate, caCertificate }

# Agent Management
GET    /api/v1/agents
       Query: ?status={online|offline|degraded}&capability={type}
       Response: Agent[]

GET    /api/v1/agents/{id}
       Response: Agent

PUT    /api/v1/agents/{id}
       Body: { labels?, capabilities? }
       Response: Agent

DELETE /api/v1/agents/{id}
       Response: { deleted: true }

# Agent Communication
POST   /api/v1/agents/{id}/heartbeat
       Body: { status, resourceUsage, capabilities }
       Response: { tasks: AgentTask[] }

POST   /api/v1/agents/{id}/tasks/{taskId}/complete
       Body: { success, result, logs }
       Response: { acknowledged: true }

# WebSocket for real-time task stream
WS     /api/v1/agents/{id}/task-stream
       Messages:
         - { type: "task_assigned", task: AgentTask }
         - { type: "task_cancelled", taskId }

References