release orchestrator pivot, architecture and planning
This commit is contained in:
418
docs/modules/release-orchestrator/modules/environment-manager.md
Normal file
418
docs/modules/release-orchestrator/modules/environment-manager.md
Normal file
@@ -0,0 +1,418 @@
|
||||
# ENVMGR: Environment & Inventory Manager
|
||||
|
||||
**Purpose**: Model environments, targets, agents, and their relationships.
|
||||
|
||||
## Modules
|
||||
|
||||
### Module: `environment-manager`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Environment CRUD, ordering, configuration, freeze windows |
|
||||
| **Dependencies** | `authority` |
|
||||
| **Data Entities** | `Environment`, `EnvironmentConfig`, `FreezeWindow` |
|
||||
| **Events Produced** | `environment.created`, `environment.updated`, `environment.freeze_started`, `environment.freeze_ended` |
|
||||
|
||||
**Key Operations**:
|
||||
```
|
||||
CreateEnvironment(name, displayName, orderIndex, config) → Environment
|
||||
UpdateEnvironment(id, config) → Environment
|
||||
DeleteEnvironment(id) → void
|
||||
SetFreezeWindow(environmentId, start, end, reason, exceptions) → FreezeWindow
|
||||
ClearFreezeWindow(environmentId, windowId) → void
|
||||
ListEnvironments(tenantId) → Environment[]
|
||||
GetEnvironmentState(id) → EnvironmentState
|
||||
```
|
||||
|
||||
**Environment Entity**:
|
||||
```typescript
|
||||
interface Environment {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
name: string; // "dev", "stage", "prod"
|
||||
displayName: string; // "Development"
|
||||
orderIndex: number; // 0, 1, 2 for promotion order
|
||||
config: EnvironmentConfig;
|
||||
freezeWindows: FreezeWindow[];
|
||||
requiredApprovals: number; // 0 for dev, 1+ for prod
|
||||
requireSeparationOfDuties: boolean;
|
||||
autoPromoteFrom: UUID | null; // auto-promote from this env
|
||||
promotionPolicy: string; // OPA policy name
|
||||
createdAt: DateTime;
|
||||
updatedAt: DateTime;
|
||||
}
|
||||
|
||||
interface EnvironmentConfig {
|
||||
variables: Record<string, string>; // env-specific variables
|
||||
secrets: SecretReference[]; // vault references
|
||||
registryOverrides: RegistryOverride[]; // per-env registry
|
||||
agentLabels: string[]; // required agent labels
|
||||
deploymentTimeout: number; // seconds
|
||||
healthCheckConfig: HealthCheckConfig;
|
||||
}
|
||||
|
||||
interface FreezeWindow {
|
||||
id: UUID;
|
||||
start: DateTime;
|
||||
end: DateTime;
|
||||
reason: string;
|
||||
createdBy: UUID;
|
||||
exceptions: UUID[]; // users who can override
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `target-registry`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Deployment target inventory; capability tracking |
|
||||
| **Dependencies** | `environment-manager`, `agent-manager` |
|
||||
| **Data Entities** | `Target`, `TargetGroup`, `TargetCapability` |
|
||||
| **Events Produced** | `target.created`, `target.updated`, `target.deleted`, `target.health_changed` |
|
||||
|
||||
**Target Types** (plugin-provided):
|
||||
|
||||
| Type | Description |
|
||||
|------|-------------|
|
||||
| `docker_host` | Single Docker host |
|
||||
| `compose_host` | Docker Compose host |
|
||||
| `ssh_remote` | Generic SSH target |
|
||||
| `winrm_remote` | Windows remote target |
|
||||
| `ecs_service` | AWS ECS service |
|
||||
| `nomad_job` | HashiCorp Nomad job |
|
||||
|
||||
**Target Entity**:
|
||||
```typescript
|
||||
interface Target {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
environmentId: UUID;
|
||||
name: string; // "prod-web-01"
|
||||
targetType: string; // "docker_host"
|
||||
connection: TargetConnection; // type-specific
|
||||
capabilities: TargetCapability[];
|
||||
labels: Record<string, string>; // for grouping
|
||||
healthStatus: HealthStatus;
|
||||
lastHealthCheck: DateTime;
|
||||
deploymentDirectory: string; // where artifacts are placed
|
||||
currentDigest: string | null; // what's currently deployed
|
||||
agentId: UUID | null; // assigned agent
|
||||
}
|
||||
|
||||
interface TargetConnection {
|
||||
// Common fields
|
||||
host: string;
|
||||
port: number;
|
||||
|
||||
// Type-specific (examples)
|
||||
// docker_host:
|
||||
dockerSocket?: string;
|
||||
tlsCert?: SecretReference;
|
||||
|
||||
// ssh_remote:
|
||||
username?: string;
|
||||
privateKey?: SecretReference;
|
||||
|
||||
// ecs_service:
|
||||
cluster?: string;
|
||||
service?: string;
|
||||
region?: string;
|
||||
roleArn?: string;
|
||||
}
|
||||
|
||||
interface TargetGroup {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
environmentId: UUID;
|
||||
name: string;
|
||||
labels: Record<string, string>;
|
||||
createdAt: DateTime;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `agent-manager`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Agent registration, heartbeat, capability advertisement |
|
||||
| **Dependencies** | `authority` (for agent tokens) |
|
||||
| **Data Entities** | `Agent`, `AgentCapability`, `AgentHeartbeat` |
|
||||
| **Events Produced** | `agent.registered`, `agent.online`, `agent.offline`, `agent.capability_changed` |
|
||||
|
||||
**Agent Lifecycle**:
|
||||
1. Agent starts, requests registration token from Authority
|
||||
2. Agent registers with capabilities and labels
|
||||
3. Agent sends heartbeats (default: 30s interval)
|
||||
4. Agent pulls tasks from task queue
|
||||
5. Agent reports task completion/failure
|
||||
|
||||
**Agent Entity**:
|
||||
```typescript
|
||||
interface Agent {
|
||||
id: UUID;
|
||||
tenantId: UUID;
|
||||
name: string;
|
||||
version: string;
|
||||
capabilities: AgentCapability[];
|
||||
labels: Record<string, string>;
|
||||
status: "online" | "offline" | "degraded";
|
||||
lastHeartbeat: DateTime;
|
||||
assignedTargets: UUID[];
|
||||
resourceUsage: ResourceUsage;
|
||||
}
|
||||
|
||||
interface AgentCapability {
|
||||
type: string; // "docker", "compose", "ssh", "winrm"
|
||||
version: string; // capability version
|
||||
config: object; // capability-specific config
|
||||
}
|
||||
|
||||
interface ResourceUsage {
|
||||
cpuPercent: number;
|
||||
memoryPercent: number;
|
||||
diskPercent: number;
|
||||
activeTasks: number;
|
||||
}
|
||||
```
|
||||
|
||||
**Agent Registration Protocol**:
|
||||
```
|
||||
1. Admin generates registration token (one-time use)
|
||||
POST /api/v1/admin/agent-tokens
|
||||
→ { token: "reg_xxx", expiresAt: "..." }
|
||||
|
||||
2. Agent starts with registration token
|
||||
./stella-agent --register --token=reg_xxx
|
||||
|
||||
3. Agent requests mTLS certificate
|
||||
POST /api/v1/agents/register
|
||||
Headers: X-Registration-Token: reg_xxx
|
||||
Body: { name, version, capabilities, csr }
|
||||
→ { agentId, certificate, caCertificate }
|
||||
|
||||
4. Agent establishes mTLS connection
|
||||
Uses issued certificate for all subsequent requests
|
||||
|
||||
5. Agent requests short-lived JWT for task execution
|
||||
POST /api/v1/agents/token (over mTLS)
|
||||
→ { token, expiresIn: 3600 } // 1 hour
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Module: `inventory-sync`
|
||||
|
||||
| Aspect | Specification |
|
||||
|--------|---------------|
|
||||
| **Responsibility** | Drift detection; expected vs actual state reconciliation |
|
||||
| **Dependencies** | `target-registry`, `agent-manager` |
|
||||
| **Events Produced** | `inventory.drift_detected`, `inventory.reconciled` |
|
||||
|
||||
**Drift Detection Process**:
|
||||
1. Read `stella.version.json` from target deployment directory
|
||||
2. Compare with expected state in database
|
||||
3. Flag discrepancies (digest mismatch, missing sticker, unexpected files)
|
||||
4. Report on dashboard
|
||||
|
||||
**Drift Detection Types**:
|
||||
|
||||
| Drift Type | Description | Severity |
|
||||
|------------|-------------|----------|
|
||||
| `digest_mismatch` | Running digest differs from expected | Critical |
|
||||
| `missing_sticker` | No version sticker found on target | Warning |
|
||||
| `stale_sticker` | Sticker timestamp older than last deployment | Warning |
|
||||
| `orphan_container` | Container not managed by Stella | Info |
|
||||
| `extra_files` | Unexpected files in deployment directory | Info |
|
||||
|
||||
---
|
||||
|
||||
## Cache Eviction Policies
|
||||
|
||||
Environment configurations and target states are cached to improve performance. **All caches MUST have bounded size and TTL-based eviction**:
|
||||
|
||||
| Cache Type | Purpose | TTL | Max Size | Eviction Strategy |
|
||||
|-----------|---------|-----|----------|-------------------|
|
||||
| **Environment Configs** | Environment configuration data | 30 minutes | 500 entries | Sliding expiration |
|
||||
| **Target Health** | Target health status | 5 minutes | 2,000 entries | Sliding expiration |
|
||||
| **Agent Capabilities** | Agent capability advertisement | 10 minutes | 1,000 entries | Sliding expiration |
|
||||
| **Freeze Windows** | Active freeze window checks | 15 minutes | 100 entries | Absolute expiration |
|
||||
|
||||
**Implementation**:
|
||||
```csharp
|
||||
public class EnvironmentConfigCache
|
||||
{
|
||||
private readonly MemoryCache _cache;
|
||||
|
||||
public EnvironmentConfigCache()
|
||||
{
|
||||
_cache = new MemoryCache(new MemoryCacheOptions
|
||||
{
|
||||
SizeLimit = 500 // Max 500 environment configs
|
||||
});
|
||||
}
|
||||
|
||||
public void CacheConfig(Guid environmentId, EnvironmentConfig config)
|
||||
{
|
||||
_cache.Set(environmentId, config, new MemoryCacheEntryOptions
|
||||
{
|
||||
Size = 1,
|
||||
SlidingExpiration = TimeSpan.FromMinutes(30) // 30-minute TTL
|
||||
});
|
||||
}
|
||||
|
||||
public EnvironmentConfig? GetCachedConfig(Guid environmentId)
|
||||
=> _cache.Get<EnvironmentConfig>(environmentId);
|
||||
|
||||
public void InvalidateConfig(Guid environmentId)
|
||||
=> _cache.Remove(environmentId);
|
||||
}
|
||||
```
|
||||
|
||||
**Cache Invalidation**:
|
||||
- Environment configs: Invalidate on update
|
||||
- Target health: Invalidate on health check or deployment
|
||||
- Agent capabilities: Invalidate on capability change event
|
||||
- Freeze windows: Invalidate on window creation/deletion
|
||||
|
||||
**Reference**: See [Implementation Guide](../implementation-guide.md#caching) for cache implementation patterns.
|
||||
|
||||
---
|
||||
|
||||
## Database Schema
|
||||
|
||||
```sql
|
||||
-- Environments
|
||||
CREATE TABLE release.environments (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
name VARCHAR(100) NOT NULL,
|
||||
display_name VARCHAR(255) NOT NULL,
|
||||
order_index INTEGER NOT NULL,
|
||||
config JSONB NOT NULL DEFAULT '{}',
|
||||
freeze_windows JSONB NOT NULL DEFAULT '[]',
|
||||
required_approvals INTEGER NOT NULL DEFAULT 0,
|
||||
require_sod BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
auto_promote_from UUID REFERENCES release.environments(id),
|
||||
promotion_policy VARCHAR(255),
|
||||
deployment_timeout INTEGER NOT NULL DEFAULT 600,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
UNIQUE (tenant_id, name)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_environments_tenant ON release.environments(tenant_id);
|
||||
CREATE INDEX idx_environments_order ON release.environments(tenant_id, order_index);
|
||||
|
||||
-- Target Groups
|
||||
CREATE TABLE release.target_groups (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
environment_id UUID NOT NULL REFERENCES release.environments(id) ON DELETE CASCADE,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
labels JSONB NOT NULL DEFAULT '{}',
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
UNIQUE (tenant_id, environment_id, name)
|
||||
);
|
||||
|
||||
-- Targets
|
||||
CREATE TABLE release.targets (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
environment_id UUID NOT NULL REFERENCES release.environments(id) ON DELETE CASCADE,
|
||||
target_group_id UUID REFERENCES release.target_groups(id),
|
||||
name VARCHAR(255) NOT NULL,
|
||||
target_type VARCHAR(100) NOT NULL,
|
||||
connection JSONB NOT NULL,
|
||||
capabilities JSONB NOT NULL DEFAULT '[]',
|
||||
labels JSONB NOT NULL DEFAULT '{}',
|
||||
deployment_directory VARCHAR(500),
|
||||
health_status VARCHAR(50) NOT NULL DEFAULT 'unknown',
|
||||
last_health_check TIMESTAMPTZ,
|
||||
current_digest VARCHAR(100),
|
||||
agent_id UUID REFERENCES release.agents(id),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
UNIQUE (tenant_id, environment_id, name)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_targets_tenant_env ON release.targets(tenant_id, environment_id);
|
||||
CREATE INDEX idx_targets_type ON release.targets(target_type);
|
||||
CREATE INDEX idx_targets_labels ON release.targets USING GIN (labels);
|
||||
|
||||
-- Agents
|
||||
CREATE TABLE release.agents (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
version VARCHAR(50) NOT NULL,
|
||||
capabilities JSONB NOT NULL DEFAULT '[]',
|
||||
labels JSONB NOT NULL DEFAULT '{}',
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'offline',
|
||||
last_heartbeat TIMESTAMPTZ,
|
||||
resource_usage JSONB,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
UNIQUE (tenant_id, name)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_agents_tenant ON release.agents(tenant_id);
|
||||
CREATE INDEX idx_agents_status ON release.agents(status);
|
||||
CREATE INDEX idx_agents_capabilities ON release.agents USING GIN (capabilities);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
```yaml
|
||||
# Environments
|
||||
POST /api/v1/environments
|
||||
GET /api/v1/environments
|
||||
GET /api/v1/environments/{id}
|
||||
PUT /api/v1/environments/{id}
|
||||
DELETE /api/v1/environments/{id}
|
||||
|
||||
# Freeze Windows
|
||||
POST /api/v1/environments/{envId}/freeze-windows
|
||||
GET /api/v1/environments/{envId}/freeze-windows
|
||||
DELETE /api/v1/environments/{envId}/freeze-windows/{windowId}
|
||||
|
||||
# Target Groups
|
||||
POST /api/v1/environments/{envId}/target-groups
|
||||
GET /api/v1/environments/{envId}/target-groups
|
||||
GET /api/v1/target-groups/{id}
|
||||
PUT /api/v1/target-groups/{id}
|
||||
DELETE /api/v1/target-groups/{id}
|
||||
|
||||
# Targets
|
||||
POST /api/v1/targets
|
||||
GET /api/v1/targets
|
||||
GET /api/v1/targets/{id}
|
||||
PUT /api/v1/targets/{id}
|
||||
DELETE /api/v1/targets/{id}
|
||||
POST /api/v1/targets/{id}/health-check
|
||||
GET /api/v1/targets/{id}/sticker
|
||||
GET /api/v1/targets/{id}/drift
|
||||
|
||||
# Agents
|
||||
POST /api/v1/agents/register
|
||||
GET /api/v1/agents
|
||||
GET /api/v1/agents/{id}
|
||||
PUT /api/v1/agents/{id}
|
||||
DELETE /api/v1/agents/{id}
|
||||
POST /api/v1/agents/{id}/heartbeat
|
||||
POST /api/v1/agents/{id}/tasks/{taskId}/complete
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [Module Overview](overview.md)
|
||||
- [Agent Specification](agents.md)
|
||||
- [API Documentation](../api/environments.md)
|
||||
- [Agent Security](../security/agent-security.md)
|
||||
Reference in New Issue
Block a user