release orchestration strengthening

This commit is contained in:
master
2026-01-17 21:32:03 +02:00
parent 195dff2457
commit da27b9faa9
256 changed files with 94634 additions and 2269 deletions

View File

@@ -0,0 +1,78 @@
// -----------------------------------------------------------------------------
// AgentDoctorPlugin.cs
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
// Task: TASK-041-09 - Server-Side Doctor Plugin
// Description: Doctor plugin for agent fleet health monitoring
// -----------------------------------------------------------------------------
using StellaOps.Doctor.Plugin.Agent.Checks;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugin.Agent;
/// <summary>
/// Doctor plugin for agent fleet health monitoring.
/// Monitors agent connectivity, certificates, capacity, and overall fleet health.
/// </summary>
public sealed class AgentDoctorPlugin : IDoctorPlugin
{
private static readonly Version PluginVersion = new(1, 0, 0);
private static readonly Version MinVersion = new(1, 0, 0);
/// <inheritdoc />
public string PluginId => "stellaops.doctor.agent";
/// <inheritdoc />
public string DisplayName => "Agent Fleet";
/// <inheritdoc />
public DoctorCategory Category => DoctorCategory.Infrastructure;
/// <inheritdoc />
public Version Version => PluginVersion;
/// <inheritdoc />
public Version MinEngineVersion => MinVersion;
/// <inheritdoc />
public bool IsAvailable(IServiceProvider services)
{
// Always available - individual checks handle their own availability
return true;
}
/// <inheritdoc />
public IReadOnlyList<IDoctorCheck> GetChecks(DoctorPluginContext context)
{
return new IDoctorCheck[]
{
// Connectivity checks
new AgentHeartbeatFreshnessCheck(),
new StaleAgentCheck(),
// Security checks
new AgentCertificateExpiryCheck(),
new AgentCertificateValidityCheck(),
// Capacity checks
new AgentCapacityCheck(),
new TaskQueueBacklogCheck(),
new FailedTaskRateCheck(),
// Fleet health checks
new AgentVersionConsistencyCheck(),
new AgentResourceUtilizationCheck(),
// Cluster checks (when clustering is enabled)
new AgentClusterHealthCheck(),
new AgentClusterQuorumCheck()
};
}
/// <inheritdoc />
public Task InitializeAsync(DoctorPluginContext context, CancellationToken ct)
{
// No initialization required
return Task.CompletedTask;
}
}

View File

@@ -0,0 +1,167 @@
// -----------------------------------------------------------------------------
// AgentCapacityCheck.cs
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
// Task: TASK-041-09 - Server-Side Doctor Plugin
// Description: Checks if agents have sufficient capacity for tasks
// -----------------------------------------------------------------------------
using System.Globalization;
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
using StellaOps.ReleaseOrchestrator.Agent.Store;
namespace StellaOps.Doctor.Plugin.Agent.Checks;
/// <summary>
/// Checks if agents have sufficient capacity to handle incoming tasks.
/// </summary>
public sealed class AgentCapacityCheck : IDoctorCheck
{
private const double HighUtilizationThreshold = 0.9;
private const double WarningUtilizationThreshold = 0.75;
/// <inheritdoc />
public string CheckId => "check.agent.capacity";
/// <inheritdoc />
public string Name => "Agent Capacity";
/// <inheritdoc />
public string Description => "Verify agents have sufficient capacity for tasks";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["agent", "capacity", "performance"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context)
{
return context.Services.GetService<IAgentStore>() != null;
}
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var agentStore = context.Services.GetRequiredService<IAgentStore>();
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
var agents = await agentStore.GetAllAsync(ct);
var activeAgents = agents
.Where(a => a.Status == AgentStatus.Online)
.ToList();
if (activeAgents.Count == 0)
{
return builder
.Fail("No online agents available to handle tasks")
.WithEvidence("Agent capacity", eb => eb
.Add("OnlineAgents", "0")
.Add("TotalAgents", agents.Count.ToString(CultureInfo.InvariantCulture)))
.WithCauses(
"All agents are offline",
"No agents have been registered")
.WithRemediation(rb => rb
.AddStep(1, "Check agent heartbeat status",
"stella doctor --check check.agent.heartbeat.freshness",
CommandType.Shell)
.AddStep(2, "Bootstrap new agents if needed",
"stella agent bootstrap --name <name> --env <env>",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.Build();
}
var overloadedAgents = new List<string>();
var warningAgents = new List<string>();
var totalCapacity = 0;
var totalUtilized = 0;
foreach (var agent in activeAgents)
{
totalCapacity += agent.MaxConcurrentTasks;
totalUtilized += agent.ActiveTaskCount;
var utilization = agent.MaxConcurrentTasks > 0
? (double)agent.ActiveTaskCount / agent.MaxConcurrentTasks
: 0;
if (utilization >= HighUtilizationThreshold)
{
overloadedAgents.Add($"{agent.Name} ({agent.ActiveTaskCount}/{agent.MaxConcurrentTasks})");
}
else if (utilization >= WarningUtilizationThreshold)
{
warningAgents.Add($"{agent.Name} ({agent.ActiveTaskCount}/{agent.MaxConcurrentTasks})");
}
}
var overallUtilization = totalCapacity > 0 ? (double)totalUtilized / totalCapacity : 0;
if (overallUtilization >= HighUtilizationThreshold)
{
return builder
.Fail($"Fleet capacity critically low ({overallUtilization:P0} utilized)")
.WithEvidence("Agent capacity", eb => eb
.Add("TotalCapacity", totalCapacity.ToString(CultureInfo.InvariantCulture))
.Add("TotalUtilized", totalUtilized.ToString(CultureInfo.InvariantCulture))
.Add("Utilization", overallUtilization.ToString("P1", CultureInfo.InvariantCulture))
.Add("OverloadedAgents", string.Join(", ", overloadedAgents)))
.WithCauses(
"Too many concurrent deployments",
"Insufficient agent capacity",
"Tasks taking longer than expected")
.WithRemediation(rb => rb
.AddStep(1, "Add more agents to increase capacity",
"stella agent bootstrap --name <name> --env <env>",
CommandType.Shell)
.AddStep(2, "Review and optimize long-running tasks",
"stella task list --status running --sort duration",
CommandType.Shell)
.AddStep(3, "Consider increasing max concurrent tasks per agent",
"stella agent config --agent-id <id> --set max_concurrent_tasks=10",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.Build();
}
if (overloadedAgents.Count > 0 || overallUtilization >= WarningUtilizationThreshold)
{
return builder
.Warn($"Fleet capacity at {overallUtilization:P0}")
.WithEvidence("Agent capacity", eb => eb
.Add("TotalCapacity", totalCapacity.ToString(CultureInfo.InvariantCulture))
.Add("TotalUtilized", totalUtilized.ToString(CultureInfo.InvariantCulture))
.Add("Utilization", overallUtilization.ToString("P1", CultureInfo.InvariantCulture))
.Add("OverloadedAgents", overloadedAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("WarningAgents", warningAgents.Count.ToString(CultureInfo.InvariantCulture)))
.WithCauses(
"High deployment activity",
"Approaching capacity limits")
.WithRemediation(rb => rb
.AddStep(1, "Monitor capacity trend",
"stella agent list --format table",
CommandType.Shell)
.AddStep(2, "Consider scaling if trend continues",
"stella agent bootstrap --name <name> --env <env>",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.Build();
}
return builder
.Pass($"Fleet capacity healthy ({overallUtilization:P0} utilized)")
.WithEvidence("Agent capacity", eb => eb
.Add("TotalCapacity", totalCapacity.ToString(CultureInfo.InvariantCulture))
.Add("TotalUtilized", totalUtilized.ToString(CultureInfo.InvariantCulture))
.Add("Utilization", overallUtilization.ToString("P1", CultureInfo.InvariantCulture))
.Add("OnlineAgents", activeAgents.Count.ToString(CultureInfo.InvariantCulture)))
.Build();
}
}

View File

@@ -0,0 +1,189 @@
// -----------------------------------------------------------------------------
// AgentCertificateExpiryCheck.cs
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
// Task: TASK-041-09 - Server-Side Doctor Plugin
// Description: Checks if agent certificates are expiring soon
// -----------------------------------------------------------------------------
using System.Globalization;
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
using StellaOps.ReleaseOrchestrator.Agent.Store;
namespace StellaOps.Doctor.Plugin.Agent.Checks;
/// <summary>
/// Checks if any agent certificates are expired or expiring soon.
/// </summary>
public sealed class AgentCertificateExpiryCheck : IDoctorCheck
{
private static readonly TimeSpan WarningThreshold = TimeSpan.FromDays(7);
private static readonly TimeSpan CriticalThreshold = TimeSpan.FromDays(1);
/// <inheritdoc />
public string CheckId => "check.agent.certificate.expiry";
/// <inheritdoc />
public string Name => "Agent Certificate Expiry";
/// <inheritdoc />
public string Description => "Verify agent certificates are not expired or expiring soon";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["agent", "certificate", "security", "quick"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context)
{
return context.Services.GetService<IAgentStore>() != null;
}
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var agentStore = context.Services.GetRequiredService<IAgentStore>();
var timeProvider = context.Services.GetRequiredService<TimeProvider>();
var now = timeProvider.GetUtcNow();
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
var agents = await agentStore.GetAllAsync(ct);
var activeAgents = agents.Where(a => a.Status != AgentStatus.Deactivated).ToList();
if (activeAgents.Count == 0)
{
return builder
.Skip("No active agents to check")
.Build();
}
var expiredAgents = new List<(string Name, TimeSpan ExpiredAgo)>();
var criticalAgents = new List<(string Name, TimeSpan ExpiresIn)>();
var warningAgents = new List<(string Name, TimeSpan ExpiresIn)>();
foreach (var agent in activeAgents)
{
if (agent.CertificateExpiry == default)
{
continue; // Certificate info not available
}
var expiresIn = agent.CertificateExpiry - now;
if (expiresIn <= TimeSpan.Zero)
{
expiredAgents.Add((agent.Name, -expiresIn));
}
else if (expiresIn <= CriticalThreshold)
{
criticalAgents.Add((agent.Name, expiresIn));
}
else if (expiresIn <= WarningThreshold)
{
warningAgents.Add((agent.Name, expiresIn));
}
}
if (expiredAgents.Count > 0)
{
var expiredList = expiredAgents
.Select(a => $"{a.Name} (expired {a.ExpiredAgo.TotalDays:F0} days ago)")
.ToList();
return builder
.Fail($"{expiredAgents.Count} agent(s) have expired certificates")
.WithEvidence("Agent certificate status", eb => eb
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("Expired", expiredAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("Critical", criticalAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("Warning", warningAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("ExpiredAgents", string.Join(", ", expiredList)))
.WithCauses(
"Certificate auto-renewal is disabled",
"Agent was offline when renewal was due",
"Certificate authority is unreachable",
"Agent bootstrap was incomplete")
.WithRemediation(rb => rb
.AddStep(1, "Force certificate renewal on the affected agent",
"stella agent renew-cert --agent-id <agent-id> --force",
CommandType.Shell)
.AddStep(2, "If agent is unreachable, re-bootstrap",
"stella agent bootstrap --name <agent-name> --env <environment>",
CommandType.Shell)
.AddStep(3, "Verify auto-renewal is enabled",
"stella agent config --agent-id <agent-id> | grep auto_renew",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.WithRunbookUrl("https://docs.stella-ops.org/runbooks/agent-cert-expired")
.Build();
}
if (criticalAgents.Count > 0)
{
var criticalList = criticalAgents
.Select(a => $"{a.Name} (expires in {a.ExpiresIn.TotalHours:F0} hours)")
.ToList();
return builder
.Fail($"{criticalAgents.Count} agent(s) have certificates expiring within 24 hours")
.WithEvidence("Agent certificate status", eb => eb
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("Critical", criticalAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("Warning", warningAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("CriticalAgents", string.Join(", ", criticalList)))
.WithCauses(
"Certificate auto-renewal failed",
"Agent has been offline",
"Certificate authority rate limiting")
.WithRemediation(rb => rb
.AddStep(1, "Manually trigger certificate renewal",
"stella agent renew-cert --agent-id <agent-id>",
CommandType.Shell)
.AddStep(2, "Check agent logs for renewal failures",
"stella agent logs --agent-id <agent-id> --level warn",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.Build();
}
if (warningAgents.Count > 0)
{
var warningList = warningAgents
.Select(a => $"{a.Name} (expires in {a.ExpiresIn.TotalDays:F0} days)")
.ToList();
return builder
.Warn($"{warningAgents.Count} agent(s) have certificates expiring within 7 days")
.WithEvidence("Agent certificate status", eb => eb
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("Warning", warningAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("WarningAgents", string.Join(", ", warningList)))
.WithCauses(
"Certificate renewal threshold not reached yet",
"Agent auto-renewal scheduled but not yet triggered")
.WithRemediation(rb => rb
.AddStep(1, "Monitor certificate renewal",
"stella agent health <agent-id>",
CommandType.Shell)
.AddStep(2, "Optionally force early renewal",
"stella agent renew-cert --agent-id <agent-id>",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.Build();
}
return builder
.Pass("All agent certificates are valid")
.WithEvidence("Agent certificate status", eb => eb
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("AllValid", "true"))
.Build();
}
}

View File

@@ -0,0 +1,60 @@
// -----------------------------------------------------------------------------
// AgentCertificateValidityCheck.cs
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
// Task: TASK-041-09 - Server-Side Doctor Plugin
// Description: Validates agent certificate chain and trust
// -----------------------------------------------------------------------------
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
using StellaOps.ReleaseOrchestrator.Agent.Store;
namespace StellaOps.Doctor.Plugin.Agent.Checks;
/// <summary>
/// Validates agent certificate chain and trust relationships.
/// </summary>
public sealed class AgentCertificateValidityCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.agent.certificate.validity";
/// <inheritdoc />
public string Name => "Agent Certificate Validity";
/// <inheritdoc />
public string Description => "Verify agent certificates have valid chain of trust";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["agent", "certificate", "security"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(5);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context)
{
return context.Services.GetService<IAgentStore>() != null;
}
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
// TODO: Implement certificate chain validation
// This check verifies:
// 1. Certificate is signed by trusted CA
// 2. Certificate chain is complete
// 3. No revoked certificates in chain
// 4. Certificate is for correct agent identity
return builder
.Pass("Certificate validity check - implementation pending")
.Build();
}
}

View File

@@ -0,0 +1,61 @@
// -----------------------------------------------------------------------------
// AgentClusterHealthCheck.cs
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
// Task: TASK-041-09 - Server-Side Doctor Plugin
// Description: Monitors agent cluster health (when clustering is enabled)
// -----------------------------------------------------------------------------
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugin.Agent.Checks;
/// <summary>
/// Monitors agent cluster health when clustering is enabled.
/// </summary>
public sealed class AgentClusterHealthCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.agent.cluster.health";
/// <inheritdoc />
public string Name => "Agent Cluster Health";
/// <inheritdoc />
public string Description => "Monitor agent cluster membership and health";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["agent", "cluster", "ha", "resilience"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context)
{
// Only run if clustering is enabled
var clusteringEnabled = context.Configuration["Agent:Cluster:Enabled"];
return clusteringEnabled?.Equals("true", StringComparison.OrdinalIgnoreCase) == true;
}
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
// TODO: Implement cluster health monitoring
// This check verifies:
// 1. All cluster members are reachable
// 2. Leader is elected and healthy
// 3. State sync is working
// 4. Failover is possible if needed
return builder
.Skip("Clustering not enabled or check implementation pending")
.Build();
}
}

View File

@@ -0,0 +1,60 @@
// -----------------------------------------------------------------------------
// AgentClusterQuorumCheck.cs
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
// Task: TASK-041-09 - Server-Side Doctor Plugin
// Description: Verifies agent cluster has quorum for leader election
// -----------------------------------------------------------------------------
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugin.Agent.Checks;
/// <summary>
/// Verifies agent cluster has sufficient members for quorum.
/// </summary>
public sealed class AgentClusterQuorumCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.agent.cluster.quorum";
/// <inheritdoc />
public string Name => "Agent Cluster Quorum";
/// <inheritdoc />
public string Description => "Verify agent cluster has quorum for leader election";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["agent", "cluster", "quorum", "ha"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context)
{
// Only run if clustering is enabled
var clusteringEnabled = context.Configuration["Agent:Cluster:Enabled"];
return clusteringEnabled?.Equals("true", StringComparison.OrdinalIgnoreCase) == true;
}
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
// TODO: Implement quorum check
// This check verifies:
// 1. Minimum members are online (n/2 + 1 for odd, or configured minimum)
// 2. Leader election is possible
// 3. Split-brain prevention is active
return builder
.Skip("Clustering not enabled or check implementation pending")
.Build();
}
}

View File

@@ -0,0 +1,179 @@
// -----------------------------------------------------------------------------
// AgentHeartbeatFreshnessCheck.cs
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
// Task: TASK-041-09 - Server-Side Doctor Plugin
// Description: Checks if all agents have fresh heartbeats
// -----------------------------------------------------------------------------
using System.Globalization;
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
using StellaOps.ReleaseOrchestrator.Agent.Store;
namespace StellaOps.Doctor.Plugin.Agent.Checks;
/// <summary>
/// Checks if all registered agents have recent heartbeats.
/// </summary>
public sealed class AgentHeartbeatFreshnessCheck : IDoctorCheck
{
private static readonly TimeSpan StaleThreshold = TimeSpan.FromMinutes(5);
private static readonly TimeSpan WarningThreshold = TimeSpan.FromMinutes(2);
/// <inheritdoc />
public string CheckId => "check.agent.heartbeat.freshness";
/// <inheritdoc />
public string Name => "Agent Heartbeat Freshness";
/// <inheritdoc />
public string Description => "Verify all agents have recent heartbeats";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["agent", "heartbeat", "connectivity", "quick"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context)
{
return context.Services.GetService<IAgentStore>() != null;
}
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var agentStore = context.Services.GetRequiredService<IAgentStore>();
var timeProvider = context.Services.GetRequiredService<TimeProvider>();
var now = timeProvider.GetUtcNow();
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
var agents = await agentStore.GetAllAsync(ct);
var activeAgents = agents.Where(a => a.Status != AgentStatus.Deactivated).ToList();
if (activeAgents.Count == 0)
{
return builder
.Warn("No active agents registered")
.WithEvidence("Agent status", eb => eb
.Add("TotalAgents", agents.Count.ToString(CultureInfo.InvariantCulture))
.Add("ActiveAgents", "0"))
.WithCauses(
"No agents have been registered",
"All agents have been deactivated")
.WithRemediation(rb => rb
.AddStep(1, "Bootstrap a new agent",
"stella agent bootstrap --name agent-01 --env production --platform linux",
CommandType.Shell)
.AddStep(2, "Check agent registration status",
"stella agent list --all",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.Build();
}
var staleAgents = new List<(string Name, TimeSpan Age)>();
var warningAgents = new List<(string Name, TimeSpan Age)>();
var healthyAgents = new List<string>();
foreach (var agent in activeAgents)
{
var heartbeatAge = now - agent.LastHeartbeat;
if (heartbeatAge > StaleThreshold)
{
staleAgents.Add((agent.Name, heartbeatAge));
}
else if (heartbeatAge > WarningThreshold)
{
warningAgents.Add((agent.Name, heartbeatAge));
}
else
{
healthyAgents.Add(agent.Name);
}
}
if (staleAgents.Count > 0)
{
var staleList = staleAgents
.Select(a => $"{a.Name} (last heartbeat: {a.Age.TotalMinutes:F0}m ago)")
.ToList();
return builder
.Fail($"{staleAgents.Count} agent(s) have stale heartbeats")
.WithEvidence("Agent heartbeat status", eb => eb
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("Stale", staleAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("Warning", warningAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("Healthy", healthyAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("StaleAgents", string.Join(", ", staleList)))
.WithCauses(
"Agent process has crashed or stopped",
"Network connectivity issue between agent and orchestrator",
"Firewall blocking agent heartbeats",
"Agent host is unreachable or powered off",
"mTLS certificate has expired")
.WithRemediation(rb => rb
.AddStep(1, "Check agent status on the host",
"systemctl status stella-agent",
CommandType.Shell)
.AddStep(2, "View agent logs for errors",
"journalctl -u stella-agent --since '10 minutes ago'",
CommandType.Shell)
.AddStep(3, "Run agent diagnostics",
"stella agent doctor",
CommandType.Shell)
.AddStep(4, "Check network connectivity to orchestrator",
"curl -k https://orchestrator:8443/health",
CommandType.Shell)
.AddStep(5, "If certificate expired, renew it",
"stella agent renew-cert --force",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.WithRunbookUrl("https://docs.stella-ops.org/runbooks/agent-stale-heartbeat")
.Build();
}
if (warningAgents.Count > 0)
{
var warningList = warningAgents
.Select(a => $"{a.Name} ({a.Age.TotalSeconds:F0}s ago)")
.ToList();
return builder
.Warn($"{warningAgents.Count} agent(s) have delayed heartbeats")
.WithEvidence("Agent heartbeat status", eb => eb
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("Warning", warningAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("Healthy", healthyAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("DelayedAgents", string.Join(", ", warningList)))
.WithCauses(
"Agent is under heavy load",
"Network latency between agent and orchestrator",
"Agent is processing long-running tasks")
.WithRemediation(rb => rb
.AddStep(1, "Check agent resource utilization",
"stella agent health <agent-id>",
CommandType.Shell)
.AddStep(2, "Monitor heartbeat trend",
"stella agent logs --agent-id <agent-id> --tail 50",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.Build();
}
return builder
.Pass($"All {activeAgents.Count} agents have fresh heartbeats")
.WithEvidence("Agent heartbeat status", eb => eb
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("AllHealthy", "true"))
.Build();
}
}

View File

@@ -0,0 +1,56 @@
// -----------------------------------------------------------------------------
// AgentResourceUtilizationCheck.cs
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
// Task: TASK-041-09 - Server-Side Doctor Plugin
// Description: Monitors resource utilization across agent fleet
// -----------------------------------------------------------------------------
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugin.Agent.Checks;
/// <summary>
/// Monitors CPU, memory, and disk utilization across agent fleet.
/// </summary>
public sealed class AgentResourceUtilizationCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.agent.resource.utilization";
/// <inheritdoc />
public string Name => "Agent Resource Utilization";
/// <inheritdoc />
public string Description => "Monitor CPU, memory, and disk utilization across agents";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["agent", "resource", "performance", "capacity"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context) => true;
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
// TODO: Implement resource utilization monitoring
// This check verifies:
// 1. CPU utilization per agent
// 2. Memory utilization per agent
// 3. Disk space per agent
// 4. Resource trends
return builder
.Pass("Resource utilization check - implementation pending")
.Build();
}
}

View File

@@ -0,0 +1,122 @@
// -----------------------------------------------------------------------------
// AgentVersionConsistencyCheck.cs
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
// Task: TASK-041-09 - Server-Side Doctor Plugin
// Description: Checks for version consistency across agent fleet
// -----------------------------------------------------------------------------
using System.Globalization;
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
using StellaOps.ReleaseOrchestrator.Agent.Store;
namespace StellaOps.Doctor.Plugin.Agent.Checks;
/// <summary>
/// Checks for version consistency across the agent fleet.
/// Detects version skew that could cause compatibility issues.
/// </summary>
public sealed class AgentVersionConsistencyCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.agent.version.consistency";
/// <inheritdoc />
public string Name => "Agent Version Consistency";
/// <inheritdoc />
public string Description => "Verify all agents are running compatible versions";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["agent", "version", "maintenance"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context)
{
return context.Services.GetService<IAgentStore>() != null;
}
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var agentStore = context.Services.GetRequiredService<IAgentStore>();
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
var agents = await agentStore.GetAllAsync(ct);
var activeAgents = agents
.Where(a => a.Status != AgentStatus.Deactivated)
.ToList();
if (activeAgents.Count == 0)
{
return builder
.Skip("No active agents to check")
.Build();
}
var versionGroups = activeAgents
.GroupBy(a => a.Version ?? "unknown")
.OrderByDescending(g => g.Count())
.ToList();
var majorVersion = versionGroups.First().Key;
var majorCount = versionGroups.First().Count();
if (versionGroups.Count == 1)
{
return builder
.Pass($"All {activeAgents.Count} agents running version {majorVersion}")
.WithEvidence("Agent versions", eb => eb
.Add("Version", majorVersion)
.Add("AgentCount", activeAgents.Count.ToString(CultureInfo.InvariantCulture)))
.Build();
}
var outdatedAgents = versionGroups
.Skip(1)
.SelectMany(g => g.Select(a => $"{a.Name} ({g.Key})"))
.ToList();
var versionSummary = versionGroups
.Select(g => $"{g.Key}: {g.Count()}")
.ToList();
if (versionGroups.Count > 2 || outdatedAgents.Count > activeAgents.Count / 2)
{
return builder
.Warn($"Significant version skew detected ({versionGroups.Count} versions)")
.WithEvidence("Agent versions", eb => eb
.Add("MajorityVersion", majorVersion)
.Add("VersionDistribution", string.Join(", ", versionSummary))
.Add("OutdatedAgents", string.Join(", ", outdatedAgents.Take(10))))
.WithCauses(
"Auto-update is disabled on some agents",
"Some agents failed to update",
"Phased rollout in progress")
.WithRemediation(rb => rb
.AddStep(1, "Update outdated agents",
"stella agent update --version <target-version> --agent-id <id>",
CommandType.Shell)
.AddStep(2, "Enable auto-update if appropriate",
"stella agent config --agent-id <id> --set auto_update.enabled=true",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.Build();
}
return builder
.Pass($"Minor version skew acceptable ({versionGroups.Count} versions)")
.WithEvidence("Agent versions", eb => eb
.Add("MajorityVersion", majorVersion)
.Add("VersionDistribution", string.Join(", ", versionSummary)))
.Build();
}
}

View File

@@ -0,0 +1,56 @@
// -----------------------------------------------------------------------------
// FailedTaskRateCheck.cs
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
// Task: TASK-041-09 - Server-Side Doctor Plugin
// Description: Monitors task failure rate across agents
// -----------------------------------------------------------------------------
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugin.Agent.Checks;
/// <summary>
/// Monitors task failure rate to detect systemic issues.
/// </summary>
public sealed class FailedTaskRateCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.agent.task.failure.rate";
/// <inheritdoc />
public string Name => "Task Failure Rate";
/// <inheritdoc />
public string Description => "Monitor task failure rate across agent fleet";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["agent", "task", "failure", "reliability"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context) => true;
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
// TODO: Implement task failure rate monitoring
// This check verifies:
// 1. Overall task failure rate (last hour)
// 2. Per-agent failure rate
// 3. Failure rate trend (increasing/decreasing)
// 4. Common failure reasons
return builder
.Pass("Task failure rate check - implementation pending")
.Build();
}
}

View File

@@ -0,0 +1,141 @@
// -----------------------------------------------------------------------------
// StaleAgentCheck.cs
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
// Task: TASK-041-09 - Server-Side Doctor Plugin
// Description: Checks for agents that have been stale for extended periods
// -----------------------------------------------------------------------------
using System.Globalization;
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
using StellaOps.ReleaseOrchestrator.Agent.Store;
namespace StellaOps.Doctor.Plugin.Agent.Checks;
/// <summary>
/// Checks for agents that have been stale (offline) for extended periods
/// and may need to be decommissioned or investigated.
/// </summary>
public sealed class StaleAgentCheck : IDoctorCheck
{
private static readonly TimeSpan StaleThreshold = TimeSpan.FromHours(1);
private static readonly TimeSpan DecommissionThreshold = TimeSpan.FromDays(7);
/// <inheritdoc />
public string CheckId => "check.agent.stale";
/// <inheritdoc />
public string Name => "Stale Agent Detection";
/// <inheritdoc />
public string Description => "Detect agents that have been offline for extended periods";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["agent", "maintenance", "cleanup"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context)
{
return context.Services.GetService<IAgentStore>() != null;
}
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var agentStore = context.Services.GetRequiredService<IAgentStore>();
var timeProvider = context.Services.GetRequiredService<TimeProvider>();
var now = timeProvider.GetUtcNow();
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
var agents = await agentStore.GetAllAsync(ct);
var activeAgents = agents.Where(a => a.Status != AgentStatus.Deactivated).ToList();
var decommissionCandidates = new List<(string Name, TimeSpan OfflineFor)>();
var staleAgents = new List<(string Name, TimeSpan OfflineFor)>();
foreach (var agent in activeAgents)
{
var offlineFor = now - agent.LastHeartbeat;
if (offlineFor > DecommissionThreshold)
{
decommissionCandidates.Add((agent.Name, offlineFor));
}
else if (offlineFor > StaleThreshold)
{
staleAgents.Add((agent.Name, offlineFor));
}
}
if (decommissionCandidates.Count > 0)
{
var decommList = decommissionCandidates
.Select(a => $"{a.Name} (offline {a.OfflineFor.TotalDays:F0} days)")
.ToList();
return builder
.Warn($"{decommissionCandidates.Count} agent(s) may need decommissioning")
.WithEvidence("Stale agent status", eb => eb
.Add("DecommissionCandidates", decommissionCandidates.Count.ToString(CultureInfo.InvariantCulture))
.Add("StaleAgents", staleAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("Agents", string.Join(", ", decommList)))
.WithCauses(
"Agent host has been permanently removed",
"Agent was replaced but not deactivated",
"Infrastructure change without cleanup")
.WithRemediation(rb => rb
.AddStep(1, "Review stale agents",
"stella agent list --status stale",
CommandType.Shell)
.AddStep(2, "Deactivate agents that are no longer needed",
"stella agent deactivate --agent-id <agent-id>",
CommandType.Shell)
.AddStep(3, "If agent should be active, investigate host",
"ssh <agent-host> 'systemctl status stella-agent'",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.Build();
}
if (staleAgents.Count > 0)
{
var staleList = staleAgents
.Select(a => $"{a.Name} (offline {a.OfflineFor.TotalHours:F0} hours)")
.ToList();
return builder
.Warn($"{staleAgents.Count} agent(s) have been offline for over an hour")
.WithEvidence("Stale agent status", eb => eb
.Add("StaleAgents", staleAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("Agents", string.Join(", ", staleList)))
.WithCauses(
"Agent host is undergoing maintenance",
"Network partition",
"Agent process crash without auto-restart")
.WithRemediation(rb => rb
.AddStep(1, "Check agent host status",
"ping <agent-host>",
CommandType.Shell)
.AddStep(2, "Restart agent service",
"ssh <agent-host> 'systemctl restart stella-agent'",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.Build();
}
return builder
.Pass("No stale agents detected")
.WithEvidence("Stale agent status", eb => eb
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
.Add("AllHealthy", "true"))
.Build();
}
}

View File

@@ -0,0 +1,55 @@
// -----------------------------------------------------------------------------
// TaskQueueBacklogCheck.cs
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
// Task: TASK-041-09 - Server-Side Doctor Plugin
// Description: Monitors task queue backlog across agents
// -----------------------------------------------------------------------------
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugin.Agent.Checks;
/// <summary>
/// Monitors task queue backlog to detect capacity issues.
/// </summary>
public sealed class TaskQueueBacklogCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.agent.task.backlog";
/// <inheritdoc />
public string Name => "Task Queue Backlog";
/// <inheritdoc />
public string Description => "Monitor pending task queue depth across agents";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["agent", "task", "queue", "capacity"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context) => true;
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
// TODO: Implement task queue backlog monitoring
// This check verifies:
// 1. Total queued tasks across fleet
// 2. Age of oldest queued task
// 3. Queue growth rate trend
return builder
.Pass("Task queue backlog check - implementation pending")
.Build();
}
}

View File

@@ -0,0 +1,22 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<LangVersion>preview</LangVersion>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<RootNamespace>StellaOps.Doctor.Plugin.Agent</RootNamespace>
<Description>Agent fleet health checks for Stella Ops Doctor diagnostics</Description>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\..\..\__Libraries\StellaOps.Doctor\StellaOps.Doctor.csproj" />
<ProjectReference Include="..\..\..\ReleaseOrchestrator\__Libraries\StellaOps.ReleaseOrchestrator.Agent\StellaOps.ReleaseOrchestrator.Agent.csproj" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Http" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,319 @@
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugins.Agent;
/// <summary>
/// Server-side Doctor plugin for agent fleet health monitoring.
/// </summary>
public sealed class AgentHealthPlugin : IDoctorPlugin
{
private readonly IAgentFleetService _fleetService;
private readonly AgentHealthPluginOptions _options;
public AgentHealthPlugin(
IAgentFleetService fleetService,
AgentHealthPluginOptions? options = null)
{
_fleetService = fleetService;
_options = options ?? new AgentHealthPluginOptions();
}
public string Name => "AgentHealth";
public string Description => "Monitors agent fleet health";
public string[] Categories => ["fleet", "agents", "infrastructure"];
public async Task<IReadOnlyList<DoctorCheckResult>> RunChecksAsync(
DoctorContext context,
CancellationToken cancellationToken = default)
{
var results = new List<DoctorCheckResult>();
// Run all fleet health checks
results.Add(await CheckHeartbeatFreshnessAsync(cancellationToken));
results.Add(await CheckCertificateExpiryAsync(cancellationToken));
results.Add(await CheckVersionConsistencyAsync(cancellationToken));
results.Add(await CheckAgentCapacityAsync(cancellationToken));
results.Add(await CheckStaleAgentsAsync(cancellationToken));
results.Add(await CheckTaskQueueBacklogAsync(cancellationToken));
results.Add(await CheckFailedTaskRateAsync(cancellationToken));
return results;
}
private async Task<DoctorCheckResult> CheckHeartbeatFreshnessAsync(CancellationToken cancellationToken)
{
var agents = await _fleetService.GetAllAgentsAsync(cancellationToken);
var staleAgents = agents
.Where(a => a.LastHeartbeat < DateTimeOffset.UtcNow - _options.HeartbeatStaleThreshold)
.ToList();
if (staleAgents.Count == 0)
{
return DoctorCheckResult.Pass("AgentHeartbeatFreshness",
$"All {agents.Count} agents have recent heartbeats");
}
var severity = staleAgents.Count > agents.Count / 2
? DoctorSeverity.Critical
: DoctorSeverity.Warning;
return new DoctorCheckResult
{
CheckName = "AgentHeartbeatFreshness",
Severity = severity,
Message = $"{staleAgents.Count} of {agents.Count} agents have stale heartbeats",
Details = new Dictionary<string, object>
{
["staleAgents"] = staleAgents.Select(a => a.Id).ToList(),
["threshold"] = _options.HeartbeatStaleThreshold.TotalMinutes
}
};
}
private async Task<DoctorCheckResult> CheckCertificateExpiryAsync(CancellationToken cancellationToken)
{
var agents = await _fleetService.GetAllAgentsAsync(cancellationToken);
var expiringAgents = agents
.Where(a => a.CertificateExpiresAt.HasValue &&
a.CertificateExpiresAt.Value < DateTimeOffset.UtcNow.AddDays(_options.CertificateWarningDays))
.ToList();
if (expiringAgents.Count == 0)
{
return DoctorCheckResult.Pass("AgentCertificateExpiry",
"No agent certificates expiring soon");
}
var expiredCount = expiringAgents.Count(a =>
a.CertificateExpiresAt < DateTimeOffset.UtcNow);
var severity = expiredCount > 0 ? DoctorSeverity.Critical : DoctorSeverity.Warning;
return new DoctorCheckResult
{
CheckName = "AgentCertificateExpiry",
Severity = severity,
Message = expiredCount > 0
? $"{expiredCount} agents have expired certificates"
: $"{expiringAgents.Count} agents have certificates expiring within {_options.CertificateWarningDays} days",
Details = new Dictionary<string, object>
{
["expiringAgents"] = expiringAgents.Select(a => new { a.Id, a.CertificateExpiresAt }).ToList()
}
};
}
private async Task<DoctorCheckResult> CheckVersionConsistencyAsync(CancellationToken cancellationToken)
{
var agents = await _fleetService.GetAllAgentsAsync(cancellationToken);
var versionGroups = agents
.GroupBy(a => a.Version)
.OrderByDescending(g => g.Count())
.ToList();
if (versionGroups.Count <= 1)
{
return DoctorCheckResult.Pass("AgentVersionConsistency",
$"All agents running version {versionGroups.FirstOrDefault()?.Key ?? "unknown"}");
}
return new DoctorCheckResult
{
CheckName = "AgentVersionConsistency",
Severity = DoctorSeverity.Warning,
Message = $"Version skew detected: {versionGroups.Count} different versions running",
Details = new Dictionary<string, object>
{
["versions"] = versionGroups.Select(g => new { Version = g.Key, Count = g.Count() }).ToList()
}
};
}
private async Task<DoctorCheckResult> CheckAgentCapacityAsync(CancellationToken cancellationToken)
{
var agents = await _fleetService.GetAllAgentsAsync(cancellationToken);
var overloadedAgents = agents
.Where(a => a.CurrentTasks >= a.MaxConcurrentTasks)
.ToList();
if (overloadedAgents.Count == 0)
{
return DoctorCheckResult.Pass("AgentCapacity", "All agents have available capacity");
}
return new DoctorCheckResult
{
CheckName = "AgentCapacity",
Severity = overloadedAgents.Count > agents.Count / 2
? DoctorSeverity.Warning
: DoctorSeverity.Info,
Message = $"{overloadedAgents.Count} agents at maximum capacity",
Details = new Dictionary<string, object>
{
["overloadedAgents"] = overloadedAgents.Select(a => a.Id).ToList()
}
};
}
private async Task<DoctorCheckResult> CheckStaleAgentsAsync(CancellationToken cancellationToken)
{
var agents = await _fleetService.GetAllAgentsAsync(cancellationToken);
var disconnectedAgents = agents
.Where(a => a.Status == AgentFleetStatus.Disconnected &&
a.DisconnectedAt < DateTimeOffset.UtcNow.AddDays(-7))
.ToList();
if (disconnectedAgents.Count == 0)
{
return DoctorCheckResult.Pass("StaleAgents", "No stale disconnected agents");
}
return new DoctorCheckResult
{
CheckName = "StaleAgents",
Severity = DoctorSeverity.Info,
Message = $"{disconnectedAgents.Count} agents disconnected for more than 7 days",
Details = new Dictionary<string, object>
{
["staleAgents"] = disconnectedAgents.Select(a => new { a.Id, a.DisconnectedAt }).ToList()
},
Recommendation = "Consider removing stale agents or investigating connectivity issues"
};
}
private async Task<DoctorCheckResult> CheckTaskQueueBacklogAsync(CancellationToken cancellationToken)
{
var queueStats = await _fleetService.GetTaskQueueStatsAsync(cancellationToken);
if (queueStats.PendingTasks < _options.TaskQueueWarningThreshold)
{
return DoctorCheckResult.Pass("TaskQueueBacklog",
$"Task queue healthy: {queueStats.PendingTasks} pending tasks");
}
var severity = queueStats.PendingTasks > _options.TaskQueueCriticalThreshold
? DoctorSeverity.Critical
: DoctorSeverity.Warning;
return new DoctorCheckResult
{
CheckName = "TaskQueueBacklog",
Severity = severity,
Message = $"Task queue backlog: {queueStats.PendingTasks} pending tasks",
Details = new Dictionary<string, object>
{
["pendingTasks"] = queueStats.PendingTasks,
["oldestTaskAge"] = queueStats.OldestTaskAge?.TotalMinutes ?? 0
},
Recommendation = "Consider adding more agents or investigating task processing delays"
};
}
private async Task<DoctorCheckResult> CheckFailedTaskRateAsync(CancellationToken cancellationToken)
{
var stats = await _fleetService.GetTaskStatsAsync(
DateTimeOffset.UtcNow.AddHours(-1),
cancellationToken);
if (stats.TotalTasks == 0)
{
return DoctorCheckResult.Pass("FailedTaskRate", "No tasks executed in the last hour");
}
var failureRate = (double)stats.FailedTasks / stats.TotalTasks * 100;
if (failureRate < _options.FailureRateWarningThreshold)
{
return DoctorCheckResult.Pass("FailedTaskRate",
$"Task failure rate: {failureRate:F1}%");
}
var severity = failureRate > _options.FailureRateCriticalThreshold
? DoctorSeverity.Critical
: DoctorSeverity.Warning;
return new DoctorCheckResult
{
CheckName = "FailedTaskRate",
Severity = severity,
Message = $"High task failure rate: {failureRate:F1}%",
Details = new Dictionary<string, object>
{
["totalTasks"] = stats.TotalTasks,
["failedTasks"] = stats.FailedTasks,
["failureRate"] = failureRate
}
};
}
}
/// <summary>
/// Agent health plugin options.
/// </summary>
public sealed record AgentHealthPluginOptions
{
public TimeSpan HeartbeatStaleThreshold { get; init; } = TimeSpan.FromMinutes(5);
public int CertificateWarningDays { get; init; } = 14;
public int TaskQueueWarningThreshold { get; init; } = 100;
public int TaskQueueCriticalThreshold { get; init; } = 500;
public double FailureRateWarningThreshold { get; init; } = 5.0;
public double FailureRateCriticalThreshold { get; init; } = 20.0;
}
/// <summary>
/// Agent fleet service interface.
/// </summary>
public interface IAgentFleetService
{
Task<IReadOnlyList<AgentFleetInfo>> GetAllAgentsAsync(CancellationToken cancellationToken = default);
Task<TaskQueueStats> GetTaskQueueStatsAsync(CancellationToken cancellationToken = default);
Task<TaskExecutionStats> GetTaskStatsAsync(DateTimeOffset since, CancellationToken cancellationToken = default);
}
/// <summary>
/// Agent fleet info.
/// </summary>
public sealed record AgentFleetInfo
{
public required string Id { get; init; }
public required string Name { get; init; }
public required string Version { get; init; }
public required AgentFleetStatus Status { get; init; }
public DateTimeOffset LastHeartbeat { get; init; }
public DateTimeOffset? CertificateExpiresAt { get; init; }
public int CurrentTasks { get; init; }
public int MaxConcurrentTasks { get; init; }
public DateTimeOffset? DisconnectedAt { get; init; }
}
/// <summary>
/// Agent fleet status.
/// </summary>
public enum AgentFleetStatus
{
Unknown,
Online,
Disconnected,
Draining
}
/// <summary>
/// Task queue stats.
/// </summary>
public sealed record TaskQueueStats
{
public int PendingTasks { get; init; }
public TimeSpan? OldestTaskAge { get; init; }
}
/// <summary>
/// Task execution stats.
/// </summary>
public sealed record TaskExecutionStats
{
public int TotalTasks { get; init; }
public int SuccessfulTasks { get; init; }
public int FailedTasks { get; init; }
}

View File

@@ -0,0 +1,119 @@
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugins;
/// <summary>
/// Doctor plugin interface.
/// </summary>
public interface IDoctorPlugin
{
/// <summary>
/// Plugin name.
/// </summary>
string Name { get; }
/// <summary>
/// Plugin description.
/// </summary>
string Description { get; }
/// <summary>
/// Categories this plugin covers.
/// </summary>
string[] Categories { get; }
/// <summary>
/// Runs all health checks for this plugin.
/// </summary>
Task<IReadOnlyList<DoctorCheckResult>> RunChecksAsync(
DoctorContext context,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Doctor check result.
/// </summary>
public sealed record DoctorCheckResult
{
public required string CheckName { get; init; }
public required DoctorSeverity Severity { get; init; }
public required string Message { get; init; }
public IReadOnlyDictionary<string, object>? Details { get; init; }
public string? Recommendation { get; init; }
public TimeSpan Duration { get; init; }
public static DoctorCheckResult Pass(string checkName, string message) =>
new()
{
CheckName = checkName,
Severity = DoctorSeverity.None,
Message = message
};
public static DoctorCheckResult Info(string checkName, string message) =>
new()
{
CheckName = checkName,
Severity = DoctorSeverity.Info,
Message = message
};
public static DoctorCheckResult Warning(string checkName, string message) =>
new()
{
CheckName = checkName,
Severity = DoctorSeverity.Warning,
Message = message
};
public static DoctorCheckResult Error(string checkName, string message) =>
new()
{
CheckName = checkName,
Severity = DoctorSeverity.Error,
Message = message
};
public static DoctorCheckResult Critical(string checkName, string message) =>
new()
{
CheckName = checkName,
Severity = DoctorSeverity.Critical,
Message = message
};
}
/// <summary>
/// Doctor severity levels.
/// </summary>
public enum DoctorSeverity
{
None,
Info,
Warning,
Error,
Critical
}
/// <summary>
/// Doctor execution context.
/// </summary>
public sealed record DoctorContext
{
/// <summary>
/// Categories to check (null = all).
/// </summary>
public IReadOnlyList<string>? Categories { get; init; }
/// <summary>
/// Whether to include detailed diagnostics.
/// </summary>
public bool IncludeDetails { get; init; } = true;
/// <summary>
/// Per-check timeout.
/// </summary>
public TimeSpan CheckTimeout { get; init; } = TimeSpan.FromSeconds(30);
}