release orchestration strengthening
This commit is contained in:
@@ -0,0 +1,78 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentDoctorPlugin.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Doctor plugin for agent fleet health monitoring
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent;
|
||||
|
||||
/// <summary>
|
||||
/// Doctor plugin for agent fleet health monitoring.
|
||||
/// Monitors agent connectivity, certificates, capacity, and overall fleet health.
|
||||
/// </summary>
|
||||
public sealed class AgentDoctorPlugin : IDoctorPlugin
|
||||
{
|
||||
private static readonly Version PluginVersion = new(1, 0, 0);
|
||||
private static readonly Version MinVersion = new(1, 0, 0);
|
||||
|
||||
/// <inheritdoc />
|
||||
public string PluginId => "stellaops.doctor.agent";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string DisplayName => "Agent Fleet";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorCategory Category => DoctorCategory.Infrastructure;
|
||||
|
||||
/// <inheritdoc />
|
||||
public Version Version => PluginVersion;
|
||||
|
||||
/// <inheritdoc />
|
||||
public Version MinEngineVersion => MinVersion;
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool IsAvailable(IServiceProvider services)
|
||||
{
|
||||
// Always available - individual checks handle their own availability
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<IDoctorCheck> GetChecks(DoctorPluginContext context)
|
||||
{
|
||||
return new IDoctorCheck[]
|
||||
{
|
||||
// Connectivity checks
|
||||
new AgentHeartbeatFreshnessCheck(),
|
||||
new StaleAgentCheck(),
|
||||
|
||||
// Security checks
|
||||
new AgentCertificateExpiryCheck(),
|
||||
new AgentCertificateValidityCheck(),
|
||||
|
||||
// Capacity checks
|
||||
new AgentCapacityCheck(),
|
||||
new TaskQueueBacklogCheck(),
|
||||
new FailedTaskRateCheck(),
|
||||
|
||||
// Fleet health checks
|
||||
new AgentVersionConsistencyCheck(),
|
||||
new AgentResourceUtilizationCheck(),
|
||||
|
||||
// Cluster checks (when clustering is enabled)
|
||||
new AgentClusterHealthCheck(),
|
||||
new AgentClusterQuorumCheck()
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task InitializeAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
// No initialization required
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,167 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentCapacityCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Checks if agents have sufficient capacity for tasks
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.ReleaseOrchestrator.Agent.Store;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Checks if agents have sufficient capacity to handle incoming tasks.
|
||||
/// </summary>
|
||||
public sealed class AgentCapacityCheck : IDoctorCheck
|
||||
{
|
||||
private const double HighUtilizationThreshold = 0.9;
|
||||
private const double WarningUtilizationThreshold = 0.75;
|
||||
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.capacity";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Capacity";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verify agents have sufficient capacity for tasks";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "capacity", "performance"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
return context.Services.GetService<IAgentStore>() != null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var agentStore = context.Services.GetRequiredService<IAgentStore>();
|
||||
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
var agents = await agentStore.GetAllAsync(ct);
|
||||
var activeAgents = agents
|
||||
.Where(a => a.Status == AgentStatus.Online)
|
||||
.ToList();
|
||||
|
||||
if (activeAgents.Count == 0)
|
||||
{
|
||||
return builder
|
||||
.Fail("No online agents available to handle tasks")
|
||||
.WithEvidence("Agent capacity", eb => eb
|
||||
.Add("OnlineAgents", "0")
|
||||
.Add("TotalAgents", agents.Count.ToString(CultureInfo.InvariantCulture)))
|
||||
.WithCauses(
|
||||
"All agents are offline",
|
||||
"No agents have been registered")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Check agent heartbeat status",
|
||||
"stella doctor --check check.agent.heartbeat.freshness",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Bootstrap new agents if needed",
|
||||
"stella agent bootstrap --name <name> --env <env>",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
var overloadedAgents = new List<string>();
|
||||
var warningAgents = new List<string>();
|
||||
var totalCapacity = 0;
|
||||
var totalUtilized = 0;
|
||||
|
||||
foreach (var agent in activeAgents)
|
||||
{
|
||||
totalCapacity += agent.MaxConcurrentTasks;
|
||||
totalUtilized += agent.ActiveTaskCount;
|
||||
|
||||
var utilization = agent.MaxConcurrentTasks > 0
|
||||
? (double)agent.ActiveTaskCount / agent.MaxConcurrentTasks
|
||||
: 0;
|
||||
|
||||
if (utilization >= HighUtilizationThreshold)
|
||||
{
|
||||
overloadedAgents.Add($"{agent.Name} ({agent.ActiveTaskCount}/{agent.MaxConcurrentTasks})");
|
||||
}
|
||||
else if (utilization >= WarningUtilizationThreshold)
|
||||
{
|
||||
warningAgents.Add($"{agent.Name} ({agent.ActiveTaskCount}/{agent.MaxConcurrentTasks})");
|
||||
}
|
||||
}
|
||||
|
||||
var overallUtilization = totalCapacity > 0 ? (double)totalUtilized / totalCapacity : 0;
|
||||
|
||||
if (overallUtilization >= HighUtilizationThreshold)
|
||||
{
|
||||
return builder
|
||||
.Fail($"Fleet capacity critically low ({overallUtilization:P0} utilized)")
|
||||
.WithEvidence("Agent capacity", eb => eb
|
||||
.Add("TotalCapacity", totalCapacity.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("TotalUtilized", totalUtilized.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Utilization", overallUtilization.ToString("P1", CultureInfo.InvariantCulture))
|
||||
.Add("OverloadedAgents", string.Join(", ", overloadedAgents)))
|
||||
.WithCauses(
|
||||
"Too many concurrent deployments",
|
||||
"Insufficient agent capacity",
|
||||
"Tasks taking longer than expected")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Add more agents to increase capacity",
|
||||
"stella agent bootstrap --name <name> --env <env>",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Review and optimize long-running tasks",
|
||||
"stella task list --status running --sort duration",
|
||||
CommandType.Shell)
|
||||
.AddStep(3, "Consider increasing max concurrent tasks per agent",
|
||||
"stella agent config --agent-id <id> --set max_concurrent_tasks=10",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
if (overloadedAgents.Count > 0 || overallUtilization >= WarningUtilizationThreshold)
|
||||
{
|
||||
return builder
|
||||
.Warn($"Fleet capacity at {overallUtilization:P0}")
|
||||
.WithEvidence("Agent capacity", eb => eb
|
||||
.Add("TotalCapacity", totalCapacity.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("TotalUtilized", totalUtilized.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Utilization", overallUtilization.ToString("P1", CultureInfo.InvariantCulture))
|
||||
.Add("OverloadedAgents", overloadedAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("WarningAgents", warningAgents.Count.ToString(CultureInfo.InvariantCulture)))
|
||||
.WithCauses(
|
||||
"High deployment activity",
|
||||
"Approaching capacity limits")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Monitor capacity trend",
|
||||
"stella agent list --format table",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Consider scaling if trend continues",
|
||||
"stella agent bootstrap --name <name> --env <env>",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return builder
|
||||
.Pass($"Fleet capacity healthy ({overallUtilization:P0} utilized)")
|
||||
.WithEvidence("Agent capacity", eb => eb
|
||||
.Add("TotalCapacity", totalCapacity.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("TotalUtilized", totalUtilized.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Utilization", overallUtilization.ToString("P1", CultureInfo.InvariantCulture))
|
||||
.Add("OnlineAgents", activeAgents.Count.ToString(CultureInfo.InvariantCulture)))
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,189 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentCertificateExpiryCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Checks if agent certificates are expiring soon
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.ReleaseOrchestrator.Agent.Store;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Checks if any agent certificates are expired or expiring soon.
|
||||
/// </summary>
|
||||
public sealed class AgentCertificateExpiryCheck : IDoctorCheck
|
||||
{
|
||||
private static readonly TimeSpan WarningThreshold = TimeSpan.FromDays(7);
|
||||
private static readonly TimeSpan CriticalThreshold = TimeSpan.FromDays(1);
|
||||
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.certificate.expiry";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Certificate Expiry";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verify agent certificates are not expired or expiring soon";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "certificate", "security", "quick"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
return context.Services.GetService<IAgentStore>() != null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var agentStore = context.Services.GetRequiredService<IAgentStore>();
|
||||
var timeProvider = context.Services.GetRequiredService<TimeProvider>();
|
||||
var now = timeProvider.GetUtcNow();
|
||||
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
var agents = await agentStore.GetAllAsync(ct);
|
||||
var activeAgents = agents.Where(a => a.Status != AgentStatus.Deactivated).ToList();
|
||||
|
||||
if (activeAgents.Count == 0)
|
||||
{
|
||||
return builder
|
||||
.Skip("No active agents to check")
|
||||
.Build();
|
||||
}
|
||||
|
||||
var expiredAgents = new List<(string Name, TimeSpan ExpiredAgo)>();
|
||||
var criticalAgents = new List<(string Name, TimeSpan ExpiresIn)>();
|
||||
var warningAgents = new List<(string Name, TimeSpan ExpiresIn)>();
|
||||
|
||||
foreach (var agent in activeAgents)
|
||||
{
|
||||
if (agent.CertificateExpiry == default)
|
||||
{
|
||||
continue; // Certificate info not available
|
||||
}
|
||||
|
||||
var expiresIn = agent.CertificateExpiry - now;
|
||||
|
||||
if (expiresIn <= TimeSpan.Zero)
|
||||
{
|
||||
expiredAgents.Add((agent.Name, -expiresIn));
|
||||
}
|
||||
else if (expiresIn <= CriticalThreshold)
|
||||
{
|
||||
criticalAgents.Add((agent.Name, expiresIn));
|
||||
}
|
||||
else if (expiresIn <= WarningThreshold)
|
||||
{
|
||||
warningAgents.Add((agent.Name, expiresIn));
|
||||
}
|
||||
}
|
||||
|
||||
if (expiredAgents.Count > 0)
|
||||
{
|
||||
var expiredList = expiredAgents
|
||||
.Select(a => $"{a.Name} (expired {a.ExpiredAgo.TotalDays:F0} days ago)")
|
||||
.ToList();
|
||||
|
||||
return builder
|
||||
.Fail($"{expiredAgents.Count} agent(s) have expired certificates")
|
||||
.WithEvidence("Agent certificate status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Expired", expiredAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Critical", criticalAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Warning", warningAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("ExpiredAgents", string.Join(", ", expiredList)))
|
||||
.WithCauses(
|
||||
"Certificate auto-renewal is disabled",
|
||||
"Agent was offline when renewal was due",
|
||||
"Certificate authority is unreachable",
|
||||
"Agent bootstrap was incomplete")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Force certificate renewal on the affected agent",
|
||||
"stella agent renew-cert --agent-id <agent-id> --force",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "If agent is unreachable, re-bootstrap",
|
||||
"stella agent bootstrap --name <agent-name> --env <environment>",
|
||||
CommandType.Shell)
|
||||
.AddStep(3, "Verify auto-renewal is enabled",
|
||||
"stella agent config --agent-id <agent-id> | grep auto_renew",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.WithRunbookUrl("https://docs.stella-ops.org/runbooks/agent-cert-expired")
|
||||
.Build();
|
||||
}
|
||||
|
||||
if (criticalAgents.Count > 0)
|
||||
{
|
||||
var criticalList = criticalAgents
|
||||
.Select(a => $"{a.Name} (expires in {a.ExpiresIn.TotalHours:F0} hours)")
|
||||
.ToList();
|
||||
|
||||
return builder
|
||||
.Fail($"{criticalAgents.Count} agent(s) have certificates expiring within 24 hours")
|
||||
.WithEvidence("Agent certificate status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Critical", criticalAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Warning", warningAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("CriticalAgents", string.Join(", ", criticalList)))
|
||||
.WithCauses(
|
||||
"Certificate auto-renewal failed",
|
||||
"Agent has been offline",
|
||||
"Certificate authority rate limiting")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Manually trigger certificate renewal",
|
||||
"stella agent renew-cert --agent-id <agent-id>",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Check agent logs for renewal failures",
|
||||
"stella agent logs --agent-id <agent-id> --level warn",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
if (warningAgents.Count > 0)
|
||||
{
|
||||
var warningList = warningAgents
|
||||
.Select(a => $"{a.Name} (expires in {a.ExpiresIn.TotalDays:F0} days)")
|
||||
.ToList();
|
||||
|
||||
return builder
|
||||
.Warn($"{warningAgents.Count} agent(s) have certificates expiring within 7 days")
|
||||
.WithEvidence("Agent certificate status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Warning", warningAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("WarningAgents", string.Join(", ", warningList)))
|
||||
.WithCauses(
|
||||
"Certificate renewal threshold not reached yet",
|
||||
"Agent auto-renewal scheduled but not yet triggered")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Monitor certificate renewal",
|
||||
"stella agent health <agent-id>",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Optionally force early renewal",
|
||||
"stella agent renew-cert --agent-id <agent-id>",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return builder
|
||||
.Pass("All agent certificates are valid")
|
||||
.WithEvidence("Agent certificate status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("AllValid", "true"))
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentCertificateValidityCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Validates agent certificate chain and trust
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.ReleaseOrchestrator.Agent.Store;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Validates agent certificate chain and trust relationships.
|
||||
/// </summary>
|
||||
public sealed class AgentCertificateValidityCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.certificate.validity";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Certificate Validity";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verify agent certificates have valid chain of trust";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "certificate", "security"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(5);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
return context.Services.GetService<IAgentStore>() != null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
// TODO: Implement certificate chain validation
|
||||
// This check verifies:
|
||||
// 1. Certificate is signed by trusted CA
|
||||
// 2. Certificate chain is complete
|
||||
// 3. No revoked certificates in chain
|
||||
// 4. Certificate is for correct agent identity
|
||||
|
||||
return builder
|
||||
.Pass("Certificate validity check - implementation pending")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentClusterHealthCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Monitors agent cluster health (when clustering is enabled)
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Monitors agent cluster health when clustering is enabled.
|
||||
/// </summary>
|
||||
public sealed class AgentClusterHealthCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.cluster.health";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Cluster Health";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Monitor agent cluster membership and health";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "cluster", "ha", "resilience"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
// Only run if clustering is enabled
|
||||
var clusteringEnabled = context.Configuration["Agent:Cluster:Enabled"];
|
||||
return clusteringEnabled?.Equals("true", StringComparison.OrdinalIgnoreCase) == true;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
// TODO: Implement cluster health monitoring
|
||||
// This check verifies:
|
||||
// 1. All cluster members are reachable
|
||||
// 2. Leader is elected and healthy
|
||||
// 3. State sync is working
|
||||
// 4. Failover is possible if needed
|
||||
|
||||
return builder
|
||||
.Skip("Clustering not enabled or check implementation pending")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentClusterQuorumCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Verifies agent cluster has quorum for leader election
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies agent cluster has sufficient members for quorum.
|
||||
/// </summary>
|
||||
public sealed class AgentClusterQuorumCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.cluster.quorum";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Cluster Quorum";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verify agent cluster has quorum for leader election";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "cluster", "quorum", "ha"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
// Only run if clustering is enabled
|
||||
var clusteringEnabled = context.Configuration["Agent:Cluster:Enabled"];
|
||||
return clusteringEnabled?.Equals("true", StringComparison.OrdinalIgnoreCase) == true;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
// TODO: Implement quorum check
|
||||
// This check verifies:
|
||||
// 1. Minimum members are online (n/2 + 1 for odd, or configured minimum)
|
||||
// 2. Leader election is possible
|
||||
// 3. Split-brain prevention is active
|
||||
|
||||
return builder
|
||||
.Skip("Clustering not enabled or check implementation pending")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,179 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentHeartbeatFreshnessCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Checks if all agents have fresh heartbeats
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.ReleaseOrchestrator.Agent.Store;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Checks if all registered agents have recent heartbeats.
|
||||
/// </summary>
|
||||
public sealed class AgentHeartbeatFreshnessCheck : IDoctorCheck
|
||||
{
|
||||
private static readonly TimeSpan StaleThreshold = TimeSpan.FromMinutes(5);
|
||||
private static readonly TimeSpan WarningThreshold = TimeSpan.FromMinutes(2);
|
||||
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.heartbeat.freshness";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Heartbeat Freshness";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verify all agents have recent heartbeats";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "heartbeat", "connectivity", "quick"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
return context.Services.GetService<IAgentStore>() != null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var agentStore = context.Services.GetRequiredService<IAgentStore>();
|
||||
var timeProvider = context.Services.GetRequiredService<TimeProvider>();
|
||||
var now = timeProvider.GetUtcNow();
|
||||
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
var agents = await agentStore.GetAllAsync(ct);
|
||||
var activeAgents = agents.Where(a => a.Status != AgentStatus.Deactivated).ToList();
|
||||
|
||||
if (activeAgents.Count == 0)
|
||||
{
|
||||
return builder
|
||||
.Warn("No active agents registered")
|
||||
.WithEvidence("Agent status", eb => eb
|
||||
.Add("TotalAgents", agents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("ActiveAgents", "0"))
|
||||
.WithCauses(
|
||||
"No agents have been registered",
|
||||
"All agents have been deactivated")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Bootstrap a new agent",
|
||||
"stella agent bootstrap --name agent-01 --env production --platform linux",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Check agent registration status",
|
||||
"stella agent list --all",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
var staleAgents = new List<(string Name, TimeSpan Age)>();
|
||||
var warningAgents = new List<(string Name, TimeSpan Age)>();
|
||||
var healthyAgents = new List<string>();
|
||||
|
||||
foreach (var agent in activeAgents)
|
||||
{
|
||||
var heartbeatAge = now - agent.LastHeartbeat;
|
||||
|
||||
if (heartbeatAge > StaleThreshold)
|
||||
{
|
||||
staleAgents.Add((agent.Name, heartbeatAge));
|
||||
}
|
||||
else if (heartbeatAge > WarningThreshold)
|
||||
{
|
||||
warningAgents.Add((agent.Name, heartbeatAge));
|
||||
}
|
||||
else
|
||||
{
|
||||
healthyAgents.Add(agent.Name);
|
||||
}
|
||||
}
|
||||
|
||||
if (staleAgents.Count > 0)
|
||||
{
|
||||
var staleList = staleAgents
|
||||
.Select(a => $"{a.Name} (last heartbeat: {a.Age.TotalMinutes:F0}m ago)")
|
||||
.ToList();
|
||||
|
||||
return builder
|
||||
.Fail($"{staleAgents.Count} agent(s) have stale heartbeats")
|
||||
.WithEvidence("Agent heartbeat status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Stale", staleAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Warning", warningAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Healthy", healthyAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("StaleAgents", string.Join(", ", staleList)))
|
||||
.WithCauses(
|
||||
"Agent process has crashed or stopped",
|
||||
"Network connectivity issue between agent and orchestrator",
|
||||
"Firewall blocking agent heartbeats",
|
||||
"Agent host is unreachable or powered off",
|
||||
"mTLS certificate has expired")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Check agent status on the host",
|
||||
"systemctl status stella-agent",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "View agent logs for errors",
|
||||
"journalctl -u stella-agent --since '10 minutes ago'",
|
||||
CommandType.Shell)
|
||||
.AddStep(3, "Run agent diagnostics",
|
||||
"stella agent doctor",
|
||||
CommandType.Shell)
|
||||
.AddStep(4, "Check network connectivity to orchestrator",
|
||||
"curl -k https://orchestrator:8443/health",
|
||||
CommandType.Shell)
|
||||
.AddStep(5, "If certificate expired, renew it",
|
||||
"stella agent renew-cert --force",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.WithRunbookUrl("https://docs.stella-ops.org/runbooks/agent-stale-heartbeat")
|
||||
.Build();
|
||||
}
|
||||
|
||||
if (warningAgents.Count > 0)
|
||||
{
|
||||
var warningList = warningAgents
|
||||
.Select(a => $"{a.Name} ({a.Age.TotalSeconds:F0}s ago)")
|
||||
.ToList();
|
||||
|
||||
return builder
|
||||
.Warn($"{warningAgents.Count} agent(s) have delayed heartbeats")
|
||||
.WithEvidence("Agent heartbeat status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Warning", warningAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Healthy", healthyAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("DelayedAgents", string.Join(", ", warningList)))
|
||||
.WithCauses(
|
||||
"Agent is under heavy load",
|
||||
"Network latency between agent and orchestrator",
|
||||
"Agent is processing long-running tasks")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Check agent resource utilization",
|
||||
"stella agent health <agent-id>",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Monitor heartbeat trend",
|
||||
"stella agent logs --agent-id <agent-id> --tail 50",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return builder
|
||||
.Pass($"All {activeAgents.Count} agents have fresh heartbeats")
|
||||
.WithEvidence("Agent heartbeat status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("AllHealthy", "true"))
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentResourceUtilizationCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Monitors resource utilization across agent fleet
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Monitors CPU, memory, and disk utilization across agent fleet.
|
||||
/// </summary>
|
||||
public sealed class AgentResourceUtilizationCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.resource.utilization";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Resource Utilization";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Monitor CPU, memory, and disk utilization across agents";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "resource", "performance", "capacity"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context) => true;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
// TODO: Implement resource utilization monitoring
|
||||
// This check verifies:
|
||||
// 1. CPU utilization per agent
|
||||
// 2. Memory utilization per agent
|
||||
// 3. Disk space per agent
|
||||
// 4. Resource trends
|
||||
|
||||
return builder
|
||||
.Pass("Resource utilization check - implementation pending")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,122 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentVersionConsistencyCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Checks for version consistency across agent fleet
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.ReleaseOrchestrator.Agent.Store;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Checks for version consistency across the agent fleet.
|
||||
/// Detects version skew that could cause compatibility issues.
|
||||
/// </summary>
|
||||
public sealed class AgentVersionConsistencyCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.version.consistency";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Version Consistency";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verify all agents are running compatible versions";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "version", "maintenance"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
return context.Services.GetService<IAgentStore>() != null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var agentStore = context.Services.GetRequiredService<IAgentStore>();
|
||||
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
var agents = await agentStore.GetAllAsync(ct);
|
||||
var activeAgents = agents
|
||||
.Where(a => a.Status != AgentStatus.Deactivated)
|
||||
.ToList();
|
||||
|
||||
if (activeAgents.Count == 0)
|
||||
{
|
||||
return builder
|
||||
.Skip("No active agents to check")
|
||||
.Build();
|
||||
}
|
||||
|
||||
var versionGroups = activeAgents
|
||||
.GroupBy(a => a.Version ?? "unknown")
|
||||
.OrderByDescending(g => g.Count())
|
||||
.ToList();
|
||||
|
||||
var majorVersion = versionGroups.First().Key;
|
||||
var majorCount = versionGroups.First().Count();
|
||||
|
||||
if (versionGroups.Count == 1)
|
||||
{
|
||||
return builder
|
||||
.Pass($"All {activeAgents.Count} agents running version {majorVersion}")
|
||||
.WithEvidence("Agent versions", eb => eb
|
||||
.Add("Version", majorVersion)
|
||||
.Add("AgentCount", activeAgents.Count.ToString(CultureInfo.InvariantCulture)))
|
||||
.Build();
|
||||
}
|
||||
|
||||
var outdatedAgents = versionGroups
|
||||
.Skip(1)
|
||||
.SelectMany(g => g.Select(a => $"{a.Name} ({g.Key})"))
|
||||
.ToList();
|
||||
|
||||
var versionSummary = versionGroups
|
||||
.Select(g => $"{g.Key}: {g.Count()}")
|
||||
.ToList();
|
||||
|
||||
if (versionGroups.Count > 2 || outdatedAgents.Count > activeAgents.Count / 2)
|
||||
{
|
||||
return builder
|
||||
.Warn($"Significant version skew detected ({versionGroups.Count} versions)")
|
||||
.WithEvidence("Agent versions", eb => eb
|
||||
.Add("MajorityVersion", majorVersion)
|
||||
.Add("VersionDistribution", string.Join(", ", versionSummary))
|
||||
.Add("OutdatedAgents", string.Join(", ", outdatedAgents.Take(10))))
|
||||
.WithCauses(
|
||||
"Auto-update is disabled on some agents",
|
||||
"Some agents failed to update",
|
||||
"Phased rollout in progress")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Update outdated agents",
|
||||
"stella agent update --version <target-version> --agent-id <id>",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Enable auto-update if appropriate",
|
||||
"stella agent config --agent-id <id> --set auto_update.enabled=true",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return builder
|
||||
.Pass($"Minor version skew acceptable ({versionGroups.Count} versions)")
|
||||
.WithEvidence("Agent versions", eb => eb
|
||||
.Add("MajorityVersion", majorVersion)
|
||||
.Add("VersionDistribution", string.Join(", ", versionSummary)))
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// FailedTaskRateCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Monitors task failure rate across agents
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Monitors task failure rate to detect systemic issues.
|
||||
/// </summary>
|
||||
public sealed class FailedTaskRateCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.task.failure.rate";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Task Failure Rate";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Monitor task failure rate across agent fleet";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "task", "failure", "reliability"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context) => true;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
// TODO: Implement task failure rate monitoring
|
||||
// This check verifies:
|
||||
// 1. Overall task failure rate (last hour)
|
||||
// 2. Per-agent failure rate
|
||||
// 3. Failure rate trend (increasing/decreasing)
|
||||
// 4. Common failure reasons
|
||||
|
||||
return builder
|
||||
.Pass("Task failure rate check - implementation pending")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,141 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// StaleAgentCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Checks for agents that have been stale for extended periods
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.ReleaseOrchestrator.Agent.Store;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Checks for agents that have been stale (offline) for extended periods
|
||||
/// and may need to be decommissioned or investigated.
|
||||
/// </summary>
|
||||
public sealed class StaleAgentCheck : IDoctorCheck
|
||||
{
|
||||
private static readonly TimeSpan StaleThreshold = TimeSpan.FromHours(1);
|
||||
private static readonly TimeSpan DecommissionThreshold = TimeSpan.FromDays(7);
|
||||
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.stale";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Stale Agent Detection";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Detect agents that have been offline for extended periods";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "maintenance", "cleanup"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
return context.Services.GetService<IAgentStore>() != null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var agentStore = context.Services.GetRequiredService<IAgentStore>();
|
||||
var timeProvider = context.Services.GetRequiredService<TimeProvider>();
|
||||
var now = timeProvider.GetUtcNow();
|
||||
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
var agents = await agentStore.GetAllAsync(ct);
|
||||
var activeAgents = agents.Where(a => a.Status != AgentStatus.Deactivated).ToList();
|
||||
|
||||
var decommissionCandidates = new List<(string Name, TimeSpan OfflineFor)>();
|
||||
var staleAgents = new List<(string Name, TimeSpan OfflineFor)>();
|
||||
|
||||
foreach (var agent in activeAgents)
|
||||
{
|
||||
var offlineFor = now - agent.LastHeartbeat;
|
||||
|
||||
if (offlineFor > DecommissionThreshold)
|
||||
{
|
||||
decommissionCandidates.Add((agent.Name, offlineFor));
|
||||
}
|
||||
else if (offlineFor > StaleThreshold)
|
||||
{
|
||||
staleAgents.Add((agent.Name, offlineFor));
|
||||
}
|
||||
}
|
||||
|
||||
if (decommissionCandidates.Count > 0)
|
||||
{
|
||||
var decommList = decommissionCandidates
|
||||
.Select(a => $"{a.Name} (offline {a.OfflineFor.TotalDays:F0} days)")
|
||||
.ToList();
|
||||
|
||||
return builder
|
||||
.Warn($"{decommissionCandidates.Count} agent(s) may need decommissioning")
|
||||
.WithEvidence("Stale agent status", eb => eb
|
||||
.Add("DecommissionCandidates", decommissionCandidates.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("StaleAgents", staleAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Agents", string.Join(", ", decommList)))
|
||||
.WithCauses(
|
||||
"Agent host has been permanently removed",
|
||||
"Agent was replaced but not deactivated",
|
||||
"Infrastructure change without cleanup")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Review stale agents",
|
||||
"stella agent list --status stale",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Deactivate agents that are no longer needed",
|
||||
"stella agent deactivate --agent-id <agent-id>",
|
||||
CommandType.Shell)
|
||||
.AddStep(3, "If agent should be active, investigate host",
|
||||
"ssh <agent-host> 'systemctl status stella-agent'",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
if (staleAgents.Count > 0)
|
||||
{
|
||||
var staleList = staleAgents
|
||||
.Select(a => $"{a.Name} (offline {a.OfflineFor.TotalHours:F0} hours)")
|
||||
.ToList();
|
||||
|
||||
return builder
|
||||
.Warn($"{staleAgents.Count} agent(s) have been offline for over an hour")
|
||||
.WithEvidence("Stale agent status", eb => eb
|
||||
.Add("StaleAgents", staleAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Agents", string.Join(", ", staleList)))
|
||||
.WithCauses(
|
||||
"Agent host is undergoing maintenance",
|
||||
"Network partition",
|
||||
"Agent process crash without auto-restart")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Check agent host status",
|
||||
"ping <agent-host>",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Restart agent service",
|
||||
"ssh <agent-host> 'systemctl restart stella-agent'",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return builder
|
||||
.Pass("No stale agents detected")
|
||||
.WithEvidence("Stale agent status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("AllHealthy", "true"))
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// TaskQueueBacklogCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Monitors task queue backlog across agents
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Monitors task queue backlog to detect capacity issues.
|
||||
/// </summary>
|
||||
public sealed class TaskQueueBacklogCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.task.backlog";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Task Queue Backlog";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Monitor pending task queue depth across agents";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "task", "queue", "capacity"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context) => true;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
// TODO: Implement task queue backlog monitoring
|
||||
// This check verifies:
|
||||
// 1. Total queued tasks across fleet
|
||||
// 2. Age of oldest queued task
|
||||
// 3. Queue growth rate trend
|
||||
|
||||
return builder
|
||||
.Pass("Task queue backlog check - implementation pending")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<RootNamespace>StellaOps.Doctor.Plugin.Agent</RootNamespace>
|
||||
<Description>Agent fleet health checks for Stella Ops Doctor diagnostics</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\..\__Libraries\StellaOps.Doctor\StellaOps.Doctor.csproj" />
|
||||
<ProjectReference Include="..\..\..\ReleaseOrchestrator\__Libraries\StellaOps.ReleaseOrchestrator.Agent\StellaOps.ReleaseOrchestrator.Agent.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Http" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -0,0 +1,319 @@
|
||||
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.Agent;
|
||||
|
||||
/// <summary>
|
||||
/// Server-side Doctor plugin for agent fleet health monitoring.
|
||||
/// </summary>
|
||||
public sealed class AgentHealthPlugin : IDoctorPlugin
|
||||
{
|
||||
private readonly IAgentFleetService _fleetService;
|
||||
private readonly AgentHealthPluginOptions _options;
|
||||
|
||||
public AgentHealthPlugin(
|
||||
IAgentFleetService fleetService,
|
||||
AgentHealthPluginOptions? options = null)
|
||||
{
|
||||
_fleetService = fleetService;
|
||||
_options = options ?? new AgentHealthPluginOptions();
|
||||
}
|
||||
|
||||
public string Name => "AgentHealth";
|
||||
public string Description => "Monitors agent fleet health";
|
||||
public string[] Categories => ["fleet", "agents", "infrastructure"];
|
||||
|
||||
public async Task<IReadOnlyList<DoctorCheckResult>> RunChecksAsync(
|
||||
DoctorContext context,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var results = new List<DoctorCheckResult>();
|
||||
|
||||
// Run all fleet health checks
|
||||
results.Add(await CheckHeartbeatFreshnessAsync(cancellationToken));
|
||||
results.Add(await CheckCertificateExpiryAsync(cancellationToken));
|
||||
results.Add(await CheckVersionConsistencyAsync(cancellationToken));
|
||||
results.Add(await CheckAgentCapacityAsync(cancellationToken));
|
||||
results.Add(await CheckStaleAgentsAsync(cancellationToken));
|
||||
results.Add(await CheckTaskQueueBacklogAsync(cancellationToken));
|
||||
results.Add(await CheckFailedTaskRateAsync(cancellationToken));
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private async Task<DoctorCheckResult> CheckHeartbeatFreshnessAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var agents = await _fleetService.GetAllAgentsAsync(cancellationToken);
|
||||
var staleAgents = agents
|
||||
.Where(a => a.LastHeartbeat < DateTimeOffset.UtcNow - _options.HeartbeatStaleThreshold)
|
||||
.ToList();
|
||||
|
||||
if (staleAgents.Count == 0)
|
||||
{
|
||||
return DoctorCheckResult.Pass("AgentHeartbeatFreshness",
|
||||
$"All {agents.Count} agents have recent heartbeats");
|
||||
}
|
||||
|
||||
var severity = staleAgents.Count > agents.Count / 2
|
||||
? DoctorSeverity.Critical
|
||||
: DoctorSeverity.Warning;
|
||||
|
||||
return new DoctorCheckResult
|
||||
{
|
||||
CheckName = "AgentHeartbeatFreshness",
|
||||
Severity = severity,
|
||||
Message = $"{staleAgents.Count} of {agents.Count} agents have stale heartbeats",
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["staleAgents"] = staleAgents.Select(a => a.Id).ToList(),
|
||||
["threshold"] = _options.HeartbeatStaleThreshold.TotalMinutes
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<DoctorCheckResult> CheckCertificateExpiryAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var agents = await _fleetService.GetAllAgentsAsync(cancellationToken);
|
||||
var expiringAgents = agents
|
||||
.Where(a => a.CertificateExpiresAt.HasValue &&
|
||||
a.CertificateExpiresAt.Value < DateTimeOffset.UtcNow.AddDays(_options.CertificateWarningDays))
|
||||
.ToList();
|
||||
|
||||
if (expiringAgents.Count == 0)
|
||||
{
|
||||
return DoctorCheckResult.Pass("AgentCertificateExpiry",
|
||||
"No agent certificates expiring soon");
|
||||
}
|
||||
|
||||
var expiredCount = expiringAgents.Count(a =>
|
||||
a.CertificateExpiresAt < DateTimeOffset.UtcNow);
|
||||
|
||||
var severity = expiredCount > 0 ? DoctorSeverity.Critical : DoctorSeverity.Warning;
|
||||
|
||||
return new DoctorCheckResult
|
||||
{
|
||||
CheckName = "AgentCertificateExpiry",
|
||||
Severity = severity,
|
||||
Message = expiredCount > 0
|
||||
? $"{expiredCount} agents have expired certificates"
|
||||
: $"{expiringAgents.Count} agents have certificates expiring within {_options.CertificateWarningDays} days",
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["expiringAgents"] = expiringAgents.Select(a => new { a.Id, a.CertificateExpiresAt }).ToList()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<DoctorCheckResult> CheckVersionConsistencyAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var agents = await _fleetService.GetAllAgentsAsync(cancellationToken);
|
||||
var versionGroups = agents
|
||||
.GroupBy(a => a.Version)
|
||||
.OrderByDescending(g => g.Count())
|
||||
.ToList();
|
||||
|
||||
if (versionGroups.Count <= 1)
|
||||
{
|
||||
return DoctorCheckResult.Pass("AgentVersionConsistency",
|
||||
$"All agents running version {versionGroups.FirstOrDefault()?.Key ?? "unknown"}");
|
||||
}
|
||||
|
||||
return new DoctorCheckResult
|
||||
{
|
||||
CheckName = "AgentVersionConsistency",
|
||||
Severity = DoctorSeverity.Warning,
|
||||
Message = $"Version skew detected: {versionGroups.Count} different versions running",
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["versions"] = versionGroups.Select(g => new { Version = g.Key, Count = g.Count() }).ToList()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<DoctorCheckResult> CheckAgentCapacityAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var agents = await _fleetService.GetAllAgentsAsync(cancellationToken);
|
||||
var overloadedAgents = agents
|
||||
.Where(a => a.CurrentTasks >= a.MaxConcurrentTasks)
|
||||
.ToList();
|
||||
|
||||
if (overloadedAgents.Count == 0)
|
||||
{
|
||||
return DoctorCheckResult.Pass("AgentCapacity", "All agents have available capacity");
|
||||
}
|
||||
|
||||
return new DoctorCheckResult
|
||||
{
|
||||
CheckName = "AgentCapacity",
|
||||
Severity = overloadedAgents.Count > agents.Count / 2
|
||||
? DoctorSeverity.Warning
|
||||
: DoctorSeverity.Info,
|
||||
Message = $"{overloadedAgents.Count} agents at maximum capacity",
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["overloadedAgents"] = overloadedAgents.Select(a => a.Id).ToList()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<DoctorCheckResult> CheckStaleAgentsAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var agents = await _fleetService.GetAllAgentsAsync(cancellationToken);
|
||||
var disconnectedAgents = agents
|
||||
.Where(a => a.Status == AgentFleetStatus.Disconnected &&
|
||||
a.DisconnectedAt < DateTimeOffset.UtcNow.AddDays(-7))
|
||||
.ToList();
|
||||
|
||||
if (disconnectedAgents.Count == 0)
|
||||
{
|
||||
return DoctorCheckResult.Pass("StaleAgents", "No stale disconnected agents");
|
||||
}
|
||||
|
||||
return new DoctorCheckResult
|
||||
{
|
||||
CheckName = "StaleAgents",
|
||||
Severity = DoctorSeverity.Info,
|
||||
Message = $"{disconnectedAgents.Count} agents disconnected for more than 7 days",
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["staleAgents"] = disconnectedAgents.Select(a => new { a.Id, a.DisconnectedAt }).ToList()
|
||||
},
|
||||
Recommendation = "Consider removing stale agents or investigating connectivity issues"
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<DoctorCheckResult> CheckTaskQueueBacklogAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var queueStats = await _fleetService.GetTaskQueueStatsAsync(cancellationToken);
|
||||
|
||||
if (queueStats.PendingTasks < _options.TaskQueueWarningThreshold)
|
||||
{
|
||||
return DoctorCheckResult.Pass("TaskQueueBacklog",
|
||||
$"Task queue healthy: {queueStats.PendingTasks} pending tasks");
|
||||
}
|
||||
|
||||
var severity = queueStats.PendingTasks > _options.TaskQueueCriticalThreshold
|
||||
? DoctorSeverity.Critical
|
||||
: DoctorSeverity.Warning;
|
||||
|
||||
return new DoctorCheckResult
|
||||
{
|
||||
CheckName = "TaskQueueBacklog",
|
||||
Severity = severity,
|
||||
Message = $"Task queue backlog: {queueStats.PendingTasks} pending tasks",
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["pendingTasks"] = queueStats.PendingTasks,
|
||||
["oldestTaskAge"] = queueStats.OldestTaskAge?.TotalMinutes ?? 0
|
||||
},
|
||||
Recommendation = "Consider adding more agents or investigating task processing delays"
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<DoctorCheckResult> CheckFailedTaskRateAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var stats = await _fleetService.GetTaskStatsAsync(
|
||||
DateTimeOffset.UtcNow.AddHours(-1),
|
||||
cancellationToken);
|
||||
|
||||
if (stats.TotalTasks == 0)
|
||||
{
|
||||
return DoctorCheckResult.Pass("FailedTaskRate", "No tasks executed in the last hour");
|
||||
}
|
||||
|
||||
var failureRate = (double)stats.FailedTasks / stats.TotalTasks * 100;
|
||||
|
||||
if (failureRate < _options.FailureRateWarningThreshold)
|
||||
{
|
||||
return DoctorCheckResult.Pass("FailedTaskRate",
|
||||
$"Task failure rate: {failureRate:F1}%");
|
||||
}
|
||||
|
||||
var severity = failureRate > _options.FailureRateCriticalThreshold
|
||||
? DoctorSeverity.Critical
|
||||
: DoctorSeverity.Warning;
|
||||
|
||||
return new DoctorCheckResult
|
||||
{
|
||||
CheckName = "FailedTaskRate",
|
||||
Severity = severity,
|
||||
Message = $"High task failure rate: {failureRate:F1}%",
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["totalTasks"] = stats.TotalTasks,
|
||||
["failedTasks"] = stats.FailedTasks,
|
||||
["failureRate"] = failureRate
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Agent health plugin options.
|
||||
/// </summary>
|
||||
public sealed record AgentHealthPluginOptions
|
||||
{
|
||||
public TimeSpan HeartbeatStaleThreshold { get; init; } = TimeSpan.FromMinutes(5);
|
||||
public int CertificateWarningDays { get; init; } = 14;
|
||||
public int TaskQueueWarningThreshold { get; init; } = 100;
|
||||
public int TaskQueueCriticalThreshold { get; init; } = 500;
|
||||
public double FailureRateWarningThreshold { get; init; } = 5.0;
|
||||
public double FailureRateCriticalThreshold { get; init; } = 20.0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Agent fleet service interface.
|
||||
/// </summary>
|
||||
public interface IAgentFleetService
|
||||
{
|
||||
Task<IReadOnlyList<AgentFleetInfo>> GetAllAgentsAsync(CancellationToken cancellationToken = default);
|
||||
Task<TaskQueueStats> GetTaskQueueStatsAsync(CancellationToken cancellationToken = default);
|
||||
Task<TaskExecutionStats> GetTaskStatsAsync(DateTimeOffset since, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Agent fleet info.
|
||||
/// </summary>
|
||||
public sealed record AgentFleetInfo
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required AgentFleetStatus Status { get; init; }
|
||||
public DateTimeOffset LastHeartbeat { get; init; }
|
||||
public DateTimeOffset? CertificateExpiresAt { get; init; }
|
||||
public int CurrentTasks { get; init; }
|
||||
public int MaxConcurrentTasks { get; init; }
|
||||
public DateTimeOffset? DisconnectedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Agent fleet status.
|
||||
/// </summary>
|
||||
public enum AgentFleetStatus
|
||||
{
|
||||
Unknown,
|
||||
Online,
|
||||
Disconnected,
|
||||
Draining
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Task queue stats.
|
||||
/// </summary>
|
||||
public sealed record TaskQueueStats
|
||||
{
|
||||
public int PendingTasks { get; init; }
|
||||
public TimeSpan? OldestTaskAge { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Task execution stats.
|
||||
/// </summary>
|
||||
public sealed record TaskExecutionStats
|
||||
{
|
||||
public int TotalTasks { get; init; }
|
||||
public int SuccessfulTasks { get; init; }
|
||||
public int FailedTasks { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins;
|
||||
|
||||
/// <summary>
|
||||
/// Doctor plugin interface.
|
||||
/// </summary>
|
||||
public interface IDoctorPlugin
|
||||
{
|
||||
/// <summary>
|
||||
/// Plugin name.
|
||||
/// </summary>
|
||||
string Name { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Plugin description.
|
||||
/// </summary>
|
||||
string Description { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Categories this plugin covers.
|
||||
/// </summary>
|
||||
string[] Categories { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Runs all health checks for this plugin.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<DoctorCheckResult>> RunChecksAsync(
|
||||
DoctorContext context,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Doctor check result.
|
||||
/// </summary>
|
||||
public sealed record DoctorCheckResult
|
||||
{
|
||||
public required string CheckName { get; init; }
|
||||
public required DoctorSeverity Severity { get; init; }
|
||||
public required string Message { get; init; }
|
||||
public IReadOnlyDictionary<string, object>? Details { get; init; }
|
||||
public string? Recommendation { get; init; }
|
||||
public TimeSpan Duration { get; init; }
|
||||
|
||||
public static DoctorCheckResult Pass(string checkName, string message) =>
|
||||
new()
|
||||
{
|
||||
CheckName = checkName,
|
||||
Severity = DoctorSeverity.None,
|
||||
Message = message
|
||||
};
|
||||
|
||||
public static DoctorCheckResult Info(string checkName, string message) =>
|
||||
new()
|
||||
{
|
||||
CheckName = checkName,
|
||||
Severity = DoctorSeverity.Info,
|
||||
Message = message
|
||||
};
|
||||
|
||||
public static DoctorCheckResult Warning(string checkName, string message) =>
|
||||
new()
|
||||
{
|
||||
CheckName = checkName,
|
||||
Severity = DoctorSeverity.Warning,
|
||||
Message = message
|
||||
};
|
||||
|
||||
public static DoctorCheckResult Error(string checkName, string message) =>
|
||||
new()
|
||||
{
|
||||
CheckName = checkName,
|
||||
Severity = DoctorSeverity.Error,
|
||||
Message = message
|
||||
};
|
||||
|
||||
public static DoctorCheckResult Critical(string checkName, string message) =>
|
||||
new()
|
||||
{
|
||||
CheckName = checkName,
|
||||
Severity = DoctorSeverity.Critical,
|
||||
Message = message
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Doctor severity levels.
|
||||
/// </summary>
|
||||
public enum DoctorSeverity
|
||||
{
|
||||
None,
|
||||
Info,
|
||||
Warning,
|
||||
Error,
|
||||
Critical
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Doctor execution context.
|
||||
/// </summary>
|
||||
public sealed record DoctorContext
|
||||
{
|
||||
/// <summary>
|
||||
/// Categories to check (null = all).
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? Categories { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include detailed diagnostics.
|
||||
/// </summary>
|
||||
public bool IncludeDetails { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Per-check timeout.
|
||||
/// </summary>
|
||||
public TimeSpan CheckTimeout { get; init; } = TimeSpan.FromSeconds(30);
|
||||
}
|
||||
Reference in New Issue
Block a user