sln build fix (again), tests fixes, audit work and doctors work
This commit is contained in:
@@ -0,0 +1,156 @@
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using System.Net.Http;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.Doctor.Plugins.Builders;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.ServiceGraph.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies connectivity to the StellaOps backend API.
|
||||
/// </summary>
|
||||
public sealed class BackendConnectivityCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.servicegraph.backend";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Backend API Connectivity";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verifies the application can connect to the StellaOps backend API";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["connectivity", "api", "quick"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(5);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
var backendUrl = context.Configuration.GetValue<string>("StellaOps:BackendUrl")
|
||||
?? context.Configuration.GetValue<string>("BackendUrl");
|
||||
return !string.IsNullOrWhiteSpace(backendUrl);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var result = context.CreateResult(CheckId, "stellaops.doctor.servicegraph", DoctorCategory.ServiceGraph.ToString());
|
||||
|
||||
var backendUrl = context.Configuration.GetValue<string>("StellaOps:BackendUrl")
|
||||
?? context.Configuration.GetValue<string>("BackendUrl");
|
||||
|
||||
if (string.IsNullOrWhiteSpace(backendUrl))
|
||||
{
|
||||
return result
|
||||
.Skip("Backend URL not configured")
|
||||
.WithEvidence("Configuration", e => e.Add("BackendUrl", "(not set)"))
|
||||
.Build();
|
||||
}
|
||||
|
||||
var httpClientFactory = context.Services.GetService<IHttpClientFactory>();
|
||||
if (httpClientFactory == null)
|
||||
{
|
||||
return result
|
||||
.Skip("IHttpClientFactory not available")
|
||||
.WithEvidence("Services", e => e.Add("IHttpClientFactory", "not registered"))
|
||||
.Build();
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
using var client = httpClientFactory.CreateClient();
|
||||
client.Timeout = TimeSpan.FromSeconds(10);
|
||||
|
||||
var healthUrl = backendUrl.TrimEnd('/') + "/health";
|
||||
var sw = Stopwatch.StartNew();
|
||||
using var response = await client.GetAsync(healthUrl, ct);
|
||||
sw.Stop();
|
||||
|
||||
var latencyMs = sw.ElapsedMilliseconds;
|
||||
|
||||
if (response.IsSuccessStatusCode)
|
||||
{
|
||||
if (latencyMs > 2000)
|
||||
{
|
||||
return result
|
||||
.Warn($"Backend API responding slowly ({latencyMs}ms)")
|
||||
.WithEvidence("Backend connectivity", e =>
|
||||
{
|
||||
e.Add("Url", healthUrl);
|
||||
e.Add("StatusCode", ((int)response.StatusCode).ToString(CultureInfo.InvariantCulture));
|
||||
e.Add("LatencyMs", latencyMs.ToString(CultureInfo.InvariantCulture));
|
||||
})
|
||||
.WithCauses(
|
||||
"Network latency issues",
|
||||
"Backend under heavy load",
|
||||
"Firewall inspection delays")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return result
|
||||
.Pass($"Backend API healthy ({latencyMs}ms)")
|
||||
.WithEvidence("Backend connectivity", e =>
|
||||
{
|
||||
e.Add("Url", healthUrl);
|
||||
e.Add("StatusCode", ((int)response.StatusCode).ToString(CultureInfo.InvariantCulture));
|
||||
e.Add("LatencyMs", latencyMs.ToString(CultureInfo.InvariantCulture));
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
else
|
||||
{
|
||||
return result
|
||||
.Fail($"Backend API returned {(int)response.StatusCode} {response.StatusCode}")
|
||||
.WithEvidence("Backend connectivity", e =>
|
||||
{
|
||||
e.Add("Url", healthUrl);
|
||||
e.Add("StatusCode", ((int)response.StatusCode).ToString(CultureInfo.InvariantCulture));
|
||||
e.Add("ReasonPhrase", response.ReasonPhrase ?? "unknown");
|
||||
})
|
||||
.WithCauses(
|
||||
"Backend service is down",
|
||||
"Backend is returning errors",
|
||||
"Authentication/authorization failure")
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Check backend logs", "kubectl logs -l app=stellaops-backend")
|
||||
.AddManualStep(2, "Verify backend health", $"curl -v {healthUrl}"))
|
||||
.WithVerification("stella doctor --check check.servicegraph.backend")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
catch (TaskCanceledException) when (ct.IsCancellationRequested)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return result
|
||||
.Fail($"Failed to connect to backend: {ex.Message}")
|
||||
.WithEvidence("Backend connectivity", e =>
|
||||
{
|
||||
e.Add("Url", backendUrl);
|
||||
e.Add("ErrorType", ex.GetType().Name);
|
||||
e.Add("Error", ex.Message);
|
||||
})
|
||||
.WithCauses(
|
||||
"Backend URL is incorrect",
|
||||
"Network connectivity issues",
|
||||
"DNS resolution failure",
|
||||
"Firewall blocking connection")
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Verify URL", "Check STELLAOPS_BACKEND_URL environment variable")
|
||||
.AddManualStep(2, "Test connectivity", $"curl -v {backendUrl}/health"))
|
||||
.WithVerification("stella doctor --check check.servicegraph.backend")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.Doctor.Plugins.Builders;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.ServiceGraph.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Checks the status of circuit breakers in the application.
|
||||
/// </summary>
|
||||
public sealed class CircuitBreakerStatusCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.servicegraph.circuitbreaker";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Circuit Breaker Status";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Checks the status of circuit breakers for external service calls";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["resilience", "circuit-breaker"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
// Always can run - will report info if not configured
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var result = context.CreateResult(CheckId, "stellaops.doctor.servicegraph", DoctorCategory.ServiceGraph.ToString());
|
||||
|
||||
var resilienceEnabled = context.Configuration.GetValue<bool?>("Resilience:Enabled")
|
||||
?? context.Configuration.GetValue<bool?>("HttpClient:Resilience:Enabled");
|
||||
|
||||
if (resilienceEnabled != true)
|
||||
{
|
||||
return Task.FromResult(result
|
||||
.Info("Circuit breakers not configured")
|
||||
.WithEvidence("Resilience configuration", e =>
|
||||
{
|
||||
e.Add("ResilienceEnabled", "false");
|
||||
e.Add("Recommendation", "Consider enabling HTTP resilience for external calls");
|
||||
})
|
||||
.Build());
|
||||
}
|
||||
|
||||
var breakDuration = context.Configuration.GetValue<int?>("Resilience:CircuitBreaker:BreakDurationSeconds") ?? 30;
|
||||
var failureThreshold = context.Configuration.GetValue<int?>("Resilience:CircuitBreaker:FailureThreshold") ?? 5;
|
||||
var samplingDuration = context.Configuration.GetValue<int?>("Resilience:CircuitBreaker:SamplingDurationSeconds") ?? 60;
|
||||
|
||||
var evidenceBuilder = new EvidenceBuilder(context);
|
||||
evidenceBuilder.Add("Enabled", "true");
|
||||
evidenceBuilder.Add("BreakDurationSeconds", breakDuration.ToString(CultureInfo.InvariantCulture));
|
||||
evidenceBuilder.Add("FailureThreshold", failureThreshold.ToString(CultureInfo.InvariantCulture));
|
||||
evidenceBuilder.Add("SamplingDurationSeconds", samplingDuration.ToString(CultureInfo.InvariantCulture));
|
||||
|
||||
if (breakDuration < 5)
|
||||
{
|
||||
return Task.FromResult(result
|
||||
.Warn("Circuit breaker break duration is very short")
|
||||
.WithEvidence(evidenceBuilder.Build("Circuit breaker configuration"))
|
||||
.WithCauses("Break duration less than 5 seconds may cause excessive retries")
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Increase break duration", "Set Resilience:CircuitBreaker:BreakDurationSeconds to 30"))
|
||||
.Build());
|
||||
}
|
||||
|
||||
if (failureThreshold < 2)
|
||||
{
|
||||
return Task.FromResult(result
|
||||
.Warn("Circuit breaker failure threshold is very low")
|
||||
.WithEvidence(evidenceBuilder.Build("Circuit breaker configuration"))
|
||||
.WithCauses("Threshold of 1 may cause circuit to open on transient failures")
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Increase threshold", "Set Resilience:CircuitBreaker:FailureThreshold to 5"))
|
||||
.Build());
|
||||
}
|
||||
|
||||
return Task.FromResult(result
|
||||
.Pass("Circuit breakers configured correctly")
|
||||
.WithEvidence(evidenceBuilder.Build("Circuit breaker configuration"))
|
||||
.Build());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,152 @@
|
||||
using System.Globalization;
|
||||
using System.Net.Sockets;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.Doctor.Plugins.Builders;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.ServiceGraph.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies connectivity to message queue (RabbitMQ/other).
|
||||
/// </summary>
|
||||
public sealed class MessageQueueCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.servicegraph.mq";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Message Queue Connectivity";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verifies connectivity to the message queue service";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["connectivity", "messaging", "rabbitmq"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(5);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
var rabbitHost = context.Configuration.GetValue<string>("RabbitMQ:Host")
|
||||
?? context.Configuration.GetValue<string>("Messaging:RabbitMQ:Host");
|
||||
return !string.IsNullOrWhiteSpace(rabbitHost);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var result = context.CreateResult(CheckId, "stellaops.doctor.servicegraph", DoctorCategory.ServiceGraph.ToString());
|
||||
|
||||
var rabbitHost = context.Configuration.GetValue<string>("RabbitMQ:Host")
|
||||
?? context.Configuration.GetValue<string>("Messaging:RabbitMQ:Host");
|
||||
var rabbitPort = context.Configuration.GetValue<int?>("RabbitMQ:Port")
|
||||
?? context.Configuration.GetValue<int?>("Messaging:RabbitMQ:Port")
|
||||
?? 5672;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(rabbitHost))
|
||||
{
|
||||
return result
|
||||
.Skip("Message queue not configured")
|
||||
.WithEvidence("Configuration", e => e.Add("RabbitMQ:Host", "(not set)"))
|
||||
.Build();
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
using var client = new TcpClient();
|
||||
var connectTask = client.ConnectAsync(rabbitHost, rabbitPort, ct);
|
||||
var timeoutTask = Task.Delay(TimeSpan.FromSeconds(5), ct);
|
||||
|
||||
var completedTask = await Task.WhenAny(connectTask.AsTask(), timeoutTask);
|
||||
|
||||
if (completedTask == timeoutTask)
|
||||
{
|
||||
return result
|
||||
.Fail($"Connection to RabbitMQ at {rabbitHost}:{rabbitPort} timed out")
|
||||
.WithEvidence("Message queue connectivity", e =>
|
||||
{
|
||||
e.Add("Host", rabbitHost);
|
||||
e.Add("Port", rabbitPort.ToString(CultureInfo.InvariantCulture));
|
||||
e.Add("Status", "timeout");
|
||||
})
|
||||
.WithCauses(
|
||||
"RabbitMQ server is not running",
|
||||
"Network connectivity issues",
|
||||
"Firewall blocking AMQP port")
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Check RabbitMQ status", "docker ps | grep rabbitmq")
|
||||
.AddManualStep(2, "Check RabbitMQ logs", "docker logs rabbitmq")
|
||||
.AddManualStep(3, "Start RabbitMQ", "docker-compose up -d rabbitmq"))
|
||||
.WithVerification("stella doctor --check check.servicegraph.mq")
|
||||
.Build();
|
||||
}
|
||||
|
||||
await connectTask;
|
||||
|
||||
if (client.Connected)
|
||||
{
|
||||
return result
|
||||
.Pass($"Message queue reachable at {rabbitHost}:{rabbitPort}")
|
||||
.WithEvidence("Message queue connectivity", e =>
|
||||
{
|
||||
e.Add("Host", rabbitHost);
|
||||
e.Add("Port", rabbitPort.ToString(CultureInfo.InvariantCulture));
|
||||
e.Add("Status", "connected");
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
else
|
||||
{
|
||||
return result
|
||||
.Fail($"Failed to connect to message queue at {rabbitHost}:{rabbitPort}")
|
||||
.WithEvidence("Message queue connectivity", e =>
|
||||
{
|
||||
e.Add("Host", rabbitHost);
|
||||
e.Add("Port", rabbitPort.ToString(CultureInfo.InvariantCulture));
|
||||
e.Add("Status", "connection_failed");
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
catch (SocketException ex)
|
||||
{
|
||||
return result
|
||||
.Fail($"Socket error connecting to message queue: {ex.Message}")
|
||||
.WithEvidence("Message queue connectivity", e =>
|
||||
{
|
||||
e.Add("Host", rabbitHost);
|
||||
e.Add("Port", rabbitPort.ToString(CultureInfo.InvariantCulture));
|
||||
e.Add("SocketErrorCode", ex.SocketErrorCode.ToString());
|
||||
e.Add("Error", ex.Message);
|
||||
})
|
||||
.WithCauses(
|
||||
"RabbitMQ server is not running",
|
||||
"DNS resolution failed",
|
||||
"Network unreachable")
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Start RabbitMQ", "docker-compose up -d rabbitmq")
|
||||
.AddManualStep(2, "Verify DNS", $"nslookup {rabbitHost}"))
|
||||
.WithVerification("stella doctor --check check.servicegraph.mq")
|
||||
.Build();
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
return result
|
||||
.Fail($"Error connecting to message queue: {ex.Message}")
|
||||
.WithEvidence("Message queue connectivity", e =>
|
||||
{
|
||||
e.Add("Host", rabbitHost);
|
||||
e.Add("Port", rabbitPort.ToString(CultureInfo.InvariantCulture));
|
||||
e.Add("ErrorType", ex.GetType().Name);
|
||||
e.Add("Error", ex.Message);
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,140 @@
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.Doctor.Plugins.Builders;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.ServiceGraph.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies configured service endpoints are reachable.
|
||||
/// </summary>
|
||||
public sealed class ServiceEndpointsCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.servicegraph.endpoints";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Service Endpoints";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verifies all configured service endpoints are reachable";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["connectivity", "services", "full"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(30);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
return context.Services.GetService<IHttpClientFactory>() != null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var result = context.CreateResult(CheckId, "stellaops.doctor.servicegraph", DoctorCategory.ServiceGraph.ToString());
|
||||
|
||||
var httpClientFactory = context.Services.GetService<IHttpClientFactory>();
|
||||
if (httpClientFactory == null)
|
||||
{
|
||||
return result
|
||||
.Skip("IHttpClientFactory not available")
|
||||
.Build();
|
||||
}
|
||||
|
||||
var endpoints = new List<(string Name, string Url)>();
|
||||
|
||||
AddEndpointIfConfigured(endpoints, context.Configuration, "Authority", "StellaOps:AuthorityUrl", "/health");
|
||||
AddEndpointIfConfigured(endpoints, context.Configuration, "Scanner", "StellaOps:ScannerUrl", "/health");
|
||||
AddEndpointIfConfigured(endpoints, context.Configuration, "Concelier", "StellaOps:ConcelierUrl", "/health");
|
||||
AddEndpointIfConfigured(endpoints, context.Configuration, "Excititor", "StellaOps:ExcititorUrl", "/health");
|
||||
AddEndpointIfConfigured(endpoints, context.Configuration, "Attestor", "StellaOps:AttestorUrl", "/health");
|
||||
AddEndpointIfConfigured(endpoints, context.Configuration, "VexLens", "StellaOps:VexLensUrl", "/health");
|
||||
AddEndpointIfConfigured(endpoints, context.Configuration, "Gateway", "StellaOps:GatewayUrl", "/health");
|
||||
|
||||
if (endpoints.Count == 0)
|
||||
{
|
||||
return result
|
||||
.Skip("No service endpoints configured")
|
||||
.WithEvidence("Configuration", e => e.Add("EndpointsFound", "0"))
|
||||
.Build();
|
||||
}
|
||||
|
||||
using var client = httpClientFactory.CreateClient();
|
||||
client.Timeout = TimeSpan.FromSeconds(10);
|
||||
|
||||
var results = new List<(string Name, string Url, bool Success, int StatusCode, long LatencyMs, string? Error)>();
|
||||
|
||||
foreach (var (name, url) in endpoints)
|
||||
{
|
||||
try
|
||||
{
|
||||
var sw = Stopwatch.StartNew();
|
||||
using var response = await client.GetAsync(url, ct);
|
||||
sw.Stop();
|
||||
|
||||
results.Add((name, url, response.IsSuccessStatusCode, (int)response.StatusCode, sw.ElapsedMilliseconds, null));
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
results.Add((name, url, false, 0, 0, ex.Message));
|
||||
}
|
||||
}
|
||||
|
||||
var passed = results.Count(r => r.Success);
|
||||
var failed = results.Count(r => !r.Success);
|
||||
|
||||
var evidenceBuilder = new EvidenceBuilder(context);
|
||||
evidenceBuilder.Add("TotalEndpoints", endpoints.Count.ToString(CultureInfo.InvariantCulture));
|
||||
evidenceBuilder.Add("Healthy", passed.ToString(CultureInfo.InvariantCulture));
|
||||
evidenceBuilder.Add("Unhealthy", failed.ToString(CultureInfo.InvariantCulture));
|
||||
|
||||
foreach (var r in results)
|
||||
{
|
||||
var status = r.Success ? $"OK ({r.LatencyMs}ms)" : $"FAIL: {r.Error ?? $"HTTP {r.StatusCode}"}";
|
||||
evidenceBuilder.Add(r.Name, status);
|
||||
}
|
||||
|
||||
if (failed > 0)
|
||||
{
|
||||
var failedServices = results.Where(r => !r.Success).Select(r => r.Name).ToList();
|
||||
|
||||
return result
|
||||
.Fail($"{failed} of {endpoints.Count} service endpoints are unreachable")
|
||||
.WithEvidence(evidenceBuilder.Build("Service endpoints"))
|
||||
.WithCauses(failedServices.Select(s => $"{s} service is down or unreachable").ToArray())
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Check service status", "kubectl get pods -l app=stellaops")
|
||||
.AddManualStep(2, "Check service logs", "kubectl logs -l app=stellaops --tail=100"))
|
||||
.WithVerification("stella doctor --check check.servicegraph.endpoints")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return result
|
||||
.Pass($"All {endpoints.Count} service endpoints are healthy")
|
||||
.WithEvidence(evidenceBuilder.Build("Service endpoints"))
|
||||
.Build();
|
||||
}
|
||||
|
||||
private static void AddEndpointIfConfigured(
|
||||
List<(string Name, string Url)> endpoints,
|
||||
IConfiguration configuration,
|
||||
string name,
|
||||
string configKey,
|
||||
string healthPath)
|
||||
{
|
||||
var url = configuration.GetValue<string>(configKey);
|
||||
if (!string.IsNullOrWhiteSpace(url))
|
||||
{
|
||||
endpoints.Add((name, url.TrimEnd('/') + healthPath));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.Doctor.Plugins.Builders;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.ServiceGraph.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Validates service timeout configurations.
|
||||
/// </summary>
|
||||
public sealed class ServiceTimeoutCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.servicegraph.timeouts";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Service Timeouts";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Validates timeout configurations for service calls";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["configuration", "timeouts", "quick"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(1);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
// Always can run
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var result = context.CreateResult(CheckId, "stellaops.doctor.servicegraph", DoctorCategory.ServiceGraph.ToString());
|
||||
|
||||
var issues = new List<string>();
|
||||
var evidenceBuilder = new EvidenceBuilder(context);
|
||||
|
||||
var httpTimeout = context.Configuration.GetValue<int?>("HttpClient:Timeout") ?? 100;
|
||||
var dbCommandTimeout = context.Configuration.GetValue<int?>("Database:CommandTimeout") ?? 30;
|
||||
var cacheTimeout = context.Configuration.GetValue<int?>("Cache:OperationTimeout") ?? 5;
|
||||
var healthCheckTimeout = context.Configuration.GetValue<int?>("HealthChecks:Timeout") ?? 10;
|
||||
|
||||
evidenceBuilder.Add("HttpClientTimeout", $"{httpTimeout}s");
|
||||
evidenceBuilder.Add("DatabaseCommandTimeout", $"{dbCommandTimeout}s");
|
||||
evidenceBuilder.Add("CacheOperationTimeout", $"{cacheTimeout}s");
|
||||
evidenceBuilder.Add("HealthCheckTimeout", $"{healthCheckTimeout}s");
|
||||
|
||||
if (httpTimeout > 300)
|
||||
{
|
||||
issues.Add($"HTTP client timeout ({httpTimeout}s) is very high - may cause resource exhaustion");
|
||||
}
|
||||
else if (httpTimeout < 5)
|
||||
{
|
||||
issues.Add($"HTTP client timeout ({httpTimeout}s) is very low - may cause premature failures");
|
||||
}
|
||||
|
||||
if (dbCommandTimeout > 120)
|
||||
{
|
||||
issues.Add($"Database command timeout ({dbCommandTimeout}s) is very high - consider query optimization");
|
||||
}
|
||||
else if (dbCommandTimeout < 5)
|
||||
{
|
||||
issues.Add($"Database command timeout ({dbCommandTimeout}s) is very low - complex queries may fail");
|
||||
}
|
||||
|
||||
if (cacheTimeout > 30)
|
||||
{
|
||||
issues.Add($"Cache operation timeout ({cacheTimeout}s) is too high for a cache service");
|
||||
}
|
||||
|
||||
if (healthCheckTimeout > httpTimeout)
|
||||
{
|
||||
issues.Add("Health check timeout exceeds HTTP client timeout - health checks may fail prematurely");
|
||||
}
|
||||
|
||||
if (issues.Count > 0)
|
||||
{
|
||||
return Task.FromResult(result
|
||||
.Warn($"{issues.Count} timeout configuration issue(s) found")
|
||||
.WithEvidence(evidenceBuilder.Build("Timeout configuration"))
|
||||
.WithCauses(issues.ToArray())
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Review timeout values", "Check configuration and adjust timeouts based on expected service latencies"))
|
||||
.WithVerification("stella doctor --check check.servicegraph.timeouts")
|
||||
.Build());
|
||||
}
|
||||
|
||||
return Task.FromResult(result
|
||||
.Pass("Service timeouts are configured appropriately")
|
||||
.WithEvidence(evidenceBuilder.Build("Timeout configuration"))
|
||||
.Build());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,192 @@
|
||||
using System.Globalization;
|
||||
using System.Net.Sockets;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.Doctor.Plugins.Builders;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.ServiceGraph.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies connectivity to Valkey/Redis cache service.
|
||||
/// </summary>
|
||||
public sealed class ValkeyConnectivityCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.servicegraph.valkey";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Valkey/Redis Connectivity";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verifies the application can connect to the Valkey/Redis cache service";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["connectivity", "cache", "valkey", "redis"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(5);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
var connectionString = context.Configuration.GetValue<string>("Valkey:ConnectionString")
|
||||
?? context.Configuration.GetValue<string>("Redis:ConnectionString")
|
||||
?? context.Configuration.GetValue<string>("ConnectionStrings:Valkey")
|
||||
?? context.Configuration.GetValue<string>("ConnectionStrings:Redis");
|
||||
return !string.IsNullOrWhiteSpace(connectionString);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var result = context.CreateResult(CheckId, "stellaops.doctor.servicegraph", DoctorCategory.ServiceGraph.ToString());
|
||||
|
||||
var connectionString = context.Configuration.GetValue<string>("Valkey:ConnectionString")
|
||||
?? context.Configuration.GetValue<string>("Redis:ConnectionString")
|
||||
?? context.Configuration.GetValue<string>("ConnectionStrings:Valkey")
|
||||
?? context.Configuration.GetValue<string>("ConnectionStrings:Redis");
|
||||
|
||||
if (string.IsNullOrWhiteSpace(connectionString))
|
||||
{
|
||||
return result
|
||||
.Skip("Valkey/Redis not configured")
|
||||
.WithEvidence("Configuration", e => e.Add("ConnectionString", "(not set)"))
|
||||
.Build();
|
||||
}
|
||||
|
||||
var (host, port) = ParseConnectionString(connectionString);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(host))
|
||||
{
|
||||
return result
|
||||
.Fail("Invalid Valkey connection string - cannot parse host")
|
||||
.WithEvidence("Configuration", e => e.Add("ConnectionString", RedactConnectionString(connectionString)))
|
||||
.WithCauses("Connection string format is invalid")
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Fix connection string", "Use format: host:port or host:port,password=xxx"))
|
||||
.Build();
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
using var client = new TcpClient();
|
||||
var connectTask = client.ConnectAsync(host, port, ct);
|
||||
var timeoutTask = Task.Delay(TimeSpan.FromSeconds(5), ct);
|
||||
|
||||
var completedTask = await Task.WhenAny(connectTask.AsTask(), timeoutTask);
|
||||
|
||||
if (completedTask == timeoutTask)
|
||||
{
|
||||
return result
|
||||
.Fail($"Connection to {host}:{port} timed out")
|
||||
.WithEvidence("Valkey connectivity", e =>
|
||||
{
|
||||
e.Add("Host", host);
|
||||
e.Add("Port", port.ToString(CultureInfo.InvariantCulture));
|
||||
e.Add("Status", "timeout");
|
||||
})
|
||||
.WithCauses(
|
||||
"Valkey server is not running",
|
||||
"Network connectivity issues",
|
||||
"Firewall blocking port " + port)
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Check Valkey status", "docker ps | grep valkey")
|
||||
.AddManualStep(2, "Test port connectivity", $"nc -zv {host} {port}"))
|
||||
.WithVerification("stella doctor --check check.servicegraph.valkey")
|
||||
.Build();
|
||||
}
|
||||
|
||||
await connectTask;
|
||||
|
||||
if (client.Connected)
|
||||
{
|
||||
return result
|
||||
.Pass($"Valkey reachable at {host}:{port}")
|
||||
.WithEvidence("Valkey connectivity", e =>
|
||||
{
|
||||
e.Add("Host", host);
|
||||
e.Add("Port", port.ToString(CultureInfo.InvariantCulture));
|
||||
e.Add("Status", "connected");
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
else
|
||||
{
|
||||
return result
|
||||
.Fail($"Failed to connect to Valkey at {host}:{port}")
|
||||
.WithEvidence("Valkey connectivity", e =>
|
||||
{
|
||||
e.Add("Host", host);
|
||||
e.Add("Port", port.ToString(CultureInfo.InvariantCulture));
|
||||
e.Add("Status", "connection_failed");
|
||||
})
|
||||
.WithCauses(
|
||||
"Valkey server refused connection",
|
||||
"Network issues")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
catch (SocketException ex)
|
||||
{
|
||||
return result
|
||||
.Fail($"Socket error connecting to Valkey: {ex.Message}")
|
||||
.WithEvidence("Valkey connectivity", e =>
|
||||
{
|
||||
e.Add("Host", host);
|
||||
e.Add("Port", port.ToString(CultureInfo.InvariantCulture));
|
||||
e.Add("SocketErrorCode", ex.SocketErrorCode.ToString());
|
||||
e.Add("Error", ex.Message);
|
||||
})
|
||||
.WithCauses(
|
||||
"Valkey server is not running",
|
||||
"DNS resolution failed",
|
||||
"Network unreachable")
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Start Valkey", "docker-compose up -d valkey")
|
||||
.AddManualStep(2, "Check DNS", $"nslookup {host}"))
|
||||
.WithVerification("stella doctor --check check.servicegraph.valkey")
|
||||
.Build();
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
return result
|
||||
.Fail($"Error connecting to Valkey: {ex.Message}")
|
||||
.WithEvidence("Valkey connectivity", e =>
|
||||
{
|
||||
e.Add("Host", host);
|
||||
e.Add("Port", port.ToString(CultureInfo.InvariantCulture));
|
||||
e.Add("ErrorType", ex.GetType().Name);
|
||||
e.Add("Error", ex.Message);
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
|
||||
private static (string Host, int Port) ParseConnectionString(string connectionString)
|
||||
{
|
||||
var parts = connectionString.Split(',')[0];
|
||||
var hostPort = parts.Split(':');
|
||||
|
||||
var host = hostPort[0];
|
||||
var port = hostPort.Length > 1 && int.TryParse(hostPort[1], out var p) ? p : 6379;
|
||||
|
||||
return (host, port);
|
||||
}
|
||||
|
||||
private static string RedactConnectionString(string connectionString)
|
||||
{
|
||||
var parts = connectionString.Split(',');
|
||||
for (var i = 0; i < parts.Length; i++)
|
||||
{
|
||||
if (parts[i].StartsWith("password=", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
parts[i] = "password=***";
|
||||
}
|
||||
}
|
||||
return string.Join(",", parts);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.DependencyInjection.Extensions;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.ServiceGraph.DependencyInjection;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering the ServiceGraph doctor plugin.
|
||||
/// </summary>
|
||||
public static class ServiceGraphPluginExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds the ServiceGraph doctor plugin with service dependency checks.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddDoctorServiceGraphPlugin(this IServiceCollection services)
|
||||
{
|
||||
services.TryAddEnumerable(ServiceDescriptor.Singleton<IDoctorPlugin, ServiceGraphPlugin>());
|
||||
return services;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.Doctor.Plugins.ServiceGraph.Checks;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.ServiceGraph;
|
||||
|
||||
/// <summary>
|
||||
/// Plugin providing service dependency and connectivity checks.
|
||||
/// </summary>
|
||||
public sealed class ServiceGraphPlugin : IDoctorPlugin
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string PluginId => "stellaops.doctor.servicegraph";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string DisplayName => "Service Graph";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorCategory Category => DoctorCategory.ServiceGraph;
|
||||
|
||||
/// <inheritdoc />
|
||||
public Version Version => new(1, 0, 0);
|
||||
|
||||
/// <inheritdoc />
|
||||
public Version MinEngineVersion => new(1, 0, 0);
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<IDoctorCheck> GetChecks(DoctorPluginContext context)
|
||||
{
|
||||
return
|
||||
[
|
||||
new BackendConnectivityCheck(),
|
||||
new ValkeyConnectivityCheck(),
|
||||
new MessageQueueCheck(),
|
||||
new ServiceEndpointsCheck(),
|
||||
new CircuitBreakerStatusCheck(),
|
||||
new ServiceTimeoutCheck()
|
||||
];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool IsAvailable(IServiceProvider services)
|
||||
{
|
||||
// ServiceGraph plugin is always available
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task InitializeAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
// No initialization required
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.Doctor\StellaOps.Doctor.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Http" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
Reference in New Issue
Block a user