sln build fix (again), tests fixes, audit work and doctors work
This commit is contained in:
@@ -0,0 +1,124 @@
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.Observability.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Validates alerting configuration.
|
||||
/// </summary>
|
||||
public sealed class AlertingConfigurationCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.observability.alerting";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Alerting Configuration";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Validates alerting rules and notification destinations";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Info;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["observability", "alerting", "notifications"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromMilliseconds(50);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context) => true;
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var result = context.CreateResult(CheckId, "stellaops.doctor.observability", DoctorCategory.Observability.ToString());
|
||||
|
||||
var alertingEnabled = context.Configuration.GetValue<bool?>("Alerting:Enabled")
|
||||
?? context.Configuration.GetValue<bool?>("Notifications:Alerts:Enabled");
|
||||
|
||||
var alertManagerUrl = context.Configuration.GetValue<string>("Alerting:AlertManagerUrl")
|
||||
?? context.Configuration.GetValue<string>("Prometheus:AlertManager:Url");
|
||||
|
||||
var slackWebhook = context.Configuration.GetValue<string>("Alerting:Slack:WebhookUrl")
|
||||
?? context.Configuration.GetValue<string>("Notifications:Slack:WebhookUrl");
|
||||
|
||||
var emailRecipients = context.Configuration.GetSection("Alerting:Email:Recipients").Get<string[]>()
|
||||
?? context.Configuration.GetSection("Notifications:Email:Recipients").Get<string[]>();
|
||||
|
||||
var pagerDutyKey = context.Configuration.GetValue<string>("Alerting:PagerDuty:RoutingKey")
|
||||
?? context.Configuration.GetValue<string>("Notifications:PagerDuty:IntegrationKey");
|
||||
|
||||
var hasAnyDestination = !string.IsNullOrWhiteSpace(alertManagerUrl)
|
||||
|| !string.IsNullOrWhiteSpace(slackWebhook)
|
||||
|| (emailRecipients?.Length > 0)
|
||||
|| !string.IsNullOrWhiteSpace(pagerDutyKey);
|
||||
|
||||
if (alertingEnabled == false)
|
||||
{
|
||||
return Task.FromResult(result
|
||||
.Info("Alerting is explicitly disabled")
|
||||
.WithEvidence("Alerting configuration", e =>
|
||||
{
|
||||
e.Add("Enabled", "false");
|
||||
})
|
||||
.Build());
|
||||
}
|
||||
|
||||
if (!hasAnyDestination)
|
||||
{
|
||||
return Task.FromResult(result
|
||||
.Info("No alerting destinations configured")
|
||||
.WithEvidence("Alerting configuration", e =>
|
||||
{
|
||||
e.Add("AlertManagerConfigured", "false");
|
||||
e.Add("SlackConfigured", "false");
|
||||
e.Add("EmailConfigured", "false");
|
||||
e.Add("PagerDutyConfigured", "false");
|
||||
e.Add("Recommendation", "Configure at least one alert destination for production");
|
||||
})
|
||||
.Build());
|
||||
}
|
||||
|
||||
var issues = new List<string>();
|
||||
|
||||
if (emailRecipients?.Length > 0 && emailRecipients.Any(e => !e.Contains('@')))
|
||||
{
|
||||
issues.Add("Some email recipients appear to be invalid");
|
||||
}
|
||||
|
||||
var destinations = new List<string>();
|
||||
if (!string.IsNullOrWhiteSpace(alertManagerUrl)) destinations.Add("AlertManager");
|
||||
if (!string.IsNullOrWhiteSpace(slackWebhook)) destinations.Add("Slack");
|
||||
if (emailRecipients?.Length > 0) destinations.Add("Email");
|
||||
if (!string.IsNullOrWhiteSpace(pagerDutyKey)) destinations.Add("PagerDuty");
|
||||
|
||||
if (issues.Count > 0)
|
||||
{
|
||||
return Task.FromResult(result
|
||||
.Warn($"{issues.Count} alerting configuration issue(s)")
|
||||
.WithEvidence("Alerting configuration", e =>
|
||||
{
|
||||
e.Add("Enabled", alertingEnabled?.ToString() ?? "default");
|
||||
e.Add("ConfiguredDestinations", string.Join(", ", destinations));
|
||||
e.Add("AlertManagerUrl", !string.IsNullOrWhiteSpace(alertManagerUrl) ? "configured" : "(not set)");
|
||||
e.Add("SlackWebhook", !string.IsNullOrWhiteSpace(slackWebhook) ? "configured" : "(not set)");
|
||||
e.Add("EmailRecipients", emailRecipients?.Length.ToString() ?? "0");
|
||||
e.Add("PagerDuty", !string.IsNullOrWhiteSpace(pagerDutyKey) ? "configured" : "(not set)");
|
||||
})
|
||||
.WithCauses(issues.ToArray())
|
||||
.Build());
|
||||
}
|
||||
|
||||
return Task.FromResult(result
|
||||
.Pass($"Alerting configured with {destinations.Count} destination(s)")
|
||||
.WithEvidence("Alerting configuration", e =>
|
||||
{
|
||||
e.Add("Enabled", alertingEnabled?.ToString() ?? "default");
|
||||
e.Add("ConfiguredDestinations", string.Join(", ", destinations));
|
||||
e.Add("DestinationCount", destinations.Count.ToString());
|
||||
})
|
||||
.Build());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,134 @@
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.Observability.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Validates health check endpoint configuration.
|
||||
/// </summary>
|
||||
public sealed class HealthCheckEndpointsCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.observability.healthchecks";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Health Check Endpoints";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Validates health check endpoints are properly configured";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["observability", "health", "kubernetes"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context) => true;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var result = context.CreateResult(CheckId, "stellaops.doctor.observability", DoctorCategory.Observability.ToString());
|
||||
|
||||
var healthPath = context.Configuration.GetValue<string>("HealthChecks:Path")
|
||||
?? context.Configuration.GetValue<string>("Health:Path")
|
||||
?? "/health";
|
||||
|
||||
var readinessPath = context.Configuration.GetValue<string>("HealthChecks:ReadinessPath")
|
||||
?? context.Configuration.GetValue<string>("Health:ReadinessPath")
|
||||
?? "/health/ready";
|
||||
|
||||
var livenessPath = context.Configuration.GetValue<string>("HealthChecks:LivenessPath")
|
||||
?? context.Configuration.GetValue<string>("Health:LivenessPath")
|
||||
?? "/health/live";
|
||||
|
||||
var healthPort = context.Configuration.GetValue<int?>("HealthChecks:Port")
|
||||
?? context.Configuration.GetValue<int?>("Health:Port");
|
||||
|
||||
var timeout = context.Configuration.GetValue<int?>("HealthChecks:Timeout")
|
||||
?? context.Configuration.GetValue<int?>("Health:TimeoutSeconds")
|
||||
?? 30;
|
||||
|
||||
var issues = new List<string>();
|
||||
|
||||
var httpClientFactory = context.Services.GetService<IHttpClientFactory>();
|
||||
if (httpClientFactory != null && healthPort.HasValue)
|
||||
{
|
||||
try
|
||||
{
|
||||
using var client = httpClientFactory.CreateClient();
|
||||
client.Timeout = TimeSpan.FromSeconds(5);
|
||||
|
||||
var healthUrl = $"http://localhost:{healthPort}{healthPath}";
|
||||
using var response = await client.GetAsync(healthUrl, ct);
|
||||
|
||||
if (!response.IsSuccessStatusCode)
|
||||
{
|
||||
issues.Add($"Health endpoint returned {(int)response.StatusCode}");
|
||||
}
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
issues.Add($"Cannot reach health endpoint: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
if (timeout > 60)
|
||||
{
|
||||
issues.Add($"Health check timeout ({timeout}s) is very long");
|
||||
}
|
||||
else if (timeout < 1)
|
||||
{
|
||||
issues.Add($"Health check timeout ({timeout}s) is too short");
|
||||
}
|
||||
|
||||
var separateReadiness = !readinessPath.Equals(healthPath, StringComparison.OrdinalIgnoreCase);
|
||||
var separateLiveness = !livenessPath.Equals(healthPath, StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
if (!separateReadiness && !separateLiveness)
|
||||
{
|
||||
issues.Add("Consider separate readiness and liveness endpoints for Kubernetes");
|
||||
}
|
||||
|
||||
if (issues.Count > 0)
|
||||
{
|
||||
return result
|
||||
.Warn($"{issues.Count} health check configuration issue(s)")
|
||||
.WithEvidence("Health check configuration", e =>
|
||||
{
|
||||
e.Add("HealthPath", healthPath);
|
||||
e.Add("ReadinessPath", readinessPath);
|
||||
e.Add("LivenessPath", livenessPath);
|
||||
e.Add("Port", healthPort?.ToString(CultureInfo.InvariantCulture) ?? "(default)");
|
||||
e.Add("TimeoutSeconds", timeout.ToString(CultureInfo.InvariantCulture));
|
||||
e.Add("SeparateReadiness", separateReadiness.ToString());
|
||||
e.Add("SeparateLiveness", separateLiveness.ToString());
|
||||
})
|
||||
.WithCauses(issues.ToArray())
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Configure endpoints", "Set separate /health/ready and /health/live endpoints")
|
||||
.AddManualStep(2, "Set timeout", "Configure reasonable timeout (5-30 seconds)"))
|
||||
.WithVerification("stella doctor --check check.observability.healthchecks")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return result
|
||||
.Pass("Health check endpoints are properly configured")
|
||||
.WithEvidence("Health check configuration", e =>
|
||||
{
|
||||
e.Add("HealthPath", healthPath);
|
||||
e.Add("ReadinessPath", readinessPath);
|
||||
e.Add("LivenessPath", livenessPath);
|
||||
e.Add("Port", healthPort?.ToString(CultureInfo.InvariantCulture) ?? "(default)");
|
||||
e.Add("TimeoutSeconds", timeout.ToString(CultureInfo.InvariantCulture));
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,108 @@
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.Observability.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Validates logging configuration.
|
||||
/// </summary>
|
||||
public sealed class LoggingConfigurationCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.observability.logging";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Logging Configuration";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Validates structured logging configuration and levels";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["observability", "logging"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromMilliseconds(50);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context) => true;
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var result = context.CreateResult(CheckId, "stellaops.doctor.observability", DoctorCategory.Observability.ToString());
|
||||
|
||||
var issues = new List<string>();
|
||||
|
||||
var defaultLogLevel = context.Configuration.GetValue<string>("Logging:LogLevel:Default")
|
||||
?? context.Configuration.GetValue<string>("Serilog:MinimumLevel:Default")
|
||||
?? "Information";
|
||||
|
||||
var microsoftLogLevel = context.Configuration.GetValue<string>("Logging:LogLevel:Microsoft")
|
||||
?? context.Configuration.GetValue<string>("Serilog:MinimumLevel:Override:Microsoft")
|
||||
?? "Warning";
|
||||
|
||||
var aspNetCoreLogLevel = context.Configuration.GetValue<string>("Logging:LogLevel:Microsoft.AspNetCore")
|
||||
?? context.Configuration.GetValue<string>("Serilog:MinimumLevel:Override:Microsoft.AspNetCore")
|
||||
?? "Warning";
|
||||
|
||||
var structuredLogging = context.Configuration.GetValue<bool?>("Logging:Structured")
|
||||
?? context.Configuration.GetSection("Serilog").Exists();
|
||||
|
||||
var jsonConsole = context.Configuration.GetValue<bool?>("Logging:Console:FormatterName")?.ToString()?.Contains("Json", StringComparison.OrdinalIgnoreCase)
|
||||
?? context.Configuration.GetValue<string>("Serilog:WriteTo:0:Name")?.Contains("Console", StringComparison.OrdinalIgnoreCase)
|
||||
?? false;
|
||||
|
||||
if (defaultLogLevel.Equals("Debug", StringComparison.OrdinalIgnoreCase)
|
||||
|| defaultLogLevel.Equals("Trace", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
issues.Add($"Default log level '{defaultLogLevel}' is very verbose - may impact performance in production");
|
||||
}
|
||||
|
||||
if (!microsoftLogLevel.Equals("Warning", StringComparison.OrdinalIgnoreCase)
|
||||
&& !microsoftLogLevel.Equals("Error", StringComparison.OrdinalIgnoreCase)
|
||||
&& !microsoftLogLevel.Equals("Critical", StringComparison.OrdinalIgnoreCase)
|
||||
&& !microsoftLogLevel.Equals("None", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
issues.Add($"Microsoft log level '{microsoftLogLevel}' may produce excessive framework logs");
|
||||
}
|
||||
|
||||
if (structuredLogging != true && !context.Configuration.GetSection("Serilog").Exists())
|
||||
{
|
||||
issues.Add("Structured logging not detected - consider using Serilog or JSON formatter");
|
||||
}
|
||||
|
||||
if (issues.Count > 0)
|
||||
{
|
||||
return Task.FromResult(result
|
||||
.Warn($"{issues.Count} logging configuration issue(s)")
|
||||
.WithEvidence("Logging configuration", e =>
|
||||
{
|
||||
e.Add("DefaultLogLevel", defaultLogLevel);
|
||||
e.Add("MicrosoftLogLevel", microsoftLogLevel);
|
||||
e.Add("AspNetCoreLogLevel", aspNetCoreLogLevel);
|
||||
e.Add("StructuredLogging", structuredLogging.ToString());
|
||||
})
|
||||
.WithCauses(issues.ToArray())
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Set appropriate level", "Use 'Information' or 'Warning' for production")
|
||||
.AddManualStep(2, "Enable structured logging", "Configure Serilog or JSON console formatter"))
|
||||
.WithVerification("stella doctor --check check.observability.logging")
|
||||
.Build());
|
||||
}
|
||||
|
||||
return Task.FromResult(result
|
||||
.Pass("Logging is properly configured")
|
||||
.WithEvidence("Logging configuration", e =>
|
||||
{
|
||||
e.Add("DefaultLogLevel", defaultLogLevel);
|
||||
e.Add("MicrosoftLogLevel", microsoftLogLevel);
|
||||
e.Add("AspNetCoreLogLevel", aspNetCoreLogLevel);
|
||||
e.Add("StructuredLogging", structuredLogging.ToString());
|
||||
})
|
||||
.Build());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,136 @@
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.Observability.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Validates metrics collection configuration.
|
||||
/// </summary>
|
||||
public sealed class MetricsCollectionCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.observability.metrics";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Metrics Collection";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Validates metrics endpoints and collection configuration";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["observability", "metrics", "prometheus"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context) => true;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var result = context.CreateResult(CheckId, "stellaops.doctor.observability", DoctorCategory.Observability.ToString());
|
||||
|
||||
var metricsEnabled = context.Configuration.GetValue<bool?>("Metrics:Enabled")
|
||||
?? context.Configuration.GetValue<bool?>("Telemetry:Metrics:Enabled")
|
||||
?? context.Configuration.GetValue<bool?>("OpenTelemetry:Metrics:Enabled");
|
||||
|
||||
var prometheusEnabled = context.Configuration.GetValue<bool?>("Metrics:Prometheus:Enabled")
|
||||
?? context.Configuration.GetValue<bool?>("Prometheus:Enabled");
|
||||
|
||||
var metricsPath = context.Configuration.GetValue<string>("Metrics:Path")
|
||||
?? context.Configuration.GetValue<string>("Prometheus:Path")
|
||||
?? "/metrics";
|
||||
|
||||
var metricsPort = context.Configuration.GetValue<int?>("Metrics:Port")
|
||||
?? context.Configuration.GetValue<int?>("Prometheus:Port");
|
||||
|
||||
if (metricsEnabled == false && prometheusEnabled == false)
|
||||
{
|
||||
return result
|
||||
.Info("Metrics collection is disabled")
|
||||
.WithEvidence("Metrics configuration", e =>
|
||||
{
|
||||
e.Add("MetricsEnabled", "false");
|
||||
e.Add("PrometheusEnabled", "false");
|
||||
e.Add("Recommendation", "Enable metrics for production observability");
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
|
||||
if (metricsEnabled == null && prometheusEnabled == null)
|
||||
{
|
||||
return result
|
||||
.Info("Metrics configuration not found")
|
||||
.WithEvidence("Metrics configuration", e =>
|
||||
{
|
||||
e.Add("Configured", "false");
|
||||
e.Add("Recommendation", "Configure Prometheus metrics or OpenTelemetry metrics");
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
|
||||
var issues = new List<string>();
|
||||
|
||||
if (metricsPort.HasValue)
|
||||
{
|
||||
var httpClientFactory = context.Services.GetService<IHttpClientFactory>();
|
||||
if (httpClientFactory != null)
|
||||
{
|
||||
try
|
||||
{
|
||||
using var client = httpClientFactory.CreateClient();
|
||||
client.Timeout = TimeSpan.FromSeconds(3);
|
||||
|
||||
var metricsUrl = $"http://localhost:{metricsPort}{metricsPath}";
|
||||
using var response = await client.GetAsync(metricsUrl, ct);
|
||||
|
||||
if (!response.IsSuccessStatusCode)
|
||||
{
|
||||
issues.Add($"Metrics endpoint returned {(int)response.StatusCode}");
|
||||
}
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
issues.Add($"Cannot reach metrics endpoint: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (issues.Count > 0)
|
||||
{
|
||||
return result
|
||||
.Warn($"{issues.Count} metrics configuration issue(s)")
|
||||
.WithEvidence("Metrics configuration", e =>
|
||||
{
|
||||
e.Add("MetricsEnabled", metricsEnabled?.ToString() ?? "(not set)");
|
||||
e.Add("PrometheusEnabled", prometheusEnabled?.ToString() ?? "(not set)");
|
||||
e.Add("MetricsPath", metricsPath);
|
||||
e.Add("MetricsPort", metricsPort?.ToString(CultureInfo.InvariantCulture) ?? "(default)");
|
||||
})
|
||||
.WithCauses(issues.ToArray())
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Enable metrics", "Configure Metrics:Enabled or Prometheus:Enabled")
|
||||
.AddManualStep(2, "Check endpoint", $"curl http://localhost:{metricsPort ?? 80}{metricsPath}"))
|
||||
.WithVerification("stella doctor --check check.observability.metrics")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return result
|
||||
.Pass("Metrics collection is configured")
|
||||
.WithEvidence("Metrics configuration", e =>
|
||||
{
|
||||
e.Add("MetricsEnabled", metricsEnabled?.ToString() ?? "(not set)");
|
||||
e.Add("PrometheusEnabled", prometheusEnabled?.ToString() ?? "(not set)");
|
||||
e.Add("MetricsPath", metricsPath);
|
||||
e.Add("MetricsPort", metricsPort?.ToString(CultureInfo.InvariantCulture) ?? "(default)");
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,144 @@
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.Observability.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Validates OpenTelemetry configuration.
|
||||
/// </summary>
|
||||
public sealed class OpenTelemetryCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.observability.otel";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "OpenTelemetry Configuration";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Validates OpenTelemetry tracing and metrics configuration";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["observability", "opentelemetry", "tracing", "metrics"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context) => true;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var result = context.CreateResult(CheckId, "stellaops.doctor.observability", DoctorCategory.Observability.ToString());
|
||||
|
||||
var issues = new List<string>();
|
||||
|
||||
var otelEndpoint = context.Configuration.GetValue<string>("OpenTelemetry:Endpoint")
|
||||
?? context.Configuration.GetValue<string>("OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||
?? Environment.GetEnvironmentVariable("OTEL_EXPORTER_OTLP_ENDPOINT");
|
||||
|
||||
var tracingEnabled = context.Configuration.GetValue<bool?>("OpenTelemetry:Tracing:Enabled")
|
||||
?? context.Configuration.GetValue<bool?>("Telemetry:Tracing:Enabled")
|
||||
?? true;
|
||||
|
||||
var metricsEnabled = context.Configuration.GetValue<bool?>("OpenTelemetry:Metrics:Enabled")
|
||||
?? context.Configuration.GetValue<bool?>("Telemetry:Metrics:Enabled")
|
||||
?? true;
|
||||
|
||||
var serviceName = context.Configuration.GetValue<string>("OpenTelemetry:ServiceName")
|
||||
?? context.Configuration.GetValue<string>("OTEL_SERVICE_NAME")
|
||||
?? Environment.GetEnvironmentVariable("OTEL_SERVICE_NAME");
|
||||
|
||||
var samplingRatio = context.Configuration.GetValue<double?>("OpenTelemetry:Tracing:SamplingRatio")
|
||||
?? context.Configuration.GetValue<double?>("Telemetry:Tracing:SamplingRatio")
|
||||
?? 1.0;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(otelEndpoint))
|
||||
{
|
||||
return result
|
||||
.Info("OpenTelemetry endpoint not configured")
|
||||
.WithEvidence("OpenTelemetry configuration", e =>
|
||||
{
|
||||
e.Add("Endpoint", "(not set)");
|
||||
e.Add("Recommendation", "Configure OTEL_EXPORTER_OTLP_ENDPOINT for distributed tracing");
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(serviceName))
|
||||
{
|
||||
issues.Add("Service name not configured - set OTEL_SERVICE_NAME or OpenTelemetry:ServiceName");
|
||||
}
|
||||
|
||||
if (tracingEnabled != true)
|
||||
{
|
||||
issues.Add("Tracing is disabled");
|
||||
}
|
||||
|
||||
if (metricsEnabled != true)
|
||||
{
|
||||
issues.Add("Metrics collection is disabled");
|
||||
}
|
||||
|
||||
if (samplingRatio < 0.01)
|
||||
{
|
||||
issues.Add($"Sampling ratio ({samplingRatio:P0}) is very low - may miss important traces");
|
||||
}
|
||||
|
||||
var httpClientFactory = context.Services.GetService<IHttpClientFactory>();
|
||||
if (httpClientFactory != null && !string.IsNullOrWhiteSpace(otelEndpoint))
|
||||
{
|
||||
try
|
||||
{
|
||||
using var client = httpClientFactory.CreateClient();
|
||||
client.Timeout = TimeSpan.FromSeconds(5);
|
||||
|
||||
var uri = new Uri(otelEndpoint);
|
||||
var healthUrl = $"{uri.Scheme}://{uri.Host}:{uri.Port}/";
|
||||
|
||||
using var response = await client.GetAsync(healthUrl, ct);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
issues.Add($"Cannot reach OTEL endpoint: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
if (issues.Count > 0)
|
||||
{
|
||||
return result
|
||||
.Warn($"{issues.Count} OpenTelemetry configuration issue(s)")
|
||||
.WithEvidence("OpenTelemetry configuration", e =>
|
||||
{
|
||||
e.Add("Endpoint", otelEndpoint);
|
||||
e.Add("ServiceName", serviceName ?? "(not set)");
|
||||
e.Add("TracingEnabled", tracingEnabled.ToString()!);
|
||||
e.Add("MetricsEnabled", metricsEnabled.ToString()!);
|
||||
e.Add("SamplingRatio", samplingRatio.ToString("P0"));
|
||||
})
|
||||
.WithCauses(issues.ToArray())
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Set service name", "Configure OTEL_SERVICE_NAME environment variable")
|
||||
.AddManualStep(2, "Verify endpoint", "Ensure OpenTelemetry collector is running"))
|
||||
.WithVerification("stella doctor --check check.observability.otel")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return result
|
||||
.Pass("OpenTelemetry is properly configured")
|
||||
.WithEvidence("OpenTelemetry configuration", e =>
|
||||
{
|
||||
e.Add("Endpoint", otelEndpoint);
|
||||
e.Add("ServiceName", serviceName ?? "(not set)");
|
||||
e.Add("TracingEnabled", tracingEnabled.ToString()!);
|
||||
e.Add("MetricsEnabled", metricsEnabled.ToString()!);
|
||||
e.Add("SamplingRatio", samplingRatio.ToString("P0"));
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,135 @@
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.Observability.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Validates distributed tracing configuration.
|
||||
/// </summary>
|
||||
public sealed class TracingConfigurationCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.observability.tracing";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Distributed Tracing";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Validates distributed tracing and correlation configuration";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["observability", "tracing", "correlation"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromMilliseconds(50);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context) => true;
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var result = context.CreateResult(CheckId, "stellaops.doctor.observability", DoctorCategory.Observability.ToString());
|
||||
|
||||
var tracingEnabled = context.Configuration.GetValue<bool?>("Tracing:Enabled")
|
||||
?? context.Configuration.GetValue<bool?>("Telemetry:Tracing:Enabled")
|
||||
?? context.Configuration.GetValue<bool?>("OpenTelemetry:Tracing:Enabled");
|
||||
|
||||
var propagator = context.Configuration.GetValue<string>("Tracing:Propagator")
|
||||
?? context.Configuration.GetValue<string>("OpenTelemetry:Propagator")
|
||||
?? "W3CTraceContext";
|
||||
|
||||
var samplingRatio = context.Configuration.GetValue<double?>("Tracing:SamplingRatio")
|
||||
?? context.Configuration.GetValue<double?>("OpenTelemetry:Tracing:SamplingRatio")
|
||||
?? 1.0;
|
||||
|
||||
var exporterType = context.Configuration.GetValue<string>("Tracing:Exporter")
|
||||
?? context.Configuration.GetValue<string>("OpenTelemetry:Exporter")
|
||||
?? "otlp";
|
||||
|
||||
var maxAttributeLength = context.Configuration.GetValue<int?>("Tracing:MaxAttributeLength")
|
||||
?? 2048;
|
||||
|
||||
var httpInstrumentation = context.Configuration.GetValue<bool?>("Tracing:Instrumentation:Http")
|
||||
?? true;
|
||||
|
||||
var dbInstrumentation = context.Configuration.GetValue<bool?>("Tracing:Instrumentation:Database")
|
||||
?? true;
|
||||
|
||||
if (tracingEnabled == false)
|
||||
{
|
||||
return Task.FromResult(result
|
||||
.Info("Distributed tracing is disabled")
|
||||
.WithEvidence("Tracing configuration", e =>
|
||||
{
|
||||
e.Add("Enabled", "false");
|
||||
e.Add("Recommendation", "Enable tracing for debugging distributed systems");
|
||||
})
|
||||
.Build());
|
||||
}
|
||||
|
||||
var issues = new List<string>();
|
||||
|
||||
if (samplingRatio <= 0)
|
||||
{
|
||||
issues.Add("Sampling ratio is 0 - no traces will be collected");
|
||||
}
|
||||
else if (samplingRatio < 0.01)
|
||||
{
|
||||
issues.Add($"Sampling ratio ({samplingRatio:P1}) is very low - important traces may be missed");
|
||||
}
|
||||
|
||||
if (samplingRatio > 1.0)
|
||||
{
|
||||
issues.Add($"Sampling ratio ({samplingRatio}) is greater than 1.0 - should be between 0 and 1");
|
||||
}
|
||||
|
||||
if (httpInstrumentation != true)
|
||||
{
|
||||
issues.Add("HTTP instrumentation is disabled - HTTP calls won't be traced");
|
||||
}
|
||||
|
||||
if (dbInstrumentation != true)
|
||||
{
|
||||
issues.Add("Database instrumentation is disabled - DB queries won't be traced");
|
||||
}
|
||||
|
||||
if (issues.Count > 0)
|
||||
{
|
||||
return Task.FromResult(result
|
||||
.Warn($"{issues.Count} tracing configuration issue(s)")
|
||||
.WithEvidence("Tracing configuration", e =>
|
||||
{
|
||||
e.Add("Enabled", tracingEnabled?.ToString() ?? "default");
|
||||
e.Add("Propagator", propagator);
|
||||
e.Add("SamplingRatio", samplingRatio.ToString("P1", CultureInfo.InvariantCulture));
|
||||
e.Add("Exporter", exporterType);
|
||||
e.Add("HttpInstrumentation", httpInstrumentation.ToString()!);
|
||||
e.Add("DatabaseInstrumentation", dbInstrumentation.ToString()!);
|
||||
})
|
||||
.WithCauses(issues.ToArray())
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Set sampling ratio", "Configure Tracing:SamplingRatio between 0.01 and 1.0")
|
||||
.AddManualStep(2, "Enable instrumentation", "Enable HTTP and database instrumentation"))
|
||||
.WithVerification("stella doctor --check check.observability.tracing")
|
||||
.Build());
|
||||
}
|
||||
|
||||
return Task.FromResult(result
|
||||
.Pass("Distributed tracing is properly configured")
|
||||
.WithEvidence("Tracing configuration", e =>
|
||||
{
|
||||
e.Add("Enabled", tracingEnabled?.ToString() ?? "default");
|
||||
e.Add("Propagator", propagator);
|
||||
e.Add("SamplingRatio", samplingRatio.ToString("P1", CultureInfo.InvariantCulture));
|
||||
e.Add("Exporter", exporterType);
|
||||
e.Add("MaxAttributeLength", maxAttributeLength.ToString(CultureInfo.InvariantCulture));
|
||||
})
|
||||
.Build());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.DependencyInjection.Extensions;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.Observability.DependencyInjection;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering the Observability plugin.
|
||||
/// </summary>
|
||||
public static class ObservabilityPluginExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds the Doctor Observability plugin to the service collection.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddDoctorObservabilityPlugin(this IServiceCollection services)
|
||||
{
|
||||
services.TryAddEnumerable(ServiceDescriptor.Singleton<IDoctorPlugin, ObservabilityPlugin>());
|
||||
return services;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.Doctor.Plugins.Observability.Checks;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Plugin for observability and telemetry diagnostics.
|
||||
/// </summary>
|
||||
public sealed class ObservabilityPlugin : IDoctorPlugin
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string PluginId => "stellaops.doctor.observability";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string DisplayName => "Observability";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorCategory Category => DoctorCategory.Observability;
|
||||
|
||||
/// <inheritdoc />
|
||||
public Version Version => new(1, 0, 0);
|
||||
|
||||
/// <inheritdoc />
|
||||
public Version MinEngineVersion => new(1, 0, 0);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool IsAvailable(IServiceProvider services) => true;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<IDoctorCheck> GetChecks(DoctorPluginContext context) =>
|
||||
[
|
||||
new OpenTelemetryCheck(),
|
||||
new LoggingConfigurationCheck(),
|
||||
new MetricsCollectionCheck(),
|
||||
new TracingConfigurationCheck(),
|
||||
new HealthCheckEndpointsCheck(),
|
||||
new AlertingConfigurationCheck()
|
||||
];
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task InitializeAsync(DoctorPluginContext context, CancellationToken ct) => Task.CompletedTask;
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.Doctor\StellaOps.Doctor.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Http" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
Reference in New Issue
Block a user