sln build fix (again), tests fixes, audit work and doctors work

This commit is contained in:
master
2026-01-12 22:15:51 +02:00
parent 9873f80830
commit 9330c64349
812 changed files with 48051 additions and 3891 deletions

View File

@@ -0,0 +1,124 @@
using Microsoft.Extensions.Configuration;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugins.Observability.Checks;
/// <summary>
/// Validates alerting configuration.
/// </summary>
public sealed class AlertingConfigurationCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.observability.alerting";
/// <inheritdoc />
public string Name => "Alerting Configuration";
/// <inheritdoc />
public string Description => "Validates alerting rules and notification destinations";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Info;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["observability", "alerting", "notifications"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromMilliseconds(50);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context) => true;
/// <inheritdoc />
public Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var result = context.CreateResult(CheckId, "stellaops.doctor.observability", DoctorCategory.Observability.ToString());
var alertingEnabled = context.Configuration.GetValue<bool?>("Alerting:Enabled")
?? context.Configuration.GetValue<bool?>("Notifications:Alerts:Enabled");
var alertManagerUrl = context.Configuration.GetValue<string>("Alerting:AlertManagerUrl")
?? context.Configuration.GetValue<string>("Prometheus:AlertManager:Url");
var slackWebhook = context.Configuration.GetValue<string>("Alerting:Slack:WebhookUrl")
?? context.Configuration.GetValue<string>("Notifications:Slack:WebhookUrl");
var emailRecipients = context.Configuration.GetSection("Alerting:Email:Recipients").Get<string[]>()
?? context.Configuration.GetSection("Notifications:Email:Recipients").Get<string[]>();
var pagerDutyKey = context.Configuration.GetValue<string>("Alerting:PagerDuty:RoutingKey")
?? context.Configuration.GetValue<string>("Notifications:PagerDuty:IntegrationKey");
var hasAnyDestination = !string.IsNullOrWhiteSpace(alertManagerUrl)
|| !string.IsNullOrWhiteSpace(slackWebhook)
|| (emailRecipients?.Length > 0)
|| !string.IsNullOrWhiteSpace(pagerDutyKey);
if (alertingEnabled == false)
{
return Task.FromResult(result
.Info("Alerting is explicitly disabled")
.WithEvidence("Alerting configuration", e =>
{
e.Add("Enabled", "false");
})
.Build());
}
if (!hasAnyDestination)
{
return Task.FromResult(result
.Info("No alerting destinations configured")
.WithEvidence("Alerting configuration", e =>
{
e.Add("AlertManagerConfigured", "false");
e.Add("SlackConfigured", "false");
e.Add("EmailConfigured", "false");
e.Add("PagerDutyConfigured", "false");
e.Add("Recommendation", "Configure at least one alert destination for production");
})
.Build());
}
var issues = new List<string>();
if (emailRecipients?.Length > 0 && emailRecipients.Any(e => !e.Contains('@')))
{
issues.Add("Some email recipients appear to be invalid");
}
var destinations = new List<string>();
if (!string.IsNullOrWhiteSpace(alertManagerUrl)) destinations.Add("AlertManager");
if (!string.IsNullOrWhiteSpace(slackWebhook)) destinations.Add("Slack");
if (emailRecipients?.Length > 0) destinations.Add("Email");
if (!string.IsNullOrWhiteSpace(pagerDutyKey)) destinations.Add("PagerDuty");
if (issues.Count > 0)
{
return Task.FromResult(result
.Warn($"{issues.Count} alerting configuration issue(s)")
.WithEvidence("Alerting configuration", e =>
{
e.Add("Enabled", alertingEnabled?.ToString() ?? "default");
e.Add("ConfiguredDestinations", string.Join(", ", destinations));
e.Add("AlertManagerUrl", !string.IsNullOrWhiteSpace(alertManagerUrl) ? "configured" : "(not set)");
e.Add("SlackWebhook", !string.IsNullOrWhiteSpace(slackWebhook) ? "configured" : "(not set)");
e.Add("EmailRecipients", emailRecipients?.Length.ToString() ?? "0");
e.Add("PagerDuty", !string.IsNullOrWhiteSpace(pagerDutyKey) ? "configured" : "(not set)");
})
.WithCauses(issues.ToArray())
.Build());
}
return Task.FromResult(result
.Pass($"Alerting configured with {destinations.Count} destination(s)")
.WithEvidence("Alerting configuration", e =>
{
e.Add("Enabled", alertingEnabled?.ToString() ?? "default");
e.Add("ConfiguredDestinations", string.Join(", ", destinations));
e.Add("DestinationCount", destinations.Count.ToString());
})
.Build());
}
}

View File

@@ -0,0 +1,134 @@
using System.Globalization;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugins.Observability.Checks;
/// <summary>
/// Validates health check endpoint configuration.
/// </summary>
public sealed class HealthCheckEndpointsCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.observability.healthchecks";
/// <inheritdoc />
public string Name => "Health Check Endpoints";
/// <inheritdoc />
public string Description => "Validates health check endpoints are properly configured";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["observability", "health", "kubernetes"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context) => true;
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var result = context.CreateResult(CheckId, "stellaops.doctor.observability", DoctorCategory.Observability.ToString());
var healthPath = context.Configuration.GetValue<string>("HealthChecks:Path")
?? context.Configuration.GetValue<string>("Health:Path")
?? "/health";
var readinessPath = context.Configuration.GetValue<string>("HealthChecks:ReadinessPath")
?? context.Configuration.GetValue<string>("Health:ReadinessPath")
?? "/health/ready";
var livenessPath = context.Configuration.GetValue<string>("HealthChecks:LivenessPath")
?? context.Configuration.GetValue<string>("Health:LivenessPath")
?? "/health/live";
var healthPort = context.Configuration.GetValue<int?>("HealthChecks:Port")
?? context.Configuration.GetValue<int?>("Health:Port");
var timeout = context.Configuration.GetValue<int?>("HealthChecks:Timeout")
?? context.Configuration.GetValue<int?>("Health:TimeoutSeconds")
?? 30;
var issues = new List<string>();
var httpClientFactory = context.Services.GetService<IHttpClientFactory>();
if (httpClientFactory != null && healthPort.HasValue)
{
try
{
using var client = httpClientFactory.CreateClient();
client.Timeout = TimeSpan.FromSeconds(5);
var healthUrl = $"http://localhost:{healthPort}{healthPath}";
using var response = await client.GetAsync(healthUrl, ct);
if (!response.IsSuccessStatusCode)
{
issues.Add($"Health endpoint returned {(int)response.StatusCode}");
}
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
issues.Add($"Cannot reach health endpoint: {ex.Message}");
}
}
if (timeout > 60)
{
issues.Add($"Health check timeout ({timeout}s) is very long");
}
else if (timeout < 1)
{
issues.Add($"Health check timeout ({timeout}s) is too short");
}
var separateReadiness = !readinessPath.Equals(healthPath, StringComparison.OrdinalIgnoreCase);
var separateLiveness = !livenessPath.Equals(healthPath, StringComparison.OrdinalIgnoreCase);
if (!separateReadiness && !separateLiveness)
{
issues.Add("Consider separate readiness and liveness endpoints for Kubernetes");
}
if (issues.Count > 0)
{
return result
.Warn($"{issues.Count} health check configuration issue(s)")
.WithEvidence("Health check configuration", e =>
{
e.Add("HealthPath", healthPath);
e.Add("ReadinessPath", readinessPath);
e.Add("LivenessPath", livenessPath);
e.Add("Port", healthPort?.ToString(CultureInfo.InvariantCulture) ?? "(default)");
e.Add("TimeoutSeconds", timeout.ToString(CultureInfo.InvariantCulture));
e.Add("SeparateReadiness", separateReadiness.ToString());
e.Add("SeparateLiveness", separateLiveness.ToString());
})
.WithCauses(issues.ToArray())
.WithRemediation(r => r
.AddManualStep(1, "Configure endpoints", "Set separate /health/ready and /health/live endpoints")
.AddManualStep(2, "Set timeout", "Configure reasonable timeout (5-30 seconds)"))
.WithVerification("stella doctor --check check.observability.healthchecks")
.Build();
}
return result
.Pass("Health check endpoints are properly configured")
.WithEvidence("Health check configuration", e =>
{
e.Add("HealthPath", healthPath);
e.Add("ReadinessPath", readinessPath);
e.Add("LivenessPath", livenessPath);
e.Add("Port", healthPort?.ToString(CultureInfo.InvariantCulture) ?? "(default)");
e.Add("TimeoutSeconds", timeout.ToString(CultureInfo.InvariantCulture));
})
.Build();
}
}

View File

@@ -0,0 +1,108 @@
using Microsoft.Extensions.Configuration;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugins.Observability.Checks;
/// <summary>
/// Validates logging configuration.
/// </summary>
public sealed class LoggingConfigurationCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.observability.logging";
/// <inheritdoc />
public string Name => "Logging Configuration";
/// <inheritdoc />
public string Description => "Validates structured logging configuration and levels";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["observability", "logging"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromMilliseconds(50);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context) => true;
/// <inheritdoc />
public Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var result = context.CreateResult(CheckId, "stellaops.doctor.observability", DoctorCategory.Observability.ToString());
var issues = new List<string>();
var defaultLogLevel = context.Configuration.GetValue<string>("Logging:LogLevel:Default")
?? context.Configuration.GetValue<string>("Serilog:MinimumLevel:Default")
?? "Information";
var microsoftLogLevel = context.Configuration.GetValue<string>("Logging:LogLevel:Microsoft")
?? context.Configuration.GetValue<string>("Serilog:MinimumLevel:Override:Microsoft")
?? "Warning";
var aspNetCoreLogLevel = context.Configuration.GetValue<string>("Logging:LogLevel:Microsoft.AspNetCore")
?? context.Configuration.GetValue<string>("Serilog:MinimumLevel:Override:Microsoft.AspNetCore")
?? "Warning";
var structuredLogging = context.Configuration.GetValue<bool?>("Logging:Structured")
?? context.Configuration.GetSection("Serilog").Exists();
var jsonConsole = context.Configuration.GetValue<bool?>("Logging:Console:FormatterName")?.ToString()?.Contains("Json", StringComparison.OrdinalIgnoreCase)
?? context.Configuration.GetValue<string>("Serilog:WriteTo:0:Name")?.Contains("Console", StringComparison.OrdinalIgnoreCase)
?? false;
if (defaultLogLevel.Equals("Debug", StringComparison.OrdinalIgnoreCase)
|| defaultLogLevel.Equals("Trace", StringComparison.OrdinalIgnoreCase))
{
issues.Add($"Default log level '{defaultLogLevel}' is very verbose - may impact performance in production");
}
if (!microsoftLogLevel.Equals("Warning", StringComparison.OrdinalIgnoreCase)
&& !microsoftLogLevel.Equals("Error", StringComparison.OrdinalIgnoreCase)
&& !microsoftLogLevel.Equals("Critical", StringComparison.OrdinalIgnoreCase)
&& !microsoftLogLevel.Equals("None", StringComparison.OrdinalIgnoreCase))
{
issues.Add($"Microsoft log level '{microsoftLogLevel}' may produce excessive framework logs");
}
if (structuredLogging != true && !context.Configuration.GetSection("Serilog").Exists())
{
issues.Add("Structured logging not detected - consider using Serilog or JSON formatter");
}
if (issues.Count > 0)
{
return Task.FromResult(result
.Warn($"{issues.Count} logging configuration issue(s)")
.WithEvidence("Logging configuration", e =>
{
e.Add("DefaultLogLevel", defaultLogLevel);
e.Add("MicrosoftLogLevel", microsoftLogLevel);
e.Add("AspNetCoreLogLevel", aspNetCoreLogLevel);
e.Add("StructuredLogging", structuredLogging.ToString());
})
.WithCauses(issues.ToArray())
.WithRemediation(r => r
.AddManualStep(1, "Set appropriate level", "Use 'Information' or 'Warning' for production")
.AddManualStep(2, "Enable structured logging", "Configure Serilog or JSON console formatter"))
.WithVerification("stella doctor --check check.observability.logging")
.Build());
}
return Task.FromResult(result
.Pass("Logging is properly configured")
.WithEvidence("Logging configuration", e =>
{
e.Add("DefaultLogLevel", defaultLogLevel);
e.Add("MicrosoftLogLevel", microsoftLogLevel);
e.Add("AspNetCoreLogLevel", aspNetCoreLogLevel);
e.Add("StructuredLogging", structuredLogging.ToString());
})
.Build());
}
}

View File

@@ -0,0 +1,136 @@
using System.Globalization;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugins.Observability.Checks;
/// <summary>
/// Validates metrics collection configuration.
/// </summary>
public sealed class MetricsCollectionCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.observability.metrics";
/// <inheritdoc />
public string Name => "Metrics Collection";
/// <inheritdoc />
public string Description => "Validates metrics endpoints and collection configuration";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["observability", "metrics", "prometheus"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context) => true;
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var result = context.CreateResult(CheckId, "stellaops.doctor.observability", DoctorCategory.Observability.ToString());
var metricsEnabled = context.Configuration.GetValue<bool?>("Metrics:Enabled")
?? context.Configuration.GetValue<bool?>("Telemetry:Metrics:Enabled")
?? context.Configuration.GetValue<bool?>("OpenTelemetry:Metrics:Enabled");
var prometheusEnabled = context.Configuration.GetValue<bool?>("Metrics:Prometheus:Enabled")
?? context.Configuration.GetValue<bool?>("Prometheus:Enabled");
var metricsPath = context.Configuration.GetValue<string>("Metrics:Path")
?? context.Configuration.GetValue<string>("Prometheus:Path")
?? "/metrics";
var metricsPort = context.Configuration.GetValue<int?>("Metrics:Port")
?? context.Configuration.GetValue<int?>("Prometheus:Port");
if (metricsEnabled == false && prometheusEnabled == false)
{
return result
.Info("Metrics collection is disabled")
.WithEvidence("Metrics configuration", e =>
{
e.Add("MetricsEnabled", "false");
e.Add("PrometheusEnabled", "false");
e.Add("Recommendation", "Enable metrics for production observability");
})
.Build();
}
if (metricsEnabled == null && prometheusEnabled == null)
{
return result
.Info("Metrics configuration not found")
.WithEvidence("Metrics configuration", e =>
{
e.Add("Configured", "false");
e.Add("Recommendation", "Configure Prometheus metrics or OpenTelemetry metrics");
})
.Build();
}
var issues = new List<string>();
if (metricsPort.HasValue)
{
var httpClientFactory = context.Services.GetService<IHttpClientFactory>();
if (httpClientFactory != null)
{
try
{
using var client = httpClientFactory.CreateClient();
client.Timeout = TimeSpan.FromSeconds(3);
var metricsUrl = $"http://localhost:{metricsPort}{metricsPath}";
using var response = await client.GetAsync(metricsUrl, ct);
if (!response.IsSuccessStatusCode)
{
issues.Add($"Metrics endpoint returned {(int)response.StatusCode}");
}
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
issues.Add($"Cannot reach metrics endpoint: {ex.Message}");
}
}
}
if (issues.Count > 0)
{
return result
.Warn($"{issues.Count} metrics configuration issue(s)")
.WithEvidence("Metrics configuration", e =>
{
e.Add("MetricsEnabled", metricsEnabled?.ToString() ?? "(not set)");
e.Add("PrometheusEnabled", prometheusEnabled?.ToString() ?? "(not set)");
e.Add("MetricsPath", metricsPath);
e.Add("MetricsPort", metricsPort?.ToString(CultureInfo.InvariantCulture) ?? "(default)");
})
.WithCauses(issues.ToArray())
.WithRemediation(r => r
.AddManualStep(1, "Enable metrics", "Configure Metrics:Enabled or Prometheus:Enabled")
.AddManualStep(2, "Check endpoint", $"curl http://localhost:{metricsPort ?? 80}{metricsPath}"))
.WithVerification("stella doctor --check check.observability.metrics")
.Build();
}
return result
.Pass("Metrics collection is configured")
.WithEvidence("Metrics configuration", e =>
{
e.Add("MetricsEnabled", metricsEnabled?.ToString() ?? "(not set)");
e.Add("PrometheusEnabled", prometheusEnabled?.ToString() ?? "(not set)");
e.Add("MetricsPath", metricsPath);
e.Add("MetricsPort", metricsPort?.ToString(CultureInfo.InvariantCulture) ?? "(default)");
})
.Build();
}
}

View File

@@ -0,0 +1,144 @@
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugins.Observability.Checks;
/// <summary>
/// Validates OpenTelemetry configuration.
/// </summary>
public sealed class OpenTelemetryCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.observability.otel";
/// <inheritdoc />
public string Name => "OpenTelemetry Configuration";
/// <inheritdoc />
public string Description => "Validates OpenTelemetry tracing and metrics configuration";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["observability", "opentelemetry", "tracing", "metrics"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context) => true;
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var result = context.CreateResult(CheckId, "stellaops.doctor.observability", DoctorCategory.Observability.ToString());
var issues = new List<string>();
var otelEndpoint = context.Configuration.GetValue<string>("OpenTelemetry:Endpoint")
?? context.Configuration.GetValue<string>("OTEL_EXPORTER_OTLP_ENDPOINT")
?? Environment.GetEnvironmentVariable("OTEL_EXPORTER_OTLP_ENDPOINT");
var tracingEnabled = context.Configuration.GetValue<bool?>("OpenTelemetry:Tracing:Enabled")
?? context.Configuration.GetValue<bool?>("Telemetry:Tracing:Enabled")
?? true;
var metricsEnabled = context.Configuration.GetValue<bool?>("OpenTelemetry:Metrics:Enabled")
?? context.Configuration.GetValue<bool?>("Telemetry:Metrics:Enabled")
?? true;
var serviceName = context.Configuration.GetValue<string>("OpenTelemetry:ServiceName")
?? context.Configuration.GetValue<string>("OTEL_SERVICE_NAME")
?? Environment.GetEnvironmentVariable("OTEL_SERVICE_NAME");
var samplingRatio = context.Configuration.GetValue<double?>("OpenTelemetry:Tracing:SamplingRatio")
?? context.Configuration.GetValue<double?>("Telemetry:Tracing:SamplingRatio")
?? 1.0;
if (string.IsNullOrWhiteSpace(otelEndpoint))
{
return result
.Info("OpenTelemetry endpoint not configured")
.WithEvidence("OpenTelemetry configuration", e =>
{
e.Add("Endpoint", "(not set)");
e.Add("Recommendation", "Configure OTEL_EXPORTER_OTLP_ENDPOINT for distributed tracing");
})
.Build();
}
if (string.IsNullOrWhiteSpace(serviceName))
{
issues.Add("Service name not configured - set OTEL_SERVICE_NAME or OpenTelemetry:ServiceName");
}
if (tracingEnabled != true)
{
issues.Add("Tracing is disabled");
}
if (metricsEnabled != true)
{
issues.Add("Metrics collection is disabled");
}
if (samplingRatio < 0.01)
{
issues.Add($"Sampling ratio ({samplingRatio:P0}) is very low - may miss important traces");
}
var httpClientFactory = context.Services.GetService<IHttpClientFactory>();
if (httpClientFactory != null && !string.IsNullOrWhiteSpace(otelEndpoint))
{
try
{
using var client = httpClientFactory.CreateClient();
client.Timeout = TimeSpan.FromSeconds(5);
var uri = new Uri(otelEndpoint);
var healthUrl = $"{uri.Scheme}://{uri.Host}:{uri.Port}/";
using var response = await client.GetAsync(healthUrl, ct);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
issues.Add($"Cannot reach OTEL endpoint: {ex.Message}");
}
}
if (issues.Count > 0)
{
return result
.Warn($"{issues.Count} OpenTelemetry configuration issue(s)")
.WithEvidence("OpenTelemetry configuration", e =>
{
e.Add("Endpoint", otelEndpoint);
e.Add("ServiceName", serviceName ?? "(not set)");
e.Add("TracingEnabled", tracingEnabled.ToString()!);
e.Add("MetricsEnabled", metricsEnabled.ToString()!);
e.Add("SamplingRatio", samplingRatio.ToString("P0"));
})
.WithCauses(issues.ToArray())
.WithRemediation(r => r
.AddManualStep(1, "Set service name", "Configure OTEL_SERVICE_NAME environment variable")
.AddManualStep(2, "Verify endpoint", "Ensure OpenTelemetry collector is running"))
.WithVerification("stella doctor --check check.observability.otel")
.Build();
}
return result
.Pass("OpenTelemetry is properly configured")
.WithEvidence("OpenTelemetry configuration", e =>
{
e.Add("Endpoint", otelEndpoint);
e.Add("ServiceName", serviceName ?? "(not set)");
e.Add("TracingEnabled", tracingEnabled.ToString()!);
e.Add("MetricsEnabled", metricsEnabled.ToString()!);
e.Add("SamplingRatio", samplingRatio.ToString("P0"));
})
.Build();
}
}

View File

@@ -0,0 +1,135 @@
using System.Globalization;
using Microsoft.Extensions.Configuration;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugins.Observability.Checks;
/// <summary>
/// Validates distributed tracing configuration.
/// </summary>
public sealed class TracingConfigurationCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.observability.tracing";
/// <inheritdoc />
public string Name => "Distributed Tracing";
/// <inheritdoc />
public string Description => "Validates distributed tracing and correlation configuration";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["observability", "tracing", "correlation"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromMilliseconds(50);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context) => true;
/// <inheritdoc />
public Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var result = context.CreateResult(CheckId, "stellaops.doctor.observability", DoctorCategory.Observability.ToString());
var tracingEnabled = context.Configuration.GetValue<bool?>("Tracing:Enabled")
?? context.Configuration.GetValue<bool?>("Telemetry:Tracing:Enabled")
?? context.Configuration.GetValue<bool?>("OpenTelemetry:Tracing:Enabled");
var propagator = context.Configuration.GetValue<string>("Tracing:Propagator")
?? context.Configuration.GetValue<string>("OpenTelemetry:Propagator")
?? "W3CTraceContext";
var samplingRatio = context.Configuration.GetValue<double?>("Tracing:SamplingRatio")
?? context.Configuration.GetValue<double?>("OpenTelemetry:Tracing:SamplingRatio")
?? 1.0;
var exporterType = context.Configuration.GetValue<string>("Tracing:Exporter")
?? context.Configuration.GetValue<string>("OpenTelemetry:Exporter")
?? "otlp";
var maxAttributeLength = context.Configuration.GetValue<int?>("Tracing:MaxAttributeLength")
?? 2048;
var httpInstrumentation = context.Configuration.GetValue<bool?>("Tracing:Instrumentation:Http")
?? true;
var dbInstrumentation = context.Configuration.GetValue<bool?>("Tracing:Instrumentation:Database")
?? true;
if (tracingEnabled == false)
{
return Task.FromResult(result
.Info("Distributed tracing is disabled")
.WithEvidence("Tracing configuration", e =>
{
e.Add("Enabled", "false");
e.Add("Recommendation", "Enable tracing for debugging distributed systems");
})
.Build());
}
var issues = new List<string>();
if (samplingRatio <= 0)
{
issues.Add("Sampling ratio is 0 - no traces will be collected");
}
else if (samplingRatio < 0.01)
{
issues.Add($"Sampling ratio ({samplingRatio:P1}) is very low - important traces may be missed");
}
if (samplingRatio > 1.0)
{
issues.Add($"Sampling ratio ({samplingRatio}) is greater than 1.0 - should be between 0 and 1");
}
if (httpInstrumentation != true)
{
issues.Add("HTTP instrumentation is disabled - HTTP calls won't be traced");
}
if (dbInstrumentation != true)
{
issues.Add("Database instrumentation is disabled - DB queries won't be traced");
}
if (issues.Count > 0)
{
return Task.FromResult(result
.Warn($"{issues.Count} tracing configuration issue(s)")
.WithEvidence("Tracing configuration", e =>
{
e.Add("Enabled", tracingEnabled?.ToString() ?? "default");
e.Add("Propagator", propagator);
e.Add("SamplingRatio", samplingRatio.ToString("P1", CultureInfo.InvariantCulture));
e.Add("Exporter", exporterType);
e.Add("HttpInstrumentation", httpInstrumentation.ToString()!);
e.Add("DatabaseInstrumentation", dbInstrumentation.ToString()!);
})
.WithCauses(issues.ToArray())
.WithRemediation(r => r
.AddManualStep(1, "Set sampling ratio", "Configure Tracing:SamplingRatio between 0.01 and 1.0")
.AddManualStep(2, "Enable instrumentation", "Enable HTTP and database instrumentation"))
.WithVerification("stella doctor --check check.observability.tracing")
.Build());
}
return Task.FromResult(result
.Pass("Distributed tracing is properly configured")
.WithEvidence("Tracing configuration", e =>
{
e.Add("Enabled", tracingEnabled?.ToString() ?? "default");
e.Add("Propagator", propagator);
e.Add("SamplingRatio", samplingRatio.ToString("P1", CultureInfo.InvariantCulture));
e.Add("Exporter", exporterType);
e.Add("MaxAttributeLength", maxAttributeLength.ToString(CultureInfo.InvariantCulture));
})
.Build());
}
}

View File

@@ -0,0 +1,20 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.DependencyInjection.Extensions;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugins.Observability.DependencyInjection;
/// <summary>
/// Extension methods for registering the Observability plugin.
/// </summary>
public static class ObservabilityPluginExtensions
{
/// <summary>
/// Adds the Doctor Observability plugin to the service collection.
/// </summary>
public static IServiceCollection AddDoctorObservabilityPlugin(this IServiceCollection services)
{
services.TryAddEnumerable(ServiceDescriptor.Singleton<IDoctorPlugin, ObservabilityPlugin>());
return services;
}
}

View File

@@ -0,0 +1,43 @@
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
using StellaOps.Doctor.Plugins.Observability.Checks;
namespace StellaOps.Doctor.Plugins.Observability;
/// <summary>
/// Plugin for observability and telemetry diagnostics.
/// </summary>
public sealed class ObservabilityPlugin : IDoctorPlugin
{
/// <inheritdoc />
public string PluginId => "stellaops.doctor.observability";
/// <inheritdoc />
public string DisplayName => "Observability";
/// <inheritdoc />
public DoctorCategory Category => DoctorCategory.Observability;
/// <inheritdoc />
public Version Version => new(1, 0, 0);
/// <inheritdoc />
public Version MinEngineVersion => new(1, 0, 0);
/// <inheritdoc />
public bool IsAvailable(IServiceProvider services) => true;
/// <inheritdoc />
public IReadOnlyList<IDoctorCheck> GetChecks(DoctorPluginContext context) =>
[
new OpenTelemetryCheck(),
new LoggingConfigurationCheck(),
new MetricsCollectionCheck(),
new TracingConfigurationCheck(),
new HealthCheckEndpointsCheck(),
new AlertingConfigurationCheck()
];
/// <inheritdoc />
public Task InitializeAsync(DoctorPluginContext context, CancellationToken ct) => Task.CompletedTask;
}

View File

@@ -0,0 +1,20 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<LangVersion>preview</LangVersion>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\StellaOps.Doctor\StellaOps.Doctor.csproj" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Http" />
</ItemGroup>
</Project>