audit work, doctors work

This commit is contained in:
master
2026-01-12 23:39:07 +02:00
parent 9330c64349
commit b8868a5f13
80 changed files with 12659 additions and 87 deletions

View File

@@ -0,0 +1,143 @@
using System.Runtime.InteropServices;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugin.Observability.Checks;
/// <summary>
/// Checks if the log directory exists and is writable.
/// </summary>
public sealed class LogDirectoryCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.logs.directory.writable";
/// <inheritdoc />
public string Name => "Log Directory Writable";
/// <inheritdoc />
public string Description => "Verify log directory exists and is writable";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["observability", "logs", "quick"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromMilliseconds(500);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context)
{
// Always run - uses default paths if not configured
return true;
}
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var logPath = GetLogDirectory(context);
var builder = context.CreateResult(CheckId, "stellaops.doctor.observability", "Observability");
// Check if directory exists
if (!Directory.Exists(logPath))
{
return builder
.Fail($"Log directory does not exist: {logPath}")
.WithEvidence(eb => eb
.Add("LogPath", logPath)
.Add("Exists", "false"))
.WithCauses(
"Log directory not created during installation",
"Directory was deleted",
"Configuration points to wrong path")
.WithRemediation(rb => rb
.AddStep(1, "Create log directory",
RuntimeInformation.IsOSPlatform(OSPlatform.Windows)
? $"mkdir \"{logPath}\""
: $"sudo mkdir -p {logPath}",
CommandType.Shell)
.AddStep(2, "Set permissions",
RuntimeInformation.IsOSPlatform(OSPlatform.Windows)
? $"icacls \"{logPath}\" /grant Users:F"
: $"sudo chown -R stellaops:stellaops {logPath} && sudo chmod 755 {logPath}",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.Build();
}
// Check if directory is writable
var testFile = Path.Combine(logPath, $".write-test-{Guid.NewGuid():N}");
try
{
await File.WriteAllTextAsync(testFile, "test", ct);
File.Delete(testFile);
return builder
.Pass("Log directory exists and is writable")
.WithEvidence(eb => eb
.Add("LogPath", logPath)
.Add("Exists", "true")
.Add("Writable", "true"))
.Build();
}
catch (UnauthorizedAccessException)
{
return builder
.Fail($"Log directory is not writable: {logPath}")
.WithEvidence(eb => eb
.Add("LogPath", logPath)
.Add("Exists", "true")
.Add("Writable", "false"))
.WithCauses(
"Insufficient permissions",
"Directory owned by different user",
"Read-only file system")
.WithRemediation(rb => rb
.AddStep(1, "Fix permissions",
RuntimeInformation.IsOSPlatform(OSPlatform.Windows)
? $"icacls \"{logPath}\" /grant Users:F"
: $"sudo chown -R stellaops:stellaops {logPath} && sudo chmod 755 {logPath}",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.Build();
}
catch (IOException ex)
{
return builder
.Fail($"Cannot write to log directory: {ex.Message}")
.WithEvidence(eb => eb
.Add("LogPath", logPath)
.Add("Error", ex.Message))
.WithCauses(
"Disk full",
"File system error",
"Path too long")
.Build();
}
finally
{
// Clean up test file if it exists
try { if (File.Exists(testFile)) File.Delete(testFile); } catch { /* ignore */ }
}
}
private static string GetLogDirectory(DoctorPluginContext context)
{
var configured = context.Configuration["Logging:Path"];
if (!string.IsNullOrEmpty(configured))
{
return configured;
}
// Platform-specific defaults
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
var appData = Environment.GetFolderPath(Environment.SpecialFolder.CommonApplicationData);
return Path.Combine(appData, "StellaOps", "logs");
}
return "/var/log/stellaops";
}
}

View File

@@ -0,0 +1,181 @@
using System.Globalization;
using System.Runtime.InteropServices;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugin.Observability.Checks;
/// <summary>
/// Checks if log rotation is configured.
/// </summary>
public sealed class LogRotationCheck : IDoctorCheck
{
private const long MaxLogSizeMb = 100; // 100 MB threshold for warning
/// <inheritdoc />
public string CheckId => "check.logs.rotation.configured";
/// <inheritdoc />
public string Name => "Log Rotation";
/// <inheritdoc />
public string Description => "Verify log rotation is configured to prevent disk exhaustion";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["observability", "logs"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(1);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context)
{
return true;
}
/// <inheritdoc />
public Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var builder = context.CreateResult(CheckId, "stellaops.doctor.observability", "Observability");
var logPath = GetLogDirectory(context);
// Check for log rotation configuration
var rotationConfigured = IsLogRotationConfigured(context);
var rollingPolicy = context.Configuration["Logging:RollingPolicy"];
if (!Directory.Exists(logPath))
{
return Task.FromResult(builder
.Skip("Log directory does not exist")
.Build());
}
// Check current log sizes
var logFiles = Directory.GetFiles(logPath, "*.log", SearchOption.TopDirectoryOnly);
var totalSizeMb = logFiles.Sum(f => new FileInfo(f).Length) / (1024 * 1024);
var largeFiles = logFiles
.Select(f => new FileInfo(f))
.Where(f => f.Length > MaxLogSizeMb * 1024 * 1024)
.ToList();
if (rotationConfigured)
{
if (largeFiles.Count > 0)
{
return Task.FromResult(builder
.Warn($"Log rotation configured but {largeFiles.Count} file(s) exceed {MaxLogSizeMb}MB threshold")
.WithEvidence(eb => eb
.Add("LogPath", logPath)
.Add("TotalSizeMb", totalSizeMb.ToString(CultureInfo.InvariantCulture))
.Add("LargeFileCount", largeFiles.Count.ToString(CultureInfo.InvariantCulture))
.Add("RollingPolicy", rollingPolicy ?? "configured"))
.WithCauses(
"Log rotation not triggered yet",
"Rotation threshold too high",
"Very high log volume")
.WithRemediation(rb => rb
.AddStep(1, "Force log rotation",
RuntimeInformation.IsOSPlatform(OSPlatform.Windows)
? "Restart-Service StellaOps"
: "sudo logrotate -f /etc/logrotate.d/stellaops",
CommandType.Shell)
.AddStep(2, "Adjust rotation threshold",
"Edit Logging:RollingPolicy in configuration",
CommandType.Config))
.Build());
}
return Task.FromResult(builder
.Pass("Log rotation is configured and logs are within size limits")
.WithEvidence(eb => eb
.Add("LogPath", logPath)
.Add("TotalSizeMb", totalSizeMb.ToString(CultureInfo.InvariantCulture))
.Add("FileCount", logFiles.Length.ToString(CultureInfo.InvariantCulture))
.Add("RollingPolicy", rollingPolicy ?? "configured"))
.Build());
}
// Not configured - check if there are large files
if (largeFiles.Count > 0 || totalSizeMb > MaxLogSizeMb * 2)
{
return Task.FromResult(builder
.Warn($"Log rotation not configured and logs total {totalSizeMb}MB")
.WithEvidence(eb => eb
.Add("LogPath", logPath)
.Add("TotalSizeMb", totalSizeMb.ToString(CultureInfo.InvariantCulture))
.Add("LargeFileCount", largeFiles.Count.ToString(CultureInfo.InvariantCulture))
.Add("RollingPolicy", "(not configured)"))
.WithCauses(
"Log rotation not configured",
"logrotate not installed",
"Application-level rotation disabled")
.WithRemediation(rb => rb
.AddStep(1, "Enable application-level log rotation",
"Set Logging:RollingPolicy to 'Size' or 'Date' in appsettings.json",
CommandType.Config)
.AddStep(2, "Or configure system-level rotation",
RuntimeInformation.IsOSPlatform(OSPlatform.Windows)
? "Use Windows Event Log or configure log cleanup task"
: "sudo cp /usr/share/stellaops/logrotate.conf /etc/logrotate.d/stellaops",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.Build());
}
return Task.FromResult(builder
.Info("Log rotation not configured but logs are small")
.WithEvidence(eb => eb
.Add("LogPath", logPath)
.Add("TotalSizeMb", totalSizeMb.ToString(CultureInfo.InvariantCulture))
.Add("RollingPolicy", "(not configured)"))
.Build());
}
private static bool IsLogRotationConfigured(DoctorPluginContext context)
{
// Check application-level configuration
var rollingPolicy = context.Configuration["Logging:RollingPolicy"];
if (!string.IsNullOrEmpty(rollingPolicy))
{
return true;
}
// Check Serilog configuration
var serilogRolling = context.Configuration["Serilog:WriteTo:0:Args:rollingInterval"];
if (!string.IsNullOrEmpty(serilogRolling))
{
return true;
}
// Check for system-level logrotate on Linux
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
if (File.Exists("/etc/logrotate.d/stellaops"))
{
return true;
}
}
return false;
}
private static string GetLogDirectory(DoctorPluginContext context)
{
var configured = context.Configuration["Logging:Path"];
if (!string.IsNullOrEmpty(configured))
{
return configured;
}
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
var appData = Environment.GetFolderPath(Environment.SpecialFolder.CommonApplicationData);
return Path.Combine(appData, "StellaOps", "logs");
}
return "/var/log/stellaops";
}
}

View File

@@ -0,0 +1,122 @@
using System.Globalization;
using System.Net.Http;
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugin.Observability.Checks;
/// <summary>
/// Checks if the OTLP collector endpoint is reachable.
/// </summary>
public sealed class OtlpEndpointCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.telemetry.otlp.endpoint";
/// <inheritdoc />
public string Name => "OTLP Endpoint";
/// <inheritdoc />
public string Description => "Verify OTLP collector endpoint is reachable";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["observability", "telemetry", "otlp"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context)
{
var endpoint = context.Configuration["Telemetry:OtlpEndpoint"];
return !string.IsNullOrEmpty(endpoint);
}
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var endpoint = context.Configuration["Telemetry:OtlpEndpoint"]!;
var builder = context.CreateResult(CheckId, "stellaops.doctor.observability", "Observability");
try
{
var httpClientFactory = context.Services.GetRequiredService<IHttpClientFactory>();
var httpClient = httpClientFactory.CreateClient("DoctorHealthCheck");
httpClient.Timeout = TimeSpan.FromSeconds(5);
// Try the OTLP health endpoint
var healthUrl = endpoint.TrimEnd('/') + "/v1/health";
var response = await httpClient.GetAsync(healthUrl, ct);
if (response.IsSuccessStatusCode)
{
return builder
.Pass("OTLP collector is reachable")
.WithEvidence(eb => eb
.Add("Endpoint", endpoint)
.Add("StatusCode", ((int)response.StatusCode).ToString(CultureInfo.InvariantCulture)))
.Build();
}
return builder
.Warn($"OTLP collector returned {response.StatusCode}")
.WithEvidence(eb => eb
.Add("Endpoint", endpoint)
.Add("StatusCode", ((int)response.StatusCode).ToString(CultureInfo.InvariantCulture)))
.WithCauses(
"OTLP collector not running",
"Network connectivity issue",
"Wrong endpoint configured",
"Health endpoint not available (may still work)")
.WithRemediation(rb => rb
.AddStep(1, "Check OTLP collector status",
"docker logs otel-collector --tail 50",
CommandType.Shell)
.AddStep(2, "Test endpoint connectivity",
$"curl -v {endpoint}/v1/health",
CommandType.Shell)
.AddStep(3, "Verify configuration",
"cat /etc/stellaops/telemetry.yaml | grep otlp",
CommandType.Shell))
.WithVerification($"stella doctor --check {CheckId}")
.Build();
}
catch (TaskCanceledException)
{
return builder
.Warn($"OTLP collector connection timed out")
.WithEvidence(eb => eb
.Add("Endpoint", endpoint)
.Add("Error", "Connection timeout"))
.WithCauses(
"OTLP collector not running",
"Network connectivity issue",
"Firewall blocking connection")
.WithRemediation(rb => rb
.AddStep(1, "Check if OTLP collector is running",
"docker ps | grep otel",
CommandType.Shell)
.AddStep(2, "Check network connectivity",
$"nc -zv {new Uri(endpoint).Host} {new Uri(endpoint).Port}",
CommandType.Shell))
.Build();
}
catch (HttpRequestException ex)
{
return builder
.Warn($"Cannot reach OTLP collector: {ex.Message}")
.WithEvidence(eb => eb
.Add("Endpoint", endpoint)
.Add("Error", ex.Message))
.WithCauses(
"OTLP collector not running",
"Network connectivity issue",
"DNS resolution failure")
.Build();
}
}
}

View File

@@ -0,0 +1,135 @@
using System.Globalization;
using System.Net.Http;
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Doctor.Models;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugin.Observability.Checks;
/// <summary>
/// Checks if Prometheus can scrape metrics from the application.
/// </summary>
public sealed class PrometheusScrapeCheck : IDoctorCheck
{
/// <inheritdoc />
public string CheckId => "check.metrics.prometheus.scrape";
/// <inheritdoc />
public string Name => "Prometheus Scrape";
/// <inheritdoc />
public string Description => "Verify application metrics endpoint is accessible for Prometheus scraping";
/// <inheritdoc />
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
/// <inheritdoc />
public IReadOnlyList<string> Tags => ["observability", "metrics", "prometheus"];
/// <inheritdoc />
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
/// <inheritdoc />
public bool CanRun(DoctorPluginContext context)
{
// Check if metrics are enabled
var metricsEnabled = context.Configuration["Metrics:Enabled"];
return metricsEnabled?.Equals("true", StringComparison.OrdinalIgnoreCase) ?? false;
}
/// <inheritdoc />
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
{
var builder = context.CreateResult(CheckId, "stellaops.doctor.observability", "Observability");
var metricsPath = context.Configuration["Metrics:Path"] ?? "/metrics";
var metricsPort = context.Configuration["Metrics:Port"] ?? "8080";
var metricsHost = context.Configuration["Metrics:Host"] ?? "localhost";
var metricsUrl = $"http://{metricsHost}:{metricsPort}{metricsPath}";
try
{
var httpClientFactory = context.Services.GetRequiredService<IHttpClientFactory>();
var httpClient = httpClientFactory.CreateClient("DoctorHealthCheck");
httpClient.Timeout = TimeSpan.FromSeconds(5);
var response = await httpClient.GetAsync(metricsUrl, ct);
if (response.IsSuccessStatusCode)
{
var content = await response.Content.ReadAsStringAsync(ct);
var metricCount = CountMetrics(content);
return builder
.Pass($"Metrics endpoint accessible with {metricCount} metrics")
.WithEvidence(eb => eb
.Add("MetricsUrl", metricsUrl)
.Add("StatusCode", ((int)response.StatusCode).ToString(CultureInfo.InvariantCulture))
.Add("MetricCount", metricCount.ToString(CultureInfo.InvariantCulture))
.Add("ContentType", response.Content.Headers.ContentType?.ToString() ?? "unknown"))
.Build();
}
return builder
.Warn($"Metrics endpoint returned {response.StatusCode}")
.WithEvidence(eb => eb
.Add("MetricsUrl", metricsUrl)
.Add("StatusCode", ((int)response.StatusCode).ToString(CultureInfo.InvariantCulture)))
.WithCauses(
"Metrics endpoint not enabled",
"Wrong port configured",
"Authentication required")
.WithRemediation(rb => rb
.AddStep(1, "Enable metrics endpoint",
"Set Metrics:Enabled=true in appsettings.json",
CommandType.Config)
.AddStep(2, "Verify metrics configuration",
"stella config get Metrics",
CommandType.Shell))
.WithVerification($"curl -s {metricsUrl} | head -5")
.Build();
}
catch (TaskCanceledException)
{
return builder
.Warn("Metrics endpoint connection timed out")
.WithEvidence(eb => eb
.Add("MetricsUrl", metricsUrl)
.Add("Error", "Connection timeout"))
.WithCauses(
"Service not running",
"Wrong port configured",
"Firewall blocking connection")
.WithRemediation(rb => rb
.AddStep(1, "Check service status",
"stella status",
CommandType.Shell)
.AddStep(2, "Check port binding",
$"netstat -an | grep {metricsPort}",
CommandType.Shell))
.Build();
}
catch (HttpRequestException ex)
{
return builder
.Warn($"Cannot reach metrics endpoint: {ex.Message}")
.WithEvidence(eb => eb
.Add("MetricsUrl", metricsUrl)
.Add("Error", ex.Message))
.WithCauses(
"Service not running",
"Metrics endpoint disabled",
"Network connectivity issue")
.Build();
}
}
private static int CountMetrics(string prometheusOutput)
{
// Count lines that look like metrics (not comments or empty)
return prometheusOutput
.Split('\n', StringSplitOptions.RemoveEmptyEntries)
.Count(line => !line.StartsWith('#') && line.Contains(' '));
}
}

View File

@@ -0,0 +1,54 @@
using StellaOps.Doctor.Plugin.Observability.Checks;
using StellaOps.Doctor.Plugins;
namespace StellaOps.Doctor.Plugin.Observability;
/// <summary>
/// Doctor plugin for observability checks (OTLP, logs, metrics).
/// </summary>
public sealed class ObservabilityDoctorPlugin : IDoctorPlugin
{
private static readonly Version PluginVersion = new(1, 0, 0);
private static readonly Version MinVersion = new(1, 0, 0);
/// <inheritdoc />
public string PluginId => "stellaops.doctor.observability";
/// <inheritdoc />
public string DisplayName => "Observability";
/// <inheritdoc />
public DoctorCategory Category => DoctorCategory.Observability;
/// <inheritdoc />
public Version Version => PluginVersion;
/// <inheritdoc />
public Version MinEngineVersion => MinVersion;
/// <inheritdoc />
public bool IsAvailable(IServiceProvider services)
{
// Always available - individual checks handle their own availability
return true;
}
/// <inheritdoc />
public IReadOnlyList<IDoctorCheck> GetChecks(DoctorPluginContext context)
{
return new IDoctorCheck[]
{
new OtlpEndpointCheck(),
new LogDirectoryCheck(),
new LogRotationCheck(),
new PrometheusScrapeCheck()
};
}
/// <inheritdoc />
public Task InitializeAsync(DoctorPluginContext context, CancellationToken ct)
{
// No initialization required
return Task.CompletedTask;
}
}

View File

@@ -0,0 +1,21 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<LangVersion>preview</LangVersion>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<RootNamespace>StellaOps.Doctor.Plugin.Observability</RootNamespace>
<Description>Observability checks for Stella Ops Doctor diagnostics - OTLP, logs, metrics</Description>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\..\..\..\__Libraries\StellaOps.Doctor\StellaOps.Doctor.csproj" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Http" />
</ItemGroup>
</Project>