Implement remediation-aware health checks across all Doctor plugin modules (Agent, Attestor, Auth, BinaryAnalysis, Compliance, Crypto, Environment, EvidenceLocker, Notify, Observability, Operations, Policy, Postgres, Release, Scanner, Storage, Vex) and their backing library counterparts (AI, Attestation, Authority, Core, Cryptography, Database, Docker, Integration, Notify, Observability, Security, ServiceGraph, Sources, Verification). Each check now emits structured remediation metadata (severity, category, runbook links, and fix suggestions) consumed by the Doctor dashboard remediation panel. Also adds: - docs/doctor/articles/ knowledge base for check explanations - Advisory AI search seed and allowlist updates for doctor content - Sprint plan for doctor checks documentation Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
136 lines
6.9 KiB
C#
136 lines
6.9 KiB
C#
|
|
using Npgsql;
|
|
using StellaOps.Doctor.Models;
|
|
using StellaOps.Doctor.Plugins;
|
|
using StellaOps.Doctor.Plugins.Builders;
|
|
using System.Globalization;
|
|
|
|
namespace StellaOps.Doctor.Plugins.Database.Checks;
|
|
|
|
/// <summary>
|
|
/// Checks the health of database connection pool.
|
|
/// </summary>
|
|
public sealed class ConnectionPoolHealthCheck : DatabaseCheckBase
|
|
{
|
|
/// <inheritdoc />
|
|
public override string CheckId => "check.db.pool.health";
|
|
|
|
/// <inheritdoc />
|
|
public override string Name => "Connection Pool Health";
|
|
|
|
/// <inheritdoc />
|
|
public override string Description => "Verifies the database connection pool is healthy";
|
|
|
|
/// <inheritdoc />
|
|
public override IReadOnlyList<string> Tags => ["database", "pool", "connectivity"];
|
|
|
|
/// <inheritdoc />
|
|
protected override async Task<DoctorCheckResult> ExecuteCheckAsync(
|
|
DoctorPluginContext context,
|
|
string connectionString,
|
|
CheckResultBuilder result,
|
|
CancellationToken ct)
|
|
{
|
|
await using var connection = await CreateConnectionAsync(connectionString, ct);
|
|
|
|
// Get connection statistics from pg_stat_activity
|
|
await using var cmd = new NpgsqlCommand(@"
|
|
SELECT
|
|
COUNT(*) AS total_connections,
|
|
COUNT(*) FILTER (WHERE state = 'active') AS active_connections,
|
|
COUNT(*) FILTER (WHERE state = 'idle') AS idle_connections,
|
|
COUNT(*) FILTER (WHERE state = 'idle in transaction') AS idle_in_transaction,
|
|
COUNT(*) FILTER (WHERE wait_event IS NOT NULL) AS waiting_connections,
|
|
MAX(EXTRACT(EPOCH FROM (now() - backend_start))) AS oldest_connection_seconds
|
|
FROM pg_stat_activity
|
|
WHERE datname = current_database()
|
|
AND pid <> pg_backend_pid()",
|
|
connection);
|
|
|
|
await using var reader = await cmd.ExecuteReaderAsync(ct);
|
|
|
|
if (await reader.ReadAsync(ct))
|
|
{
|
|
var totalConnections = reader.GetInt64(0);
|
|
var activeConnections = reader.GetInt64(1);
|
|
var idleConnections = reader.GetInt64(2);
|
|
var idleInTransaction = reader.GetInt64(3);
|
|
var waitingConnections = reader.GetInt64(4);
|
|
var oldestConnectionSeconds = reader.IsDBNull(5) ? 0 : reader.GetDouble(5);
|
|
|
|
await reader.CloseAsync();
|
|
|
|
// Get max connections setting
|
|
await using var maxCmd = new NpgsqlCommand("SHOW max_connections", connection);
|
|
var maxConnectionsStr = await maxCmd.ExecuteScalarAsync(ct) as string ?? "100";
|
|
var maxConnections = int.Parse(maxConnectionsStr, CultureInfo.InvariantCulture);
|
|
|
|
var usagePercent = (double)totalConnections / maxConnections * 100;
|
|
|
|
// Check for issues
|
|
if (idleInTransaction > 5)
|
|
{
|
|
return result
|
|
.Warn($"{idleInTransaction} connections idle in transaction")
|
|
.WithEvidence("Connection pool status", e => e
|
|
.Add("TotalConnections", totalConnections.ToString(CultureInfo.InvariantCulture))
|
|
.Add("ActiveConnections", activeConnections.ToString(CultureInfo.InvariantCulture))
|
|
.Add("IdleConnections", idleConnections.ToString(CultureInfo.InvariantCulture))
|
|
.Add("IdleInTransaction", idleInTransaction.ToString(CultureInfo.InvariantCulture))
|
|
.Add("WaitingConnections", waitingConnections.ToString(CultureInfo.InvariantCulture))
|
|
.Add("MaxConnections", maxConnections.ToString(CultureInfo.InvariantCulture))
|
|
.Add("UsagePercent", $"{usagePercent:F1}%"))
|
|
.WithCauses(
|
|
"Long-running transactions not committed",
|
|
"Application not properly closing transactions",
|
|
"Deadlock or lock contention")
|
|
.WithRemediation(r => r
|
|
.AddShellStep(1, "Find idle transactions", "psql -c \"SELECT pid, query FROM pg_stat_activity WHERE state = 'idle in transaction'\"")
|
|
.AddManualStep(2, "Review application code", "Ensure transactions are properly committed or rolled back")
|
|
.WithRunbookUrl("docs/doctor/articles/postgres/db-pool-health.md"))
|
|
.WithVerification("stella doctor --check check.db.pool.health")
|
|
.Build();
|
|
}
|
|
|
|
if (usagePercent > 80)
|
|
{
|
|
return result
|
|
.Warn($"Connection pool usage at {usagePercent:F1}%")
|
|
.WithEvidence("Connection pool status", e => e
|
|
.Add("TotalConnections", totalConnections.ToString(CultureInfo.InvariantCulture))
|
|
.Add("ActiveConnections", activeConnections.ToString(CultureInfo.InvariantCulture))
|
|
.Add("IdleConnections", idleConnections.ToString(CultureInfo.InvariantCulture))
|
|
.Add("MaxConnections", maxConnections.ToString(CultureInfo.InvariantCulture))
|
|
.Add("UsagePercent", $"{usagePercent:F1}%"))
|
|
.WithCauses(
|
|
"Connection leak in application",
|
|
"Too many concurrent requests",
|
|
"max_connections too low for workload")
|
|
.WithRemediation(r => r
|
|
.AddManualStep(1, "Review connection pool settings", "Check Npgsql connection string pool size")
|
|
.AddManualStep(2, "Consider increasing max_connections", "Edit postgresql.conf if appropriate")
|
|
.WithRunbookUrl("docs/doctor/articles/postgres/db-pool-health.md"))
|
|
.WithVerification("stella doctor --check check.db.pool.health")
|
|
.Build();
|
|
}
|
|
|
|
return result
|
|
.Pass($"Connection pool healthy: {totalConnections}/{maxConnections} connections ({usagePercent:F1}%)")
|
|
.WithEvidence("Connection pool status", e => e
|
|
.Add("TotalConnections", totalConnections.ToString(CultureInfo.InvariantCulture))
|
|
.Add("ActiveConnections", activeConnections.ToString(CultureInfo.InvariantCulture))
|
|
.Add("IdleConnections", idleConnections.ToString(CultureInfo.InvariantCulture))
|
|
.Add("IdleInTransaction", idleInTransaction.ToString(CultureInfo.InvariantCulture))
|
|
.Add("WaitingConnections", waitingConnections.ToString(CultureInfo.InvariantCulture))
|
|
.Add("MaxConnections", maxConnections.ToString(CultureInfo.InvariantCulture))
|
|
.Add("UsagePercent", $"{usagePercent:F1}%")
|
|
.Add("OldestConnectionAge", $"{oldestConnectionSeconds:F0}s"))
|
|
.Build();
|
|
}
|
|
|
|
return result
|
|
.Fail("Unable to retrieve connection pool statistics")
|
|
.Build();
|
|
}
|
|
}
|