sln build fix (again), tests fixes, audit work and doctors work
This commit is contained in:
@@ -0,0 +1,132 @@
|
||||
using System.Globalization;
|
||||
using Npgsql;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.Doctor.Plugins.Builders;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.Database.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Checks the health of database connection pool.
|
||||
/// </summary>
|
||||
public sealed class ConnectionPoolHealthCheck : DatabaseCheckBase
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public override string CheckId => "check.db.pool.health";
|
||||
|
||||
/// <inheritdoc />
|
||||
public override string Name => "Connection Pool Health";
|
||||
|
||||
/// <inheritdoc />
|
||||
public override string Description => "Verifies the database connection pool is healthy";
|
||||
|
||||
/// <inheritdoc />
|
||||
public override IReadOnlyList<string> Tags => ["database", "pool", "connectivity"];
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override async Task<DoctorCheckResult> ExecuteCheckAsync(
|
||||
DoctorPluginContext context,
|
||||
string connectionString,
|
||||
CheckResultBuilder result,
|
||||
CancellationToken ct)
|
||||
{
|
||||
await using var connection = await CreateConnectionAsync(connectionString, ct);
|
||||
|
||||
// Get connection statistics from pg_stat_activity
|
||||
await using var cmd = new NpgsqlCommand(@"
|
||||
SELECT
|
||||
COUNT(*) AS total_connections,
|
||||
COUNT(*) FILTER (WHERE state = 'active') AS active_connections,
|
||||
COUNT(*) FILTER (WHERE state = 'idle') AS idle_connections,
|
||||
COUNT(*) FILTER (WHERE state = 'idle in transaction') AS idle_in_transaction,
|
||||
COUNT(*) FILTER (WHERE wait_event IS NOT NULL) AS waiting_connections,
|
||||
MAX(EXTRACT(EPOCH FROM (now() - backend_start))) AS oldest_connection_seconds
|
||||
FROM pg_stat_activity
|
||||
WHERE datname = current_database()
|
||||
AND pid <> pg_backend_pid()",
|
||||
connection);
|
||||
|
||||
await using var reader = await cmd.ExecuteReaderAsync(ct);
|
||||
|
||||
if (await reader.ReadAsync(ct))
|
||||
{
|
||||
var totalConnections = reader.GetInt64(0);
|
||||
var activeConnections = reader.GetInt64(1);
|
||||
var idleConnections = reader.GetInt64(2);
|
||||
var idleInTransaction = reader.GetInt64(3);
|
||||
var waitingConnections = reader.GetInt64(4);
|
||||
var oldestConnectionSeconds = reader.IsDBNull(5) ? 0 : reader.GetDouble(5);
|
||||
|
||||
await reader.CloseAsync();
|
||||
|
||||
// Get max connections setting
|
||||
await using var maxCmd = new NpgsqlCommand("SHOW max_connections", connection);
|
||||
var maxConnectionsStr = await maxCmd.ExecuteScalarAsync(ct) as string ?? "100";
|
||||
var maxConnections = int.Parse(maxConnectionsStr, CultureInfo.InvariantCulture);
|
||||
|
||||
var usagePercent = (double)totalConnections / maxConnections * 100;
|
||||
|
||||
// Check for issues
|
||||
if (idleInTransaction > 5)
|
||||
{
|
||||
return result
|
||||
.Warn($"{idleInTransaction} connections idle in transaction")
|
||||
.WithEvidence("Connection pool status", e => e
|
||||
.Add("TotalConnections", totalConnections.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("ActiveConnections", activeConnections.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("IdleConnections", idleConnections.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("IdleInTransaction", idleInTransaction.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("WaitingConnections", waitingConnections.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("MaxConnections", maxConnections.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("UsagePercent", $"{usagePercent:F1}%"))
|
||||
.WithCauses(
|
||||
"Long-running transactions not committed",
|
||||
"Application not properly closing transactions",
|
||||
"Deadlock or lock contention")
|
||||
.WithRemediation(r => r
|
||||
.AddShellStep(1, "Find idle transactions", "psql -c \"SELECT pid, query FROM pg_stat_activity WHERE state = 'idle in transaction'\"")
|
||||
.AddManualStep(2, "Review application code", "Ensure transactions are properly committed or rolled back"))
|
||||
.WithVerification("stella doctor --check check.db.pool.health")
|
||||
.Build();
|
||||
}
|
||||
|
||||
if (usagePercent > 80)
|
||||
{
|
||||
return result
|
||||
.Warn($"Connection pool usage at {usagePercent:F1}%")
|
||||
.WithEvidence("Connection pool status", e => e
|
||||
.Add("TotalConnections", totalConnections.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("ActiveConnections", activeConnections.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("IdleConnections", idleConnections.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("MaxConnections", maxConnections.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("UsagePercent", $"{usagePercent:F1}%"))
|
||||
.WithCauses(
|
||||
"Connection leak in application",
|
||||
"Too many concurrent requests",
|
||||
"max_connections too low for workload")
|
||||
.WithRemediation(r => r
|
||||
.AddManualStep(1, "Review connection pool settings", "Check Npgsql connection string pool size")
|
||||
.AddManualStep(2, "Consider increasing max_connections", "Edit postgresql.conf if appropriate"))
|
||||
.WithVerification("stella doctor --check check.db.pool.health")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return result
|
||||
.Pass($"Connection pool healthy: {totalConnections}/{maxConnections} connections ({usagePercent:F1}%)")
|
||||
.WithEvidence("Connection pool status", e => e
|
||||
.Add("TotalConnections", totalConnections.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("ActiveConnections", activeConnections.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("IdleConnections", idleConnections.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("IdleInTransaction", idleInTransaction.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("WaitingConnections", waitingConnections.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("MaxConnections", maxConnections.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("UsagePercent", $"{usagePercent:F1}%")
|
||||
.Add("OldestConnectionAge", $"{oldestConnectionSeconds:F0}s"))
|
||||
.Build();
|
||||
}
|
||||
|
||||
return result
|
||||
.Fail("Unable to retrieve connection pool statistics")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user