From bc569931d44a064f8349a861c268353ffe5c45e4 Mon Sep 17 00:00:00 2001 From: master <> Date: Fri, 10 Apr 2026 12:28:36 +0300 Subject: [PATCH] fix(authority): retry transient bootstrap failures with configurable attempts StandardPluginBootstrapper now retries up to 15 times (2s delay) so the admin user and client seeds converge after PostgreSQL becomes reachable. Exceptions bubble through the retry loop instead of being swallowed per-step. Tests cover the retry path with a FlakyUserRepository that fails once then succeeds. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/modules/authority/README.md | 9 +- .../StandardPluginBootstrapperTests.cs | 224 +++++++++++++++++- .../Bootstrap/StandardPluginBootstrapper.cs | 84 +++++-- .../TASKS.md | 1 + 4 files changed, 285 insertions(+), 33 deletions(-) diff --git a/docs/modules/authority/README.md b/docs/modules/authority/README.md index d5fdacb22..93b546e4d 100644 --- a/docs/modules/authority/README.md +++ b/docs/modules/authority/README.md @@ -24,10 +24,11 @@ Authority is the platform OIDC/OAuth2 control plane that mints short-lived, send - CLI/UI for login flows and token management. - Scheduler/Scanner for machine-to-machine scope enforcement. -## Operational notes -- PostgreSQL (schema `authority`) for tenant, client, and token state. -- Key material in KMS/HSM with rotation runbooks (`operations/key-rotation.md`). -- Monitoring runbook (`operations/monitoring.md`) and offline-import Grafana JSON (`operations/grafana-dashboard.json`). +## Operational notes +- PostgreSQL (schema `authority`) for tenant, client, and token state. +- Standard plugin bootstrap provisioning retries transient storage failures during startup so seeded local users/clients converge after PostgreSQL becomes reachable. +- Key material in KMS/HSM with rotation runbooks (`operations/key-rotation.md`). +- Monitoring runbook (`operations/monitoring.md`) and offline-import Grafana JSON (`operations/grafana-dashboard.json`). ## Related resources - ./operations/backup-restore.md diff --git a/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard.Tests/StandardPluginBootstrapperTests.cs b/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard.Tests/StandardPluginBootstrapperTests.cs index 80a4e0ac5..86efaf65b 100644 --- a/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard.Tests/StandardPluginBootstrapperTests.cs +++ b/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard.Tests/StandardPluginBootstrapperTests.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using System.Linq; using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.DependencyInjection; @@ -60,7 +61,12 @@ public class StandardPluginBootstrapperTests sp.GetRequiredService())); services.AddSingleton(sp => - new StandardPluginBootstrapper("standard", sp.GetRequiredService(), NullLogger.Instance)); + new StandardPluginBootstrapper( + "standard", + sp.GetRequiredService(), + NullLogger.Instance, + maxAttempts: 1, + retryDelay: TimeSpan.Zero)); using var provider = services.BuildServiceProvider(); var bootstrapper = provider.GetRequiredService(); @@ -89,7 +95,7 @@ public class StandardPluginBootstrapperTests options.BootstrapUser = new BootstrapUserOptions { Username = "bootstrap", - Password = "Password1!", + Password = "Password1234!", RequirePasswordReset = false }; }); @@ -119,7 +125,12 @@ public class StandardPluginBootstrapperTests }); services.AddSingleton(sp => - new StandardPluginBootstrapper("standard", sp.GetRequiredService(), NullLogger.Instance)); + new StandardPluginBootstrapper( + "standard", + sp.GetRequiredService(), + NullLogger.Instance, + maxAttempts: 1, + retryDelay: TimeSpan.Zero)); using var provider = services.BuildServiceProvider(); var bootstrapper = provider.GetRequiredService(); @@ -129,6 +140,69 @@ public class StandardPluginBootstrapperTests Assert.Null(exception); } + [Trait("Category", TestCategories.Unit)] + [Fact] + public async Task StartAsync_RetriesBootstrapUserAfterTransientFailure() + { + var services = new ServiceCollection(); + services.AddOptions("standard") + .Configure(options => + { + options.TenantId = "tenant-1"; + options.BootstrapUser = new BootstrapUserOptions + { + Username = "bootstrap", + Password = "Password1234!", + RequirePasswordReset = false, + Roles = Array.Empty() + }; + }); + + var userRepository = new FlakyUserRepository(); + services.AddSingleton(userRepository); + services.AddSingleton(new NoOpRoleRepository()); + services.AddSingleton(new NoOpPermissionRepository()); + services.AddSingleton(); + services.AddSingleton(new FakeTimeProvider(DateTimeOffset.Parse("2025-12-29T13:00:00Z"))); + services.AddSingleton(new FixedStandardIdGenerator()); + services.AddSingleton(new DefaultCryptoProvider()); + + services.AddSingleton(sp => + { + var optionsMonitor = sp.GetRequiredService>(); + var options = optionsMonitor.Get("standard"); + var cryptoProvider = sp.GetRequiredService(); + var auditLogger = sp.GetRequiredService(); + return new StandardUserCredentialStore( + "standard", + "tenant-1", + sp.GetRequiredService(), + options, + new CryptoPasswordHasher(options, cryptoProvider), + auditLogger, + sp.GetRequiredService(), + sp.GetRequiredService(), + NullLogger.Instance); + }); + + services.AddSingleton(sp => + new StandardPluginBootstrapper( + "standard", + sp.GetRequiredService(), + NullLogger.Instance, + maxAttempts: 2, + retryDelay: TimeSpan.Zero)); + + using var provider = services.BuildServiceProvider(); + var bootstrapper = provider.GetRequiredService(); + + await bootstrapper.StartAsync(TestContext.Current.CancellationToken); + + var bootstrapUsers = await userRepository.GetAllAsync("tenant-1", cancellationToken: TestContext.Current.CancellationToken); + Assert.Contains(bootstrapUsers, user => string.Equals(user.Username, "bootstrap", StringComparison.OrdinalIgnoreCase)); + Assert.True(userRepository.FailureObserved); + } + private sealed class ThrowingUserRepository : IUserRepository { public Task CreateAsync(UserEntity user, CancellationToken cancellationToken = default) @@ -165,6 +239,150 @@ public class StandardPluginBootstrapperTests => throw new InvalidOperationException("Simulated failure"); } + private sealed class FlakyUserRepository : IUserRepository + { + private readonly Dictionary usersById = new(); + private readonly Dictionary idsByTenantAndUsername = new(StringComparer.OrdinalIgnoreCase); + private bool failNextLookup = true; + + public bool FailureObserved { get; private set; } + + public Task CreateAsync(UserEntity user, CancellationToken cancellationToken = default) + { + var clone = Clone(user); + usersById[clone.Id] = clone; + idsByTenantAndUsername[GetUsernameKey(clone.TenantId, clone.Username)] = clone.Id; + return Task.FromResult(Clone(clone)); + } + + public Task GetByIdAsync(string tenantId, Guid id, CancellationToken cancellationToken = default) + { + if (usersById.TryGetValue(id, out var user) && string.Equals(user.TenantId, tenantId, StringComparison.OrdinalIgnoreCase)) + { + return Task.FromResult(Clone(user)); + } + + return Task.FromResult(null); + } + + public Task GetByUsernameAsync(string tenantId, string username, CancellationToken cancellationToken = default) + { + if (failNextLookup) + { + failNextLookup = false; + FailureObserved = true; + throw new InvalidOperationException("Transient failure"); + } + + if (!idsByTenantAndUsername.TryGetValue(GetUsernameKey(tenantId, username), out var userId)) + { + return Task.FromResult(null); + } + + return GetByIdAsync(tenantId, userId, cancellationToken); + } + + public Task GetBySubjectIdAsync(string tenantId, string subjectId, CancellationToken cancellationToken = default) + => Task.FromResult(null); + + public Task GetByEmailAsync(string tenantId, string email, CancellationToken cancellationToken = default) + { + var user = usersById.Values.FirstOrDefault(candidate => + string.Equals(candidate.TenantId, tenantId, StringComparison.OrdinalIgnoreCase) + && string.Equals(candidate.Email, email, StringComparison.OrdinalIgnoreCase)); + return Task.FromResult(user is null ? null : Clone(user)); + } + + public Task> GetAllAsync(string tenantId, bool? enabled = null, int limit = 100, int offset = 0, CancellationToken cancellationToken = default) + { + var results = usersById.Values + .Where(candidate => string.Equals(candidate.TenantId, tenantId, StringComparison.OrdinalIgnoreCase)) + .Where(candidate => enabled is null || candidate.Enabled == enabled.Value) + .Skip(offset) + .Take(limit) + .Select(Clone) + .ToList(); + return Task.FromResult>(results); + } + + public Task UpdateAsync(UserEntity user, CancellationToken cancellationToken = default) + { + var clone = Clone(user); + usersById[clone.Id] = clone; + idsByTenantAndUsername[GetUsernameKey(clone.TenantId, clone.Username)] = clone.Id; + return Task.FromResult(true); + } + + public Task DeleteAsync(string tenantId, Guid id, CancellationToken cancellationToken = default) + { + if (!usersById.TryGetValue(id, out var user) || !string.Equals(user.TenantId, tenantId, StringComparison.OrdinalIgnoreCase)) + { + return Task.FromResult(false); + } + + usersById.Remove(id); + idsByTenantAndUsername.Remove(GetUsernameKey(user.TenantId, user.Username)); + return Task.FromResult(true); + } + + public Task UpdatePasswordAsync(string tenantId, Guid userId, string passwordHash, string passwordSalt, CancellationToken cancellationToken = default) + => Task.FromResult(false); + + public Task RecordFailedLoginAsync(string tenantId, Guid userId, DateTimeOffset? lockUntil = null, CancellationToken cancellationToken = default) + => Task.FromResult(0); + + public Task RecordSuccessfulLoginAsync(string tenantId, Guid userId, CancellationToken cancellationToken = default) + => Task.CompletedTask; + + private static string GetUsernameKey(string tenantId, string username) + => $"{tenantId}::{username.Trim().ToLowerInvariant()}"; + + private static UserEntity Clone(UserEntity user) + => new() + { + Id = user.Id, + TenantId = user.TenantId, + Username = user.Username, + Email = user.Email, + DisplayName = user.DisplayName, + PasswordHash = user.PasswordHash, + PasswordSalt = user.PasswordSalt, + Enabled = user.Enabled, + Metadata = user.Metadata, + FailedLoginAttempts = user.FailedLoginAttempts, + LockedUntil = user.LockedUntil, + CreatedAt = user.CreatedAt, + UpdatedAt = user.UpdatedAt + }; + } + + private sealed class NoOpRoleRepository : IRoleRepository + { + public Task GetByIdAsync(string tenantId, Guid id, CancellationToken cancellationToken = default) => Task.FromResult(null); + public Task GetByNameAsync(string tenantId, string name, CancellationToken cancellationToken = default) => Task.FromResult(null); + public Task> ListAsync(string tenantId, CancellationToken cancellationToken = default) => Task.FromResult>(Array.Empty()); + public Task> GetUserRolesAsync(string tenantId, Guid userId, CancellationToken cancellationToken = default) => Task.FromResult>(Array.Empty()); + public Task CreateAsync(string tenantId, RoleEntity role, CancellationToken cancellationToken = default) => Task.FromResult(role.Id); + public Task UpdateAsync(string tenantId, RoleEntity role, CancellationToken cancellationToken = default) => Task.CompletedTask; + public Task DeleteAsync(string tenantId, Guid id, CancellationToken cancellationToken = default) => Task.CompletedTask; + public Task AssignToUserAsync(string tenantId, Guid userId, Guid roleId, string? grantedBy, DateTimeOffset? expiresAt, CancellationToken cancellationToken = default) => Task.CompletedTask; + public Task RemoveFromUserAsync(string tenantId, Guid userId, Guid roleId, CancellationToken cancellationToken = default) => Task.CompletedTask; + } + + private sealed class NoOpPermissionRepository : IPermissionRepository + { + public Task GetByIdAsync(string tenantId, Guid id, CancellationToken cancellationToken = default) => Task.FromResult(null); + public Task GetByNameAsync(string tenantId, string name, CancellationToken cancellationToken = default) => Task.FromResult(null); + public Task> ListAsync(string tenantId, CancellationToken cancellationToken = default) => Task.FromResult>(Array.Empty()); + public Task> GetByResourceAsync(string tenantId, string resource, CancellationToken cancellationToken = default) => Task.FromResult>(Array.Empty()); + public Task> GetRolePermissionsAsync(string tenantId, Guid roleId, CancellationToken cancellationToken = default) => Task.FromResult>(Array.Empty()); + public Task> GetUserPermissionsAsync(string tenantId, Guid userId, CancellationToken cancellationToken = default) => Task.FromResult>(Array.Empty()); + public Task CreateAsync(string tenantId, PermissionEntity permission, CancellationToken cancellationToken = default) => Task.FromResult(permission.Id); + public Task DeleteAsync(string tenantId, Guid id, CancellationToken cancellationToken = default) => Task.CompletedTask; + public Task AssignToRoleAsync(string tenantId, Guid roleId, Guid permissionId, CancellationToken cancellationToken = default) => Task.CompletedTask; + public Task RemoveFromRoleAsync(string tenantId, Guid roleId, Guid permissionId, CancellationToken cancellationToken = default) => Task.CompletedTask; + } + private sealed class NullAuditLogger : IStandardCredentialAuditLogger { public ValueTask RecordAsync( diff --git a/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/Bootstrap/StandardPluginBootstrapper.cs b/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/Bootstrap/StandardPluginBootstrapper.cs index bb3db1496..336f5f033 100644 --- a/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/Bootstrap/StandardPluginBootstrapper.cs +++ b/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/Bootstrap/StandardPluginBootstrapper.cs @@ -17,36 +17,83 @@ namespace StellaOps.Authority.Plugin.Standard.Bootstrap; internal sealed class StandardPluginBootstrapper : IHostedService { private const string DefaultTenantId = "default"; + private static readonly TimeSpan DefaultRetryDelay = TimeSpan.FromSeconds(2); private readonly string pluginName; private readonly IServiceScopeFactory scopeFactory; private readonly ILogger logger; + private readonly int maxAttempts; + private readonly TimeSpan retryDelay; public StandardPluginBootstrapper( string pluginName, IServiceScopeFactory scopeFactory, - ILogger logger) + ILogger logger, + int maxAttempts = 15, + TimeSpan? retryDelay = null) { this.pluginName = pluginName; this.scopeFactory = scopeFactory; this.logger = logger; + this.maxAttempts = Math.Max(1, maxAttempts); + this.retryDelay = retryDelay ?? DefaultRetryDelay; } public async Task StartAsync(CancellationToken cancellationToken) + { + for (var attempt = 1; attempt <= maxAttempts; attempt++) + { + cancellationToken.ThrowIfCancellationRequested(); + + try + { + await RunBootstrapPassAsync(cancellationToken).ConfigureAwait(false); + + if (attempt > 1) + { + logger.LogInformation( + "Standard Authority plugin '{PluginName}' bootstrap completed on retry attempt {Attempt}/{MaxAttempts}.", + pluginName, + attempt, + maxAttempts); + } + + return; + } + catch (Exception ex) + { + var finalAttempt = attempt == maxAttempts; + var level = finalAttempt ? LogLevel.Error : LogLevel.Warning; + + logger.Log( + level, + ex, + finalAttempt + ? "Standard Authority plugin '{PluginName}' bootstrap failed after {Attempt}/{MaxAttempts} attempts." + : "Standard Authority plugin '{PluginName}' bootstrap attempt {Attempt}/{MaxAttempts} failed. Retrying in {RetryDelay}.", + pluginName, + attempt, + maxAttempts, + retryDelay); + + if (finalAttempt) + { + return; + } + + await Task.Delay(retryDelay, cancellationToken).ConfigureAwait(false); + } + } + } + + private async Task RunBootstrapPassAsync(CancellationToken cancellationToken) { using var scope = scopeFactory.CreateScope(); var optionsMonitor = scope.ServiceProvider.GetRequiredService>(); var options = optionsMonitor.Get(pluginName); var tenantId = options.TenantId ?? DefaultTenantId; - try - { - await EnsureBootstrapClientsAsync(scope.ServiceProvider, tenantId, options.BootstrapClients, cancellationToken).ConfigureAwait(false); - } - catch (Exception ex) - { - logger.LogError(ex, "Standard Authority plugin '{PluginName}' failed to ensure bootstrap clients.", pluginName); - } + await EnsureBootstrapClientsAsync(scope.ServiceProvider, tenantId, options.BootstrapClients, cancellationToken).ConfigureAwait(false); if (options.BootstrapUser is null || !options.BootstrapUser.IsConfigured) { @@ -55,25 +102,10 @@ internal sealed class StandardPluginBootstrapper : IHostedService var credentialStore = scope.ServiceProvider.GetRequiredService(); logger.LogInformation("Standard Authority plugin '{PluginName}' ensuring bootstrap user.", pluginName); - try - { - await credentialStore.EnsureBootstrapUserAsync(options.BootstrapUser, cancellationToken).ConfigureAwait(false); - } - catch (Exception ex) - { - logger.LogError(ex, "Standard Authority plugin '{PluginName}' failed to ensure bootstrap user.", pluginName); - } + await credentialStore.EnsureBootstrapUserAsync(options.BootstrapUser, cancellationToken).ConfigureAwait(false); var bootstrapRoles = options.BootstrapUser.Roles ?? new[] { "admin" }; - - try - { - await EnsureAdminRoleAsync(scope.ServiceProvider, tenantId, options.BootstrapUser.Username!, bootstrapRoles, cancellationToken).ConfigureAwait(false); - } - catch (Exception ex) - { - logger.LogError(ex, "Standard Authority plugin '{PluginName}' failed to seed admin role with scopes.", pluginName); - } + await EnsureAdminRoleAsync(scope.ServiceProvider, tenantId, options.BootstrapUser.Username!, bootstrapRoles, cancellationToken).ConfigureAwait(false); } private async Task EnsureAdminRoleAsync( diff --git a/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/TASKS.md b/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/TASKS.md index 0abca280e..72cbea5fb 100644 --- a/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/TASKS.md +++ b/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/TASKS.md @@ -8,3 +8,4 @@ Source of truth: `docs-archived/implplan/2025-12-29-csproj-audit/SPRINT_20251229 | AUDIT-0096-M | DONE | Revalidated 2026-01-06. | | AUDIT-0096-T | DONE | Revalidated 2026-01-06. | | AUDIT-0096-A | TODO | Revalidated 2026-01-06 (open findings). | +| STD-BOOT-01 | DONE | 2026-04-09: Bootstrap admin/client seeding now retries transient startup storage failures so the local admin account converges after PostgreSQL becomes reachable; source of truth is `SPRINT_20260409_002_Platform_local_stack_regression_retest.md`. |