diff --git a/docs/modules/authority/README.md b/docs/modules/authority/README.md index d5fdacb22..93b546e4d 100644 --- a/docs/modules/authority/README.md +++ b/docs/modules/authority/README.md @@ -24,10 +24,11 @@ Authority is the platform OIDC/OAuth2 control plane that mints short-lived, send - CLI/UI for login flows and token management. - Scheduler/Scanner for machine-to-machine scope enforcement. -## Operational notes -- PostgreSQL (schema `authority`) for tenant, client, and token state. -- Key material in KMS/HSM with rotation runbooks (`operations/key-rotation.md`). -- Monitoring runbook (`operations/monitoring.md`) and offline-import Grafana JSON (`operations/grafana-dashboard.json`). +## Operational notes +- PostgreSQL (schema `authority`) for tenant, client, and token state. +- Standard plugin bootstrap provisioning retries transient storage failures during startup so seeded local users/clients converge after PostgreSQL becomes reachable. +- Key material in KMS/HSM with rotation runbooks (`operations/key-rotation.md`). +- Monitoring runbook (`operations/monitoring.md`) and offline-import Grafana JSON (`operations/grafana-dashboard.json`). ## Related resources - ./operations/backup-restore.md diff --git a/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard.Tests/StandardPluginBootstrapperTests.cs b/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard.Tests/StandardPluginBootstrapperTests.cs index 80a4e0ac5..86efaf65b 100644 --- a/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard.Tests/StandardPluginBootstrapperTests.cs +++ b/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard.Tests/StandardPluginBootstrapperTests.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using System.Linq; using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.DependencyInjection; @@ -60,7 +61,12 @@ public class StandardPluginBootstrapperTests sp.GetRequiredService())); services.AddSingleton(sp => - new StandardPluginBootstrapper("standard", sp.GetRequiredService(), NullLogger.Instance)); + new StandardPluginBootstrapper( + "standard", + sp.GetRequiredService(), + NullLogger.Instance, + maxAttempts: 1, + retryDelay: TimeSpan.Zero)); using var provider = services.BuildServiceProvider(); var bootstrapper = provider.GetRequiredService(); @@ -89,7 +95,7 @@ public class StandardPluginBootstrapperTests options.BootstrapUser = new BootstrapUserOptions { Username = "bootstrap", - Password = "Password1!", + Password = "Password1234!", RequirePasswordReset = false }; }); @@ -119,7 +125,12 @@ public class StandardPluginBootstrapperTests }); services.AddSingleton(sp => - new StandardPluginBootstrapper("standard", sp.GetRequiredService(), NullLogger.Instance)); + new StandardPluginBootstrapper( + "standard", + sp.GetRequiredService(), + NullLogger.Instance, + maxAttempts: 1, + retryDelay: TimeSpan.Zero)); using var provider = services.BuildServiceProvider(); var bootstrapper = provider.GetRequiredService(); @@ -129,6 +140,69 @@ public class StandardPluginBootstrapperTests Assert.Null(exception); } + [Trait("Category", TestCategories.Unit)] + [Fact] + public async Task StartAsync_RetriesBootstrapUserAfterTransientFailure() + { + var services = new ServiceCollection(); + services.AddOptions("standard") + .Configure(options => + { + options.TenantId = "tenant-1"; + options.BootstrapUser = new BootstrapUserOptions + { + Username = "bootstrap", + Password = "Password1234!", + RequirePasswordReset = false, + Roles = Array.Empty() + }; + }); + + var userRepository = new FlakyUserRepository(); + services.AddSingleton(userRepository); + services.AddSingleton(new NoOpRoleRepository()); + services.AddSingleton(new NoOpPermissionRepository()); + services.AddSingleton(); + services.AddSingleton(new FakeTimeProvider(DateTimeOffset.Parse("2025-12-29T13:00:00Z"))); + services.AddSingleton(new FixedStandardIdGenerator()); + services.AddSingleton(new DefaultCryptoProvider()); + + services.AddSingleton(sp => + { + var optionsMonitor = sp.GetRequiredService>(); + var options = optionsMonitor.Get("standard"); + var cryptoProvider = sp.GetRequiredService(); + var auditLogger = sp.GetRequiredService(); + return new StandardUserCredentialStore( + "standard", + "tenant-1", + sp.GetRequiredService(), + options, + new CryptoPasswordHasher(options, cryptoProvider), + auditLogger, + sp.GetRequiredService(), + sp.GetRequiredService(), + NullLogger.Instance); + }); + + services.AddSingleton(sp => + new StandardPluginBootstrapper( + "standard", + sp.GetRequiredService(), + NullLogger.Instance, + maxAttempts: 2, + retryDelay: TimeSpan.Zero)); + + using var provider = services.BuildServiceProvider(); + var bootstrapper = provider.GetRequiredService(); + + await bootstrapper.StartAsync(TestContext.Current.CancellationToken); + + var bootstrapUsers = await userRepository.GetAllAsync("tenant-1", cancellationToken: TestContext.Current.CancellationToken); + Assert.Contains(bootstrapUsers, user => string.Equals(user.Username, "bootstrap", StringComparison.OrdinalIgnoreCase)); + Assert.True(userRepository.FailureObserved); + } + private sealed class ThrowingUserRepository : IUserRepository { public Task CreateAsync(UserEntity user, CancellationToken cancellationToken = default) @@ -165,6 +239,150 @@ public class StandardPluginBootstrapperTests => throw new InvalidOperationException("Simulated failure"); } + private sealed class FlakyUserRepository : IUserRepository + { + private readonly Dictionary usersById = new(); + private readonly Dictionary idsByTenantAndUsername = new(StringComparer.OrdinalIgnoreCase); + private bool failNextLookup = true; + + public bool FailureObserved { get; private set; } + + public Task CreateAsync(UserEntity user, CancellationToken cancellationToken = default) + { + var clone = Clone(user); + usersById[clone.Id] = clone; + idsByTenantAndUsername[GetUsernameKey(clone.TenantId, clone.Username)] = clone.Id; + return Task.FromResult(Clone(clone)); + } + + public Task GetByIdAsync(string tenantId, Guid id, CancellationToken cancellationToken = default) + { + if (usersById.TryGetValue(id, out var user) && string.Equals(user.TenantId, tenantId, StringComparison.OrdinalIgnoreCase)) + { + return Task.FromResult(Clone(user)); + } + + return Task.FromResult(null); + } + + public Task GetByUsernameAsync(string tenantId, string username, CancellationToken cancellationToken = default) + { + if (failNextLookup) + { + failNextLookup = false; + FailureObserved = true; + throw new InvalidOperationException("Transient failure"); + } + + if (!idsByTenantAndUsername.TryGetValue(GetUsernameKey(tenantId, username), out var userId)) + { + return Task.FromResult(null); + } + + return GetByIdAsync(tenantId, userId, cancellationToken); + } + + public Task GetBySubjectIdAsync(string tenantId, string subjectId, CancellationToken cancellationToken = default) + => Task.FromResult(null); + + public Task GetByEmailAsync(string tenantId, string email, CancellationToken cancellationToken = default) + { + var user = usersById.Values.FirstOrDefault(candidate => + string.Equals(candidate.TenantId, tenantId, StringComparison.OrdinalIgnoreCase) + && string.Equals(candidate.Email, email, StringComparison.OrdinalIgnoreCase)); + return Task.FromResult(user is null ? null : Clone(user)); + } + + public Task> GetAllAsync(string tenantId, bool? enabled = null, int limit = 100, int offset = 0, CancellationToken cancellationToken = default) + { + var results = usersById.Values + .Where(candidate => string.Equals(candidate.TenantId, tenantId, StringComparison.OrdinalIgnoreCase)) + .Where(candidate => enabled is null || candidate.Enabled == enabled.Value) + .Skip(offset) + .Take(limit) + .Select(Clone) + .ToList(); + return Task.FromResult>(results); + } + + public Task UpdateAsync(UserEntity user, CancellationToken cancellationToken = default) + { + var clone = Clone(user); + usersById[clone.Id] = clone; + idsByTenantAndUsername[GetUsernameKey(clone.TenantId, clone.Username)] = clone.Id; + return Task.FromResult(true); + } + + public Task DeleteAsync(string tenantId, Guid id, CancellationToken cancellationToken = default) + { + if (!usersById.TryGetValue(id, out var user) || !string.Equals(user.TenantId, tenantId, StringComparison.OrdinalIgnoreCase)) + { + return Task.FromResult(false); + } + + usersById.Remove(id); + idsByTenantAndUsername.Remove(GetUsernameKey(user.TenantId, user.Username)); + return Task.FromResult(true); + } + + public Task UpdatePasswordAsync(string tenantId, Guid userId, string passwordHash, string passwordSalt, CancellationToken cancellationToken = default) + => Task.FromResult(false); + + public Task RecordFailedLoginAsync(string tenantId, Guid userId, DateTimeOffset? lockUntil = null, CancellationToken cancellationToken = default) + => Task.FromResult(0); + + public Task RecordSuccessfulLoginAsync(string tenantId, Guid userId, CancellationToken cancellationToken = default) + => Task.CompletedTask; + + private static string GetUsernameKey(string tenantId, string username) + => $"{tenantId}::{username.Trim().ToLowerInvariant()}"; + + private static UserEntity Clone(UserEntity user) + => new() + { + Id = user.Id, + TenantId = user.TenantId, + Username = user.Username, + Email = user.Email, + DisplayName = user.DisplayName, + PasswordHash = user.PasswordHash, + PasswordSalt = user.PasswordSalt, + Enabled = user.Enabled, + Metadata = user.Metadata, + FailedLoginAttempts = user.FailedLoginAttempts, + LockedUntil = user.LockedUntil, + CreatedAt = user.CreatedAt, + UpdatedAt = user.UpdatedAt + }; + } + + private sealed class NoOpRoleRepository : IRoleRepository + { + public Task GetByIdAsync(string tenantId, Guid id, CancellationToken cancellationToken = default) => Task.FromResult(null); + public Task GetByNameAsync(string tenantId, string name, CancellationToken cancellationToken = default) => Task.FromResult(null); + public Task> ListAsync(string tenantId, CancellationToken cancellationToken = default) => Task.FromResult>(Array.Empty()); + public Task> GetUserRolesAsync(string tenantId, Guid userId, CancellationToken cancellationToken = default) => Task.FromResult>(Array.Empty()); + public Task CreateAsync(string tenantId, RoleEntity role, CancellationToken cancellationToken = default) => Task.FromResult(role.Id); + public Task UpdateAsync(string tenantId, RoleEntity role, CancellationToken cancellationToken = default) => Task.CompletedTask; + public Task DeleteAsync(string tenantId, Guid id, CancellationToken cancellationToken = default) => Task.CompletedTask; + public Task AssignToUserAsync(string tenantId, Guid userId, Guid roleId, string? grantedBy, DateTimeOffset? expiresAt, CancellationToken cancellationToken = default) => Task.CompletedTask; + public Task RemoveFromUserAsync(string tenantId, Guid userId, Guid roleId, CancellationToken cancellationToken = default) => Task.CompletedTask; + } + + private sealed class NoOpPermissionRepository : IPermissionRepository + { + public Task GetByIdAsync(string tenantId, Guid id, CancellationToken cancellationToken = default) => Task.FromResult(null); + public Task GetByNameAsync(string tenantId, string name, CancellationToken cancellationToken = default) => Task.FromResult(null); + public Task> ListAsync(string tenantId, CancellationToken cancellationToken = default) => Task.FromResult>(Array.Empty()); + public Task> GetByResourceAsync(string tenantId, string resource, CancellationToken cancellationToken = default) => Task.FromResult>(Array.Empty()); + public Task> GetRolePermissionsAsync(string tenantId, Guid roleId, CancellationToken cancellationToken = default) => Task.FromResult>(Array.Empty()); + public Task> GetUserPermissionsAsync(string tenantId, Guid userId, CancellationToken cancellationToken = default) => Task.FromResult>(Array.Empty()); + public Task CreateAsync(string tenantId, PermissionEntity permission, CancellationToken cancellationToken = default) => Task.FromResult(permission.Id); + public Task DeleteAsync(string tenantId, Guid id, CancellationToken cancellationToken = default) => Task.CompletedTask; + public Task AssignToRoleAsync(string tenantId, Guid roleId, Guid permissionId, CancellationToken cancellationToken = default) => Task.CompletedTask; + public Task RemoveFromRoleAsync(string tenantId, Guid roleId, Guid permissionId, CancellationToken cancellationToken = default) => Task.CompletedTask; + } + private sealed class NullAuditLogger : IStandardCredentialAuditLogger { public ValueTask RecordAsync( diff --git a/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/Bootstrap/StandardPluginBootstrapper.cs b/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/Bootstrap/StandardPluginBootstrapper.cs index bb3db1496..336f5f033 100644 --- a/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/Bootstrap/StandardPluginBootstrapper.cs +++ b/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/Bootstrap/StandardPluginBootstrapper.cs @@ -17,36 +17,83 @@ namespace StellaOps.Authority.Plugin.Standard.Bootstrap; internal sealed class StandardPluginBootstrapper : IHostedService { private const string DefaultTenantId = "default"; + private static readonly TimeSpan DefaultRetryDelay = TimeSpan.FromSeconds(2); private readonly string pluginName; private readonly IServiceScopeFactory scopeFactory; private readonly ILogger logger; + private readonly int maxAttempts; + private readonly TimeSpan retryDelay; public StandardPluginBootstrapper( string pluginName, IServiceScopeFactory scopeFactory, - ILogger logger) + ILogger logger, + int maxAttempts = 15, + TimeSpan? retryDelay = null) { this.pluginName = pluginName; this.scopeFactory = scopeFactory; this.logger = logger; + this.maxAttempts = Math.Max(1, maxAttempts); + this.retryDelay = retryDelay ?? DefaultRetryDelay; } public async Task StartAsync(CancellationToken cancellationToken) + { + for (var attempt = 1; attempt <= maxAttempts; attempt++) + { + cancellationToken.ThrowIfCancellationRequested(); + + try + { + await RunBootstrapPassAsync(cancellationToken).ConfigureAwait(false); + + if (attempt > 1) + { + logger.LogInformation( + "Standard Authority plugin '{PluginName}' bootstrap completed on retry attempt {Attempt}/{MaxAttempts}.", + pluginName, + attempt, + maxAttempts); + } + + return; + } + catch (Exception ex) + { + var finalAttempt = attempt == maxAttempts; + var level = finalAttempt ? LogLevel.Error : LogLevel.Warning; + + logger.Log( + level, + ex, + finalAttempt + ? "Standard Authority plugin '{PluginName}' bootstrap failed after {Attempt}/{MaxAttempts} attempts." + : "Standard Authority plugin '{PluginName}' bootstrap attempt {Attempt}/{MaxAttempts} failed. Retrying in {RetryDelay}.", + pluginName, + attempt, + maxAttempts, + retryDelay); + + if (finalAttempt) + { + return; + } + + await Task.Delay(retryDelay, cancellationToken).ConfigureAwait(false); + } + } + } + + private async Task RunBootstrapPassAsync(CancellationToken cancellationToken) { using var scope = scopeFactory.CreateScope(); var optionsMonitor = scope.ServiceProvider.GetRequiredService>(); var options = optionsMonitor.Get(pluginName); var tenantId = options.TenantId ?? DefaultTenantId; - try - { - await EnsureBootstrapClientsAsync(scope.ServiceProvider, tenantId, options.BootstrapClients, cancellationToken).ConfigureAwait(false); - } - catch (Exception ex) - { - logger.LogError(ex, "Standard Authority plugin '{PluginName}' failed to ensure bootstrap clients.", pluginName); - } + await EnsureBootstrapClientsAsync(scope.ServiceProvider, tenantId, options.BootstrapClients, cancellationToken).ConfigureAwait(false); if (options.BootstrapUser is null || !options.BootstrapUser.IsConfigured) { @@ -55,25 +102,10 @@ internal sealed class StandardPluginBootstrapper : IHostedService var credentialStore = scope.ServiceProvider.GetRequiredService(); logger.LogInformation("Standard Authority plugin '{PluginName}' ensuring bootstrap user.", pluginName); - try - { - await credentialStore.EnsureBootstrapUserAsync(options.BootstrapUser, cancellationToken).ConfigureAwait(false); - } - catch (Exception ex) - { - logger.LogError(ex, "Standard Authority plugin '{PluginName}' failed to ensure bootstrap user.", pluginName); - } + await credentialStore.EnsureBootstrapUserAsync(options.BootstrapUser, cancellationToken).ConfigureAwait(false); var bootstrapRoles = options.BootstrapUser.Roles ?? new[] { "admin" }; - - try - { - await EnsureAdminRoleAsync(scope.ServiceProvider, tenantId, options.BootstrapUser.Username!, bootstrapRoles, cancellationToken).ConfigureAwait(false); - } - catch (Exception ex) - { - logger.LogError(ex, "Standard Authority plugin '{PluginName}' failed to seed admin role with scopes.", pluginName); - } + await EnsureAdminRoleAsync(scope.ServiceProvider, tenantId, options.BootstrapUser.Username!, bootstrapRoles, cancellationToken).ConfigureAwait(false); } private async Task EnsureAdminRoleAsync( diff --git a/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/TASKS.md b/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/TASKS.md index 0abca280e..72cbea5fb 100644 --- a/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/TASKS.md +++ b/src/Authority/StellaOps.Authority/StellaOps.Authority.Plugin.Standard/TASKS.md @@ -8,3 +8,4 @@ Source of truth: `docs-archived/implplan/2025-12-29-csproj-audit/SPRINT_20251229 | AUDIT-0096-M | DONE | Revalidated 2026-01-06. | | AUDIT-0096-T | DONE | Revalidated 2026-01-06. | | AUDIT-0096-A | TODO | Revalidated 2026-01-06 (open findings). | +| STD-BOOT-01 | DONE | 2026-04-09: Bootstrap admin/client seeding now retries transient startup storage failures so the local admin account converges after PostgreSQL becomes reachable; source of truth is `SPRINT_20260409_002_Platform_local_stack_regression_retest.md`. |