Restructure solution layout by module
This commit is contained in:
@@ -0,0 +1,26 @@
|
||||
using StellaOps.Zastava.Observer.Configuration;
|
||||
|
||||
namespace StellaOps.Zastava.Observer.Worker;
|
||||
|
||||
internal static class BackoffCalculator
|
||||
{
|
||||
public static TimeSpan ComputeDelay(ObserverBackoffOptions options, int attempt, Random random)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(options);
|
||||
ArgumentNullException.ThrowIfNull(random);
|
||||
|
||||
var cappedAttempt = Math.Max(1, attempt);
|
||||
var baseDelayMs = options.Initial.TotalMilliseconds * Math.Pow(2, cappedAttempt - 1);
|
||||
baseDelayMs = Math.Min(baseDelayMs, options.Max.TotalMilliseconds);
|
||||
|
||||
if (options.JitterRatio <= 0)
|
||||
{
|
||||
return TimeSpan.FromMilliseconds(baseDelayMs);
|
||||
}
|
||||
|
||||
var jitterWindow = baseDelayMs * options.JitterRatio;
|
||||
var jitter = (random.NextDouble() * 2 - 1) * jitterWindow;
|
||||
var jittered = Math.Clamp(baseDelayMs + jitter, options.Initial.TotalMilliseconds, options.Max.TotalMilliseconds);
|
||||
return TimeSpan.FromMilliseconds(jittered);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,197 @@
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Zastava.Core.Contracts;
|
||||
using StellaOps.Zastava.Core.Configuration;
|
||||
using StellaOps.Zastava.Core.Diagnostics;
|
||||
using StellaOps.Zastava.Observer.Configuration;
|
||||
using StellaOps.Zastava.Observer.ContainerRuntime;
|
||||
using StellaOps.Zastava.Observer.ContainerRuntime.Cri;
|
||||
using StellaOps.Zastava.Observer.Runtime;
|
||||
|
||||
namespace StellaOps.Zastava.Observer.Worker;
|
||||
|
||||
internal sealed class ContainerLifecycleHostedService : BackgroundService
|
||||
{
|
||||
private readonly ICriRuntimeClientFactory clientFactory;
|
||||
private readonly IOptionsMonitor<ZastavaObserverOptions> observerOptions;
|
||||
private readonly IOptionsMonitor<ZastavaRuntimeOptions> runtimeOptions;
|
||||
private readonly IZastavaLogScopeBuilder logScopeBuilder;
|
||||
private readonly IZastavaRuntimeMetrics runtimeMetrics;
|
||||
private readonly IRuntimeEventBuffer eventBuffer;
|
||||
private readonly ContainerStateTrackerFactory trackerFactory;
|
||||
private readonly ContainerRuntimePoller poller;
|
||||
private readonly IRuntimeProcessCollector processCollector;
|
||||
private readonly TimeProvider timeProvider;
|
||||
private readonly ILogger<ContainerLifecycleHostedService> logger;
|
||||
private readonly Random jitterRandom = new();
|
||||
|
||||
public ContainerLifecycleHostedService(
|
||||
ICriRuntimeClientFactory clientFactory,
|
||||
IOptionsMonitor<ZastavaObserverOptions> observerOptions,
|
||||
IOptionsMonitor<ZastavaRuntimeOptions> runtimeOptions,
|
||||
IZastavaLogScopeBuilder logScopeBuilder,
|
||||
IZastavaRuntimeMetrics runtimeMetrics,
|
||||
IRuntimeEventBuffer eventBuffer,
|
||||
ContainerStateTrackerFactory trackerFactory,
|
||||
ContainerRuntimePoller poller,
|
||||
IRuntimeProcessCollector processCollector,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<ContainerLifecycleHostedService> logger)
|
||||
{
|
||||
this.clientFactory = clientFactory ?? throw new ArgumentNullException(nameof(clientFactory));
|
||||
this.observerOptions = observerOptions ?? throw new ArgumentNullException(nameof(observerOptions));
|
||||
this.runtimeOptions = runtimeOptions ?? throw new ArgumentNullException(nameof(runtimeOptions));
|
||||
this.logScopeBuilder = logScopeBuilder ?? throw new ArgumentNullException(nameof(logScopeBuilder));
|
||||
this.runtimeMetrics = runtimeMetrics ?? throw new ArgumentNullException(nameof(runtimeMetrics));
|
||||
this.eventBuffer = eventBuffer ?? throw new ArgumentNullException(nameof(eventBuffer));
|
||||
this.trackerFactory = trackerFactory ?? throw new ArgumentNullException(nameof(trackerFactory));
|
||||
this.poller = poller ?? throw new ArgumentNullException(nameof(poller));
|
||||
this.processCollector = processCollector ?? throw new ArgumentNullException(nameof(processCollector));
|
||||
this.timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
|
||||
this.logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
var options = observerOptions.CurrentValue;
|
||||
var activeEndpoints = options.Runtimes
|
||||
.Where(static runtime => runtime.Enabled)
|
||||
.ToArray();
|
||||
|
||||
if (activeEndpoints.Length == 0)
|
||||
{
|
||||
logger.LogWarning("No container runtime endpoints configured; lifecycle watcher idle.");
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
var tasks = activeEndpoints
|
||||
.Select(endpoint => MonitorRuntimeAsync(endpoint, stoppingToken))
|
||||
.ToArray();
|
||||
|
||||
return Task.WhenAll(tasks);
|
||||
}
|
||||
|
||||
private async Task MonitorRuntimeAsync(ContainerRuntimeEndpointOptions endpoint, CancellationToken cancellationToken)
|
||||
{
|
||||
var runtime = runtimeOptions.CurrentValue;
|
||||
var tenant = runtime.Tenant;
|
||||
var nodeName = observerOptions.CurrentValue.NodeName;
|
||||
var pollInterval = endpoint.PollInterval ?? observerOptions.CurrentValue.PollInterval;
|
||||
var backoffOptions = observerOptions.CurrentValue.Backoff;
|
||||
|
||||
while (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
await using var client = clientFactory.Create(endpoint);
|
||||
CriRuntimeIdentity identity;
|
||||
try
|
||||
{
|
||||
identity = await client.GetIdentityAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
await HandleFailureAsync(endpoint, 1, backoffOptions, ex, cancellationToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
var tracker = trackerFactory.Create();
|
||||
var failureCount = 0;
|
||||
|
||||
while (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
var envelopes = await poller.PollAsync(
|
||||
tracker,
|
||||
client,
|
||||
endpoint,
|
||||
identity,
|
||||
tenant,
|
||||
nodeName,
|
||||
timeProvider,
|
||||
processCollector,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (envelopes.Count > 0)
|
||||
{
|
||||
await PublishAsync(endpoint, envelopes, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
failureCount = 0;
|
||||
await Task.Delay(pollInterval, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
return;
|
||||
}
|
||||
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
failureCount++;
|
||||
await HandleFailureAsync(endpoint, failureCount, backoffOptions, ex, cancellationToken).ConfigureAwait(false);
|
||||
break; // recreate client
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task PublishAsync(ContainerRuntimeEndpointOptions endpoint, IReadOnlyList<RuntimeEventEnvelope> envelopes, CancellationToken cancellationToken)
|
||||
{
|
||||
var endpointName = endpoint.ResolveName();
|
||||
foreach (var envelope in envelopes)
|
||||
{
|
||||
var tags = runtimeMetrics.DefaultTags
|
||||
.Concat(new[]
|
||||
{
|
||||
new KeyValuePair<string, object?>("runtime_endpoint", endpointName),
|
||||
new KeyValuePair<string, object?>("event_kind", envelope.Event.Kind.ToString().ToLowerInvariant())
|
||||
})
|
||||
.ToArray();
|
||||
runtimeMetrics.RuntimeEvents.Add(1, tags);
|
||||
|
||||
var scope = logScopeBuilder.BuildScope(
|
||||
correlationId: envelope.Event.EventId,
|
||||
node: envelope.Event.Node,
|
||||
workload: envelope.Event.Workload.ContainerId,
|
||||
eventId: envelope.Event.EventId,
|
||||
additional: new Dictionary<string, string>
|
||||
{
|
||||
["runtimeEndpoint"] = endpointName,
|
||||
["kind"] = envelope.Event.Kind.ToString()
|
||||
});
|
||||
|
||||
using (logger.BeginScope(scope))
|
||||
{
|
||||
logger.LogInformation("Observed container {ContainerId} ({Kind}) for node {Node}.",
|
||||
envelope.Event.Workload.ContainerId,
|
||||
envelope.Event.Kind,
|
||||
envelope.Event.Node);
|
||||
}
|
||||
}
|
||||
|
||||
await eventBuffer.WriteBatchAsync(envelopes, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task HandleFailureAsync(
|
||||
ContainerRuntimeEndpointOptions endpoint,
|
||||
int failureCount,
|
||||
ObserverBackoffOptions backoffOptions,
|
||||
Exception exception,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var delay = BackoffCalculator.ComputeDelay(backoffOptions, failureCount, jitterRandom);
|
||||
logger.LogWarning(exception, "Runtime watcher for {Endpoint} encountered error (attempt {Attempt}); retrying after {Delay}.",
|
||||
endpoint.ResolveName(),
|
||||
failureCount,
|
||||
delay);
|
||||
|
||||
try
|
||||
{
|
||||
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Zastava.Core.Contracts;
|
||||
using StellaOps.Zastava.Observer.Configuration;
|
||||
using StellaOps.Zastava.Observer.ContainerRuntime;
|
||||
using StellaOps.Zastava.Observer.ContainerRuntime.Cri;
|
||||
using StellaOps.Zastava.Observer.Cri;
|
||||
using StellaOps.Zastava.Observer.Posture;
|
||||
using StellaOps.Zastava.Observer.Runtime;
|
||||
|
||||
namespace StellaOps.Zastava.Observer.Worker;
|
||||
|
||||
internal sealed class ContainerRuntimePoller
|
||||
{
|
||||
private readonly ILogger<ContainerRuntimePoller> logger;
|
||||
private readonly IRuntimePostureEvaluator? postureEvaluator;
|
||||
|
||||
public ContainerRuntimePoller(ILogger<ContainerRuntimePoller> logger, IRuntimePostureEvaluator? postureEvaluator = null)
|
||||
{
|
||||
this.logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
this.postureEvaluator = postureEvaluator;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<RuntimeEventEnvelope>> PollAsync(
|
||||
ContainerStateTracker tracker,
|
||||
ICriRuntimeClient client,
|
||||
ContainerRuntimeEndpointOptions endpoint,
|
||||
CriRuntimeIdentity identity,
|
||||
string tenant,
|
||||
string nodeName,
|
||||
TimeProvider timeProvider,
|
||||
IRuntimeProcessCollector? processCollector,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(tracker);
|
||||
ArgumentNullException.ThrowIfNull(client);
|
||||
ArgumentNullException.ThrowIfNull(endpoint);
|
||||
ArgumentNullException.ThrowIfNull(identity);
|
||||
ArgumentNullException.ThrowIfNull(timeProvider);
|
||||
|
||||
var pollTimestamp = timeProvider.GetUtcNow();
|
||||
tracker.BeginCycle();
|
||||
|
||||
var runningContainers = await client.ListContainersAsync(ContainerState.ContainerRunning, cancellationToken).ConfigureAwait(false);
|
||||
var generated = new List<RuntimeEventEnvelope>();
|
||||
|
||||
if (runningContainers.Count > 0)
|
||||
{
|
||||
foreach (var container in runningContainers)
|
||||
{
|
||||
var enriched = container;
|
||||
var status = await client.GetContainerStatusAsync(container.Id, cancellationToken).ConfigureAwait(false);
|
||||
if (status is not null)
|
||||
{
|
||||
enriched = status;
|
||||
}
|
||||
|
||||
var lifecycleEvent = tracker.MarkRunning(enriched, pollTimestamp);
|
||||
if (lifecycleEvent is null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
RuntimeProcessCapture? capture = null;
|
||||
if (processCollector is not null && lifecycleEvent.Kind == ContainerLifecycleEventKind.Start)
|
||||
{
|
||||
capture = await processCollector.CollectAsync(enriched, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
RuntimePostureEvaluationResult? posture = null;
|
||||
if (this.postureEvaluator is not null)
|
||||
{
|
||||
posture = await this.postureEvaluator.EvaluateAsync(enriched, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
generated.Add(RuntimeEventFactory.Create(
|
||||
lifecycleEvent,
|
||||
endpoint,
|
||||
identity,
|
||||
tenant,
|
||||
nodeName,
|
||||
capture,
|
||||
posture?.Posture,
|
||||
posture?.Evidence));
|
||||
}
|
||||
}
|
||||
|
||||
var stopEvents = await tracker.CompleteCycleAsync(
|
||||
id => client.GetContainerStatusAsync(id, cancellationToken),
|
||||
pollTimestamp,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
foreach (var lifecycleEvent in stopEvents)
|
||||
{
|
||||
RuntimePostureEvaluationResult? posture = null;
|
||||
if (this.postureEvaluator is not null)
|
||||
{
|
||||
posture = await this.postureEvaluator.EvaluateAsync(lifecycleEvent.Snapshot, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
generated.Add(RuntimeEventFactory.Create(
|
||||
lifecycleEvent,
|
||||
endpoint,
|
||||
identity,
|
||||
tenant,
|
||||
nodeName,
|
||||
null,
|
||||
posture?.Posture,
|
||||
posture?.Evidence));
|
||||
}
|
||||
|
||||
if (generated.Count == 0)
|
||||
{
|
||||
return Array.Empty<RuntimeEventEnvelope>();
|
||||
}
|
||||
|
||||
var ordered = generated
|
||||
.OrderBy(static envelope => envelope.Event.When)
|
||||
.ThenBy(static envelope => envelope.Event.Workload.ContainerId, StringComparer.Ordinal)
|
||||
.ToArray();
|
||||
|
||||
logger.LogDebug("Generated {Count} runtime events for endpoint {EndpointName}.", ordered.Length, endpoint.ResolveName());
|
||||
return ordered;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Zastava.Core.Configuration;
|
||||
using StellaOps.Zastava.Core.Diagnostics;
|
||||
using StellaOps.Zastava.Core.Security;
|
||||
|
||||
namespace StellaOps.Zastava.Observer.Worker;
|
||||
|
||||
/// <summary>
|
||||
/// Minimal bootstrap worker ensuring runtime core wiring is exercised.
|
||||
/// </summary>
|
||||
internal sealed class ObserverBootstrapService : BackgroundService
|
||||
{
|
||||
private readonly IZastavaLogScopeBuilder logScopeBuilder;
|
||||
private readonly IZastavaRuntimeMetrics runtimeMetrics;
|
||||
private readonly IZastavaAuthorityTokenProvider authorityTokenProvider;
|
||||
private readonly IHostApplicationLifetime applicationLifetime;
|
||||
private readonly ILogger<ObserverBootstrapService> logger;
|
||||
private readonly ZastavaRuntimeOptions runtimeOptions;
|
||||
|
||||
public ObserverBootstrapService(
|
||||
IZastavaLogScopeBuilder logScopeBuilder,
|
||||
IZastavaRuntimeMetrics runtimeMetrics,
|
||||
IZastavaAuthorityTokenProvider authorityTokenProvider,
|
||||
IOptions<ZastavaRuntimeOptions> runtimeOptions,
|
||||
IHostApplicationLifetime applicationLifetime,
|
||||
ILogger<ObserverBootstrapService> logger)
|
||||
{
|
||||
this.logScopeBuilder = logScopeBuilder;
|
||||
this.runtimeMetrics = runtimeMetrics;
|
||||
this.authorityTokenProvider = authorityTokenProvider;
|
||||
this.applicationLifetime = applicationLifetime;
|
||||
this.logger = logger;
|
||||
this.runtimeOptions = runtimeOptions.Value;
|
||||
}
|
||||
|
||||
protected override Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
var scope = logScopeBuilder.BuildScope(eventId: "observer.bootstrap");
|
||||
using (logger.BeginScope(scope))
|
||||
{
|
||||
logger.LogInformation("Zastava observer runtime core initialised for tenant {Tenant}, component {Component}.", runtimeOptions.Tenant, runtimeOptions.Component);
|
||||
logger.LogDebug("Observer metrics meter {MeterName} registered with {TagCount} default tags.", runtimeMetrics.Meter.Name, runtimeMetrics.DefaultTags.Count);
|
||||
}
|
||||
|
||||
// Observer implementation will hook into the authority token provider when connectors arrive.
|
||||
applicationLifetime.ApplicationStarted.Register(() => logger.LogInformation("Observer bootstrap complete."));
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,225 @@
|
||||
using System.Linq;
|
||||
using System.Net;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Zastava.Observer.Backend;
|
||||
using StellaOps.Zastava.Observer.Configuration;
|
||||
using StellaOps.Zastava.Observer.Runtime;
|
||||
|
||||
namespace StellaOps.Zastava.Observer.Worker;
|
||||
|
||||
internal sealed class RuntimeEventDispatchService : BackgroundService
|
||||
{
|
||||
private readonly IRuntimeEventBuffer buffer;
|
||||
private readonly IRuntimeEventsClient eventsClient;
|
||||
private readonly IOptionsMonitor<ZastavaObserverOptions> observerOptions;
|
||||
private readonly TimeProvider timeProvider;
|
||||
private readonly ILogger<RuntimeEventDispatchService> logger;
|
||||
|
||||
public RuntimeEventDispatchService(
|
||||
IRuntimeEventBuffer buffer,
|
||||
IRuntimeEventsClient eventsClient,
|
||||
IOptionsMonitor<ZastavaObserverOptions> observerOptions,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<RuntimeEventDispatchService> logger)
|
||||
{
|
||||
this.buffer = buffer ?? throw new ArgumentNullException(nameof(buffer));
|
||||
this.eventsClient = eventsClient ?? throw new ArgumentNullException(nameof(eventsClient));
|
||||
this.observerOptions = observerOptions ?? throw new ArgumentNullException(nameof(observerOptions));
|
||||
this.timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
|
||||
this.logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
var batch = new List<RuntimeEventBufferItem>();
|
||||
var enumerator = buffer.ReadAllAsync(stoppingToken).GetAsyncEnumerator(stoppingToken);
|
||||
Task<bool>? moveNextTask = null;
|
||||
Task? flushDelayTask = null;
|
||||
CancellationTokenSource? flushDelayCts = null;
|
||||
|
||||
try
|
||||
{
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
moveNextTask ??= enumerator.MoveNextAsync().AsTask();
|
||||
|
||||
if (batch.Count > 0 && flushDelayTask is null)
|
||||
{
|
||||
StartFlushTimer(ref flushDelayTask, ref flushDelayCts, stoppingToken);
|
||||
}
|
||||
|
||||
Task completedTask;
|
||||
if (flushDelayTask is null)
|
||||
{
|
||||
completedTask = await Task.WhenAny(moveNextTask).ConfigureAwait(false);
|
||||
}
|
||||
else
|
||||
{
|
||||
completedTask = await Task.WhenAny(moveNextTask, flushDelayTask).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
if (completedTask == moveNextTask)
|
||||
{
|
||||
if (!await moveNextTask.ConfigureAwait(false))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
var item = enumerator.Current;
|
||||
batch.Add(item);
|
||||
moveNextTask = null;
|
||||
|
||||
var options = observerOptions.CurrentValue;
|
||||
var batchSize = Math.Clamp(options.PublishBatchSize, 1, 512);
|
||||
if (batch.Count >= batchSize)
|
||||
{
|
||||
ResetFlushTimer(ref flushDelayTask, ref flushDelayCts);
|
||||
await FlushAsync(batch, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// flush timer triggered
|
||||
ResetFlushTimer(ref flushDelayTask, ref flushDelayCts);
|
||||
if (batch.Count > 0)
|
||||
{
|
||||
await FlushAsync(batch, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
ResetFlushTimer(ref flushDelayTask, ref flushDelayCts);
|
||||
|
||||
if (batch.Count > 0 && !stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
await FlushAsync(batch, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
if (moveNextTask is not null)
|
||||
{
|
||||
try { await moveNextTask.ConfigureAwait(false); }
|
||||
catch { /* ignored */ }
|
||||
}
|
||||
|
||||
await enumerator.DisposeAsync().ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task FlushAsync(List<RuntimeEventBufferItem> batch, CancellationToken cancellationToken)
|
||||
{
|
||||
if (batch.Count == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var request = new RuntimeEventsIngestRequest
|
||||
{
|
||||
BatchId = $"obs-{timeProvider.GetUtcNow():yyyyMMddTHHmmssfff}-{Guid.NewGuid():N}",
|
||||
Events = batch.Select(item => item.Envelope).ToArray()
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var result = await eventsClient.PublishAsync(request, cancellationToken).ConfigureAwait(false);
|
||||
if (result.Success)
|
||||
{
|
||||
foreach (var item in batch)
|
||||
{
|
||||
await item.CompleteAsync().ConfigureAwait(false);
|
||||
}
|
||||
|
||||
logger.LogInformation("Runtime events batch published (batchId={BatchId}, accepted={Accepted}, duplicates={Duplicates}).",
|
||||
request.BatchId,
|
||||
result.Accepted,
|
||||
result.Duplicates);
|
||||
}
|
||||
else if (result.RateLimited)
|
||||
{
|
||||
await RequeueBatchAsync(batch, cancellationToken).ConfigureAwait(false);
|
||||
await DelayAsync(result.RetryAfter, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (RuntimeEventsException ex) when (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
logger.LogWarning(ex, "Runtime events publish failed (status={StatusCode}); batch will be retried.", (int)ex.StatusCode);
|
||||
await RequeueBatchAsync(batch, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var backoff = ex.StatusCode == HttpStatusCode.ServiceUnavailable
|
||||
? TimeSpan.FromSeconds(5)
|
||||
: TimeSpan.FromSeconds(2);
|
||||
|
||||
await DelayAsync(backoff, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
logger.LogWarning(ex, "Runtime events publish encountered an unexpected error; batch will be retried.");
|
||||
await RequeueBatchAsync(batch, cancellationToken).ConfigureAwait(false);
|
||||
await DelayAsync(TimeSpan.FromSeconds(5), cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
finally
|
||||
{
|
||||
batch.Clear();
|
||||
}
|
||||
}
|
||||
|
||||
private async Task RequeueBatchAsync(IEnumerable<RuntimeEventBufferItem> batch, CancellationToken cancellationToken)
|
||||
{
|
||||
foreach (var item in batch)
|
||||
{
|
||||
try
|
||||
{
|
||||
await item.RequeueAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
logger.LogWarning(ex, "Failed to requeue runtime event {EventId}; dropping.", item.Envelope.Event.EventId);
|
||||
await item.CompleteAsync().ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task DelayAsync(TimeSpan delay, CancellationToken cancellationToken)
|
||||
{
|
||||
if (delay <= TimeSpan.Zero)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
private void StartFlushTimer(ref Task? flushTask, ref CancellationTokenSource? cts, CancellationToken stoppingToken)
|
||||
{
|
||||
var options = observerOptions.CurrentValue;
|
||||
var flushIntervalSeconds = Math.Clamp(options.PublishFlushIntervalSeconds, 0.1, 30);
|
||||
var flushInterval = TimeSpan.FromSeconds(flushIntervalSeconds);
|
||||
|
||||
cts = CancellationTokenSource.CreateLinkedTokenSource(stoppingToken);
|
||||
flushTask = Task.Delay(flushInterval, cts.Token);
|
||||
}
|
||||
|
||||
private void ResetFlushTimer(ref Task? flushTask, ref CancellationTokenSource? cts)
|
||||
{
|
||||
if (cts is not null)
|
||||
{
|
||||
try { cts.Cancel(); } catch { /* ignore */ }
|
||||
cts.Dispose();
|
||||
cts = null;
|
||||
}
|
||||
flushTask = null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,148 @@
|
||||
using System.Collections.Generic;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using StellaOps.Zastava.Core.Contracts;
|
||||
using StellaOps.Zastava.Observer.Configuration;
|
||||
using StellaOps.Zastava.Observer.ContainerRuntime;
|
||||
using StellaOps.Zastava.Observer.ContainerRuntime.Cri;
|
||||
using StellaOps.Zastava.Observer.Runtime;
|
||||
|
||||
namespace StellaOps.Zastava.Observer.Worker;
|
||||
|
||||
internal static class RuntimeEventFactory
|
||||
{
|
||||
public static RuntimeEventEnvelope Create(
|
||||
ContainerLifecycleEvent lifecycleEvent,
|
||||
ContainerRuntimeEndpointOptions endpoint,
|
||||
CriRuntimeIdentity identity,
|
||||
string tenant,
|
||||
string nodeName,
|
||||
RuntimeProcessCapture? capture = null,
|
||||
RuntimePosture? posture = null,
|
||||
IReadOnlyList<RuntimeEvidence>? additionalEvidence = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(lifecycleEvent);
|
||||
ArgumentNullException.ThrowIfNull(endpoint);
|
||||
ArgumentNullException.ThrowIfNull(identity);
|
||||
ArgumentNullException.ThrowIfNull(tenant);
|
||||
ArgumentNullException.ThrowIfNull(nodeName);
|
||||
|
||||
var snapshot = lifecycleEvent.Snapshot;
|
||||
var workloadLabels = snapshot.Labels ?? new Dictionary<string, string>(StringComparer.Ordinal);
|
||||
var annotations = snapshot.Annotations is null
|
||||
? new Dictionary<string, string>(StringComparer.Ordinal)
|
||||
: new Dictionary<string, string>(snapshot.Annotations, StringComparer.Ordinal);
|
||||
|
||||
var platform = ResolvePlatform(workloadLabels, endpoint);
|
||||
var runtimeEvent = new RuntimeEvent
|
||||
{
|
||||
EventId = ComputeEventId(nodeName, lifecycleEvent),
|
||||
When = lifecycleEvent.Timestamp,
|
||||
Kind = lifecycleEvent.Kind == ContainerLifecycleEventKind.Start
|
||||
? RuntimeEventKind.ContainerStart
|
||||
: RuntimeEventKind.ContainerStop,
|
||||
Tenant = tenant,
|
||||
Node = nodeName,
|
||||
Runtime = new RuntimeEngine
|
||||
{
|
||||
Engine = endpoint.Engine.ToEngineString(),
|
||||
Version = identity.RuntimeVersion
|
||||
},
|
||||
Workload = new RuntimeWorkload
|
||||
{
|
||||
Platform = platform,
|
||||
Namespace = TryGet(workloadLabels, CriLabelKeys.PodNamespace),
|
||||
Pod = TryGet(workloadLabels, CriLabelKeys.PodName),
|
||||
Container = TryGet(workloadLabels, CriLabelKeys.ContainerName) ?? snapshot.Name,
|
||||
ContainerId = $"{endpoint.Engine.ToEngineString()}://{snapshot.Id}",
|
||||
ImageRef = ResolveImageRef(snapshot),
|
||||
Owner = null
|
||||
},
|
||||
Process = capture?.Process,
|
||||
LoadedLibraries = capture?.Libraries ?? Array.Empty<RuntimeLoadedLibrary>(),
|
||||
Posture = posture,
|
||||
Evidence = MergeEvidence(capture?.Evidence, additionalEvidence),
|
||||
Annotations = annotations.Count == 0 ? null : new SortedDictionary<string, string>(annotations, StringComparer.Ordinal)
|
||||
};
|
||||
|
||||
return RuntimeEventEnvelope.Create(runtimeEvent, ZastavaContractVersions.RuntimeEvent);
|
||||
}
|
||||
|
||||
private static string ResolvePlatform(IReadOnlyDictionary<string, string> labels, ContainerRuntimeEndpointOptions endpoint)
|
||||
{
|
||||
if (labels.ContainsKey(CriLabelKeys.PodName))
|
||||
{
|
||||
return "kubernetes";
|
||||
}
|
||||
|
||||
return endpoint.Engine.ToEngineString();
|
||||
}
|
||||
|
||||
private static IReadOnlyList<RuntimeEvidence> MergeEvidence(
|
||||
IReadOnlyList<RuntimeEvidence>? primary,
|
||||
IReadOnlyList<RuntimeEvidence>? secondary)
|
||||
{
|
||||
if ((primary is null || primary.Count == 0) && (secondary is null || secondary.Count == 0))
|
||||
{
|
||||
return Array.Empty<RuntimeEvidence>();
|
||||
}
|
||||
|
||||
if (secondary is null || secondary.Count == 0)
|
||||
{
|
||||
return primary ?? Array.Empty<RuntimeEvidence>();
|
||||
}
|
||||
|
||||
if (primary is null || primary.Count == 0)
|
||||
{
|
||||
return secondary;
|
||||
}
|
||||
|
||||
var merged = new List<RuntimeEvidence>(primary.Count + secondary.Count);
|
||||
merged.AddRange(primary);
|
||||
merged.AddRange(secondary);
|
||||
return merged;
|
||||
}
|
||||
|
||||
private static string? ResolveImageRef(CriContainerInfo snapshot)
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(snapshot.ImageRef))
|
||||
{
|
||||
return snapshot.ImageRef;
|
||||
}
|
||||
|
||||
return snapshot.Image;
|
||||
}
|
||||
|
||||
private static string? TryGet(IReadOnlyDictionary<string, string> dictionary, string key)
|
||||
{
|
||||
if (dictionary.TryGetValue(key, out var value) && !string.IsNullOrWhiteSpace(value))
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static string ComputeEventId(string nodeName, ContainerLifecycleEvent lifecycleEvent)
|
||||
{
|
||||
var builder = new StringBuilder()
|
||||
.Append(nodeName)
|
||||
.Append('|')
|
||||
.Append(lifecycleEvent.Snapshot.Id)
|
||||
.Append('|')
|
||||
.Append(lifecycleEvent.Timestamp.ToUniversalTime().Ticks)
|
||||
.Append('|')
|
||||
.Append((int)lifecycleEvent.Kind);
|
||||
|
||||
var bytes = Encoding.UTF8.GetBytes(builder.ToString());
|
||||
Span<byte> hash = stackalloc byte[16];
|
||||
if (!MD5.TryHashData(bytes, hash, out _))
|
||||
{
|
||||
using var md5 = MD5.Create();
|
||||
hash = md5.ComputeHash(bytes).AsSpan(0, 16);
|
||||
}
|
||||
|
||||
var guid = new Guid(hash);
|
||||
return guid.ToString("N");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user