release orchestrator v1 draft and build fixes
This commit is contained in:
473
src/Plugin/StellaOps.Plugin.Sandbox/ProcessSandbox.cs
Normal file
473
src/Plugin/StellaOps.Plugin.Sandbox/ProcessSandbox.cs
Normal file
@@ -0,0 +1,473 @@
|
||||
using System.Runtime.CompilerServices;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Plugin.Abstractions.Health;
|
||||
using StellaOps.Plugin.Abstractions.Manifest;
|
||||
using StellaOps.Plugin.Sandbox.Communication;
|
||||
using StellaOps.Plugin.Sandbox.Filesystem;
|
||||
using StellaOps.Plugin.Sandbox.Network;
|
||||
using StellaOps.Plugin.Sandbox.Process;
|
||||
using StellaOps.Plugin.Sandbox.Resources;
|
||||
using SystemProcess = System.Diagnostics.Process;
|
||||
|
||||
namespace StellaOps.Plugin.Sandbox;
|
||||
|
||||
/// <summary>
|
||||
/// Process-based sandbox implementation for untrusted plugins.
|
||||
/// </summary>
|
||||
public sealed class ProcessSandbox : ISandbox
|
||||
{
|
||||
private readonly SandboxConfiguration _config;
|
||||
private readonly IPluginProcessManager _processManager;
|
||||
private readonly IGrpcPluginBridge _bridge;
|
||||
private readonly IResourceLimiter _resourceLimiter;
|
||||
private readonly INetworkPolicyEnforcer _networkEnforcer;
|
||||
private readonly ILogger<ProcessSandbox> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
private SystemProcess? _process;
|
||||
private SandboxState _state = SandboxState.Created;
|
||||
private ResourceUsage _currentUsage = ResourceUsage.Empty;
|
||||
private CancellationTokenSource? _monitoringCts;
|
||||
private Task? _monitoringTask;
|
||||
private string? _workingDirectory;
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Id { get; }
|
||||
|
||||
/// <inheritdoc />
|
||||
public SandboxState State => _state;
|
||||
|
||||
/// <inheritdoc />
|
||||
public ResourceUsage CurrentUsage => _currentUsage;
|
||||
|
||||
/// <inheritdoc />
|
||||
public event EventHandler<SandboxStateChangedEventArgs>? StateChanged;
|
||||
|
||||
/// <inheritdoc />
|
||||
public event EventHandler<ResourceWarningEventArgs>? ResourceWarning;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new process sandbox.
|
||||
/// </summary>
|
||||
public ProcessSandbox(
|
||||
string id,
|
||||
SandboxConfiguration config,
|
||||
IPluginProcessManager processManager,
|
||||
IGrpcPluginBridge bridge,
|
||||
IResourceLimiter resourceLimiter,
|
||||
INetworkPolicyEnforcer networkEnforcer,
|
||||
ILogger<ProcessSandbox> logger,
|
||||
TimeProvider timeProvider)
|
||||
{
|
||||
Id = id;
|
||||
_config = config;
|
||||
_processManager = processManager;
|
||||
_bridge = bridge;
|
||||
_resourceLimiter = resourceLimiter;
|
||||
_networkEnforcer = networkEnforcer;
|
||||
_logger = logger;
|
||||
_timeProvider = timeProvider;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task StartAsync(PluginManifest manifest, CancellationToken ct)
|
||||
{
|
||||
TransitionState(SandboxState.Starting);
|
||||
|
||||
try
|
||||
{
|
||||
// 1. Create isolated working directory
|
||||
_workingDirectory = PrepareWorkingDirectory(manifest);
|
||||
|
||||
// 2. Configure resource limits
|
||||
var resourceConfig = _resourceLimiter.CreateConfiguration(_config.ResourceLimits);
|
||||
|
||||
// 3. Configure network policy
|
||||
await _networkEnforcer.ApplyPolicyAsync(Id, _config.NetworkPolicy, ct);
|
||||
|
||||
// 4. Start the plugin host process
|
||||
var socketPath = GetSocketPath();
|
||||
_process = await _processManager.StartAsync(new ProcessStartRequest
|
||||
{
|
||||
PluginAssemblyPath = manifest.AssemblyPath!,
|
||||
EntryPoint = manifest.EntryPoint,
|
||||
WorkingDirectory = _workingDirectory,
|
||||
SocketPath = socketPath,
|
||||
ResourceConfiguration = resourceConfig,
|
||||
EnvironmentVariables = _config.EnvironmentVariables
|
||||
}, ct);
|
||||
|
||||
// 5. Apply resource limits to the process
|
||||
await _resourceLimiter.ApplyLimitsAsync(_process, resourceConfig, ct);
|
||||
|
||||
// 6. Wait for the process to be ready and connect
|
||||
await WaitForReadyAsync(socketPath, ct);
|
||||
|
||||
// 7. Initialize the plugin
|
||||
await _bridge.InitializePluginAsync(manifest, ct);
|
||||
|
||||
// 8. Start resource monitoring
|
||||
StartResourceMonitoring();
|
||||
|
||||
TransitionState(SandboxState.Running);
|
||||
|
||||
_logger.LogInformation("Sandbox {Id} started for plugin {PluginId}",
|
||||
Id, manifest.Info.Id);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to start sandbox {Id}", Id);
|
||||
TransitionState(SandboxState.Failed, ex.Message);
|
||||
await CleanupAsync();
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task StopAsync(TimeSpan timeout, CancellationToken ct)
|
||||
{
|
||||
if (_state is SandboxState.Stopped or SandboxState.Failed or SandboxState.Killed)
|
||||
return;
|
||||
|
||||
TransitionState(SandboxState.Stopping);
|
||||
|
||||
try
|
||||
{
|
||||
// Stop monitoring
|
||||
_monitoringCts?.Cancel();
|
||||
if (_monitoringTask != null)
|
||||
{
|
||||
try { await _monitoringTask; } catch { /* Ignore */ }
|
||||
}
|
||||
|
||||
// 1. Signal graceful shutdown via gRPC
|
||||
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
timeoutCts.CancelAfter(timeout);
|
||||
|
||||
try
|
||||
{
|
||||
if (_bridge.IsConnected)
|
||||
{
|
||||
await _bridge.ShutdownPluginAsync(timeoutCts.Token);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
_logger.LogWarning("Sandbox {Id} did not shutdown gracefully, killing", Id);
|
||||
}
|
||||
|
||||
// 2. Disconnect bridge
|
||||
await _bridge.DisconnectAsync(ct);
|
||||
|
||||
// 3. Stop the process
|
||||
if (_process != null)
|
||||
{
|
||||
await _processManager.StopAsync(_process, timeout, ct);
|
||||
}
|
||||
|
||||
// 4. Cleanup resources
|
||||
await CleanupAsync();
|
||||
|
||||
TransitionState(SandboxState.Stopped);
|
||||
|
||||
_logger.LogInformation("Sandbox {Id} stopped", Id);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error stopping sandbox {Id}", Id);
|
||||
TransitionState(SandboxState.Failed, ex.Message);
|
||||
await CleanupAsync();
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<T> ExecuteAsync<T>(
|
||||
string operationName,
|
||||
object? parameters,
|
||||
TimeSpan timeout,
|
||||
CancellationToken ct)
|
||||
{
|
||||
EnsureRunning();
|
||||
|
||||
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
timeoutCts.CancelAfter(timeout);
|
||||
|
||||
try
|
||||
{
|
||||
return await _bridge.InvokeAsync<T>(operationName, parameters, timeoutCts.Token);
|
||||
}
|
||||
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested && !ct.IsCancellationRequested)
|
||||
{
|
||||
throw new TimeoutException($"Operation '{operationName}' timed out after {timeout}");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async IAsyncEnumerable<TEvent> ExecuteStreamingAsync<TEvent>(
|
||||
string operationName,
|
||||
object? parameters,
|
||||
[EnumeratorCancellation] CancellationToken ct)
|
||||
{
|
||||
EnsureRunning();
|
||||
|
||||
await foreach (var evt in _bridge.InvokeStreamingAsync<TEvent>(operationName, parameters, ct))
|
||||
{
|
||||
yield return evt;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<HealthCheckResult> HealthCheckAsync(CancellationToken ct)
|
||||
{
|
||||
if (_state != SandboxState.Running)
|
||||
{
|
||||
return HealthCheckResult.Unhealthy($"Sandbox is in state {_state}");
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
timeoutCts.CancelAfter(_config.Timeouts.HealthCheckTimeout);
|
||||
|
||||
var result = await _bridge.HealthCheckAsync(timeoutCts.Token);
|
||||
|
||||
// Add resource usage to details
|
||||
var details = new Dictionary<string, object>(result.Details ?? new Dictionary<string, object>())
|
||||
{
|
||||
["sandboxId"] = Id,
|
||||
["memoryUsageMb"] = _currentUsage.MemoryUsageMb,
|
||||
["cpuUsagePercent"] = _currentUsage.CpuUsagePercent
|
||||
};
|
||||
|
||||
return result with { Details = details };
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return HealthCheckResult.Unhealthy(ex);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (_state == SandboxState.Running)
|
||||
{
|
||||
await StopAsync(_config.Timeouts.ShutdownTimeout, CancellationToken.None);
|
||||
}
|
||||
|
||||
_bridge.Dispose();
|
||||
_monitoringCts?.Dispose();
|
||||
}
|
||||
|
||||
private void EnsureRunning()
|
||||
{
|
||||
if (_state != SandboxState.Running)
|
||||
{
|
||||
throw new InvalidOperationException($"Sandbox is not running (state: {_state})");
|
||||
}
|
||||
}
|
||||
|
||||
private void TransitionState(SandboxState newState, string? reason = null)
|
||||
{
|
||||
var oldState = _state;
|
||||
_state = newState;
|
||||
|
||||
_logger.LogDebug("Sandbox {Id} state changed: {OldState} -> {NewState} ({Reason})",
|
||||
Id, oldState, newState, reason ?? "N/A");
|
||||
|
||||
StateChanged?.Invoke(this, new SandboxStateChangedEventArgs
|
||||
{
|
||||
OldState = oldState,
|
||||
NewState = newState,
|
||||
Reason = reason
|
||||
});
|
||||
}
|
||||
|
||||
private string PrepareWorkingDirectory(PluginManifest manifest)
|
||||
{
|
||||
var workDir = _config.WorkingDirectory
|
||||
?? Path.Combine(Path.GetTempPath(), "stellaops-sandbox", Id);
|
||||
|
||||
if (Directory.Exists(workDir))
|
||||
Directory.Delete(workDir, recursive: true);
|
||||
|
||||
Directory.CreateDirectory(workDir);
|
||||
|
||||
// Copy plugin files to sandbox directory
|
||||
if (!string.IsNullOrEmpty(manifest.AssemblyPath))
|
||||
{
|
||||
var pluginDir = Path.GetDirectoryName(manifest.AssemblyPath);
|
||||
if (!string.IsNullOrEmpty(pluginDir) && Directory.Exists(pluginDir))
|
||||
{
|
||||
CopyDirectory(pluginDir, workDir);
|
||||
}
|
||||
}
|
||||
|
||||
return workDir;
|
||||
}
|
||||
|
||||
private async Task CleanupAsync()
|
||||
{
|
||||
// Cleanup network policy
|
||||
try
|
||||
{
|
||||
await _networkEnforcer.RemovePolicyAsync(Id, CancellationToken.None);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to cleanup network policy for sandbox {Id}", Id);
|
||||
}
|
||||
|
||||
// Cleanup resource limits
|
||||
if (_process != null)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _resourceLimiter.RemoveLimitsAsync(_process, CancellationToken.None);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to cleanup resource limits for sandbox {Id}", Id);
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup working directory
|
||||
CleanupWorkingDirectory();
|
||||
}
|
||||
|
||||
private void CleanupWorkingDirectory()
|
||||
{
|
||||
var workDir = _workingDirectory
|
||||
?? Path.Combine(Path.GetTempPath(), "stellaops-sandbox", Id);
|
||||
|
||||
if (Directory.Exists(workDir))
|
||||
{
|
||||
try
|
||||
{
|
||||
Directory.Delete(workDir, recursive: true);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to cleanup sandbox directory {WorkDir}", workDir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private string GetSocketPath()
|
||||
{
|
||||
if (OperatingSystem.IsWindows())
|
||||
{
|
||||
return $"\\\\.\\pipe\\stellaops-sandbox-{Id}";
|
||||
}
|
||||
else
|
||||
{
|
||||
return Path.Combine(Path.GetTempPath(), $"stellaops-sandbox-{Id}.sock");
|
||||
}
|
||||
}
|
||||
|
||||
private async Task WaitForReadyAsync(string socketPath, CancellationToken ct)
|
||||
{
|
||||
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
timeoutCts.CancelAfter(_config.Timeouts.StartupTimeout);
|
||||
|
||||
while (!timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
if (_process?.HasExited == true)
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
$"Plugin process exited with code {_process.ExitCode}");
|
||||
}
|
||||
|
||||
// Try to connect
|
||||
try
|
||||
{
|
||||
await _bridge.ConnectAsync(socketPath, timeoutCts.Token);
|
||||
return;
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
// Not ready yet, wait and retry
|
||||
await Task.Delay(100, timeoutCts.Token);
|
||||
}
|
||||
}
|
||||
|
||||
throw new TimeoutException("Plugin process did not become ready in time");
|
||||
}
|
||||
|
||||
private void StartResourceMonitoring()
|
||||
{
|
||||
_monitoringCts = new CancellationTokenSource();
|
||||
_monitoringTask = Task.Run(async () =>
|
||||
{
|
||||
while (!_monitoringCts.Token.IsCancellationRequested && _state == SandboxState.Running)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (_process != null && !_process.HasExited)
|
||||
{
|
||||
_currentUsage = await _resourceLimiter.GetUsageAsync(_process, _monitoringCts.Token);
|
||||
|
||||
// Check thresholds
|
||||
CheckResourceThreshold(ResourceType.Memory,
|
||||
_currentUsage.MemoryUsageMb,
|
||||
_config.ResourceLimits.MaxMemoryMb);
|
||||
|
||||
CheckResourceThreshold(ResourceType.Cpu,
|
||||
_currentUsage.CpuUsagePercent,
|
||||
_config.ResourceLimits.MaxCpuPercent);
|
||||
|
||||
// Check if limits exceeded
|
||||
var limitCheck = await _resourceLimiter.CheckLimitsAsync(
|
||||
_process, _config.ResourceLimits, _monitoringCts.Token);
|
||||
|
||||
if (limitCheck.IsExceeded)
|
||||
{
|
||||
_logger.LogWarning("Sandbox {Id} exceeded resource limit: {Message}",
|
||||
Id, limitCheck.Message);
|
||||
}
|
||||
}
|
||||
|
||||
await Task.Delay(1000, _monitoringCts.Token);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error monitoring resources for sandbox {Id}", Id);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private void CheckResourceThreshold(ResourceType resource, double current, double max)
|
||||
{
|
||||
if (max <= 0) return;
|
||||
|
||||
var percent = (current / max) * 100;
|
||||
if (percent >= 80)
|
||||
{
|
||||
ResourceWarning?.Invoke(this, new ResourceWarningEventArgs
|
||||
{
|
||||
Resource = resource,
|
||||
CurrentUsagePercent = percent,
|
||||
ThresholdPercent = 80
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private static void CopyDirectory(string source, string destination)
|
||||
{
|
||||
foreach (var dir in Directory.GetDirectories(source, "*", SearchOption.AllDirectories))
|
||||
{
|
||||
Directory.CreateDirectory(dir.Replace(source, destination));
|
||||
}
|
||||
|
||||
foreach (var file in Directory.GetFiles(source, "*", SearchOption.AllDirectories))
|
||||
{
|
||||
File.Copy(file, file.Replace(source, destination), overwrite: true);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user