475 lines
15 KiB
C#
475 lines
15 KiB
C#
|
|
using Microsoft.Extensions.Logging;
|
|
using StellaOps.Plugin.Abstractions.Health;
|
|
using StellaOps.Plugin.Abstractions.Manifest;
|
|
using StellaOps.Plugin.Sandbox.Communication;
|
|
using StellaOps.Plugin.Sandbox.Filesystem;
|
|
using StellaOps.Plugin.Sandbox.Network;
|
|
using StellaOps.Plugin.Sandbox.Process;
|
|
using StellaOps.Plugin.Sandbox.Resources;
|
|
using System.Runtime.CompilerServices;
|
|
using SystemProcess = System.Diagnostics.Process;
|
|
|
|
namespace StellaOps.Plugin.Sandbox;
|
|
|
|
/// <summary>
|
|
/// Process-based sandbox implementation for untrusted plugins.
|
|
/// </summary>
|
|
public sealed class ProcessSandbox : ISandbox
|
|
{
|
|
private readonly SandboxConfiguration _config;
|
|
private readonly IPluginProcessManager _processManager;
|
|
private readonly IGrpcPluginBridge _bridge;
|
|
private readonly IResourceLimiter _resourceLimiter;
|
|
private readonly INetworkPolicyEnforcer _networkEnforcer;
|
|
private readonly ILogger<ProcessSandbox> _logger;
|
|
private readonly TimeProvider _timeProvider;
|
|
|
|
private SystemProcess? _process;
|
|
private SandboxState _state = SandboxState.Created;
|
|
private ResourceUsage _currentUsage = ResourceUsage.Empty;
|
|
private CancellationTokenSource? _monitoringCts;
|
|
private Task? _monitoringTask;
|
|
private string? _workingDirectory;
|
|
|
|
/// <inheritdoc />
|
|
public string Id { get; }
|
|
|
|
/// <inheritdoc />
|
|
public SandboxState State => _state;
|
|
|
|
/// <inheritdoc />
|
|
public ResourceUsage CurrentUsage => _currentUsage;
|
|
|
|
/// <inheritdoc />
|
|
public event EventHandler<SandboxStateChangedEventArgs>? StateChanged;
|
|
|
|
/// <inheritdoc />
|
|
public event EventHandler<ResourceWarningEventArgs>? ResourceWarning;
|
|
|
|
/// <summary>
|
|
/// Creates a new process sandbox.
|
|
/// </summary>
|
|
public ProcessSandbox(
|
|
string id,
|
|
SandboxConfiguration config,
|
|
IPluginProcessManager processManager,
|
|
IGrpcPluginBridge bridge,
|
|
IResourceLimiter resourceLimiter,
|
|
INetworkPolicyEnforcer networkEnforcer,
|
|
ILogger<ProcessSandbox> logger,
|
|
TimeProvider timeProvider)
|
|
{
|
|
Id = id;
|
|
_config = config;
|
|
_processManager = processManager;
|
|
_bridge = bridge;
|
|
_resourceLimiter = resourceLimiter;
|
|
_networkEnforcer = networkEnforcer;
|
|
_logger = logger;
|
|
_timeProvider = timeProvider;
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public async Task StartAsync(PluginManifest manifest, CancellationToken ct)
|
|
{
|
|
TransitionState(SandboxState.Starting);
|
|
|
|
try
|
|
{
|
|
// 1. Create isolated working directory
|
|
_workingDirectory = PrepareWorkingDirectory(manifest);
|
|
|
|
// 2. Configure resource limits
|
|
var resourceConfig = _resourceLimiter.CreateConfiguration(_config.ResourceLimits);
|
|
|
|
// 3. Configure network policy
|
|
await _networkEnforcer.ApplyPolicyAsync(Id, _config.NetworkPolicy, ct);
|
|
|
|
// 4. Start the plugin host process
|
|
var socketPath = GetSocketPath();
|
|
_process = await _processManager.StartAsync(new ProcessStartRequest
|
|
{
|
|
PluginAssemblyPath = manifest.AssemblyPath!,
|
|
EntryPoint = manifest.EntryPoint,
|
|
WorkingDirectory = _workingDirectory,
|
|
SocketPath = socketPath,
|
|
ResourceConfiguration = resourceConfig,
|
|
EnvironmentVariables = _config.EnvironmentVariables
|
|
}, ct);
|
|
|
|
// 5. Apply resource limits to the process
|
|
await _resourceLimiter.ApplyLimitsAsync(_process, resourceConfig, ct);
|
|
|
|
// 6. Wait for the process to be ready and connect
|
|
await WaitForReadyAsync(socketPath, ct);
|
|
|
|
// 7. Initialize the plugin
|
|
await _bridge.InitializePluginAsync(manifest, ct);
|
|
|
|
// 8. Start resource monitoring
|
|
StartResourceMonitoring();
|
|
|
|
TransitionState(SandboxState.Running);
|
|
|
|
_logger.LogInformation("Sandbox {Id} started for plugin {PluginId}",
|
|
Id, manifest.Info.Id);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Failed to start sandbox {Id}", Id);
|
|
TransitionState(SandboxState.Failed, ex.Message);
|
|
await CleanupAsync();
|
|
throw;
|
|
}
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public async Task StopAsync(TimeSpan timeout, CancellationToken ct)
|
|
{
|
|
if (_state is SandboxState.Stopped or SandboxState.Failed or SandboxState.Killed)
|
|
return;
|
|
|
|
TransitionState(SandboxState.Stopping);
|
|
|
|
try
|
|
{
|
|
// Stop monitoring
|
|
_monitoringCts?.Cancel();
|
|
if (_monitoringTask != null)
|
|
{
|
|
try { await _monitoringTask; } catch { /* Ignore */ }
|
|
}
|
|
|
|
// 1. Signal graceful shutdown via gRPC
|
|
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
|
timeoutCts.CancelAfter(timeout);
|
|
|
|
try
|
|
{
|
|
if (_bridge.IsConnected)
|
|
{
|
|
await _bridge.ShutdownPluginAsync(timeoutCts.Token);
|
|
}
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
_logger.LogWarning("Sandbox {Id} did not shutdown gracefully, killing", Id);
|
|
}
|
|
|
|
// 2. Disconnect bridge
|
|
await _bridge.DisconnectAsync(ct);
|
|
|
|
// 3. Stop the process
|
|
if (_process != null)
|
|
{
|
|
await _processManager.StopAsync(_process, timeout, ct);
|
|
}
|
|
|
|
// 4. Cleanup resources
|
|
await CleanupAsync();
|
|
|
|
TransitionState(SandboxState.Stopped);
|
|
|
|
_logger.LogInformation("Sandbox {Id} stopped", Id);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error stopping sandbox {Id}", Id);
|
|
TransitionState(SandboxState.Failed, ex.Message);
|
|
await CleanupAsync();
|
|
throw;
|
|
}
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public async Task<T> ExecuteAsync<T>(
|
|
string operationName,
|
|
object? parameters,
|
|
TimeSpan timeout,
|
|
CancellationToken ct)
|
|
{
|
|
EnsureRunning();
|
|
|
|
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
|
timeoutCts.CancelAfter(timeout);
|
|
|
|
try
|
|
{
|
|
return await _bridge.InvokeAsync<T>(operationName, parameters, timeoutCts.Token);
|
|
}
|
|
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested && !ct.IsCancellationRequested)
|
|
{
|
|
throw new TimeoutException($"Operation '{operationName}' timed out after {timeout}");
|
|
}
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public async IAsyncEnumerable<TEvent> ExecuteStreamingAsync<TEvent>(
|
|
string operationName,
|
|
object? parameters,
|
|
[EnumeratorCancellation] CancellationToken ct)
|
|
{
|
|
EnsureRunning();
|
|
|
|
await foreach (var evt in _bridge.InvokeStreamingAsync<TEvent>(operationName, parameters, ct))
|
|
{
|
|
yield return evt;
|
|
}
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public async Task<HealthCheckResult> HealthCheckAsync(CancellationToken ct)
|
|
{
|
|
if (_state != SandboxState.Running)
|
|
{
|
|
return HealthCheckResult.Unhealthy($"Sandbox is in state {_state}");
|
|
}
|
|
|
|
try
|
|
{
|
|
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
|
timeoutCts.CancelAfter(_config.Timeouts.HealthCheckTimeout);
|
|
|
|
var result = await _bridge.HealthCheckAsync(timeoutCts.Token);
|
|
|
|
// Add resource usage to details
|
|
var details = new Dictionary<string, object>(result.Details ?? new Dictionary<string, object>())
|
|
{
|
|
["sandboxId"] = Id,
|
|
["memoryUsageMb"] = _currentUsage.MemoryUsageMb,
|
|
["cpuUsagePercent"] = _currentUsage.CpuUsagePercent
|
|
};
|
|
|
|
return result with { Details = details };
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
return HealthCheckResult.Unhealthy(ex);
|
|
}
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public async ValueTask DisposeAsync()
|
|
{
|
|
if (_state == SandboxState.Running)
|
|
{
|
|
await StopAsync(_config.Timeouts.ShutdownTimeout, CancellationToken.None);
|
|
}
|
|
|
|
_bridge.Dispose();
|
|
_monitoringCts?.Dispose();
|
|
}
|
|
|
|
private void EnsureRunning()
|
|
{
|
|
if (_state != SandboxState.Running)
|
|
{
|
|
throw new InvalidOperationException($"Sandbox is not running (state: {_state})");
|
|
}
|
|
}
|
|
|
|
private void TransitionState(SandboxState newState, string? reason = null)
|
|
{
|
|
var oldState = _state;
|
|
_state = newState;
|
|
|
|
_logger.LogDebug("Sandbox {Id} state changed: {OldState} -> {NewState} ({Reason})",
|
|
Id, oldState, newState, reason ?? "N/A");
|
|
|
|
StateChanged?.Invoke(this, new SandboxStateChangedEventArgs
|
|
{
|
|
OldState = oldState,
|
|
NewState = newState,
|
|
Reason = reason
|
|
});
|
|
}
|
|
|
|
private string PrepareWorkingDirectory(PluginManifest manifest)
|
|
{
|
|
var workDir = _config.WorkingDirectory
|
|
?? Path.Combine(Path.GetTempPath(), "stellaops-sandbox", Id);
|
|
|
|
if (Directory.Exists(workDir))
|
|
Directory.Delete(workDir, recursive: true);
|
|
|
|
Directory.CreateDirectory(workDir);
|
|
|
|
// Copy plugin files to sandbox directory
|
|
if (!string.IsNullOrEmpty(manifest.AssemblyPath))
|
|
{
|
|
var pluginDir = Path.GetDirectoryName(manifest.AssemblyPath);
|
|
if (!string.IsNullOrEmpty(pluginDir) && Directory.Exists(pluginDir))
|
|
{
|
|
CopyDirectory(pluginDir, workDir);
|
|
}
|
|
}
|
|
|
|
return workDir;
|
|
}
|
|
|
|
private async Task CleanupAsync()
|
|
{
|
|
// Cleanup network policy
|
|
try
|
|
{
|
|
await _networkEnforcer.RemovePolicyAsync(Id, CancellationToken.None);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex, "Failed to cleanup network policy for sandbox {Id}", Id);
|
|
}
|
|
|
|
// Cleanup resource limits
|
|
if (_process != null)
|
|
{
|
|
try
|
|
{
|
|
await _resourceLimiter.RemoveLimitsAsync(_process, CancellationToken.None);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex, "Failed to cleanup resource limits for sandbox {Id}", Id);
|
|
}
|
|
}
|
|
|
|
// Cleanup working directory
|
|
CleanupWorkingDirectory();
|
|
}
|
|
|
|
private void CleanupWorkingDirectory()
|
|
{
|
|
var workDir = _workingDirectory
|
|
?? Path.Combine(Path.GetTempPath(), "stellaops-sandbox", Id);
|
|
|
|
if (Directory.Exists(workDir))
|
|
{
|
|
try
|
|
{
|
|
Directory.Delete(workDir, recursive: true);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex, "Failed to cleanup sandbox directory {WorkDir}", workDir);
|
|
}
|
|
}
|
|
}
|
|
|
|
private string GetSocketPath()
|
|
{
|
|
if (OperatingSystem.IsWindows())
|
|
{
|
|
return $"\\\\.\\pipe\\stellaops-sandbox-{Id}";
|
|
}
|
|
else
|
|
{
|
|
return Path.Combine(Path.GetTempPath(), $"stellaops-sandbox-{Id}.sock");
|
|
}
|
|
}
|
|
|
|
private async Task WaitForReadyAsync(string socketPath, CancellationToken ct)
|
|
{
|
|
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
|
timeoutCts.CancelAfter(_config.Timeouts.StartupTimeout);
|
|
|
|
while (!timeoutCts.IsCancellationRequested)
|
|
{
|
|
if (_process?.HasExited == true)
|
|
{
|
|
throw new InvalidOperationException(
|
|
$"Plugin process exited with code {_process.ExitCode}");
|
|
}
|
|
|
|
// Try to connect
|
|
try
|
|
{
|
|
await _bridge.ConnectAsync(socketPath, timeoutCts.Token);
|
|
return;
|
|
}
|
|
catch (Exception ex) when (ex is not OperationCanceledException)
|
|
{
|
|
// Not ready yet, wait and retry
|
|
await Task.Delay(100, timeoutCts.Token);
|
|
}
|
|
}
|
|
|
|
throw new TimeoutException("Plugin process did not become ready in time");
|
|
}
|
|
|
|
private void StartResourceMonitoring()
|
|
{
|
|
_monitoringCts = new CancellationTokenSource();
|
|
_monitoringTask = Task.Run(async () =>
|
|
{
|
|
while (!_monitoringCts.Token.IsCancellationRequested && _state == SandboxState.Running)
|
|
{
|
|
try
|
|
{
|
|
if (_process != null && !_process.HasExited)
|
|
{
|
|
_currentUsage = await _resourceLimiter.GetUsageAsync(_process, _monitoringCts.Token);
|
|
|
|
// Check thresholds
|
|
CheckResourceThreshold(ResourceType.Memory,
|
|
_currentUsage.MemoryUsageMb,
|
|
_config.ResourceLimits.MaxMemoryMb);
|
|
|
|
CheckResourceThreshold(ResourceType.Cpu,
|
|
_currentUsage.CpuUsagePercent,
|
|
_config.ResourceLimits.MaxCpuPercent);
|
|
|
|
// Check if limits exceeded
|
|
var limitCheck = await _resourceLimiter.CheckLimitsAsync(
|
|
_process, _config.ResourceLimits, _monitoringCts.Token);
|
|
|
|
if (limitCheck.IsExceeded)
|
|
{
|
|
_logger.LogWarning("Sandbox {Id} exceeded resource limit: {Message}",
|
|
Id, limitCheck.Message);
|
|
}
|
|
}
|
|
|
|
await Task.Delay(1000, _monitoringCts.Token);
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
break;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error monitoring resources for sandbox {Id}", Id);
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
private void CheckResourceThreshold(ResourceType resource, double current, double max)
|
|
{
|
|
if (max <= 0) return;
|
|
|
|
var percent = (current / max) * 100;
|
|
if (percent >= 80)
|
|
{
|
|
ResourceWarning?.Invoke(this, new ResourceWarningEventArgs
|
|
{
|
|
Resource = resource,
|
|
CurrentUsagePercent = percent,
|
|
ThresholdPercent = 80
|
|
});
|
|
}
|
|
}
|
|
|
|
private static void CopyDirectory(string source, string destination)
|
|
{
|
|
foreach (var dir in Directory.GetDirectories(source, "*", SearchOption.AllDirectories))
|
|
{
|
|
Directory.CreateDirectory(dir.Replace(source, destination));
|
|
}
|
|
|
|
foreach (var file in Directory.GetFiles(source, "*", SearchOption.AllDirectories))
|
|
{
|
|
File.Copy(file, file.Replace(source, destination), overwrite: true);
|
|
}
|
|
}
|
|
}
|