audit work, fixed StellaOps.sln warnings/errors, fixed tests, sprints work, new advisories

This commit is contained in:
master
2026-01-07 18:49:59 +02:00
parent 04ec098046
commit 608a7f85c0
866 changed files with 56323 additions and 6231 deletions

View File

@@ -0,0 +1,63 @@
# StellaOps Timeline Service
# Multi-stage build for optimized production image
FROM mcr.microsoft.com/dotnet/sdk:10.0-preview AS build
WORKDIR /src
# Copy solution and project files for restore
COPY ["src/Timeline/StellaOps.Timeline.WebService/StellaOps.Timeline.WebService.csproj", "src/Timeline/StellaOps.Timeline.WebService/"]
COPY ["src/Timeline/__Libraries/StellaOps.Timeline.Core/StellaOps.Timeline.Core.csproj", "src/Timeline/__Libraries/StellaOps.Timeline.Core/"]
COPY ["src/__Libraries/StellaOps.Eventing/StellaOps.Eventing.csproj", "src/__Libraries/StellaOps.Eventing/"]
COPY ["src/__Libraries/StellaOps.HybridLogicalClock/StellaOps.HybridLogicalClock.csproj", "src/__Libraries/StellaOps.HybridLogicalClock/"]
COPY ["src/__Libraries/StellaOps.Microservice/StellaOps.Microservice.csproj", "src/__Libraries/StellaOps.Microservice/"]
COPY ["src/__Libraries/StellaOps.Replay.Core/StellaOps.Replay.Core.csproj", "src/__Libraries/StellaOps.Replay.Core/"]
COPY ["nuget.config", "."]
COPY ["Directory.Build.props", "."]
COPY ["Directory.Packages.props", "."]
# Restore dependencies
RUN dotnet restore "src/Timeline/StellaOps.Timeline.WebService/StellaOps.Timeline.WebService.csproj"
# Copy source code
COPY ["src/", "src/"]
# Build
WORKDIR /src/src/Timeline/StellaOps.Timeline.WebService
RUN dotnet build -c Release -o /app/build --no-restore
# Publish
FROM build AS publish
RUN dotnet publish -c Release -o /app/publish --no-build /p:UseAppHost=false
# Runtime image
FROM mcr.microsoft.com/dotnet/aspnet:10.0-preview AS runtime
WORKDIR /app
# Create non-root user
RUN addgroup --system --gid 1000 stellaops && \
adduser --system --uid 1000 --ingroup stellaops stellaops
# Copy published files
COPY --from=publish /app/publish .
# Set ownership
RUN chown -R stellaops:stellaops /app
# Switch to non-root user
USER stellaops
# Environment configuration
ENV ASPNETCORE_URLS=http://+:8080 \
ASPNETCORE_ENVIRONMENT=Production \
DOTNET_EnableDiagnostics=0 \
DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=false
# Health check
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8080/health || exit 1
# Expose port
EXPOSE 8080
# Entry point
ENTRYPOINT ["dotnet", "StellaOps.Timeline.WebService.dll"]

View File

@@ -0,0 +1,119 @@
# HLC Queue Alerting Rules
# Sprint: SPRINT_20260105_002_004_BE_hlc_integration_tests
# Task: INT-018 - Create alerts for HLC anomalies
groups:
- name: hlc_alerts
interval: 1m
rules:
# Critical: Chain verification failures indicate tampering or corruption
- alert: HlcChainVerificationFailure
expr: increase(scheduler_chain_verification_failures_total[5m]) > 0
for: 1m
labels:
severity: critical
team: scheduler
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#chain-verification-failure
annotations:
summary: "HLC chain verification failure detected"
description: "Chain verification failure on node {{ $labels.node_id }} for tenant {{ $labels.tenant_id }}. This may indicate data tampering or corruption."
impact: "Audit trail integrity compromised. Investigation required."
action: "1. Check scheduler_log table for gaps. 2. Verify no unauthorized changes. 3. Review chain head consistency."
# Critical: Clock skew exceeds tolerance - can cause ordering issues
- alert: HlcClockSkewExceedsTolerance
expr: increase(hlc_clock_skew_rejections_total[5m]) > 5
for: 2m
labels:
severity: critical
team: infrastructure
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#clock-skew
annotations:
summary: "HLC clock skew rejections on {{ $labels.node_id }}"
description: "Node {{ $labels.node_id }} is rejecting HLC updates due to clock skew. {{ $value }} rejections in last 5 minutes."
impact: "Job ordering may be inconsistent. Distributed consistency at risk."
action: "1. Check NTP synchronization on affected node. 2. Verify time sources. 3. Consider increasing skew tolerance temporarily."
# Warning: Physical time offset is drifting
- alert: HlcPhysicalTimeOffset
expr: abs(hlc_physical_time_offset_seconds) > 0.5
for: 5m
labels:
severity: warning
team: infrastructure
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#time-offset
annotations:
summary: "HLC physical time offset on {{ $labels.node_id }}"
description: "HLC physical time is {{ $value }}s offset from wall clock on {{ $labels.node_id }}."
impact: "May cause timestamp ordering anomalies in logs and diagnostics."
action: "Monitor NTP status and consider clock synchronization."
# Warning: High merge conflict rate in air-gap sync
- alert: HlcMergeConflictRateHigh
expr: increase(airgap_merge_conflicts_total[1h]) > 100
for: 10m
labels:
severity: warning
team: scheduler
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#merge-conflicts
annotations:
summary: "High HLC merge conflict rate during air-gap sync"
description: "{{ $value }} merge conflicts detected in the last hour for conflict type {{ $labels.conflict_type }}."
impact: "Air-gap sync may be producing unexpected results or dropping jobs."
action: "1. Review conflict resolution logs. 2. Check for duplicate job submissions. 3. Verify offline node clocks."
# Warning: Air-gap sync duration increasing
- alert: HlcSyncDurationHigh
expr: histogram_quantile(0.95, sum(rate(airgap_sync_duration_seconds_bucket[15m])) by (le)) > 30
for: 10m
labels:
severity: warning
team: scheduler
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#slow-sync
annotations:
summary: "Air-gap sync duration is high"
description: "95th percentile sync duration is {{ $value }}s, exceeding 30s threshold."
impact: "Air-gap import operations are slow, may delay job processing."
action: "1. Check bundle sizes. 2. Verify database performance. 3. Consider chunking large bundles."
# Info: HLC enqueue rate is zero (may be expected in some deployments)
- alert: HlcEnqueueRateZero
expr: sum(rate(scheduler_hlc_enqueues_total[10m])) == 0
for: 30m
labels:
severity: info
team: scheduler
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#no-enqueues
annotations:
summary: "No HLC enqueues in last 30 minutes"
description: "No jobs have been enqueued with HLC timestamps in the last 30 minutes."
impact: "May be expected if no jobs are scheduled, or may indicate HLC ordering is disabled."
action: "Verify EnableHlcOrdering configuration if HLC ordering is expected."
# Warning: Batch snapshot creation failing
- alert: HlcBatchSnapshotFailures
expr: increase(scheduler_batch_snapshot_failures_total[5m]) > 0
for: 2m
labels:
severity: warning
team: scheduler
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#batch-snapshot-failure
annotations:
summary: "Batch snapshot creation failures"
description: "{{ $value }} batch snapshot creation failures in the last 5 minutes."
impact: "DSSE-signed batch proofs may be missing for affected time ranges."
action: "1. Check signing key availability. 2. Verify database connectivity. 3. Review batch size limits."
# Critical: Multiple nodes with same node ID (configuration error)
- alert: HlcDuplicateNodeId
expr: count by (node_id) (group by (node_id, instance) (hlc_ticks_total)) > 1
for: 5m
labels:
severity: critical
team: scheduler
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#duplicate-node-id
annotations:
summary: "Duplicate HLC node ID detected"
description: "Multiple instances are using node_id={{ $labels.node_id }}. This will cause ordering conflicts."
impact: "Critical: Job ordering and chain integrity will be compromised."
action: "Immediately reconfigure affected instances with unique node IDs."

View File

@@ -0,0 +1,290 @@
{
"dashboard": {
"id": null,
"uid": "stellaops-hlc-metrics",
"title": "StellaOps HLC Queue Metrics",
"description": "Hybrid Logical Clock ordering metrics for the Scheduler queue",
"tags": ["stellaops", "hlc", "scheduler", "audit"],
"timezone": "utc",
"schemaVersion": 39,
"version": 1,
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
},
"panels": [
{
"id": 1,
"title": "HLC Tick Rate",
"description": "Rate of HLC tick operations per second",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": { "drawStyle": "line", "lineInterpolation": "smooth" }
}
},
"targets": [
{
"expr": "rate(hlc_ticks_total[1m])",
"legendFormat": "{{node_id}}",
"refId": "A"
}
]
},
{
"id": 2,
"title": "Clock Skew Rejections",
"description": "HLC rejections due to clock skew exceeding tolerance",
"type": "stat",
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 10 }
]
}
}
},
"targets": [
{
"expr": "sum(increase(hlc_clock_skew_rejections_total[1h]))",
"refId": "A"
}
]
},
{
"id": 3,
"title": "Physical Time Offset",
"description": "Difference between HLC physical time and wall clock",
"type": "gauge",
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
"fieldConfig": {
"defaults": {
"unit": "ms",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 100 },
{ "color": "red", "value": 1000 }
]
},
"max": 5000
}
},
"targets": [
{
"expr": "max(hlc_physical_time_offset_seconds) * 1000",
"refId": "A"
}
]
},
{
"id": 4,
"title": "Scheduler HLC Enqueues",
"description": "Rate of jobs enqueued with HLC timestamps",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": { "drawStyle": "bars", "fillOpacity": 50 }
}
},
"targets": [
{
"expr": "rate(scheduler_hlc_enqueues_total[5m])",
"legendFormat": "{{tenant_id}}",
"refId": "A"
}
]
},
{
"id": 5,
"title": "Chain Verifications",
"description": "Chain verification operations by result",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"fieldConfig": {
"defaults": {
"unit": "ops"
},
"overrides": [
{
"matcher": { "id": "byName", "options": "valid" },
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
},
{
"matcher": { "id": "byName", "options": "invalid" },
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
}
]
},
"targets": [
{
"expr": "rate(scheduler_chain_verifications_total[5m])",
"legendFormat": "{{result}}",
"refId": "A"
}
]
},
{
"id": 6,
"title": "Verification Failures",
"description": "Chain verification failures - indicates tampering or corruption",
"type": "stat",
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 8 },
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
}
}
},
"targets": [
{
"expr": "sum(increase(scheduler_chain_verification_failures_total[1h]))",
"refId": "A"
}
]
},
{
"id": 7,
"title": "Batch Snapshots",
"description": "Batch snapshot creation rate",
"type": "stat",
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 8 },
"fieldConfig": {
"defaults": {
"unit": "short"
}
},
"targets": [
{
"expr": "sum(increase(scheduler_batch_snapshots_total[1h]))",
"refId": "A"
}
]
},
{
"id": 8,
"title": "Air-Gap Bundle Exports",
"description": "Rate of air-gap bundles exported",
"type": "timeseries",
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 16 },
"fieldConfig": {
"defaults": {
"unit": "ops"
}
},
"targets": [
{
"expr": "rate(airgap_bundles_exported_total[5m])",
"legendFormat": "{{node_id}}",
"refId": "A"
}
]
},
{
"id": 9,
"title": "Air-Gap Bundle Imports",
"description": "Rate of air-gap bundles imported",
"type": "timeseries",
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 16 },
"fieldConfig": {
"defaults": {
"unit": "ops"
}
},
"targets": [
{
"expr": "rate(airgap_bundles_imported_total[5m])",
"legendFormat": "imported",
"refId": "A"
}
]
},
{
"id": 10,
"title": "Air-Gap Merge Conflicts",
"description": "Merge conflicts by type during air-gap sync",
"type": "stat",
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 16 },
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 10 }
]
}
}
},
"targets": [
{
"expr": "sum by (conflict_type) (increase(airgap_merge_conflicts_total[1h]))",
"legendFormat": "{{conflict_type}}",
"refId": "A"
}
]
},
{
"id": 11,
"title": "Sync Duration",
"description": "Air-gap sync operation duration percentiles",
"type": "timeseries",
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 },
"fieldConfig": {
"defaults": {
"unit": "s"
}
},
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(airgap_sync_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p50",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum(rate(airgap_sync_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p95",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(rate(airgap_sync_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "p99",
"refId": "C"
}
]
}
],
"annotations": {
"list": [
{
"name": "Deployments",
"datasource": "-- Grafana --",
"enable": true,
"iconColor": "blue"
}
]
}
},
"folderId": 0,
"overwrite": true
}