Files
git.stella-ops.org/.gitea/workflows/control-plane-chaos.yml

188 lines
7.0 KiB
YAML

# -----------------------------------------------------------------------------
# control-plane-chaos.yml
# Sprint: Testing Enhancement Advisory - Phase 3.3
# Description: CI workflow for control-plane outage chaos tests
# Schedule: Weekly (chaos tests are intensive)
# -----------------------------------------------------------------------------
name: Control-Plane Chaos Tests
on:
schedule:
# Run weekly on Sundays at 3:00 AM UTC
- cron: '0 3 * * 0'
workflow_dispatch:
inputs:
test_filter:
description: 'Test filter (e.g., FullyQualifiedName~Authority)'
required: false
default: ''
verbosity:
description: 'Test verbosity level'
required: false
default: 'normal'
type: choice
options:
- minimal
- normal
- detailed
- diagnostic
env:
DOTNET_NOLOGO: true
DOTNET_CLI_TELEMETRY_OPTOUT: true
DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
jobs:
chaos-tests:
name: Control-Plane Chaos Tests
runs-on: ubuntu-latest
timeout-minutes: 60
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup .NET
uses: actions/setup-dotnet@v4
with:
dotnet-version: '10.0.x'
dotnet-quality: 'preview'
- name: Restore dependencies
run: |
dotnet restore src/__Tests/chaos/StellaOps.Chaos.ControlPlane.Tests/StellaOps.Chaos.ControlPlane.Tests.csproj
- name: Build chaos test project
run: |
dotnet build src/__Tests/chaos/StellaOps.Chaos.ControlPlane.Tests/StellaOps.Chaos.ControlPlane.Tests.csproj \
--configuration Release \
--no-restore
- name: Run control-plane outage tests
id: outage-tests
run: |
FILTER="${{ github.event.inputs.test_filter }}"
VERBOSITY="${{ github.event.inputs.verbosity || 'normal' }}"
dotnet test src/__Tests/chaos/StellaOps.Chaos.ControlPlane.Tests/StellaOps.Chaos.ControlPlane.Tests.csproj \
--configuration Release \
--no-build \
--verbosity $VERBOSITY \
--logger "trx;LogFileName=chaos-outage-results.trx" \
--logger "console;verbosity=$VERBOSITY" \
--results-directory ./TestResults \
--filter "Category=ControlPlane${FILTER:+&$FILTER}" \
-- \
RunConfiguration.CollectSourceInformation=true
continue-on-error: true
- name: Run partial outage tests
id: partial-tests
run: |
FILTER="${{ github.event.inputs.test_filter }}"
VERBOSITY="${{ github.event.inputs.verbosity || 'normal' }}"
dotnet test src/__Tests/chaos/StellaOps.Chaos.ControlPlane.Tests/StellaOps.Chaos.ControlPlane.Tests.csproj \
--configuration Release \
--no-build \
--verbosity $VERBOSITY \
--logger "trx;LogFileName=chaos-partial-results.trx" \
--logger "console;verbosity=$VERBOSITY" \
--results-directory ./TestResults \
--filter "Category=PartialOutage${FILTER:+&$FILTER}" \
-- \
RunConfiguration.CollectSourceInformation=true
continue-on-error: true
- name: Upload test results
uses: actions/upload-artifact@v4
if: always()
with:
name: chaos-test-results
path: ./TestResults/*.trx
retention-days: 30
- name: Generate chaos test summary
if: always()
run: |
echo "## Control-Plane Chaos Test Results" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Test Execution" >> $GITHUB_STEP_SUMMARY
echo "| Test Suite | Status |" >> $GITHUB_STEP_SUMMARY
echo "|------------|--------|" >> $GITHUB_STEP_SUMMARY
if [ "${{ steps.outage-tests.outcome }}" == "success" ]; then
echo "| Full Outage Tests | :white_check_mark: Passed |" >> $GITHUB_STEP_SUMMARY
else
echo "| Full Outage Tests | :x: Failed |" >> $GITHUB_STEP_SUMMARY
fi
if [ "${{ steps.partial-tests.outcome }}" == "success" ]; then
echo "| Partial Outage Tests | :white_check_mark: Passed |" >> $GITHUB_STEP_SUMMARY
else
echo "| Partial Outage Tests | :x: Failed |" >> $GITHUB_STEP_SUMMARY
fi
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Test Categories Covered" >> $GITHUB_STEP_SUMMARY
echo "- Authority outage and cached token validation" >> $GITHUB_STEP_SUMMARY
echo "- Scheduler outage and job persistence" >> $GITHUB_STEP_SUMMARY
echo "- Full control-plane outage and data integrity" >> $GITHUB_STEP_SUMMARY
echo "- Partial failure rate scenarios" >> $GITHUB_STEP_SUMMARY
echo "- Latency injection and degraded service handling" >> $GITHUB_STEP_SUMMARY
echo "- Service isolation and cascading failure prevention" >> $GITHUB_STEP_SUMMARY
- name: Check test results
if: always()
run: |
if [ "${{ steps.outage-tests.outcome }}" != "success" ] || [ "${{ steps.partial-tests.outcome }}" != "success" ]; then
echo "::error::One or more chaos test suites failed"
exit 1
fi
echo "All chaos tests passed successfully"
chaos-report:
name: Generate Chaos Report
runs-on: ubuntu-latest
needs: chaos-tests
if: always()
steps:
- name: Download test results
uses: actions/download-artifact@v4
with:
name: chaos-test-results
path: ./TestResults
- name: Parse TRX results
run: |
echo "## Chaos Test Detailed Report" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Test results have been uploaded as artifacts." >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Artifact Location" >> $GITHUB_STEP_SUMMARY
echo "- chaos-test-results (TRX format)" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
# List TRX files
echo "### Available Result Files" >> $GITHUB_STEP_SUMMARY
for file in ./TestResults/*.trx; do
if [ -f "$file" ]; then
echo "- $(basename $file)" >> $GITHUB_STEP_SUMMARY
fi
done
- name: Notify on failure
if: needs.chaos-tests.result == 'failure'
run: |
echo "::warning::Chaos tests failed. Review the test results for details."
echo "" >> $GITHUB_STEP_SUMMARY
echo "### :warning: Action Required" >> $GITHUB_STEP_SUMMARY
echo "Chaos tests have failed. Please review:" >> $GITHUB_STEP_SUMMARY
echo "1. Download the test artifacts for detailed results" >> $GITHUB_STEP_SUMMARY
echo "2. Check if failures are due to test infrastructure or actual regressions" >> $GITHUB_STEP_SUMMARY
echo "3. Consider running tests locally with diagnostic verbosity" >> $GITHUB_STEP_SUMMARY