# ----------------------------------------------------------------------------- # control-plane-chaos.yml # Sprint: Testing Enhancement Advisory - Phase 3.3 # Description: CI workflow for control-plane outage chaos tests # Schedule: Weekly (chaos tests are intensive) # ----------------------------------------------------------------------------- name: Control-Plane Chaos Tests on: schedule: # Run weekly on Sundays at 3:00 AM UTC - cron: '0 3 * * 0' workflow_dispatch: inputs: test_filter: description: 'Test filter (e.g., FullyQualifiedName~Authority)' required: false default: '' verbosity: description: 'Test verbosity level' required: false default: 'normal' type: choice options: - minimal - normal - detailed - diagnostic env: DOTNET_NOLOGO: true DOTNET_CLI_TELEMETRY_OPTOUT: true DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true jobs: chaos-tests: name: Control-Plane Chaos Tests runs-on: ubuntu-latest timeout-minutes: 60 steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 0 - name: Setup .NET uses: actions/setup-dotnet@v4 with: dotnet-version: '10.0.x' dotnet-quality: 'preview' - name: Restore dependencies run: | dotnet restore src/__Tests/chaos/StellaOps.Chaos.ControlPlane.Tests/StellaOps.Chaos.ControlPlane.Tests.csproj - name: Build chaos test project run: | dotnet build src/__Tests/chaos/StellaOps.Chaos.ControlPlane.Tests/StellaOps.Chaos.ControlPlane.Tests.csproj \ --configuration Release \ --no-restore - name: Run control-plane outage tests id: outage-tests run: | FILTER="${{ github.event.inputs.test_filter }}" VERBOSITY="${{ github.event.inputs.verbosity || 'normal' }}" dotnet test src/__Tests/chaos/StellaOps.Chaos.ControlPlane.Tests/StellaOps.Chaos.ControlPlane.Tests.csproj \ --configuration Release \ --no-build \ --verbosity $VERBOSITY \ --logger "trx;LogFileName=chaos-outage-results.trx" \ --logger "console;verbosity=$VERBOSITY" \ --results-directory ./TestResults \ --filter "Category=ControlPlane${FILTER:+&$FILTER}" \ -- \ RunConfiguration.CollectSourceInformation=true continue-on-error: true - name: Run partial outage tests id: partial-tests run: | FILTER="${{ github.event.inputs.test_filter }}" VERBOSITY="${{ github.event.inputs.verbosity || 'normal' }}" dotnet test src/__Tests/chaos/StellaOps.Chaos.ControlPlane.Tests/StellaOps.Chaos.ControlPlane.Tests.csproj \ --configuration Release \ --no-build \ --verbosity $VERBOSITY \ --logger "trx;LogFileName=chaos-partial-results.trx" \ --logger "console;verbosity=$VERBOSITY" \ --results-directory ./TestResults \ --filter "Category=PartialOutage${FILTER:+&$FILTER}" \ -- \ RunConfiguration.CollectSourceInformation=true continue-on-error: true - name: Upload test results uses: actions/upload-artifact@v4 if: always() with: name: chaos-test-results path: ./TestResults/*.trx retention-days: 30 - name: Generate chaos test summary if: always() run: | echo "## Control-Plane Chaos Test Results" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "### Test Execution" >> $GITHUB_STEP_SUMMARY echo "| Test Suite | Status |" >> $GITHUB_STEP_SUMMARY echo "|------------|--------|" >> $GITHUB_STEP_SUMMARY if [ "${{ steps.outage-tests.outcome }}" == "success" ]; then echo "| Full Outage Tests | :white_check_mark: Passed |" >> $GITHUB_STEP_SUMMARY else echo "| Full Outage Tests | :x: Failed |" >> $GITHUB_STEP_SUMMARY fi if [ "${{ steps.partial-tests.outcome }}" == "success" ]; then echo "| Partial Outage Tests | :white_check_mark: Passed |" >> $GITHUB_STEP_SUMMARY else echo "| Partial Outage Tests | :x: Failed |" >> $GITHUB_STEP_SUMMARY fi echo "" >> $GITHUB_STEP_SUMMARY echo "### Test Categories Covered" >> $GITHUB_STEP_SUMMARY echo "- Authority outage and cached token validation" >> $GITHUB_STEP_SUMMARY echo "- Scheduler outage and job persistence" >> $GITHUB_STEP_SUMMARY echo "- Full control-plane outage and data integrity" >> $GITHUB_STEP_SUMMARY echo "- Partial failure rate scenarios" >> $GITHUB_STEP_SUMMARY echo "- Latency injection and degraded service handling" >> $GITHUB_STEP_SUMMARY echo "- Service isolation and cascading failure prevention" >> $GITHUB_STEP_SUMMARY - name: Check test results if: always() run: | if [ "${{ steps.outage-tests.outcome }}" != "success" ] || [ "${{ steps.partial-tests.outcome }}" != "success" ]; then echo "::error::One or more chaos test suites failed" exit 1 fi echo "All chaos tests passed successfully" chaos-report: name: Generate Chaos Report runs-on: ubuntu-latest needs: chaos-tests if: always() steps: - name: Download test results uses: actions/download-artifact@v4 with: name: chaos-test-results path: ./TestResults - name: Parse TRX results run: | echo "## Chaos Test Detailed Report" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "Test results have been uploaded as artifacts." >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "### Artifact Location" >> $GITHUB_STEP_SUMMARY echo "- chaos-test-results (TRX format)" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY # List TRX files echo "### Available Result Files" >> $GITHUB_STEP_SUMMARY for file in ./TestResults/*.trx; do if [ -f "$file" ]; then echo "- $(basename $file)" >> $GITHUB_STEP_SUMMARY fi done - name: Notify on failure if: needs.chaos-tests.result == 'failure' run: | echo "::warning::Chaos tests failed. Review the test results for details." echo "" >> $GITHUB_STEP_SUMMARY echo "### :warning: Action Required" >> $GITHUB_STEP_SUMMARY echo "Chaos tests have failed. Please review:" >> $GITHUB_STEP_SUMMARY echo "1. Download the test artifacts for detailed results" >> $GITHUB_STEP_SUMMARY echo "2. Check if failures are due to test infrastructure or actual regressions" >> $GITHUB_STEP_SUMMARY echo "3. Consider running tests locally with diagnostic verbosity" >> $GITHUB_STEP_SUMMARY