# .gitea/workflows/rollback-lag.yml # Rollback lag measurement for deployment SLO validation # Sprint: SPRINT_20260105_002_005_TEST_cross_cutting # Task: CCUT-025 # # WORKFLOW PURPOSE: # ================= # Measures the time required to rollback a deployment and restore service health. # This validates the rollback SLO (< 5 minutes) and provides visibility into # deployment reversibility characteristics. # # The workflow performs a controlled rollback, measures timing metrics, and # restores the original version afterward. name: Rollback Lag Measurement on: workflow_dispatch: inputs: environment: description: 'Target environment' required: true type: choice options: - staging - production deployment: description: 'Deployment name to test' required: true type: string default: 'stellaops-api' namespace: description: 'Kubernetes namespace' required: true type: string default: 'stellaops' rollback_slo_seconds: description: 'Rollback SLO in seconds' required: false type: number default: 300 dry_run: description: 'Dry run (do not actually rollback)' required: false type: boolean default: true schedule: # Run weekly on staging to track trends - cron: '0 3 * * 0' env: DEFAULT_NAMESPACE: stellaops DEFAULT_DEPLOYMENT: stellaops-api DEFAULT_SLO: 300 jobs: # =========================================================================== # PRE-FLIGHT CHECKS # =========================================================================== preflight: name: Pre-Flight Checks runs-on: ubuntu-22.04 environment: ${{ inputs.environment || 'staging' }} outputs: current-version: ${{ steps.current.outputs.version }} current-image: ${{ steps.current.outputs.image }} previous-version: ${{ steps.previous.outputs.version }} previous-image: ${{ steps.previous.outputs.image }} can-rollback: ${{ steps.check.outputs.can_rollback }} replica-count: ${{ steps.current.outputs.replicas }} steps: - name: Checkout uses: actions/checkout@v4 - name: Setup kubectl uses: azure/setup-kubectl@v4 with: version: 'latest' - name: Configure Kubernetes run: | echo "${{ secrets.KUBECONFIG }}" | base64 -d > kubeconfig.yaml export KUBECONFIG=kubeconfig.yaml - name: Get Current Deployment State id: current run: | NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}" DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}" # Get current image CURRENT_IMAGE=$(kubectl get deployment "$DEPLOYMENT" -n "$NAMESPACE" \ -o jsonpath='{.spec.template.spec.containers[0].image}' 2>/dev/null || echo "unknown") # Extract version from image tag CURRENT_VERSION=$(echo "$CURRENT_IMAGE" | sed 's/.*://') # Get replica count REPLICAS=$(kubectl get deployment "$DEPLOYMENT" -n "$NAMESPACE" \ -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "1") echo "image=$CURRENT_IMAGE" >> $GITHUB_OUTPUT echo "version=$CURRENT_VERSION" >> $GITHUB_OUTPUT echo "replicas=$REPLICAS" >> $GITHUB_OUTPUT echo "Current deployment: $DEPLOYMENT" echo "Current image: $CURRENT_IMAGE" echo "Current version: $CURRENT_VERSION" echo "Replicas: $REPLICAS" - name: Get Previous Version id: previous run: | NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}" DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}" # Get rollout history HISTORY=$(kubectl rollout history deployment "$DEPLOYMENT" -n "$NAMESPACE" 2>/dev/null || echo "") if [ -z "$HISTORY" ]; then echo "version=unknown" >> $GITHUB_OUTPUT echo "image=unknown" >> $GITHUB_OUTPUT echo "No rollout history available" exit 0 fi # Get previous revision number PREV_REVISION=$(echo "$HISTORY" | grep -E '^[0-9]+' | tail -2 | head -1 | awk '{print $1}') if [ -z "$PREV_REVISION" ]; then echo "version=unknown" >> $GITHUB_OUTPUT echo "image=unknown" >> $GITHUB_OUTPUT echo "No previous revision found" exit 0 fi # Get image from previous revision PREV_IMAGE=$(kubectl rollout history deployment "$DEPLOYMENT" -n "$NAMESPACE" \ --revision="$PREV_REVISION" -o jsonpath='{.spec.template.spec.containers[0].image}' 2>/dev/null || echo "unknown") PREV_VERSION=$(echo "$PREV_IMAGE" | sed 's/.*://') echo "image=$PREV_IMAGE" >> $GITHUB_OUTPUT echo "version=$PREV_VERSION" >> $GITHUB_OUTPUT echo "Previous revision: $PREV_REVISION" echo "Previous image: $PREV_IMAGE" echo "Previous version: $PREV_VERSION" - name: Check Rollback Feasibility id: check run: | CURRENT="${{ steps.current.outputs.version }}" PREVIOUS="${{ steps.previous.outputs.version }}" if [ "$PREVIOUS" = "unknown" ] || [ -z "$PREVIOUS" ]; then echo "can_rollback=false" >> $GITHUB_OUTPUT echo "::warning::No previous version available for rollback" elif [ "$CURRENT" = "$PREVIOUS" ]; then echo "can_rollback=false" >> $GITHUB_OUTPUT echo "::warning::Current and previous versions are the same" else echo "can_rollback=true" >> $GITHUB_OUTPUT echo "Rollback feasible: $CURRENT -> $PREVIOUS" fi # =========================================================================== # MEASURE ROLLBACK LAG # =========================================================================== measure: name: Measure Rollback Lag needs: preflight if: needs.preflight.outputs.can-rollback == 'true' runs-on: ubuntu-22.04 environment: ${{ inputs.environment || 'staging' }} outputs: rollback-time: ${{ steps.timing.outputs.rollback_time }} health-recovery-time: ${{ steps.timing.outputs.health_time }} total-lag: ${{ steps.timing.outputs.total_lag }} slo-met: ${{ steps.timing.outputs.slo_met }} steps: - name: Setup kubectl uses: azure/setup-kubectl@v4 with: version: 'latest' - name: Configure Kubernetes run: | echo "${{ secrets.KUBECONFIG }}" | base64 -d > kubeconfig.yaml export KUBECONFIG=kubeconfig.yaml - name: Record Start Time id: start run: | START_TIME=$(date +%s) echo "time=$START_TIME" >> $GITHUB_OUTPUT echo "Rollback measurement started at: $(date -u +%Y-%m-%dT%H:%M:%SZ)" - name: Trigger Rollback id: rollback run: | NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}" DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}" DRY_RUN="${{ inputs.dry_run || 'true' }}" if [ "$DRY_RUN" = "true" ]; then echo "DRY RUN: Would execute rollback" echo "kubectl rollout undo deployment/$DEPLOYMENT -n $NAMESPACE" ROLLBACK_TIME=$(date +%s) else echo "Executing rollback..." kubectl rollout undo deployment/"$DEPLOYMENT" -n "$NAMESPACE" ROLLBACK_TIME=$(date +%s) fi echo "time=$ROLLBACK_TIME" >> $GITHUB_OUTPUT - name: Wait for Rollout Complete id: rollout run: | NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}" DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}" DRY_RUN="${{ inputs.dry_run || 'true' }}" if [ "$DRY_RUN" = "true" ]; then echo "DRY RUN: Simulating rollout wait" sleep 5 ROLLOUT_COMPLETE_TIME=$(date +%s) else echo "Waiting for rollout to complete..." kubectl rollout status deployment/"$DEPLOYMENT" -n "$NAMESPACE" --timeout=600s ROLLOUT_COMPLETE_TIME=$(date +%s) fi echo "time=$ROLLOUT_COMPLETE_TIME" >> $GITHUB_OUTPUT - name: Wait for Health Recovery id: health run: | NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}" DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}" DRY_RUN="${{ inputs.dry_run || 'true' }}" REPLICAS="${{ needs.preflight.outputs.replica-count }}" if [ "$DRY_RUN" = "true" ]; then echo "DRY RUN: Simulating health check" sleep 3 HEALTH_TIME=$(date +%s) else echo "Waiting for health checks to pass..." # Wait for all pods to be ready MAX_WAIT=300 WAITED=0 while [ "$WAITED" -lt "$MAX_WAIT" ]; do READY=$(kubectl get deployment "$DEPLOYMENT" -n "$NAMESPACE" \ -o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo "0") if [ "$READY" = "$REPLICAS" ]; then echo "All $READY replicas are ready" break fi echo "Ready: $READY / $REPLICAS (waited ${WAITED}s)" sleep 5 WAITED=$((WAITED + 5)) done HEALTH_TIME=$(date +%s) fi echo "time=$HEALTH_TIME" >> $GITHUB_OUTPUT - name: Calculate Timing Metrics id: timing run: | START_TIME=${{ steps.start.outputs.time }} ROLLBACK_TIME=${{ steps.rollback.outputs.time }} ROLLOUT_TIME=${{ steps.rollout.outputs.time }} HEALTH_TIME=${{ steps.health.outputs.time }} SLO_SECONDS="${{ inputs.rollback_slo_seconds || env.DEFAULT_SLO }}" # Calculate durations ROLLBACK_DURATION=$((ROLLOUT_TIME - ROLLBACK_TIME)) HEALTH_DURATION=$((HEALTH_TIME - ROLLOUT_TIME)) TOTAL_LAG=$((HEALTH_TIME - START_TIME)) # Check SLO if [ "$TOTAL_LAG" -le "$SLO_SECONDS" ]; then SLO_MET="true" else SLO_MET="false" fi echo "rollback_time=$ROLLBACK_DURATION" >> $GITHUB_OUTPUT echo "health_time=$HEALTH_DURATION" >> $GITHUB_OUTPUT echo "total_lag=$TOTAL_LAG" >> $GITHUB_OUTPUT echo "slo_met=$SLO_MET" >> $GITHUB_OUTPUT echo "=== Rollback Timing Metrics ===" echo "Rollback execution: ${ROLLBACK_DURATION}s" echo "Health recovery: ${HEALTH_DURATION}s" echo "Total lag: ${TOTAL_LAG}s" echo "SLO (${SLO_SECONDS}s): $SLO_MET" - name: Restore Original Version if: inputs.dry_run != true run: | NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}" DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}" ORIGINAL_IMAGE="${{ needs.preflight.outputs.current-image }}" echo "Restoring original version: $ORIGINAL_IMAGE" kubectl set image deployment/"$DEPLOYMENT" \ "$DEPLOYMENT"="$ORIGINAL_IMAGE" \ -n "$NAMESPACE" kubectl rollout status deployment/"$DEPLOYMENT" -n "$NAMESPACE" --timeout=600s echo "Original version restored" # =========================================================================== # GENERATE REPORT # =========================================================================== report: name: Generate Report needs: [preflight, measure] if: always() && needs.preflight.result == 'success' runs-on: ubuntu-22.04 steps: - name: Generate Report run: | SLO_SECONDS="${{ inputs.rollback_slo_seconds || 300 }}" TOTAL_LAG="${{ needs.measure.outputs.total-lag || 'N/A' }}" SLO_MET="${{ needs.measure.outputs.slo-met || 'unknown' }}" if [ "$SLO_MET" = "true" ]; then STATUS=":white_check_mark: PASSED" elif [ "$SLO_MET" = "false" ]; then STATUS=":x: FAILED" else STATUS=":grey_question: UNKNOWN" fi cat > rollback-lag-report.md << EOF ## Rollback Lag Measurement Report **Environment:** ${{ inputs.environment || 'staging' }} **Deployment:** ${{ inputs.deployment || 'stellaops-api' }} **Dry Run:** ${{ inputs.dry_run || 'true' }} ### Version Information | Version | Image | |---------|-------| | Current | \`${{ needs.preflight.outputs.current-version }}\` | | Previous | \`${{ needs.preflight.outputs.previous-version }}\` | ### Timing Metrics | Metric | Value | SLO | |--------|-------|-----| | Rollback Execution | ${{ needs.measure.outputs.rollback-time || 'N/A' }}s | - | | Health Recovery | ${{ needs.measure.outputs.health-recovery-time || 'N/A' }}s | - | | **Total Lag** | **${TOTAL_LAG}s** | < ${SLO_SECONDS}s | ### SLO Status: ${STATUS} --- *Report generated at $(date -u +%Y-%m-%dT%H:%M:%SZ)*
Measurement Details - Can Rollback: ${{ needs.preflight.outputs.can-rollback }} - Replica Count: ${{ needs.preflight.outputs.replica-count }} - Current Image: \`${{ needs.preflight.outputs.current-image }}\` - Previous Image: \`${{ needs.preflight.outputs.previous-image }}\`
EOF cat rollback-lag-report.md # Add to job summary cat rollback-lag-report.md >> $GITHUB_STEP_SUMMARY - name: Upload Report uses: actions/upload-artifact@v4 with: name: rollback-lag-report path: rollback-lag-report.md - name: Check SLO and Fail if Exceeded if: needs.measure.outputs.slo-met == 'false' run: | TOTAL_LAG="${{ needs.measure.outputs.total-lag }}" SLO_SECONDS="${{ inputs.rollback_slo_seconds || 300 }}" echo "::error::Rollback took ${TOTAL_LAG}s, exceeds SLO of ${SLO_SECONDS}s" exit 1