Files
git.stella-ops.org/.gitea/workflows/rollback-lag.yml
2026-01-08 08:54:27 +02:00

405 lines
14 KiB
YAML

# .gitea/workflows/rollback-lag.yml
# Rollback lag measurement for deployment SLO validation
# Sprint: SPRINT_20260105_002_005_TEST_cross_cutting
# Task: CCUT-025
#
# WORKFLOW PURPOSE:
# =================
# Measures the time required to rollback a deployment and restore service health.
# This validates the rollback SLO (< 5 minutes) and provides visibility into
# deployment reversibility characteristics.
#
# The workflow performs a controlled rollback, measures timing metrics, and
# restores the original version afterward.
name: Rollback Lag Measurement
on:
workflow_dispatch:
inputs:
environment:
description: 'Target environment'
required: true
type: choice
options:
- staging
- production
deployment:
description: 'Deployment name to test'
required: true
type: string
default: 'stellaops-api'
namespace:
description: 'Kubernetes namespace'
required: true
type: string
default: 'stellaops'
rollback_slo_seconds:
description: 'Rollback SLO in seconds'
required: false
type: number
default: 300
dry_run:
description: 'Dry run (do not actually rollback)'
required: false
type: boolean
default: true
schedule:
# Run weekly on staging to track trends
- cron: '0 3 * * 0'
env:
DEFAULT_NAMESPACE: stellaops
DEFAULT_DEPLOYMENT: stellaops-api
DEFAULT_SLO: 300
jobs:
# ===========================================================================
# PRE-FLIGHT CHECKS
# ===========================================================================
preflight:
name: Pre-Flight Checks
runs-on: ${{ vars.LINUX_RUNNER_LABEL || 'ubuntu-latest' }}
environment: ${{ inputs.environment || 'staging' }}
outputs:
current-version: ${{ steps.current.outputs.version }}
current-image: ${{ steps.current.outputs.image }}
previous-version: ${{ steps.previous.outputs.version }}
previous-image: ${{ steps.previous.outputs.image }}
can-rollback: ${{ steps.check.outputs.can_rollback }}
replica-count: ${{ steps.current.outputs.replicas }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup kubectl
uses: azure/setup-kubectl@v4
with:
version: 'latest'
- name: Configure Kubernetes
run: |
echo "${{ secrets.KUBECONFIG }}" | base64 -d > kubeconfig.yaml
export KUBECONFIG=kubeconfig.yaml
- name: Get Current Deployment State
id: current
run: |
NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}"
DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}"
# Get current image
CURRENT_IMAGE=$(kubectl get deployment "$DEPLOYMENT" -n "$NAMESPACE" \
-o jsonpath='{.spec.template.spec.containers[0].image}' 2>/dev/null || echo "unknown")
# Extract version from image tag
CURRENT_VERSION=$(echo "$CURRENT_IMAGE" | sed 's/.*://')
# Get replica count
REPLICAS=$(kubectl get deployment "$DEPLOYMENT" -n "$NAMESPACE" \
-o jsonpath='{.spec.replicas}' 2>/dev/null || echo "1")
echo "image=$CURRENT_IMAGE" >> $GITHUB_OUTPUT
echo "version=$CURRENT_VERSION" >> $GITHUB_OUTPUT
echo "replicas=$REPLICAS" >> $GITHUB_OUTPUT
echo "Current deployment: $DEPLOYMENT"
echo "Current image: $CURRENT_IMAGE"
echo "Current version: $CURRENT_VERSION"
echo "Replicas: $REPLICAS"
- name: Get Previous Version
id: previous
run: |
NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}"
DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}"
# Get rollout history
HISTORY=$(kubectl rollout history deployment "$DEPLOYMENT" -n "$NAMESPACE" 2>/dev/null || echo "")
if [ -z "$HISTORY" ]; then
echo "version=unknown" >> $GITHUB_OUTPUT
echo "image=unknown" >> $GITHUB_OUTPUT
echo "No rollout history available"
exit 0
fi
# Get previous revision number
PREV_REVISION=$(echo "$HISTORY" | grep -E '^[0-9]+' | tail -2 | head -1 | awk '{print $1}')
if [ -z "$PREV_REVISION" ]; then
echo "version=unknown" >> $GITHUB_OUTPUT
echo "image=unknown" >> $GITHUB_OUTPUT
echo "No previous revision found"
exit 0
fi
# Get image from previous revision
PREV_IMAGE=$(kubectl rollout history deployment "$DEPLOYMENT" -n "$NAMESPACE" \
--revision="$PREV_REVISION" -o jsonpath='{.spec.template.spec.containers[0].image}' 2>/dev/null || echo "unknown")
PREV_VERSION=$(echo "$PREV_IMAGE" | sed 's/.*://')
echo "image=$PREV_IMAGE" >> $GITHUB_OUTPUT
echo "version=$PREV_VERSION" >> $GITHUB_OUTPUT
echo "Previous revision: $PREV_REVISION"
echo "Previous image: $PREV_IMAGE"
echo "Previous version: $PREV_VERSION"
- name: Check Rollback Feasibility
id: check
run: |
CURRENT="${{ steps.current.outputs.version }}"
PREVIOUS="${{ steps.previous.outputs.version }}"
if [ "$PREVIOUS" = "unknown" ] || [ -z "$PREVIOUS" ]; then
echo "can_rollback=false" >> $GITHUB_OUTPUT
echo "::warning::No previous version available for rollback"
elif [ "$CURRENT" = "$PREVIOUS" ]; then
echo "can_rollback=false" >> $GITHUB_OUTPUT
echo "::warning::Current and previous versions are the same"
else
echo "can_rollback=true" >> $GITHUB_OUTPUT
echo "Rollback feasible: $CURRENT -> $PREVIOUS"
fi
# ===========================================================================
# MEASURE ROLLBACK LAG
# ===========================================================================
measure:
name: Measure Rollback Lag
needs: preflight
if: needs.preflight.outputs.can-rollback == 'true'
runs-on: ${{ vars.LINUX_RUNNER_LABEL || 'ubuntu-latest' }}
environment: ${{ inputs.environment || 'staging' }}
outputs:
rollback-time: ${{ steps.timing.outputs.rollback_time }}
health-recovery-time: ${{ steps.timing.outputs.health_time }}
total-lag: ${{ steps.timing.outputs.total_lag }}
slo-met: ${{ steps.timing.outputs.slo_met }}
steps:
- name: Setup kubectl
uses: azure/setup-kubectl@v4
with:
version: 'latest'
- name: Configure Kubernetes
run: |
echo "${{ secrets.KUBECONFIG }}" | base64 -d > kubeconfig.yaml
export KUBECONFIG=kubeconfig.yaml
- name: Record Start Time
id: start
run: |
START_TIME=$(date +%s)
echo "time=$START_TIME" >> $GITHUB_OUTPUT
echo "Rollback measurement started at: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
- name: Trigger Rollback
id: rollback
run: |
NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}"
DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}"
DRY_RUN="${{ inputs.dry_run || 'true' }}"
if [ "$DRY_RUN" = "true" ]; then
echo "DRY RUN: Would execute rollback"
echo "kubectl rollout undo deployment/$DEPLOYMENT -n $NAMESPACE"
ROLLBACK_TIME=$(date +%s)
else
echo "Executing rollback..."
kubectl rollout undo deployment/"$DEPLOYMENT" -n "$NAMESPACE"
ROLLBACK_TIME=$(date +%s)
fi
echo "time=$ROLLBACK_TIME" >> $GITHUB_OUTPUT
- name: Wait for Rollout Complete
id: rollout
run: |
NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}"
DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}"
DRY_RUN="${{ inputs.dry_run || 'true' }}"
if [ "$DRY_RUN" = "true" ]; then
echo "DRY RUN: Simulating rollout wait"
sleep 5
ROLLOUT_COMPLETE_TIME=$(date +%s)
else
echo "Waiting for rollout to complete..."
kubectl rollout status deployment/"$DEPLOYMENT" -n "$NAMESPACE" --timeout=600s
ROLLOUT_COMPLETE_TIME=$(date +%s)
fi
echo "time=$ROLLOUT_COMPLETE_TIME" >> $GITHUB_OUTPUT
- name: Wait for Health Recovery
id: health
run: |
NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}"
DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}"
DRY_RUN="${{ inputs.dry_run || 'true' }}"
REPLICAS="${{ needs.preflight.outputs.replica-count }}"
if [ "$DRY_RUN" = "true" ]; then
echo "DRY RUN: Simulating health check"
sleep 3
HEALTH_TIME=$(date +%s)
else
echo "Waiting for health checks to pass..."
# Wait for all pods to be ready
MAX_WAIT=300
WAITED=0
while [ "$WAITED" -lt "$MAX_WAIT" ]; do
READY=$(kubectl get deployment "$DEPLOYMENT" -n "$NAMESPACE" \
-o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo "0")
if [ "$READY" = "$REPLICAS" ]; then
echo "All $READY replicas are ready"
break
fi
echo "Ready: $READY / $REPLICAS (waited ${WAITED}s)"
sleep 5
WAITED=$((WAITED + 5))
done
HEALTH_TIME=$(date +%s)
fi
echo "time=$HEALTH_TIME" >> $GITHUB_OUTPUT
- name: Calculate Timing Metrics
id: timing
run: |
START_TIME=${{ steps.start.outputs.time }}
ROLLBACK_TIME=${{ steps.rollback.outputs.time }}
ROLLOUT_TIME=${{ steps.rollout.outputs.time }}
HEALTH_TIME=${{ steps.health.outputs.time }}
SLO_SECONDS="${{ inputs.rollback_slo_seconds || env.DEFAULT_SLO }}"
# Calculate durations
ROLLBACK_DURATION=$((ROLLOUT_TIME - ROLLBACK_TIME))
HEALTH_DURATION=$((HEALTH_TIME - ROLLOUT_TIME))
TOTAL_LAG=$((HEALTH_TIME - START_TIME))
# Check SLO
if [ "$TOTAL_LAG" -le "$SLO_SECONDS" ]; then
SLO_MET="true"
else
SLO_MET="false"
fi
echo "rollback_time=$ROLLBACK_DURATION" >> $GITHUB_OUTPUT
echo "health_time=$HEALTH_DURATION" >> $GITHUB_OUTPUT
echo "total_lag=$TOTAL_LAG" >> $GITHUB_OUTPUT
echo "slo_met=$SLO_MET" >> $GITHUB_OUTPUT
echo "=== Rollback Timing Metrics ==="
echo "Rollback execution: ${ROLLBACK_DURATION}s"
echo "Health recovery: ${HEALTH_DURATION}s"
echo "Total lag: ${TOTAL_LAG}s"
echo "SLO (${SLO_SECONDS}s): $SLO_MET"
- name: Restore Original Version
if: inputs.dry_run != true
run: |
NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}"
DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}"
ORIGINAL_IMAGE="${{ needs.preflight.outputs.current-image }}"
echo "Restoring original version: $ORIGINAL_IMAGE"
kubectl set image deployment/"$DEPLOYMENT" \
"$DEPLOYMENT"="$ORIGINAL_IMAGE" \
-n "$NAMESPACE"
kubectl rollout status deployment/"$DEPLOYMENT" -n "$NAMESPACE" --timeout=600s
echo "Original version restored"
# ===========================================================================
# GENERATE REPORT
# ===========================================================================
report:
name: Generate Report
needs: [preflight, measure]
if: always() && needs.preflight.result == 'success'
runs-on: ${{ vars.LINUX_RUNNER_LABEL || 'ubuntu-latest' }}
steps:
- name: Generate Report
run: |
SLO_SECONDS="${{ inputs.rollback_slo_seconds || 300 }}"
TOTAL_LAG="${{ needs.measure.outputs.total-lag || 'N/A' }}"
SLO_MET="${{ needs.measure.outputs.slo-met || 'unknown' }}"
if [ "$SLO_MET" = "true" ]; then
STATUS=":white_check_mark: PASSED"
elif [ "$SLO_MET" = "false" ]; then
STATUS=":x: FAILED"
else
STATUS=":grey_question: UNKNOWN"
fi
cat > rollback-lag-report.md << EOF
## Rollback Lag Measurement Report
**Environment:** ${{ inputs.environment || 'staging' }}
**Deployment:** ${{ inputs.deployment || 'stellaops-api' }}
**Dry Run:** ${{ inputs.dry_run || 'true' }}
### Version Information
| Version | Image |
|---------|-------|
| Current | \`${{ needs.preflight.outputs.current-version }}\` |
| Previous | \`${{ needs.preflight.outputs.previous-version }}\` |
### Timing Metrics
| Metric | Value | SLO |
|--------|-------|-----|
| Rollback Execution | ${{ needs.measure.outputs.rollback-time || 'N/A' }}s | - |
| Health Recovery | ${{ needs.measure.outputs.health-recovery-time || 'N/A' }}s | - |
| **Total Lag** | **${TOTAL_LAG}s** | < ${SLO_SECONDS}s |
### SLO Status: ${STATUS}
---
*Report generated at $(date -u +%Y-%m-%dT%H:%M:%SZ)*
<details>
<summary>Measurement Details</summary>
- Can Rollback: ${{ needs.preflight.outputs.can-rollback }}
- Replica Count: ${{ needs.preflight.outputs.replica-count }}
- Current Image: \`${{ needs.preflight.outputs.current-image }}\`
- Previous Image: \`${{ needs.preflight.outputs.previous-image }}\`
</details>
EOF
cat rollback-lag-report.md
# Add to job summary
cat rollback-lag-report.md >> $GITHUB_STEP_SUMMARY
- name: Upload Report
uses: actions/upload-artifact@v4
with:
name: rollback-lag-report
path: rollback-lag-report.md
- name: Check SLO and Fail if Exceeded
if: needs.measure.outputs.slo-met == 'false'
run: |
TOTAL_LAG="${{ needs.measure.outputs.total-lag }}"
SLO_SECONDS="${{ inputs.rollback_slo_seconds || 300 }}"
echo "::error::Rollback took ${TOTAL_LAG}s, exceeds SLO of ${SLO_SECONDS}s"
exit 1