404 lines
14 KiB
YAML
404 lines
14 KiB
YAML
# .gitea/workflows/rollback-lag.yml
|
|
# Rollback lag measurement for deployment SLO validation
|
|
# Sprint: SPRINT_20260105_002_005_TEST_cross_cutting
|
|
# Task: CCUT-025
|
|
#
|
|
# WORKFLOW PURPOSE:
|
|
# =================
|
|
# Measures the time required to rollback a deployment and restore service health.
|
|
# This validates the rollback SLO (< 5 minutes) and provides visibility into
|
|
# deployment reversibility characteristics.
|
|
#
|
|
# The workflow performs a controlled rollback, measures timing metrics, and
|
|
# restores the original version afterward.
|
|
|
|
name: Rollback Lag Measurement
|
|
|
|
on:
|
|
workflow_dispatch:
|
|
inputs:
|
|
environment:
|
|
description: 'Target environment'
|
|
required: true
|
|
type: choice
|
|
options:
|
|
- staging
|
|
- production
|
|
deployment:
|
|
description: 'Deployment name to test'
|
|
required: true
|
|
type: string
|
|
default: 'stellaops-api'
|
|
namespace:
|
|
description: 'Kubernetes namespace'
|
|
required: true
|
|
type: string
|
|
default: 'stellaops'
|
|
rollback_slo_seconds:
|
|
description: 'Rollback SLO in seconds'
|
|
required: false
|
|
type: number
|
|
default: 300
|
|
dry_run:
|
|
description: 'Dry run (do not actually rollback)'
|
|
required: false
|
|
type: boolean
|
|
default: true
|
|
schedule:
|
|
# Run weekly on staging to track trends
|
|
- cron: '0 3 * * 0'
|
|
|
|
env:
|
|
DEFAULT_NAMESPACE: stellaops
|
|
DEFAULT_DEPLOYMENT: stellaops-api
|
|
DEFAULT_SLO: 300
|
|
|
|
jobs:
|
|
# ===========================================================================
|
|
# PRE-FLIGHT CHECKS
|
|
# ===========================================================================
|
|
|
|
preflight:
|
|
name: Pre-Flight Checks
|
|
runs-on: ubuntu-22.04
|
|
environment: ${{ inputs.environment || 'staging' }}
|
|
outputs:
|
|
current-version: ${{ steps.current.outputs.version }}
|
|
current-image: ${{ steps.current.outputs.image }}
|
|
previous-version: ${{ steps.previous.outputs.version }}
|
|
previous-image: ${{ steps.previous.outputs.image }}
|
|
can-rollback: ${{ steps.check.outputs.can_rollback }}
|
|
replica-count: ${{ steps.current.outputs.replicas }}
|
|
steps:
|
|
- name: Checkout
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Setup kubectl
|
|
uses: azure/setup-kubectl@v4
|
|
with:
|
|
version: 'latest'
|
|
|
|
- name: Configure Kubernetes
|
|
run: |
|
|
echo "${{ secrets.KUBECONFIG }}" | base64 -d > kubeconfig.yaml
|
|
export KUBECONFIG=kubeconfig.yaml
|
|
|
|
- name: Get Current Deployment State
|
|
id: current
|
|
run: |
|
|
NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}"
|
|
DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}"
|
|
|
|
# Get current image
|
|
CURRENT_IMAGE=$(kubectl get deployment "$DEPLOYMENT" -n "$NAMESPACE" \
|
|
-o jsonpath='{.spec.template.spec.containers[0].image}' 2>/dev/null || echo "unknown")
|
|
|
|
# Extract version from image tag
|
|
CURRENT_VERSION=$(echo "$CURRENT_IMAGE" | sed 's/.*://')
|
|
|
|
# Get replica count
|
|
REPLICAS=$(kubectl get deployment "$DEPLOYMENT" -n "$NAMESPACE" \
|
|
-o jsonpath='{.spec.replicas}' 2>/dev/null || echo "1")
|
|
|
|
echo "image=$CURRENT_IMAGE" >> $GITHUB_OUTPUT
|
|
echo "version=$CURRENT_VERSION" >> $GITHUB_OUTPUT
|
|
echo "replicas=$REPLICAS" >> $GITHUB_OUTPUT
|
|
|
|
echo "Current deployment: $DEPLOYMENT"
|
|
echo "Current image: $CURRENT_IMAGE"
|
|
echo "Current version: $CURRENT_VERSION"
|
|
echo "Replicas: $REPLICAS"
|
|
|
|
- name: Get Previous Version
|
|
id: previous
|
|
run: |
|
|
NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}"
|
|
DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}"
|
|
|
|
# Get rollout history
|
|
HISTORY=$(kubectl rollout history deployment "$DEPLOYMENT" -n "$NAMESPACE" 2>/dev/null || echo "")
|
|
|
|
if [ -z "$HISTORY" ]; then
|
|
echo "version=unknown" >> $GITHUB_OUTPUT
|
|
echo "image=unknown" >> $GITHUB_OUTPUT
|
|
echo "No rollout history available"
|
|
exit 0
|
|
fi
|
|
|
|
# Get previous revision number
|
|
PREV_REVISION=$(echo "$HISTORY" | grep -E '^[0-9]+' | tail -2 | head -1 | awk '{print $1}')
|
|
|
|
if [ -z "$PREV_REVISION" ]; then
|
|
echo "version=unknown" >> $GITHUB_OUTPUT
|
|
echo "image=unknown" >> $GITHUB_OUTPUT
|
|
echo "No previous revision found"
|
|
exit 0
|
|
fi
|
|
|
|
# Get image from previous revision
|
|
PREV_IMAGE=$(kubectl rollout history deployment "$DEPLOYMENT" -n "$NAMESPACE" \
|
|
--revision="$PREV_REVISION" -o jsonpath='{.spec.template.spec.containers[0].image}' 2>/dev/null || echo "unknown")
|
|
|
|
PREV_VERSION=$(echo "$PREV_IMAGE" | sed 's/.*://')
|
|
|
|
echo "image=$PREV_IMAGE" >> $GITHUB_OUTPUT
|
|
echo "version=$PREV_VERSION" >> $GITHUB_OUTPUT
|
|
|
|
echo "Previous revision: $PREV_REVISION"
|
|
echo "Previous image: $PREV_IMAGE"
|
|
echo "Previous version: $PREV_VERSION"
|
|
|
|
- name: Check Rollback Feasibility
|
|
id: check
|
|
run: |
|
|
CURRENT="${{ steps.current.outputs.version }}"
|
|
PREVIOUS="${{ steps.previous.outputs.version }}"
|
|
|
|
if [ "$PREVIOUS" = "unknown" ] || [ -z "$PREVIOUS" ]; then
|
|
echo "can_rollback=false" >> $GITHUB_OUTPUT
|
|
echo "::warning::No previous version available for rollback"
|
|
elif [ "$CURRENT" = "$PREVIOUS" ]; then
|
|
echo "can_rollback=false" >> $GITHUB_OUTPUT
|
|
echo "::warning::Current and previous versions are the same"
|
|
else
|
|
echo "can_rollback=true" >> $GITHUB_OUTPUT
|
|
echo "Rollback feasible: $CURRENT -> $PREVIOUS"
|
|
fi
|
|
|
|
# ===========================================================================
|
|
# MEASURE ROLLBACK LAG
|
|
# ===========================================================================
|
|
|
|
measure:
|
|
name: Measure Rollback Lag
|
|
needs: preflight
|
|
if: needs.preflight.outputs.can-rollback == 'true'
|
|
runs-on: ubuntu-22.04
|
|
environment: ${{ inputs.environment || 'staging' }}
|
|
outputs:
|
|
rollback-time: ${{ steps.timing.outputs.rollback_time }}
|
|
health-recovery-time: ${{ steps.timing.outputs.health_time }}
|
|
total-lag: ${{ steps.timing.outputs.total_lag }}
|
|
slo-met: ${{ steps.timing.outputs.slo_met }}
|
|
steps:
|
|
- name: Setup kubectl
|
|
uses: azure/setup-kubectl@v4
|
|
with:
|
|
version: 'latest'
|
|
|
|
- name: Configure Kubernetes
|
|
run: |
|
|
echo "${{ secrets.KUBECONFIG }}" | base64 -d > kubeconfig.yaml
|
|
export KUBECONFIG=kubeconfig.yaml
|
|
|
|
- name: Record Start Time
|
|
id: start
|
|
run: |
|
|
START_TIME=$(date +%s)
|
|
echo "time=$START_TIME" >> $GITHUB_OUTPUT
|
|
echo "Rollback measurement started at: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
|
|
|
- name: Trigger Rollback
|
|
id: rollback
|
|
run: |
|
|
NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}"
|
|
DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}"
|
|
DRY_RUN="${{ inputs.dry_run || 'true' }}"
|
|
|
|
if [ "$DRY_RUN" = "true" ]; then
|
|
echo "DRY RUN: Would execute rollback"
|
|
echo "kubectl rollout undo deployment/$DEPLOYMENT -n $NAMESPACE"
|
|
ROLLBACK_TIME=$(date +%s)
|
|
else
|
|
echo "Executing rollback..."
|
|
kubectl rollout undo deployment/"$DEPLOYMENT" -n "$NAMESPACE"
|
|
ROLLBACK_TIME=$(date +%s)
|
|
fi
|
|
|
|
echo "time=$ROLLBACK_TIME" >> $GITHUB_OUTPUT
|
|
|
|
- name: Wait for Rollout Complete
|
|
id: rollout
|
|
run: |
|
|
NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}"
|
|
DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}"
|
|
DRY_RUN="${{ inputs.dry_run || 'true' }}"
|
|
|
|
if [ "$DRY_RUN" = "true" ]; then
|
|
echo "DRY RUN: Simulating rollout wait"
|
|
sleep 5
|
|
ROLLOUT_COMPLETE_TIME=$(date +%s)
|
|
else
|
|
echo "Waiting for rollout to complete..."
|
|
kubectl rollout status deployment/"$DEPLOYMENT" -n "$NAMESPACE" --timeout=600s
|
|
ROLLOUT_COMPLETE_TIME=$(date +%s)
|
|
fi
|
|
|
|
echo "time=$ROLLOUT_COMPLETE_TIME" >> $GITHUB_OUTPUT
|
|
|
|
- name: Wait for Health Recovery
|
|
id: health
|
|
run: |
|
|
NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}"
|
|
DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}"
|
|
DRY_RUN="${{ inputs.dry_run || 'true' }}"
|
|
REPLICAS="${{ needs.preflight.outputs.replica-count }}"
|
|
|
|
if [ "$DRY_RUN" = "true" ]; then
|
|
echo "DRY RUN: Simulating health check"
|
|
sleep 3
|
|
HEALTH_TIME=$(date +%s)
|
|
else
|
|
echo "Waiting for health checks to pass..."
|
|
|
|
# Wait for all pods to be ready
|
|
MAX_WAIT=300
|
|
WAITED=0
|
|
while [ "$WAITED" -lt "$MAX_WAIT" ]; do
|
|
READY=$(kubectl get deployment "$DEPLOYMENT" -n "$NAMESPACE" \
|
|
-o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo "0")
|
|
|
|
if [ "$READY" = "$REPLICAS" ]; then
|
|
echo "All $READY replicas are ready"
|
|
break
|
|
fi
|
|
|
|
echo "Ready: $READY / $REPLICAS (waited ${WAITED}s)"
|
|
sleep 5
|
|
WAITED=$((WAITED + 5))
|
|
done
|
|
|
|
HEALTH_TIME=$(date +%s)
|
|
fi
|
|
|
|
echo "time=$HEALTH_TIME" >> $GITHUB_OUTPUT
|
|
|
|
- name: Calculate Timing Metrics
|
|
id: timing
|
|
run: |
|
|
START_TIME=${{ steps.start.outputs.time }}
|
|
ROLLBACK_TIME=${{ steps.rollback.outputs.time }}
|
|
ROLLOUT_TIME=${{ steps.rollout.outputs.time }}
|
|
HEALTH_TIME=${{ steps.health.outputs.time }}
|
|
SLO_SECONDS="${{ inputs.rollback_slo_seconds || env.DEFAULT_SLO }}"
|
|
|
|
# Calculate durations
|
|
ROLLBACK_DURATION=$((ROLLOUT_TIME - ROLLBACK_TIME))
|
|
HEALTH_DURATION=$((HEALTH_TIME - ROLLOUT_TIME))
|
|
TOTAL_LAG=$((HEALTH_TIME - START_TIME))
|
|
|
|
# Check SLO
|
|
if [ "$TOTAL_LAG" -le "$SLO_SECONDS" ]; then
|
|
SLO_MET="true"
|
|
else
|
|
SLO_MET="false"
|
|
fi
|
|
|
|
echo "rollback_time=$ROLLBACK_DURATION" >> $GITHUB_OUTPUT
|
|
echo "health_time=$HEALTH_DURATION" >> $GITHUB_OUTPUT
|
|
echo "total_lag=$TOTAL_LAG" >> $GITHUB_OUTPUT
|
|
echo "slo_met=$SLO_MET" >> $GITHUB_OUTPUT
|
|
|
|
echo "=== Rollback Timing Metrics ==="
|
|
echo "Rollback execution: ${ROLLBACK_DURATION}s"
|
|
echo "Health recovery: ${HEALTH_DURATION}s"
|
|
echo "Total lag: ${TOTAL_LAG}s"
|
|
echo "SLO (${SLO_SECONDS}s): $SLO_MET"
|
|
|
|
- name: Restore Original Version
|
|
if: inputs.dry_run != true
|
|
run: |
|
|
NAMESPACE="${{ inputs.namespace || env.DEFAULT_NAMESPACE }}"
|
|
DEPLOYMENT="${{ inputs.deployment || env.DEFAULT_DEPLOYMENT }}"
|
|
ORIGINAL_IMAGE="${{ needs.preflight.outputs.current-image }}"
|
|
|
|
echo "Restoring original version: $ORIGINAL_IMAGE"
|
|
kubectl set image deployment/"$DEPLOYMENT" \
|
|
"$DEPLOYMENT"="$ORIGINAL_IMAGE" \
|
|
-n "$NAMESPACE"
|
|
|
|
kubectl rollout status deployment/"$DEPLOYMENT" -n "$NAMESPACE" --timeout=600s
|
|
echo "Original version restored"
|
|
|
|
# ===========================================================================
|
|
# GENERATE REPORT
|
|
# ===========================================================================
|
|
|
|
report:
|
|
name: Generate Report
|
|
needs: [preflight, measure]
|
|
if: always() && needs.preflight.result == 'success'
|
|
runs-on: ubuntu-22.04
|
|
steps:
|
|
- name: Generate Report
|
|
run: |
|
|
SLO_SECONDS="${{ inputs.rollback_slo_seconds || 300 }}"
|
|
TOTAL_LAG="${{ needs.measure.outputs.total-lag || 'N/A' }}"
|
|
SLO_MET="${{ needs.measure.outputs.slo-met || 'unknown' }}"
|
|
|
|
if [ "$SLO_MET" = "true" ]; then
|
|
STATUS=":white_check_mark: PASSED"
|
|
elif [ "$SLO_MET" = "false" ]; then
|
|
STATUS=":x: FAILED"
|
|
else
|
|
STATUS=":grey_question: UNKNOWN"
|
|
fi
|
|
|
|
cat > rollback-lag-report.md << EOF
|
|
## Rollback Lag Measurement Report
|
|
|
|
**Environment:** ${{ inputs.environment || 'staging' }}
|
|
**Deployment:** ${{ inputs.deployment || 'stellaops-api' }}
|
|
**Dry Run:** ${{ inputs.dry_run || 'true' }}
|
|
|
|
### Version Information
|
|
|
|
| Version | Image |
|
|
|---------|-------|
|
|
| Current | \`${{ needs.preflight.outputs.current-version }}\` |
|
|
| Previous | \`${{ needs.preflight.outputs.previous-version }}\` |
|
|
|
|
### Timing Metrics
|
|
|
|
| Metric | Value | SLO |
|
|
|--------|-------|-----|
|
|
| Rollback Execution | ${{ needs.measure.outputs.rollback-time || 'N/A' }}s | - |
|
|
| Health Recovery | ${{ needs.measure.outputs.health-recovery-time || 'N/A' }}s | - |
|
|
| **Total Lag** | **${TOTAL_LAG}s** | < ${SLO_SECONDS}s |
|
|
|
|
### SLO Status: ${STATUS}
|
|
|
|
---
|
|
|
|
*Report generated at $(date -u +%Y-%m-%dT%H:%M:%SZ)*
|
|
|
|
<details>
|
|
<summary>Measurement Details</summary>
|
|
|
|
- Can Rollback: ${{ needs.preflight.outputs.can-rollback }}
|
|
- Replica Count: ${{ needs.preflight.outputs.replica-count }}
|
|
- Current Image: \`${{ needs.preflight.outputs.current-image }}\`
|
|
- Previous Image: \`${{ needs.preflight.outputs.previous-image }}\`
|
|
|
|
</details>
|
|
EOF
|
|
|
|
cat rollback-lag-report.md
|
|
|
|
# Add to job summary
|
|
cat rollback-lag-report.md >> $GITHUB_STEP_SUMMARY
|
|
|
|
- name: Upload Report
|
|
uses: actions/upload-artifact@v4
|
|
with:
|
|
name: rollback-lag-report
|
|
path: rollback-lag-report.md
|
|
|
|
- name: Check SLO and Fail if Exceeded
|
|
if: needs.measure.outputs.slo-met == 'false'
|
|
run: |
|
|
TOTAL_LAG="${{ needs.measure.outputs.total-lag }}"
|
|
SLO_SECONDS="${{ inputs.rollback_slo_seconds || 300 }}"
|
|
echo "::error::Rollback took ${TOTAL_LAG}s, exceeds SLO of ${SLO_SECONDS}s"
|
|
exit 1
|