Files
StellaOps Bot 6bee1fdcf5
Some checks failed
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Concelier Attestation Tests / attestation-tests (push) Has been cancelled
Docs CI / lint-and-preview (push) Has been cancelled
work
2025-11-25 08:01:23 +02:00

135 lines
3.8 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
# Incident mode automation
# - Enables a feature-flag JSON when burn rate crosses threshold
# - Writes retention override parameters for downstream storage/ingest systems
# - Resets automatically after a cooldown period once burn subsides
# All inputs are provided via CLI flags or env vars to remain offline-friendly.
usage() {
cat <<'USAGE'
Usage: incident-mode.sh --burn-rate <float> [--threshold 2.0] [--reset-threshold 0.5] \
[--state-dir out/incident-mode] [--retention-hours 24] \
[--cooldown-mins 30] [--note "text"]
Environment overrides:
INCIDENT_STATE_DIR default: out/incident-mode
INCIDENT_THRESHOLD default: 2.0 (fast burn multiple)
INCIDENT_RESET_TH default: 0.5 (burn multiple to exit)
INCIDENT_COOLDOWN default: 30 (minutes below reset threshold)
INCIDENT_RETENTION_H default: 24 (hours)
Outputs (in state dir):
flag.json feature flag payload (enabled/disabled + metadata)
retention.json retention override (hours, applied_at)
last_burn.txt last burn rate observed
cooldown.txt consecutive minutes below reset threshold
Examples:
incident-mode.sh --burn-rate 3.1 --note "fast burn" # enter incident mode
incident-mode.sh --burn-rate 0.2 # progress cooldown / exit
USAGE
}
if [[ $# -eq 0 ]]; then usage; exit 1; fi
BURN_RATE=""
NOTE=""
STATE_DIR=${INCIDENT_STATE_DIR:-out/incident-mode}
THRESHOLD=${INCIDENT_THRESHOLD:-2.0}
RESET_TH=${INCIDENT_RESET_TH:-0.5}
COOLDOWN_MINS=${INCIDENT_COOLDOWN:-30}
RETENTION_H=${INCIDENT_RETENTION_H:-24}
while [[ $# -gt 0 ]]; do
case "$1" in
--burn-rate) BURN_RATE="$2"; shift 2;;
--threshold) THRESHOLD="$2"; shift 2;;
--reset-threshold) RESET_TH="$2"; shift 2;;
--state-dir) STATE_DIR="$2"; shift 2;;
--retention-hours) RETENTION_H="$2"; shift 2;;
--cooldown-mins) COOLDOWN_MINS="$2"; shift 2;;
--note) NOTE="$2"; shift 2;;
-h|--help) usage; exit 0;;
*) echo "Unknown arg: $1" >&2; usage; exit 1;;
esac
done
if [[ -z "$BURN_RATE" ]]; then echo "--burn-rate is required" >&2; exit 1; fi
mkdir -p "$STATE_DIR"
FLAG_FILE="$STATE_DIR/flag.json"
RET_FILE="$STATE_DIR/retention.json"
LAST_FILE="$STATE_DIR/last_burn.txt"
COOLDOWN_FILE="$STATE_DIR/cooldown.txt"
jq_escape() { python - <<PY "$1"
import json,sys
print(json.dumps(sys.argv[1]))
PY
}
now_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)
burn_float=$(python - <<PY "$BURN_RATE"
import sys
print(float(sys.argv[1]))
PY)
cooldown_current=0
if [[ -f "$COOLDOWN_FILE" ]]; then
cooldown_current=$(cat "$COOLDOWN_FILE")
fi
enter_incident=false
exit_incident=false
if (( $(echo "$burn_float >= $THRESHOLD" | bc -l) )); then
enter_incident=true
cooldown_current=0
elif (( $(echo "$burn_float <= $RESET_TH" | bc -l) )); then
cooldown_current=$((cooldown_current + 1))
if (( cooldown_current >= COOLDOWN_MINS )); then
exit_incident=true
fi
else
cooldown_current=0
fi
echo "$burn_float" > "$LAST_FILE"
echo "$cooldown_current" > "$COOLDOWN_FILE"
write_flag() {
local enabled="$1"
cat > "$FLAG_FILE" <<JSON
{
"enabled": $enabled,
"updated_at": "$now_utc",
"reason": "incident-mode",
"note": $(jq_escape "$NOTE"),
"burn_rate": $burn_float
}
JSON
}
if $enter_incident; then
write_flag true
cat > "$RET_FILE" <<JSON
{
"retention_hours": $RETENTION_H,
"applied_at": "$now_utc"
}
JSON
echo "incident-mode: activated (burn_rate=$burn_float)" >&2
elif $exit_incident; then
write_flag false
echo "incident-mode: cleared after cooldown (burn_rate=$burn_float)" >&2
else
# no change; preserve prior flag if exists
if [[ ! -f "$FLAG_FILE" ]]; then
write_flag false
fi
echo "incident-mode: steady (burn_rate=$burn_float, cooldown=$cooldown_current/$COOLDOWN_MINS)" >&2
fi
exit 0