work
This commit is contained in:
134
scripts/observability/incident-mode.sh
Normal file
134
scripts/observability/incident-mode.sh
Normal file
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Incident mode automation
|
||||
# - Enables a feature-flag JSON when burn rate crosses threshold
|
||||
# - Writes retention override parameters for downstream storage/ingest systems
|
||||
# - Resets automatically after a cooldown period once burn subsides
|
||||
# All inputs are provided via CLI flags or env vars to remain offline-friendly.
|
||||
|
||||
usage() {
|
||||
cat <<'USAGE'
|
||||
Usage: incident-mode.sh --burn-rate <float> [--threshold 2.0] [--reset-threshold 0.5] \
|
||||
[--state-dir out/incident-mode] [--retention-hours 24] \
|
||||
[--cooldown-mins 30] [--note "text"]
|
||||
|
||||
Environment overrides:
|
||||
INCIDENT_STATE_DIR default: out/incident-mode
|
||||
INCIDENT_THRESHOLD default: 2.0 (fast burn multiple)
|
||||
INCIDENT_RESET_TH default: 0.5 (burn multiple to exit)
|
||||
INCIDENT_COOLDOWN default: 30 (minutes below reset threshold)
|
||||
INCIDENT_RETENTION_H default: 24 (hours)
|
||||
|
||||
Outputs (in state dir):
|
||||
flag.json feature flag payload (enabled/disabled + metadata)
|
||||
retention.json retention override (hours, applied_at)
|
||||
last_burn.txt last burn rate observed
|
||||
cooldown.txt consecutive minutes below reset threshold
|
||||
|
||||
Examples:
|
||||
incident-mode.sh --burn-rate 3.1 --note "fast burn" # enter incident mode
|
||||
incident-mode.sh --burn-rate 0.2 # progress cooldown / exit
|
||||
USAGE
|
||||
}
|
||||
|
||||
if [[ $# -eq 0 ]]; then usage; exit 1; fi
|
||||
|
||||
BURN_RATE=""
|
||||
NOTE=""
|
||||
STATE_DIR=${INCIDENT_STATE_DIR:-out/incident-mode}
|
||||
THRESHOLD=${INCIDENT_THRESHOLD:-2.0}
|
||||
RESET_TH=${INCIDENT_RESET_TH:-0.5}
|
||||
COOLDOWN_MINS=${INCIDENT_COOLDOWN:-30}
|
||||
RETENTION_H=${INCIDENT_RETENTION_H:-24}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--burn-rate) BURN_RATE="$2"; shift 2;;
|
||||
--threshold) THRESHOLD="$2"; shift 2;;
|
||||
--reset-threshold) RESET_TH="$2"; shift 2;;
|
||||
--state-dir) STATE_DIR="$2"; shift 2;;
|
||||
--retention-hours) RETENTION_H="$2"; shift 2;;
|
||||
--cooldown-mins) COOLDOWN_MINS="$2"; shift 2;;
|
||||
--note) NOTE="$2"; shift 2;;
|
||||
-h|--help) usage; exit 0;;
|
||||
*) echo "Unknown arg: $1" >&2; usage; exit 1;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ -z "$BURN_RATE" ]]; then echo "--burn-rate is required" >&2; exit 1; fi
|
||||
mkdir -p "$STATE_DIR"
|
||||
FLAG_FILE="$STATE_DIR/flag.json"
|
||||
RET_FILE="$STATE_DIR/retention.json"
|
||||
LAST_FILE="$STATE_DIR/last_burn.txt"
|
||||
COOLDOWN_FILE="$STATE_DIR/cooldown.txt"
|
||||
|
||||
jq_escape() { python - <<PY "$1"
|
||||
import json,sys
|
||||
print(json.dumps(sys.argv[1]))
|
||||
PY
|
||||
}
|
||||
|
||||
now_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
burn_float=$(python - <<PY "$BURN_RATE"
|
||||
import sys
|
||||
print(float(sys.argv[1]))
|
||||
PY)
|
||||
|
||||
cooldown_current=0
|
||||
if [[ -f "$COOLDOWN_FILE" ]]; then
|
||||
cooldown_current=$(cat "$COOLDOWN_FILE")
|
||||
fi
|
||||
|
||||
enter_incident=false
|
||||
exit_incident=false
|
||||
|
||||
if (( $(echo "$burn_float >= $THRESHOLD" | bc -l) )); then
|
||||
enter_incident=true
|
||||
cooldown_current=0
|
||||
elif (( $(echo "$burn_float <= $RESET_TH" | bc -l) )); then
|
||||
cooldown_current=$((cooldown_current + 1))
|
||||
if (( cooldown_current >= COOLDOWN_MINS )); then
|
||||
exit_incident=true
|
||||
fi
|
||||
else
|
||||
cooldown_current=0
|
||||
fi
|
||||
|
||||
echo "$burn_float" > "$LAST_FILE"
|
||||
echo "$cooldown_current" > "$COOLDOWN_FILE"
|
||||
|
||||
write_flag() {
|
||||
local enabled="$1"
|
||||
cat > "$FLAG_FILE" <<JSON
|
||||
{
|
||||
"enabled": $enabled,
|
||||
"updated_at": "$now_utc",
|
||||
"reason": "incident-mode",
|
||||
"note": $(jq_escape "$NOTE"),
|
||||
"burn_rate": $burn_float
|
||||
}
|
||||
JSON
|
||||
}
|
||||
|
||||
if $enter_incident; then
|
||||
write_flag true
|
||||
cat > "$RET_FILE" <<JSON
|
||||
{
|
||||
"retention_hours": $RETENTION_H,
|
||||
"applied_at": "$now_utc"
|
||||
}
|
||||
JSON
|
||||
echo "incident-mode: activated (burn_rate=$burn_float)" >&2
|
||||
elif $exit_incident; then
|
||||
write_flag false
|
||||
echo "incident-mode: cleared after cooldown (burn_rate=$burn_float)" >&2
|
||||
else
|
||||
# no change; preserve prior flag if exists
|
||||
if [[ ! -f "$FLAG_FILE" ]]; then
|
||||
write_flag false
|
||||
fi
|
||||
echo "incident-mode: steady (burn_rate=$burn_float, cooldown=$cooldown_current/$COOLDOWN_MINS)" >&2
|
||||
fi
|
||||
|
||||
exit 0
|
||||
Reference in New Issue
Block a user