{ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://stella-ops.org/schemas/ops-incident-runbook.schema.json", "title": "StellaOps Operations Incident Runbook Schema", "description": "Schema for incident runbooks, escalation procedures, and operational checklists. Unblocks DOCS-RUNBOOK-55-001 (1+ tasks).", "type": "object", "definitions": { "Runbook": { "type": "object", "description": "Complete incident runbook", "required": ["runbook_id", "title", "severity", "steps"], "properties": { "runbook_id": { "type": "string", "pattern": "^RB-[A-Z]+-[0-9]+$", "description": "Unique runbook identifier (e.g., RB-VULN-001)" }, "title": { "type": "string" }, "description": { "type": "string" }, "severity": { "type": "string", "enum": ["critical", "high", "medium", "low"], "description": "Severity level this runbook addresses" }, "category": { "type": "string", "enum": ["vulnerability", "outage", "security", "performance", "data", "compliance"], "description": "Incident category" }, "trigger_conditions": { "type": "array", "items": { "$ref": "#/definitions/TriggerCondition" }, "description": "Conditions that trigger this runbook" }, "prerequisites": { "type": "array", "items": { "type": "string" }, "description": "Required access/tools before starting" }, "steps": { "type": "array", "items": { "$ref": "#/definitions/RunbookStep" } }, "escalation": { "$ref": "#/definitions/EscalationProcedure" }, "communication": { "$ref": "#/definitions/CommunicationPlan" }, "rollback": { "type": "array", "items": { "$ref": "#/definitions/RunbookStep" }, "description": "Rollback steps if resolution fails" }, "post_incident": { "$ref": "#/definitions/PostIncidentChecklist" }, "estimated_duration": { "type": "string", "description": "Expected time to resolve (e.g., 30m, 2h)" }, "last_updated": { "type": "string", "format": "date-time" }, "owner": { "type": "string", "description": "Team/person responsible for maintaining this runbook" }, "tags": { "type": "array", "items": { "type": "string" } } } }, "TriggerCondition": { "type": "object", "description": "Condition that triggers the runbook", "required": ["condition_type", "description"], "properties": { "condition_type": { "type": "string", "enum": ["alert", "metric_threshold", "user_report", "scheduled", "manual"] }, "description": { "type": "string" }, "alert_name": { "type": "string", "description": "Alert name if condition_type is 'alert'" }, "metric_expr": { "type": "string", "description": "PromQL expression if condition_type is 'metric_threshold'" }, "threshold": { "type": "number" } } }, "RunbookStep": { "type": "object", "description": "Individual runbook step", "required": ["step_number", "action"], "properties": { "step_number": { "type": "integer", "minimum": 1 }, "action": { "type": "string", "description": "What to do" }, "description": { "type": "string", "description": "Detailed explanation" }, "command": { "type": "string", "description": "CLI command to execute" }, "commands": { "type": "array", "items": { "$ref": "#/definitions/CommandSpec" }, "description": "Multiple commands if needed" }, "expected_output": { "type": "string", "description": "What success looks like" }, "timeout": { "type": "string", "description": "Max time for this step (e.g., 5m)" }, "decision_point": { "$ref": "#/definitions/DecisionPoint", "description": "If step requires a decision" }, "verification": { "type": "string", "description": "How to verify step completed successfully" }, "notes": { "type": "string", "description": "Additional context or warnings" }, "skip_conditions": { "type": "array", "items": { "type": "string" }, "description": "Conditions under which to skip this step" } } }, "CommandSpec": { "type": "object", "description": "Command specification", "required": ["command"], "properties": { "command": { "type": "string" }, "description": { "type": "string" }, "requires_sudo": { "type": "boolean", "default": false }, "environment": { "type": "object", "additionalProperties": { "type": "string" } } } }, "DecisionPoint": { "type": "object", "description": "Decision branch in runbook", "required": ["question", "options"], "properties": { "question": { "type": "string" }, "options": { "type": "array", "items": { "type": "object", "properties": { "condition": { "type": "string" }, "next_step": { "type": "integer" }, "action": { "type": "string" } }, "required": ["condition"] } } } }, "EscalationProcedure": { "type": "object", "description": "Escalation procedure", "properties": { "levels": { "type": "array", "items": { "$ref": "#/definitions/EscalationLevel" } }, "auto_escalate_after": { "type": "string", "description": "Time after which to auto-escalate (e.g., 30m)" }, "escalation_criteria": { "type": "array", "items": { "type": "string" }, "description": "Conditions that trigger escalation" } } }, "EscalationLevel": { "type": "object", "description": "Single escalation level", "required": ["level", "contacts"], "properties": { "level": { "type": "integer", "minimum": 1 }, "name": { "type": "string" }, "contacts": { "type": "array", "items": { "$ref": "#/definitions/Contact" } }, "response_time_sla": { "type": "string", "description": "Expected response time (e.g., 15m)" }, "notification_channels": { "type": "array", "items": { "type": "string", "enum": ["pagerduty", "slack", "email", "phone", "sms", "teams"] } } } }, "Contact": { "type": "object", "description": "Contact information", "required": ["name", "role"], "properties": { "name": { "type": "string" }, "role": { "type": "string" }, "email": { "type": "string", "format": "email" }, "phone": { "type": "string" }, "slack_handle": { "type": "string" }, "pagerduty_id": { "type": "string" } } }, "CommunicationPlan": { "type": "object", "description": "Communication during incident", "properties": { "status_page": { "type": "string", "format": "uri", "description": "Public status page URL" }, "internal_channel": { "type": "string", "description": "Internal communication channel (e.g., #incident-response)" }, "stakeholder_updates": { "type": "object", "properties": { "frequency": { "type": "string", "description": "Update frequency (e.g., every 30m)" }, "recipients": { "type": "array", "items": { "type": "string" } }, "template": { "type": "string", "description": "Status update template" } } }, "customer_notification": { "type": "object", "properties": { "required": { "type": "boolean" }, "template": { "type": "string" }, "approval_required": { "type": "boolean" } } } } }, "PostIncidentChecklist": { "type": "object", "description": "Post-incident activities", "properties": { "items": { "type": "array", "items": { "type": "object", "properties": { "task": { "type": "string" }, "owner": { "type": "string" }, "due": { "type": "string", "description": "Due timeframe (e.g., within 24h, within 1 week)" }, "required": { "type": "boolean", "default": true } }, "required": ["task"] } }, "postmortem_required": { "type": "boolean", "default": true }, "postmortem_due": { "type": "string", "description": "Timeframe for postmortem (e.g., 5 business days)" } } }, "IncidentChecklist": { "type": "object", "description": "Pre-flight checklist for incident response", "required": ["checklist_id", "name", "items"], "properties": { "checklist_id": { "type": "string" }, "name": { "type": "string" }, "description": { "type": "string" }, "items": { "type": "array", "items": { "type": "object", "properties": { "item_id": { "type": "string" }, "description": { "type": "string" }, "category": { "type": "string", "enum": ["access", "tools", "documentation", "communication", "monitoring"] }, "verification": { "type": "string" } }, "required": ["description"] } } } }, "RunbookCatalog": { "type": "object", "description": "Catalog of all runbooks", "required": ["catalog_id", "version", "runbooks"], "properties": { "catalog_id": { "type": "string" }, "version": { "type": "string" }, "updated_at": { "type": "string", "format": "date-time" }, "runbooks": { "type": "array", "items": { "$ref": "#/definitions/Runbook" } }, "checklists": { "type": "array", "items": { "$ref": "#/definitions/IncidentChecklist" } }, "global_contacts": { "type": "array", "items": { "$ref": "#/definitions/Contact" } } } } }, "properties": { "catalog": { "$ref": "#/definitions/RunbookCatalog" } }, "examples": [ { "catalog": { "catalog_id": "stellaops-runbooks", "version": "2025.10.0", "updated_at": "2025-12-06T10:00:00Z", "runbooks": [ { "runbook_id": "RB-VULN-001", "title": "Critical Vulnerability Spike Response", "description": "Response procedure when critical vulnerabilities spike significantly", "severity": "critical", "category": "vulnerability", "trigger_conditions": [ { "condition_type": "alert", "description": "Critical vulnerability count increased by >10 in 1 hour", "alert_name": "CriticalVulnerabilitySpike" } ], "prerequisites": [ "Access to StellaOps CLI (stella)", "Read access to Findings Ledger", "Access to #security-incidents Slack channel" ], "steps": [ { "step_number": 1, "action": "Acknowledge the alert", "description": "Acknowledge in PagerDuty/alerting system to stop escalation", "timeout": "5m" }, { "step_number": 2, "action": "Identify scope of new vulnerabilities", "command": "stella findings list --severity critical --since 1h --format table", "expected_output": "List of new critical findings with CVE IDs and affected assets", "verification": "Output shows findings with timestamps within last hour" }, { "step_number": 3, "action": "Determine if spike is from new scans or advisory updates", "commands": [ { "command": "stella scan jobs --status completed --since 1h", "description": "Check for recent scan completions" }, { "command": "stella advisory updates --since 1h", "description": "Check for recent advisory updates" } ], "decision_point": { "question": "What caused the spike?", "options": [ { "condition": "New scans completed", "next_step": 4, "action": "Review scan results" }, { "condition": "Advisory update", "next_step": 5, "action": "Review advisory impact" }, { "condition": "Unknown/Both", "next_step": 4, "action": "Continue with full investigation" } ] } }, { "step_number": 4, "action": "Review affected assets and determine business impact", "command": "stella findings group-by asset --severity critical --since 1h", "description": "Group findings by asset to understand impact scope" }, { "step_number": 5, "action": "Check VEX applicability", "command": "stella vex check --vuln-ids $(stella findings list --severity critical --since 1h --format ids)", "description": "Check if any vulnerabilities have VEX statements that reduce severity" }, { "step_number": 6, "action": "Update stakeholders", "description": "Post status update to #security-incidents with findings summary", "notes": "Use template: 'VULN SPIKE: [count] new critical vulns affecting [assets]. Investigation in progress.'" }, { "step_number": 7, "action": "Create remediation tickets if needed", "command": "stella findings export --severity critical --since 1h --format jira", "skip_conditions": [ "All vulnerabilities covered by VEX not_affected", "Vulnerabilities are duplicates from rescan" ] } ], "escalation": { "levels": [ { "level": 1, "name": "On-call Security Engineer", "contacts": [ { "name": "Security On-Call", "role": "Security Engineer", "slack_handle": "@security-oncall" } ], "response_time_sla": "15m", "notification_channels": ["pagerduty", "slack"] }, { "level": 2, "name": "Security Team Lead", "contacts": [ { "name": "Security Lead", "role": "Security Team Lead", "slack_handle": "@security-lead" } ], "response_time_sla": "30m", "notification_channels": ["pagerduty", "slack", "phone"] } ], "auto_escalate_after": "30m", "escalation_criteria": [ "No acknowledgment within 15 minutes", "More than 50 critical vulnerabilities", "Production systems affected" ] }, "communication": { "internal_channel": "#security-incidents", "stakeholder_updates": { "frequency": "every 30m during active incident", "recipients": ["security-team", "engineering-leads"], "template": "VULN INCIDENT UPDATE: Status: [status]. Critical count: [count]. Affected systems: [systems]. Next update: [time]." } }, "post_incident": { "items": [ { "task": "Document incident timeline", "owner": "Incident Commander", "due": "within 24h", "required": true }, { "task": "Update vulnerability scanning schedules if needed", "owner": "Security Team", "due": "within 1 week", "required": false }, { "task": "Review and update this runbook", "owner": "Runbook Owner", "due": "within 1 week", "required": true } ], "postmortem_required": true, "postmortem_due": "5 business days" }, "estimated_duration": "1h", "last_updated": "2025-12-06T10:00:00Z", "owner": "Security Operations Team", "tags": ["vulnerability", "security", "critical"] } ], "checklists": [ { "checklist_id": "incident-preflight", "name": "Incident Response Pre-flight Checklist", "description": "Verify access and tools before incident response", "items": [ { "item_id": "cli-access", "description": "StellaOps CLI is installed and authenticated", "category": "tools", "verification": "Run 'stella whoami' successfully" }, { "item_id": "slack-access", "description": "Access to #security-incidents channel", "category": "communication", "verification": "Can post messages to channel" }, { "item_id": "pagerduty-access", "description": "Can acknowledge alerts in PagerDuty", "category": "tools", "verification": "PagerDuty mobile app logged in" }, { "item_id": "runbooks-access", "description": "Can access runbook documentation", "category": "documentation", "verification": "docs.stella-ops.org/runbooks accessible" } ] } ], "global_contacts": [ { "name": "Security Operations", "role": "Primary Response Team", "email": "security-ops@example.com", "slack_handle": "@security-ops" } ] } } ] }