Implement incident mode management service and models
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
- Added IPackRunIncidentModeService interface for managing incident mode activation, deactivation, and status retrieval. - Created PackRunIncidentModeService class implementing the service interface with methods for activating, deactivating, and escalating incident modes. - Introduced incident mode status model (PackRunIncidentModeStatus) and related enums for escalation levels and activation sources. - Developed retention policy, telemetry settings, and debug capture settings models to manage incident mode configurations. - Implemented SLO breach notification handling to activate incident mode based on severity. - Added in-memory store (InMemoryPackRunIncidentModeStore) for testing purposes. - Created comprehensive unit tests for incident mode service, covering activation, deactivation, status retrieval, and SLO breach handling.
This commit is contained in:
686
docs/schemas/ops-incident-runbook.schema.json
Normal file
686
docs/schemas/ops-incident-runbook.schema.json
Normal file
@@ -0,0 +1,686 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "https://stella-ops.org/schemas/ops-incident-runbook.schema.json",
|
||||
"title": "StellaOps Operations Incident Runbook Schema",
|
||||
"description": "Schema for incident runbooks, escalation procedures, and operational checklists. Unblocks DOCS-RUNBOOK-55-001 (1+ tasks).",
|
||||
"type": "object",
|
||||
"definitions": {
|
||||
"Runbook": {
|
||||
"type": "object",
|
||||
"description": "Complete incident runbook",
|
||||
"required": ["runbook_id", "title", "severity", "steps"],
|
||||
"properties": {
|
||||
"runbook_id": {
|
||||
"type": "string",
|
||||
"pattern": "^RB-[A-Z]+-[0-9]+$",
|
||||
"description": "Unique runbook identifier (e.g., RB-VULN-001)"
|
||||
},
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"severity": {
|
||||
"type": "string",
|
||||
"enum": ["critical", "high", "medium", "low"],
|
||||
"description": "Severity level this runbook addresses"
|
||||
},
|
||||
"category": {
|
||||
"type": "string",
|
||||
"enum": ["vulnerability", "outage", "security", "performance", "data", "compliance"],
|
||||
"description": "Incident category"
|
||||
},
|
||||
"trigger_conditions": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/TriggerCondition"
|
||||
},
|
||||
"description": "Conditions that trigger this runbook"
|
||||
},
|
||||
"prerequisites": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Required access/tools before starting"
|
||||
},
|
||||
"steps": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/RunbookStep"
|
||||
}
|
||||
},
|
||||
"escalation": {
|
||||
"$ref": "#/definitions/EscalationProcedure"
|
||||
},
|
||||
"communication": {
|
||||
"$ref": "#/definitions/CommunicationPlan"
|
||||
},
|
||||
"rollback": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/RunbookStep"
|
||||
},
|
||||
"description": "Rollback steps if resolution fails"
|
||||
},
|
||||
"post_incident": {
|
||||
"$ref": "#/definitions/PostIncidentChecklist"
|
||||
},
|
||||
"estimated_duration": {
|
||||
"type": "string",
|
||||
"description": "Expected time to resolve (e.g., 30m, 2h)"
|
||||
},
|
||||
"last_updated": {
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
},
|
||||
"owner": {
|
||||
"type": "string",
|
||||
"description": "Team/person responsible for maintaining this runbook"
|
||||
},
|
||||
"tags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"TriggerCondition": {
|
||||
"type": "object",
|
||||
"description": "Condition that triggers the runbook",
|
||||
"required": ["condition_type", "description"],
|
||||
"properties": {
|
||||
"condition_type": {
|
||||
"type": "string",
|
||||
"enum": ["alert", "metric_threshold", "user_report", "scheduled", "manual"]
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"alert_name": {
|
||||
"type": "string",
|
||||
"description": "Alert name if condition_type is 'alert'"
|
||||
},
|
||||
"metric_expr": {
|
||||
"type": "string",
|
||||
"description": "PromQL expression if condition_type is 'metric_threshold'"
|
||||
},
|
||||
"threshold": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"RunbookStep": {
|
||||
"type": "object",
|
||||
"description": "Individual runbook step",
|
||||
"required": ["step_number", "action"],
|
||||
"properties": {
|
||||
"step_number": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
},
|
||||
"action": {
|
||||
"type": "string",
|
||||
"description": "What to do"
|
||||
},
|
||||
"description": {
|
||||
"type": "string",
|
||||
"description": "Detailed explanation"
|
||||
},
|
||||
"command": {
|
||||
"type": "string",
|
||||
"description": "CLI command to execute"
|
||||
},
|
||||
"commands": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/CommandSpec"
|
||||
},
|
||||
"description": "Multiple commands if needed"
|
||||
},
|
||||
"expected_output": {
|
||||
"type": "string",
|
||||
"description": "What success looks like"
|
||||
},
|
||||
"timeout": {
|
||||
"type": "string",
|
||||
"description": "Max time for this step (e.g., 5m)"
|
||||
},
|
||||
"decision_point": {
|
||||
"$ref": "#/definitions/DecisionPoint",
|
||||
"description": "If step requires a decision"
|
||||
},
|
||||
"verification": {
|
||||
"type": "string",
|
||||
"description": "How to verify step completed successfully"
|
||||
},
|
||||
"notes": {
|
||||
"type": "string",
|
||||
"description": "Additional context or warnings"
|
||||
},
|
||||
"skip_conditions": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Conditions under which to skip this step"
|
||||
}
|
||||
}
|
||||
},
|
||||
"CommandSpec": {
|
||||
"type": "object",
|
||||
"description": "Command specification",
|
||||
"required": ["command"],
|
||||
"properties": {
|
||||
"command": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"requires_sudo": {
|
||||
"type": "boolean",
|
||||
"default": false
|
||||
},
|
||||
"environment": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"DecisionPoint": {
|
||||
"type": "object",
|
||||
"description": "Decision branch in runbook",
|
||||
"required": ["question", "options"],
|
||||
"properties": {
|
||||
"question": {
|
||||
"type": "string"
|
||||
},
|
||||
"options": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"condition": {
|
||||
"type": "string"
|
||||
},
|
||||
"next_step": {
|
||||
"type": "integer"
|
||||
},
|
||||
"action": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["condition"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"EscalationProcedure": {
|
||||
"type": "object",
|
||||
"description": "Escalation procedure",
|
||||
"properties": {
|
||||
"levels": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/EscalationLevel"
|
||||
}
|
||||
},
|
||||
"auto_escalate_after": {
|
||||
"type": "string",
|
||||
"description": "Time after which to auto-escalate (e.g., 30m)"
|
||||
},
|
||||
"escalation_criteria": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Conditions that trigger escalation"
|
||||
}
|
||||
}
|
||||
},
|
||||
"EscalationLevel": {
|
||||
"type": "object",
|
||||
"description": "Single escalation level",
|
||||
"required": ["level", "contacts"],
|
||||
"properties": {
|
||||
"level": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
},
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"contacts": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/Contact"
|
||||
}
|
||||
},
|
||||
"response_time_sla": {
|
||||
"type": "string",
|
||||
"description": "Expected response time (e.g., 15m)"
|
||||
},
|
||||
"notification_channels": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"enum": ["pagerduty", "slack", "email", "phone", "sms", "teams"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"Contact": {
|
||||
"type": "object",
|
||||
"description": "Contact information",
|
||||
"required": ["name", "role"],
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"role": {
|
||||
"type": "string"
|
||||
},
|
||||
"email": {
|
||||
"type": "string",
|
||||
"format": "email"
|
||||
},
|
||||
"phone": {
|
||||
"type": "string"
|
||||
},
|
||||
"slack_handle": {
|
||||
"type": "string"
|
||||
},
|
||||
"pagerduty_id": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"CommunicationPlan": {
|
||||
"type": "object",
|
||||
"description": "Communication during incident",
|
||||
"properties": {
|
||||
"status_page": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "Public status page URL"
|
||||
},
|
||||
"internal_channel": {
|
||||
"type": "string",
|
||||
"description": "Internal communication channel (e.g., #incident-response)"
|
||||
},
|
||||
"stakeholder_updates": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"frequency": {
|
||||
"type": "string",
|
||||
"description": "Update frequency (e.g., every 30m)"
|
||||
},
|
||||
"recipients": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"template": {
|
||||
"type": "string",
|
||||
"description": "Status update template"
|
||||
}
|
||||
}
|
||||
},
|
||||
"customer_notification": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"required": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"template": {
|
||||
"type": "string"
|
||||
},
|
||||
"approval_required": {
|
||||
"type": "boolean"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"PostIncidentChecklist": {
|
||||
"type": "object",
|
||||
"description": "Post-incident activities",
|
||||
"properties": {
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"task": {
|
||||
"type": "string"
|
||||
},
|
||||
"owner": {
|
||||
"type": "string"
|
||||
},
|
||||
"due": {
|
||||
"type": "string",
|
||||
"description": "Due timeframe (e.g., within 24h, within 1 week)"
|
||||
},
|
||||
"required": {
|
||||
"type": "boolean",
|
||||
"default": true
|
||||
}
|
||||
},
|
||||
"required": ["task"]
|
||||
}
|
||||
},
|
||||
"postmortem_required": {
|
||||
"type": "boolean",
|
||||
"default": true
|
||||
},
|
||||
"postmortem_due": {
|
||||
"type": "string",
|
||||
"description": "Timeframe for postmortem (e.g., 5 business days)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"IncidentChecklist": {
|
||||
"type": "object",
|
||||
"description": "Pre-flight checklist for incident response",
|
||||
"required": ["checklist_id", "name", "items"],
|
||||
"properties": {
|
||||
"checklist_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"item_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"category": {
|
||||
"type": "string",
|
||||
"enum": ["access", "tools", "documentation", "communication", "monitoring"]
|
||||
},
|
||||
"verification": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["description"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"RunbookCatalog": {
|
||||
"type": "object",
|
||||
"description": "Catalog of all runbooks",
|
||||
"required": ["catalog_id", "version", "runbooks"],
|
||||
"properties": {
|
||||
"catalog_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"version": {
|
||||
"type": "string"
|
||||
},
|
||||
"updated_at": {
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
},
|
||||
"runbooks": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/Runbook"
|
||||
}
|
||||
},
|
||||
"checklists": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/IncidentChecklist"
|
||||
}
|
||||
},
|
||||
"global_contacts": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/Contact"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"properties": {
|
||||
"catalog": {
|
||||
"$ref": "#/definitions/RunbookCatalog"
|
||||
}
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"catalog": {
|
||||
"catalog_id": "stellaops-runbooks",
|
||||
"version": "2025.10.0",
|
||||
"updated_at": "2025-12-06T10:00:00Z",
|
||||
"runbooks": [
|
||||
{
|
||||
"runbook_id": "RB-VULN-001",
|
||||
"title": "Critical Vulnerability Spike Response",
|
||||
"description": "Response procedure when critical vulnerabilities spike significantly",
|
||||
"severity": "critical",
|
||||
"category": "vulnerability",
|
||||
"trigger_conditions": [
|
||||
{
|
||||
"condition_type": "alert",
|
||||
"description": "Critical vulnerability count increased by >10 in 1 hour",
|
||||
"alert_name": "CriticalVulnerabilitySpike"
|
||||
}
|
||||
],
|
||||
"prerequisites": [
|
||||
"Access to StellaOps CLI (stella)",
|
||||
"Read access to Findings Ledger",
|
||||
"Access to #security-incidents Slack channel"
|
||||
],
|
||||
"steps": [
|
||||
{
|
||||
"step_number": 1,
|
||||
"action": "Acknowledge the alert",
|
||||
"description": "Acknowledge in PagerDuty/alerting system to stop escalation",
|
||||
"timeout": "5m"
|
||||
},
|
||||
{
|
||||
"step_number": 2,
|
||||
"action": "Identify scope of new vulnerabilities",
|
||||
"command": "stella findings list --severity critical --since 1h --format table",
|
||||
"expected_output": "List of new critical findings with CVE IDs and affected assets",
|
||||
"verification": "Output shows findings with timestamps within last hour"
|
||||
},
|
||||
{
|
||||
"step_number": 3,
|
||||
"action": "Determine if spike is from new scans or advisory updates",
|
||||
"commands": [
|
||||
{
|
||||
"command": "stella scan jobs --status completed --since 1h",
|
||||
"description": "Check for recent scan completions"
|
||||
},
|
||||
{
|
||||
"command": "stella advisory updates --since 1h",
|
||||
"description": "Check for recent advisory updates"
|
||||
}
|
||||
],
|
||||
"decision_point": {
|
||||
"question": "What caused the spike?",
|
||||
"options": [
|
||||
{
|
||||
"condition": "New scans completed",
|
||||
"next_step": 4,
|
||||
"action": "Review scan results"
|
||||
},
|
||||
{
|
||||
"condition": "Advisory update",
|
||||
"next_step": 5,
|
||||
"action": "Review advisory impact"
|
||||
},
|
||||
{
|
||||
"condition": "Unknown/Both",
|
||||
"next_step": 4,
|
||||
"action": "Continue with full investigation"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"step_number": 4,
|
||||
"action": "Review affected assets and determine business impact",
|
||||
"command": "stella findings group-by asset --severity critical --since 1h",
|
||||
"description": "Group findings by asset to understand impact scope"
|
||||
},
|
||||
{
|
||||
"step_number": 5,
|
||||
"action": "Check VEX applicability",
|
||||
"command": "stella vex check --vuln-ids $(stella findings list --severity critical --since 1h --format ids)",
|
||||
"description": "Check if any vulnerabilities have VEX statements that reduce severity"
|
||||
},
|
||||
{
|
||||
"step_number": 6,
|
||||
"action": "Update stakeholders",
|
||||
"description": "Post status update to #security-incidents with findings summary",
|
||||
"notes": "Use template: 'VULN SPIKE: [count] new critical vulns affecting [assets]. Investigation in progress.'"
|
||||
},
|
||||
{
|
||||
"step_number": 7,
|
||||
"action": "Create remediation tickets if needed",
|
||||
"command": "stella findings export --severity critical --since 1h --format jira",
|
||||
"skip_conditions": [
|
||||
"All vulnerabilities covered by VEX not_affected",
|
||||
"Vulnerabilities are duplicates from rescan"
|
||||
]
|
||||
}
|
||||
],
|
||||
"escalation": {
|
||||
"levels": [
|
||||
{
|
||||
"level": 1,
|
||||
"name": "On-call Security Engineer",
|
||||
"contacts": [
|
||||
{
|
||||
"name": "Security On-Call",
|
||||
"role": "Security Engineer",
|
||||
"slack_handle": "@security-oncall"
|
||||
}
|
||||
],
|
||||
"response_time_sla": "15m",
|
||||
"notification_channels": ["pagerduty", "slack"]
|
||||
},
|
||||
{
|
||||
"level": 2,
|
||||
"name": "Security Team Lead",
|
||||
"contacts": [
|
||||
{
|
||||
"name": "Security Lead",
|
||||
"role": "Security Team Lead",
|
||||
"slack_handle": "@security-lead"
|
||||
}
|
||||
],
|
||||
"response_time_sla": "30m",
|
||||
"notification_channels": ["pagerduty", "slack", "phone"]
|
||||
}
|
||||
],
|
||||
"auto_escalate_after": "30m",
|
||||
"escalation_criteria": [
|
||||
"No acknowledgment within 15 minutes",
|
||||
"More than 50 critical vulnerabilities",
|
||||
"Production systems affected"
|
||||
]
|
||||
},
|
||||
"communication": {
|
||||
"internal_channel": "#security-incidents",
|
||||
"stakeholder_updates": {
|
||||
"frequency": "every 30m during active incident",
|
||||
"recipients": ["security-team", "engineering-leads"],
|
||||
"template": "VULN INCIDENT UPDATE: Status: [status]. Critical count: [count]. Affected systems: [systems]. Next update: [time]."
|
||||
}
|
||||
},
|
||||
"post_incident": {
|
||||
"items": [
|
||||
{
|
||||
"task": "Document incident timeline",
|
||||
"owner": "Incident Commander",
|
||||
"due": "within 24h",
|
||||
"required": true
|
||||
},
|
||||
{
|
||||
"task": "Update vulnerability scanning schedules if needed",
|
||||
"owner": "Security Team",
|
||||
"due": "within 1 week",
|
||||
"required": false
|
||||
},
|
||||
{
|
||||
"task": "Review and update this runbook",
|
||||
"owner": "Runbook Owner",
|
||||
"due": "within 1 week",
|
||||
"required": true
|
||||
}
|
||||
],
|
||||
"postmortem_required": true,
|
||||
"postmortem_due": "5 business days"
|
||||
},
|
||||
"estimated_duration": "1h",
|
||||
"last_updated": "2025-12-06T10:00:00Z",
|
||||
"owner": "Security Operations Team",
|
||||
"tags": ["vulnerability", "security", "critical"]
|
||||
}
|
||||
],
|
||||
"checklists": [
|
||||
{
|
||||
"checklist_id": "incident-preflight",
|
||||
"name": "Incident Response Pre-flight Checklist",
|
||||
"description": "Verify access and tools before incident response",
|
||||
"items": [
|
||||
{
|
||||
"item_id": "cli-access",
|
||||
"description": "StellaOps CLI is installed and authenticated",
|
||||
"category": "tools",
|
||||
"verification": "Run 'stella whoami' successfully"
|
||||
},
|
||||
{
|
||||
"item_id": "slack-access",
|
||||
"description": "Access to #security-incidents channel",
|
||||
"category": "communication",
|
||||
"verification": "Can post messages to channel"
|
||||
},
|
||||
{
|
||||
"item_id": "pagerduty-access",
|
||||
"description": "Can acknowledge alerts in PagerDuty",
|
||||
"category": "tools",
|
||||
"verification": "PagerDuty mobile app logged in"
|
||||
},
|
||||
{
|
||||
"item_id": "runbooks-access",
|
||||
"description": "Can access runbook documentation",
|
||||
"category": "documentation",
|
||||
"verification": "docs.stella-ops.org/runbooks accessible"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"global_contacts": [
|
||||
{
|
||||
"name": "Security Operations",
|
||||
"role": "Primary Response Team",
|
||||
"email": "security-ops@example.com",
|
||||
"slack_handle": "@security-ops"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user