Implement incident mode management service and models
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled

- Added IPackRunIncidentModeService interface for managing incident mode activation, deactivation, and status retrieval.
- Created PackRunIncidentModeService class implementing the service interface with methods for activating, deactivating, and escalating incident modes.
- Introduced incident mode status model (PackRunIncidentModeStatus) and related enums for escalation levels and activation sources.
- Developed retention policy, telemetry settings, and debug capture settings models to manage incident mode configurations.
- Implemented SLO breach notification handling to activate incident mode based on severity.
- Added in-memory store (InMemoryPackRunIncidentModeStore) for testing purposes.
- Created comprehensive unit tests for incident mode service, covering activation, deactivation, status retrieval, and SLO breach handling.
This commit is contained in:
StellaOps Bot
2025-12-06 22:33:00 +02:00
parent 4042fc2184
commit 9bd6a73926
23 changed files with 7779 additions and 12 deletions

View File

@@ -0,0 +1,686 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://stella-ops.org/schemas/ops-incident-runbook.schema.json",
"title": "StellaOps Operations Incident Runbook Schema",
"description": "Schema for incident runbooks, escalation procedures, and operational checklists. Unblocks DOCS-RUNBOOK-55-001 (1+ tasks).",
"type": "object",
"definitions": {
"Runbook": {
"type": "object",
"description": "Complete incident runbook",
"required": ["runbook_id", "title", "severity", "steps"],
"properties": {
"runbook_id": {
"type": "string",
"pattern": "^RB-[A-Z]+-[0-9]+$",
"description": "Unique runbook identifier (e.g., RB-VULN-001)"
},
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"severity": {
"type": "string",
"enum": ["critical", "high", "medium", "low"],
"description": "Severity level this runbook addresses"
},
"category": {
"type": "string",
"enum": ["vulnerability", "outage", "security", "performance", "data", "compliance"],
"description": "Incident category"
},
"trigger_conditions": {
"type": "array",
"items": {
"$ref": "#/definitions/TriggerCondition"
},
"description": "Conditions that trigger this runbook"
},
"prerequisites": {
"type": "array",
"items": {
"type": "string"
},
"description": "Required access/tools before starting"
},
"steps": {
"type": "array",
"items": {
"$ref": "#/definitions/RunbookStep"
}
},
"escalation": {
"$ref": "#/definitions/EscalationProcedure"
},
"communication": {
"$ref": "#/definitions/CommunicationPlan"
},
"rollback": {
"type": "array",
"items": {
"$ref": "#/definitions/RunbookStep"
},
"description": "Rollback steps if resolution fails"
},
"post_incident": {
"$ref": "#/definitions/PostIncidentChecklist"
},
"estimated_duration": {
"type": "string",
"description": "Expected time to resolve (e.g., 30m, 2h)"
},
"last_updated": {
"type": "string",
"format": "date-time"
},
"owner": {
"type": "string",
"description": "Team/person responsible for maintaining this runbook"
},
"tags": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"TriggerCondition": {
"type": "object",
"description": "Condition that triggers the runbook",
"required": ["condition_type", "description"],
"properties": {
"condition_type": {
"type": "string",
"enum": ["alert", "metric_threshold", "user_report", "scheduled", "manual"]
},
"description": {
"type": "string"
},
"alert_name": {
"type": "string",
"description": "Alert name if condition_type is 'alert'"
},
"metric_expr": {
"type": "string",
"description": "PromQL expression if condition_type is 'metric_threshold'"
},
"threshold": {
"type": "number"
}
}
},
"RunbookStep": {
"type": "object",
"description": "Individual runbook step",
"required": ["step_number", "action"],
"properties": {
"step_number": {
"type": "integer",
"minimum": 1
},
"action": {
"type": "string",
"description": "What to do"
},
"description": {
"type": "string",
"description": "Detailed explanation"
},
"command": {
"type": "string",
"description": "CLI command to execute"
},
"commands": {
"type": "array",
"items": {
"$ref": "#/definitions/CommandSpec"
},
"description": "Multiple commands if needed"
},
"expected_output": {
"type": "string",
"description": "What success looks like"
},
"timeout": {
"type": "string",
"description": "Max time for this step (e.g., 5m)"
},
"decision_point": {
"$ref": "#/definitions/DecisionPoint",
"description": "If step requires a decision"
},
"verification": {
"type": "string",
"description": "How to verify step completed successfully"
},
"notes": {
"type": "string",
"description": "Additional context or warnings"
},
"skip_conditions": {
"type": "array",
"items": {
"type": "string"
},
"description": "Conditions under which to skip this step"
}
}
},
"CommandSpec": {
"type": "object",
"description": "Command specification",
"required": ["command"],
"properties": {
"command": {
"type": "string"
},
"description": {
"type": "string"
},
"requires_sudo": {
"type": "boolean",
"default": false
},
"environment": {
"type": "object",
"additionalProperties": {
"type": "string"
}
}
}
},
"DecisionPoint": {
"type": "object",
"description": "Decision branch in runbook",
"required": ["question", "options"],
"properties": {
"question": {
"type": "string"
},
"options": {
"type": "array",
"items": {
"type": "object",
"properties": {
"condition": {
"type": "string"
},
"next_step": {
"type": "integer"
},
"action": {
"type": "string"
}
},
"required": ["condition"]
}
}
}
},
"EscalationProcedure": {
"type": "object",
"description": "Escalation procedure",
"properties": {
"levels": {
"type": "array",
"items": {
"$ref": "#/definitions/EscalationLevel"
}
},
"auto_escalate_after": {
"type": "string",
"description": "Time after which to auto-escalate (e.g., 30m)"
},
"escalation_criteria": {
"type": "array",
"items": {
"type": "string"
},
"description": "Conditions that trigger escalation"
}
}
},
"EscalationLevel": {
"type": "object",
"description": "Single escalation level",
"required": ["level", "contacts"],
"properties": {
"level": {
"type": "integer",
"minimum": 1
},
"name": {
"type": "string"
},
"contacts": {
"type": "array",
"items": {
"$ref": "#/definitions/Contact"
}
},
"response_time_sla": {
"type": "string",
"description": "Expected response time (e.g., 15m)"
},
"notification_channels": {
"type": "array",
"items": {
"type": "string",
"enum": ["pagerduty", "slack", "email", "phone", "sms", "teams"]
}
}
}
},
"Contact": {
"type": "object",
"description": "Contact information",
"required": ["name", "role"],
"properties": {
"name": {
"type": "string"
},
"role": {
"type": "string"
},
"email": {
"type": "string",
"format": "email"
},
"phone": {
"type": "string"
},
"slack_handle": {
"type": "string"
},
"pagerduty_id": {
"type": "string"
}
}
},
"CommunicationPlan": {
"type": "object",
"description": "Communication during incident",
"properties": {
"status_page": {
"type": "string",
"format": "uri",
"description": "Public status page URL"
},
"internal_channel": {
"type": "string",
"description": "Internal communication channel (e.g., #incident-response)"
},
"stakeholder_updates": {
"type": "object",
"properties": {
"frequency": {
"type": "string",
"description": "Update frequency (e.g., every 30m)"
},
"recipients": {
"type": "array",
"items": {
"type": "string"
}
},
"template": {
"type": "string",
"description": "Status update template"
}
}
},
"customer_notification": {
"type": "object",
"properties": {
"required": {
"type": "boolean"
},
"template": {
"type": "string"
},
"approval_required": {
"type": "boolean"
}
}
}
}
},
"PostIncidentChecklist": {
"type": "object",
"description": "Post-incident activities",
"properties": {
"items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"task": {
"type": "string"
},
"owner": {
"type": "string"
},
"due": {
"type": "string",
"description": "Due timeframe (e.g., within 24h, within 1 week)"
},
"required": {
"type": "boolean",
"default": true
}
},
"required": ["task"]
}
},
"postmortem_required": {
"type": "boolean",
"default": true
},
"postmortem_due": {
"type": "string",
"description": "Timeframe for postmortem (e.g., 5 business days)"
}
}
},
"IncidentChecklist": {
"type": "object",
"description": "Pre-flight checklist for incident response",
"required": ["checklist_id", "name", "items"],
"properties": {
"checklist_id": {
"type": "string"
},
"name": {
"type": "string"
},
"description": {
"type": "string"
},
"items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"item_id": {
"type": "string"
},
"description": {
"type": "string"
},
"category": {
"type": "string",
"enum": ["access", "tools", "documentation", "communication", "monitoring"]
},
"verification": {
"type": "string"
}
},
"required": ["description"]
}
}
}
},
"RunbookCatalog": {
"type": "object",
"description": "Catalog of all runbooks",
"required": ["catalog_id", "version", "runbooks"],
"properties": {
"catalog_id": {
"type": "string"
},
"version": {
"type": "string"
},
"updated_at": {
"type": "string",
"format": "date-time"
},
"runbooks": {
"type": "array",
"items": {
"$ref": "#/definitions/Runbook"
}
},
"checklists": {
"type": "array",
"items": {
"$ref": "#/definitions/IncidentChecklist"
}
},
"global_contacts": {
"type": "array",
"items": {
"$ref": "#/definitions/Contact"
}
}
}
}
},
"properties": {
"catalog": {
"$ref": "#/definitions/RunbookCatalog"
}
},
"examples": [
{
"catalog": {
"catalog_id": "stellaops-runbooks",
"version": "2025.10.0",
"updated_at": "2025-12-06T10:00:00Z",
"runbooks": [
{
"runbook_id": "RB-VULN-001",
"title": "Critical Vulnerability Spike Response",
"description": "Response procedure when critical vulnerabilities spike significantly",
"severity": "critical",
"category": "vulnerability",
"trigger_conditions": [
{
"condition_type": "alert",
"description": "Critical vulnerability count increased by >10 in 1 hour",
"alert_name": "CriticalVulnerabilitySpike"
}
],
"prerequisites": [
"Access to StellaOps CLI (stella)",
"Read access to Findings Ledger",
"Access to #security-incidents Slack channel"
],
"steps": [
{
"step_number": 1,
"action": "Acknowledge the alert",
"description": "Acknowledge in PagerDuty/alerting system to stop escalation",
"timeout": "5m"
},
{
"step_number": 2,
"action": "Identify scope of new vulnerabilities",
"command": "stella findings list --severity critical --since 1h --format table",
"expected_output": "List of new critical findings with CVE IDs and affected assets",
"verification": "Output shows findings with timestamps within last hour"
},
{
"step_number": 3,
"action": "Determine if spike is from new scans or advisory updates",
"commands": [
{
"command": "stella scan jobs --status completed --since 1h",
"description": "Check for recent scan completions"
},
{
"command": "stella advisory updates --since 1h",
"description": "Check for recent advisory updates"
}
],
"decision_point": {
"question": "What caused the spike?",
"options": [
{
"condition": "New scans completed",
"next_step": 4,
"action": "Review scan results"
},
{
"condition": "Advisory update",
"next_step": 5,
"action": "Review advisory impact"
},
{
"condition": "Unknown/Both",
"next_step": 4,
"action": "Continue with full investigation"
}
]
}
},
{
"step_number": 4,
"action": "Review affected assets and determine business impact",
"command": "stella findings group-by asset --severity critical --since 1h",
"description": "Group findings by asset to understand impact scope"
},
{
"step_number": 5,
"action": "Check VEX applicability",
"command": "stella vex check --vuln-ids $(stella findings list --severity critical --since 1h --format ids)",
"description": "Check if any vulnerabilities have VEX statements that reduce severity"
},
{
"step_number": 6,
"action": "Update stakeholders",
"description": "Post status update to #security-incidents with findings summary",
"notes": "Use template: 'VULN SPIKE: [count] new critical vulns affecting [assets]. Investigation in progress.'"
},
{
"step_number": 7,
"action": "Create remediation tickets if needed",
"command": "stella findings export --severity critical --since 1h --format jira",
"skip_conditions": [
"All vulnerabilities covered by VEX not_affected",
"Vulnerabilities are duplicates from rescan"
]
}
],
"escalation": {
"levels": [
{
"level": 1,
"name": "On-call Security Engineer",
"contacts": [
{
"name": "Security On-Call",
"role": "Security Engineer",
"slack_handle": "@security-oncall"
}
],
"response_time_sla": "15m",
"notification_channels": ["pagerduty", "slack"]
},
{
"level": 2,
"name": "Security Team Lead",
"contacts": [
{
"name": "Security Lead",
"role": "Security Team Lead",
"slack_handle": "@security-lead"
}
],
"response_time_sla": "30m",
"notification_channels": ["pagerduty", "slack", "phone"]
}
],
"auto_escalate_after": "30m",
"escalation_criteria": [
"No acknowledgment within 15 minutes",
"More than 50 critical vulnerabilities",
"Production systems affected"
]
},
"communication": {
"internal_channel": "#security-incidents",
"stakeholder_updates": {
"frequency": "every 30m during active incident",
"recipients": ["security-team", "engineering-leads"],
"template": "VULN INCIDENT UPDATE: Status: [status]. Critical count: [count]. Affected systems: [systems]. Next update: [time]."
}
},
"post_incident": {
"items": [
{
"task": "Document incident timeline",
"owner": "Incident Commander",
"due": "within 24h",
"required": true
},
{
"task": "Update vulnerability scanning schedules if needed",
"owner": "Security Team",
"due": "within 1 week",
"required": false
},
{
"task": "Review and update this runbook",
"owner": "Runbook Owner",
"due": "within 1 week",
"required": true
}
],
"postmortem_required": true,
"postmortem_due": "5 business days"
},
"estimated_duration": "1h",
"last_updated": "2025-12-06T10:00:00Z",
"owner": "Security Operations Team",
"tags": ["vulnerability", "security", "critical"]
}
],
"checklists": [
{
"checklist_id": "incident-preflight",
"name": "Incident Response Pre-flight Checklist",
"description": "Verify access and tools before incident response",
"items": [
{
"item_id": "cli-access",
"description": "StellaOps CLI is installed and authenticated",
"category": "tools",
"verification": "Run 'stella whoami' successfully"
},
{
"item_id": "slack-access",
"description": "Access to #security-incidents channel",
"category": "communication",
"verification": "Can post messages to channel"
},
{
"item_id": "pagerduty-access",
"description": "Can acknowledge alerts in PagerDuty",
"category": "tools",
"verification": "PagerDuty mobile app logged in"
},
{
"item_id": "runbooks-access",
"description": "Can access runbook documentation",
"category": "documentation",
"verification": "docs.stella-ops.org/runbooks accessible"
}
]
}
],
"global_contacts": [
{
"name": "Security Operations",
"role": "Primary Response Team",
"email": "security-ops@example.com",
"slack_handle": "@security-ops"
}
]
}
}
]
}