459 lines
16 KiB
PL/PgSQL
459 lines
16 KiB
PL/PgSQL
-- TTFS (Time-to-First-Signal) Schema
|
|
-- Generated from SPRINT_0338_0001_0001_ttfs_foundation.md
|
|
-- Tables are placed in scheduler schema to co-locate with runs/jobs data
|
|
|
|
-- ============================================================================
|
|
-- FIRST SIGNAL SNAPSHOTS
|
|
-- ============================================================================
|
|
-- Caches the current signal state for each job, enabling sub-second lookups
|
|
-- without querying live job state.
|
|
|
|
CREATE TABLE IF NOT EXISTS scheduler.first_signal_snapshots (
|
|
job_id UUID PRIMARY KEY,
|
|
tenant_id UUID NOT NULL,
|
|
run_id UUID REFERENCES scheduler.runs(id),
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
|
|
-- Signal state
|
|
kind TEXT NOT NULL CHECK (kind IN (
|
|
'queued',
|
|
'started',
|
|
'phase',
|
|
'blocked',
|
|
'failed',
|
|
'succeeded',
|
|
'canceled',
|
|
'unavailable'
|
|
)),
|
|
phase TEXT NOT NULL CHECK (phase IN (
|
|
'resolve',
|
|
'fetch',
|
|
'restore',
|
|
'analyze',
|
|
'policy',
|
|
'report',
|
|
'unknown'
|
|
)),
|
|
summary TEXT NOT NULL,
|
|
eta_seconds INT NULL,
|
|
|
|
-- Predictive context
|
|
last_known_outcome JSONB NULL,
|
|
-- Example: {"status": "succeeded", "finished_at": "2025-12-13T10:15:00Z", "findings_count": 12}
|
|
|
|
next_actions JSONB NULL,
|
|
-- Example: [{"label": "View previous run", "href": "/runs/abc-123"}]
|
|
|
|
-- Diagnostics for debugging
|
|
diagnostics JSONB NOT NULL DEFAULT '{}',
|
|
-- Example: {"queue_position": 3, "worker_id": "worker-7", "retry_count": 0}
|
|
|
|
-- Flexible payload for future extensibility
|
|
payload_json JSONB NOT NULL DEFAULT '{}'
|
|
);
|
|
|
|
COMMENT ON TABLE scheduler.first_signal_snapshots IS 'Cached first-signal state for jobs, enabling sub-second TTFS lookups';
|
|
COMMENT ON COLUMN scheduler.first_signal_snapshots.kind IS 'Current signal kind: queued, started, phase, blocked, failed, succeeded, canceled, unavailable';
|
|
COMMENT ON COLUMN scheduler.first_signal_snapshots.phase IS 'Current execution phase: resolve, fetch, restore, analyze, policy, report, unknown';
|
|
COMMENT ON COLUMN scheduler.first_signal_snapshots.eta_seconds IS 'Estimated seconds until completion, null if unknown';
|
|
COMMENT ON COLUMN scheduler.first_signal_snapshots.last_known_outcome IS 'Previous run outcome for predictive context';
|
|
COMMENT ON COLUMN scheduler.first_signal_snapshots.next_actions IS 'Suggested user actions with labels and hrefs';
|
|
|
|
-- Indexes for common query patterns
|
|
CREATE INDEX IF NOT EXISTS idx_first_signal_snapshots_tenant
|
|
ON scheduler.first_signal_snapshots(tenant_id);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_first_signal_snapshots_updated
|
|
ON scheduler.first_signal_snapshots(updated_at DESC);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_first_signal_snapshots_kind
|
|
ON scheduler.first_signal_snapshots(kind);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_first_signal_snapshots_run
|
|
ON scheduler.first_signal_snapshots(run_id);
|
|
|
|
-- Composite index for tenant + kind queries (e.g., "all failed jobs for tenant")
|
|
CREATE INDEX IF NOT EXISTS idx_first_signal_snapshots_tenant_kind
|
|
ON scheduler.first_signal_snapshots(tenant_id, kind);
|
|
|
|
|
|
-- ============================================================================
|
|
-- TTFS EVENTS
|
|
-- ============================================================================
|
|
-- Telemetry storage for TTFS metrics, supporting SLO analysis and alerting.
|
|
|
|
CREATE TABLE IF NOT EXISTS scheduler.ttfs_events (
|
|
id BIGSERIAL PRIMARY KEY,
|
|
ts TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
tenant_id UUID NOT NULL,
|
|
job_id UUID NOT NULL,
|
|
run_id UUID NULL,
|
|
|
|
-- Dimensions
|
|
surface TEXT NOT NULL CHECK (surface IN ('ui', 'cli', 'ci')),
|
|
event_type TEXT NOT NULL CHECK (event_type IN (
|
|
'signal.start',
|
|
'signal.rendered',
|
|
'signal.timeout',
|
|
'signal.error',
|
|
'signal.cache_hit',
|
|
'signal.cold_start'
|
|
)),
|
|
|
|
-- Measurements
|
|
ttfs_ms INT NOT NULL,
|
|
cache_hit BOOLEAN NOT NULL DEFAULT FALSE,
|
|
|
|
-- Signal context
|
|
signal_source TEXT CHECK (signal_source IN ('snapshot', 'cold_start', 'failure_index')),
|
|
kind TEXT CHECK (kind IN (
|
|
'queued', 'started', 'phase', 'blocked',
|
|
'failed', 'succeeded', 'canceled', 'unavailable'
|
|
)),
|
|
phase TEXT CHECK (phase IN (
|
|
'resolve', 'fetch', 'restore', 'analyze',
|
|
'policy', 'report', 'unknown'
|
|
)),
|
|
|
|
-- Client context
|
|
network_state TEXT NULL, -- e.g., '4g', 'wifi', 'offline'
|
|
device TEXT NULL, -- e.g., 'desktop', 'mobile', 'cli'
|
|
release TEXT NULL, -- Application version
|
|
|
|
-- Tracing
|
|
correlation_id TEXT NULL,
|
|
trace_id TEXT NULL,
|
|
span_id TEXT NULL,
|
|
|
|
-- Error context
|
|
error_code TEXT NULL,
|
|
error_message TEXT NULL,
|
|
|
|
-- Extensible metadata
|
|
metadata JSONB DEFAULT '{}'
|
|
);
|
|
|
|
COMMENT ON TABLE scheduler.ttfs_events IS 'Telemetry events for Time-to-First-Signal metrics and SLO tracking';
|
|
COMMENT ON COLUMN scheduler.ttfs_events.ttfs_ms IS 'Time-to-first-signal in milliseconds';
|
|
COMMENT ON COLUMN scheduler.ttfs_events.signal_source IS 'Source of signal: snapshot (cache), cold_start (computed), failure_index (predicted)';
|
|
COMMENT ON COLUMN scheduler.ttfs_events.event_type IS 'Type of TTFS event: start, rendered, timeout, error, cache_hit, cold_start';
|
|
|
|
-- Indexes for time-series queries
|
|
CREATE INDEX IF NOT EXISTS idx_ttfs_events_ts
|
|
ON scheduler.ttfs_events(ts DESC);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_ttfs_events_tenant_ts
|
|
ON scheduler.ttfs_events(tenant_id, ts DESC);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_ttfs_events_surface
|
|
ON scheduler.ttfs_events(surface, ts DESC);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_ttfs_events_job
|
|
ON scheduler.ttfs_events(job_id);
|
|
|
|
-- Partial index for errors (for alerting queries)
|
|
CREATE INDEX IF NOT EXISTS idx_ttfs_events_errors
|
|
ON scheduler.ttfs_events(ts DESC, error_code)
|
|
WHERE event_type = 'signal.error';
|
|
|
|
-- Composite index for SLO analysis
|
|
CREATE INDEX IF NOT EXISTS idx_ttfs_events_surface_cache
|
|
ON scheduler.ttfs_events(surface, cache_hit, ts DESC);
|
|
|
|
|
|
-- ============================================================================
|
|
-- FAILURE SIGNATURES
|
|
-- ============================================================================
|
|
-- Historical failure patterns for predictive "last known outcome" enrichment.
|
|
|
|
CREATE TABLE IF NOT EXISTS scheduler.failure_signatures (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
tenant_id UUID NOT NULL,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
|
|
-- Signature identification
|
|
signature_hash TEXT NOT NULL, -- SHA-256 of pattern JSON
|
|
signature_version INT NOT NULL DEFAULT 1,
|
|
|
|
-- Pattern matching criteria
|
|
pattern JSONB NOT NULL,
|
|
-- Example: {
|
|
-- "phase": "analyze",
|
|
-- "error_code": "LAYER_EXTRACT_FAILED",
|
|
-- "image_pattern": "registry.io/.*:v1.*"
|
|
-- }
|
|
|
|
-- Outcome prediction
|
|
outcome JSONB NOT NULL,
|
|
-- Example: {
|
|
-- "likely_cause": "Registry rate limiting",
|
|
-- "mttr_p50_seconds": 300,
|
|
-- "mttr_p95_seconds": 900,
|
|
-- "suggested_action": "Wait 5 minutes and retry",
|
|
-- "remediation_url": "/docs/troubleshooting/rate-limits"
|
|
-- }
|
|
|
|
-- Confidence metrics
|
|
confidence NUMERIC(4,3) NOT NULL DEFAULT 0.5 CHECK (confidence >= 0 AND confidence <= 1),
|
|
sample_count INT NOT NULL DEFAULT 0,
|
|
last_matched_at TIMESTAMPTZ NULL,
|
|
|
|
-- Lifecycle
|
|
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
|
expires_at TIMESTAMPTZ NULL,
|
|
|
|
-- Constraints
|
|
UNIQUE (tenant_id, signature_hash)
|
|
);
|
|
|
|
COMMENT ON TABLE scheduler.failure_signatures IS 'Historical failure patterns for predictive outcome enrichment';
|
|
COMMENT ON COLUMN scheduler.failure_signatures.signature_hash IS 'SHA-256 hash of pattern JSON for deduplication';
|
|
COMMENT ON COLUMN scheduler.failure_signatures.pattern IS 'JSON pattern for matching job failures';
|
|
COMMENT ON COLUMN scheduler.failure_signatures.outcome IS 'Predicted outcome with cause, MTTR, and suggested actions';
|
|
COMMENT ON COLUMN scheduler.failure_signatures.confidence IS 'Confidence score 0.0-1.0 based on sample count and recency';
|
|
|
|
-- Indexes for failure signature lookups
|
|
CREATE INDEX IF NOT EXISTS idx_failure_signatures_tenant
|
|
ON scheduler.failure_signatures(tenant_id)
|
|
WHERE enabled = TRUE;
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_failure_signatures_hash
|
|
ON scheduler.failure_signatures(signature_hash);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_failure_signatures_confidence
|
|
ON scheduler.failure_signatures(tenant_id, confidence DESC)
|
|
WHERE enabled = TRUE;
|
|
|
|
-- GIN index for JSONB pattern matching
|
|
CREATE INDEX IF NOT EXISTS idx_failure_signatures_pattern
|
|
ON scheduler.failure_signatures USING GIN (pattern);
|
|
|
|
|
|
-- ============================================================================
|
|
-- HOURLY ROLLUP VIEW
|
|
-- ============================================================================
|
|
-- Pre-aggregated metrics for dashboard performance.
|
|
|
|
CREATE OR REPLACE VIEW scheduler.ttfs_hourly_summary AS
|
|
SELECT
|
|
date_trunc('hour', ts) AS hour,
|
|
surface,
|
|
cache_hit,
|
|
COUNT(*) AS event_count,
|
|
AVG(ttfs_ms) AS avg_ms,
|
|
percentile_cont(0.50) WITHIN GROUP (ORDER BY ttfs_ms) AS p50_ms,
|
|
percentile_cont(0.95) WITHIN GROUP (ORDER BY ttfs_ms) AS p95_ms,
|
|
percentile_cont(0.99) WITHIN GROUP (ORDER BY ttfs_ms) AS p99_ms,
|
|
MIN(ttfs_ms) AS min_ms,
|
|
MAX(ttfs_ms) AS max_ms,
|
|
COUNT(*) FILTER (WHERE ttfs_ms > 2000) AS over_p50_slo,
|
|
COUNT(*) FILTER (WHERE ttfs_ms > 5000) AS over_p95_slo
|
|
FROM scheduler.ttfs_events
|
|
WHERE ts >= NOW() - INTERVAL '7 days'
|
|
GROUP BY date_trunc('hour', ts), surface, cache_hit;
|
|
|
|
COMMENT ON VIEW scheduler.ttfs_hourly_summary IS 'Hourly rollup of TTFS metrics for dashboard queries';
|
|
|
|
|
|
-- ============================================================================
|
|
-- DAILY ROLLUP VIEW (for long-term trending)
|
|
-- ============================================================================
|
|
|
|
CREATE OR REPLACE VIEW scheduler.ttfs_daily_summary AS
|
|
SELECT
|
|
date_trunc('day', ts) AS day,
|
|
tenant_id,
|
|
surface,
|
|
COUNT(*) AS event_count,
|
|
AVG(ttfs_ms) AS avg_ms,
|
|
percentile_cont(0.50) WITHIN GROUP (ORDER BY ttfs_ms) AS p50_ms,
|
|
percentile_cont(0.95) WITHIN GROUP (ORDER BY ttfs_ms) AS p95_ms,
|
|
SUM(CASE WHEN cache_hit THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT(*), 0) AS cache_hit_rate,
|
|
COUNT(*) FILTER (WHERE event_type = 'signal.error') AS error_count
|
|
FROM scheduler.ttfs_events
|
|
WHERE ts >= NOW() - INTERVAL '90 days'
|
|
GROUP BY date_trunc('day', ts), tenant_id, surface;
|
|
|
|
COMMENT ON VIEW scheduler.ttfs_daily_summary IS 'Daily rollup of TTFS metrics for long-term trending';
|
|
|
|
|
|
-- ============================================================================
|
|
-- SLO BREACH SUMMARY VIEW
|
|
-- ============================================================================
|
|
|
|
CREATE OR REPLACE VIEW scheduler.ttfs_slo_breaches AS
|
|
SELECT
|
|
date_trunc('hour', ts) AS hour,
|
|
tenant_id,
|
|
surface,
|
|
COUNT(*) AS total_signals,
|
|
COUNT(*) FILTER (WHERE ttfs_ms > 2000) AS p50_breaches,
|
|
COUNT(*) FILTER (WHERE ttfs_ms > 5000) AS p95_breaches,
|
|
ROUND(100.0 * COUNT(*) FILTER (WHERE ttfs_ms <= 2000) / NULLIF(COUNT(*), 0), 2) AS p50_compliance_pct,
|
|
ROUND(100.0 * COUNT(*) FILTER (WHERE ttfs_ms <= 5000) / NULLIF(COUNT(*), 0), 2) AS p95_compliance_pct
|
|
FROM scheduler.ttfs_events
|
|
WHERE ts >= NOW() - INTERVAL '24 hours'
|
|
AND event_type = 'signal.rendered'
|
|
GROUP BY date_trunc('hour', ts), tenant_id, surface
|
|
HAVING COUNT(*) > 0;
|
|
|
|
COMMENT ON VIEW scheduler.ttfs_slo_breaches IS 'SLO compliance summary for alerting dashboards';
|
|
|
|
|
|
-- ============================================================================
|
|
-- RETENTION POLICY (for cleanup jobs)
|
|
-- ============================================================================
|
|
-- Note: Implement as scheduled job, not as database trigger
|
|
|
|
-- Recommended retention periods:
|
|
-- - ttfs_events: 90 days (telemetry data)
|
|
-- - first_signal_snapshots: 24 hours after job completion (cache)
|
|
-- - failure_signatures: indefinite (but expire low-confidence signatures)
|
|
|
|
-- Example cleanup queries (run via scheduler):
|
|
--
|
|
-- DELETE FROM scheduler.ttfs_events WHERE ts < NOW() - INTERVAL '90 days';
|
|
--
|
|
-- DELETE FROM scheduler.first_signal_snapshots
|
|
-- WHERE updated_at < NOW() - INTERVAL '24 hours'
|
|
-- AND kind IN ('succeeded', 'failed', 'canceled');
|
|
--
|
|
-- UPDATE scheduler.failure_signatures
|
|
-- SET enabled = FALSE
|
|
-- WHERE confidence < 0.3 AND updated_at < NOW() - INTERVAL '30 days';
|
|
|
|
|
|
-- ============================================================================
|
|
-- FUNCTIONS
|
|
-- ============================================================================
|
|
|
|
-- Function to upsert first signal snapshot
|
|
CREATE OR REPLACE FUNCTION scheduler.upsert_first_signal_snapshot(
|
|
p_job_id UUID,
|
|
p_tenant_id UUID,
|
|
p_run_id UUID,
|
|
p_kind TEXT,
|
|
p_phase TEXT,
|
|
p_summary TEXT,
|
|
p_eta_seconds INT DEFAULT NULL,
|
|
p_last_known_outcome JSONB DEFAULT NULL,
|
|
p_next_actions JSONB DEFAULT NULL,
|
|
p_diagnostics JSONB DEFAULT '{}'
|
|
)
|
|
RETURNS scheduler.first_signal_snapshots AS $$
|
|
DECLARE
|
|
result scheduler.first_signal_snapshots;
|
|
BEGIN
|
|
INSERT INTO scheduler.first_signal_snapshots (
|
|
job_id, tenant_id, run_id, kind, phase, summary,
|
|
eta_seconds, last_known_outcome, next_actions, diagnostics
|
|
)
|
|
VALUES (
|
|
p_job_id, p_tenant_id, p_run_id, p_kind, p_phase, p_summary,
|
|
p_eta_seconds, p_last_known_outcome, p_next_actions, p_diagnostics
|
|
)
|
|
ON CONFLICT (job_id) DO UPDATE SET
|
|
kind = EXCLUDED.kind,
|
|
phase = EXCLUDED.phase,
|
|
summary = EXCLUDED.summary,
|
|
eta_seconds = EXCLUDED.eta_seconds,
|
|
last_known_outcome = COALESCE(EXCLUDED.last_known_outcome, scheduler.first_signal_snapshots.last_known_outcome),
|
|
next_actions = EXCLUDED.next_actions,
|
|
diagnostics = EXCLUDED.diagnostics,
|
|
updated_at = NOW()
|
|
RETURNING * INTO result;
|
|
|
|
-- Notify listeners for real-time updates (air-gap mode)
|
|
PERFORM pg_notify(
|
|
'ttfs_signal_update',
|
|
json_build_object(
|
|
'job_id', p_job_id,
|
|
'tenant_id', p_tenant_id,
|
|
'kind', p_kind,
|
|
'phase', p_phase
|
|
)::text
|
|
);
|
|
|
|
RETURN result;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
COMMENT ON FUNCTION scheduler.upsert_first_signal_snapshot IS 'Upsert signal snapshot with NOTIFY for air-gap real-time updates';
|
|
|
|
|
|
-- Function to record TTFS event
|
|
CREATE OR REPLACE FUNCTION scheduler.record_ttfs_event(
|
|
p_tenant_id UUID,
|
|
p_job_id UUID,
|
|
p_surface TEXT,
|
|
p_event_type TEXT,
|
|
p_ttfs_ms INT,
|
|
p_cache_hit BOOLEAN DEFAULT FALSE,
|
|
p_signal_source TEXT DEFAULT NULL,
|
|
p_kind TEXT DEFAULT NULL,
|
|
p_phase TEXT DEFAULT NULL,
|
|
p_run_id UUID DEFAULT NULL,
|
|
p_correlation_id TEXT DEFAULT NULL,
|
|
p_error_code TEXT DEFAULT NULL,
|
|
p_metadata JSONB DEFAULT '{}'
|
|
)
|
|
RETURNS scheduler.ttfs_events AS $$
|
|
DECLARE
|
|
result scheduler.ttfs_events;
|
|
BEGIN
|
|
INSERT INTO scheduler.ttfs_events (
|
|
tenant_id, job_id, run_id, surface, event_type, ttfs_ms,
|
|
cache_hit, signal_source, kind, phase, correlation_id,
|
|
error_code, metadata
|
|
)
|
|
VALUES (
|
|
p_tenant_id, p_job_id, p_run_id, p_surface, p_event_type, p_ttfs_ms,
|
|
p_cache_hit, p_signal_source, p_kind, p_phase, p_correlation_id,
|
|
p_error_code, p_metadata
|
|
)
|
|
RETURNING * INTO result;
|
|
|
|
RETURN result;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
COMMENT ON FUNCTION scheduler.record_ttfs_event IS 'Record TTFS telemetry event for metrics and SLO analysis';
|
|
|
|
|
|
-- Function to match failure signatures
|
|
CREATE OR REPLACE FUNCTION scheduler.match_failure_signature(
|
|
p_tenant_id UUID,
|
|
p_phase TEXT,
|
|
p_error_code TEXT,
|
|
p_image_reference TEXT DEFAULT NULL
|
|
)
|
|
RETURNS TABLE (
|
|
signature_id UUID,
|
|
outcome JSONB,
|
|
confidence NUMERIC
|
|
) AS $$
|
|
BEGIN
|
|
RETURN QUERY
|
|
SELECT
|
|
fs.id,
|
|
fs.outcome,
|
|
fs.confidence
|
|
FROM scheduler.failure_signatures fs
|
|
WHERE fs.tenant_id = p_tenant_id
|
|
AND fs.enabled = TRUE
|
|
AND (fs.expires_at IS NULL OR fs.expires_at > NOW())
|
|
AND (fs.pattern->>'phase' IS NULL OR fs.pattern->>'phase' = p_phase)
|
|
AND (fs.pattern->>'error_code' IS NULL OR fs.pattern->>'error_code' = p_error_code)
|
|
AND (
|
|
fs.pattern->>'image_pattern' IS NULL
|
|
OR (p_image_reference IS NOT NULL AND p_image_reference ~ (fs.pattern->>'image_pattern'))
|
|
)
|
|
ORDER BY fs.confidence DESC, fs.sample_count DESC
|
|
LIMIT 1;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
COMMENT ON FUNCTION scheduler.match_failure_signature IS 'Find best matching failure signature for predictive outcome';
|