-- TTFS (Time-to-First-Signal) Schema -- Generated from SPRINT_0338_0001_0001_ttfs_foundation.md -- Tables are placed in scheduler schema to co-locate with runs/jobs data -- ============================================================================ -- FIRST SIGNAL SNAPSHOTS -- ============================================================================ -- Caches the current signal state for each job, enabling sub-second lookups -- without querying live job state. CREATE TABLE IF NOT EXISTS scheduler.first_signal_snapshots ( job_id UUID PRIMARY KEY, tenant_id UUID NOT NULL, run_id UUID REFERENCES scheduler.runs(id), created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), -- Signal state kind TEXT NOT NULL CHECK (kind IN ( 'queued', 'started', 'phase', 'blocked', 'failed', 'succeeded', 'canceled', 'unavailable' )), phase TEXT NOT NULL CHECK (phase IN ( 'resolve', 'fetch', 'restore', 'analyze', 'policy', 'report', 'unknown' )), summary TEXT NOT NULL, eta_seconds INT NULL, -- Predictive context last_known_outcome JSONB NULL, -- Example: {"status": "succeeded", "finished_at": "2025-12-13T10:15:00Z", "findings_count": 12} next_actions JSONB NULL, -- Example: [{"label": "View previous run", "href": "/runs/abc-123"}] -- Diagnostics for debugging diagnostics JSONB NOT NULL DEFAULT '{}', -- Example: {"queue_position": 3, "worker_id": "worker-7", "retry_count": 0} -- Flexible payload for future extensibility payload_json JSONB NOT NULL DEFAULT '{}' ); COMMENT ON TABLE scheduler.first_signal_snapshots IS 'Cached first-signal state for jobs, enabling sub-second TTFS lookups'; COMMENT ON COLUMN scheduler.first_signal_snapshots.kind IS 'Current signal kind: queued, started, phase, blocked, failed, succeeded, canceled, unavailable'; COMMENT ON COLUMN scheduler.first_signal_snapshots.phase IS 'Current execution phase: resolve, fetch, restore, analyze, policy, report, unknown'; COMMENT ON COLUMN scheduler.first_signal_snapshots.eta_seconds IS 'Estimated seconds until completion, null if unknown'; COMMENT ON COLUMN scheduler.first_signal_snapshots.last_known_outcome IS 'Previous run outcome for predictive context'; COMMENT ON COLUMN scheduler.first_signal_snapshots.next_actions IS 'Suggested user actions with labels and hrefs'; -- Indexes for common query patterns CREATE INDEX IF NOT EXISTS idx_first_signal_snapshots_tenant ON scheduler.first_signal_snapshots(tenant_id); CREATE INDEX IF NOT EXISTS idx_first_signal_snapshots_updated ON scheduler.first_signal_snapshots(updated_at DESC); CREATE INDEX IF NOT EXISTS idx_first_signal_snapshots_kind ON scheduler.first_signal_snapshots(kind); CREATE INDEX IF NOT EXISTS idx_first_signal_snapshots_run ON scheduler.first_signal_snapshots(run_id); -- Composite index for tenant + kind queries (e.g., "all failed jobs for tenant") CREATE INDEX IF NOT EXISTS idx_first_signal_snapshots_tenant_kind ON scheduler.first_signal_snapshots(tenant_id, kind); -- ============================================================================ -- TTFS EVENTS -- ============================================================================ -- Telemetry storage for TTFS metrics, supporting SLO analysis and alerting. CREATE TABLE IF NOT EXISTS scheduler.ttfs_events ( id BIGSERIAL PRIMARY KEY, ts TIMESTAMPTZ NOT NULL DEFAULT NOW(), tenant_id UUID NOT NULL, job_id UUID NOT NULL, run_id UUID NULL, -- Dimensions surface TEXT NOT NULL CHECK (surface IN ('ui', 'cli', 'ci')), event_type TEXT NOT NULL CHECK (event_type IN ( 'signal.start', 'signal.rendered', 'signal.timeout', 'signal.error', 'signal.cache_hit', 'signal.cold_start' )), -- Measurements ttfs_ms INT NOT NULL, cache_hit BOOLEAN NOT NULL DEFAULT FALSE, -- Signal context signal_source TEXT CHECK (signal_source IN ('snapshot', 'cold_start', 'failure_index')), kind TEXT CHECK (kind IN ( 'queued', 'started', 'phase', 'blocked', 'failed', 'succeeded', 'canceled', 'unavailable' )), phase TEXT CHECK (phase IN ( 'resolve', 'fetch', 'restore', 'analyze', 'policy', 'report', 'unknown' )), -- Client context network_state TEXT NULL, -- e.g., '4g', 'wifi', 'offline' device TEXT NULL, -- e.g., 'desktop', 'mobile', 'cli' release TEXT NULL, -- Application version -- Tracing correlation_id TEXT NULL, trace_id TEXT NULL, span_id TEXT NULL, -- Error context error_code TEXT NULL, error_message TEXT NULL, -- Extensible metadata metadata JSONB DEFAULT '{}' ); COMMENT ON TABLE scheduler.ttfs_events IS 'Telemetry events for Time-to-First-Signal metrics and SLO tracking'; COMMENT ON COLUMN scheduler.ttfs_events.ttfs_ms IS 'Time-to-first-signal in milliseconds'; COMMENT ON COLUMN scheduler.ttfs_events.signal_source IS 'Source of signal: snapshot (cache), cold_start (computed), failure_index (predicted)'; COMMENT ON COLUMN scheduler.ttfs_events.event_type IS 'Type of TTFS event: start, rendered, timeout, error, cache_hit, cold_start'; -- Indexes for time-series queries CREATE INDEX IF NOT EXISTS idx_ttfs_events_ts ON scheduler.ttfs_events(ts DESC); CREATE INDEX IF NOT EXISTS idx_ttfs_events_tenant_ts ON scheduler.ttfs_events(tenant_id, ts DESC); CREATE INDEX IF NOT EXISTS idx_ttfs_events_surface ON scheduler.ttfs_events(surface, ts DESC); CREATE INDEX IF NOT EXISTS idx_ttfs_events_job ON scheduler.ttfs_events(job_id); -- Partial index for errors (for alerting queries) CREATE INDEX IF NOT EXISTS idx_ttfs_events_errors ON scheduler.ttfs_events(ts DESC, error_code) WHERE event_type = 'signal.error'; -- Composite index for SLO analysis CREATE INDEX IF NOT EXISTS idx_ttfs_events_surface_cache ON scheduler.ttfs_events(surface, cache_hit, ts DESC); -- ============================================================================ -- FAILURE SIGNATURES -- ============================================================================ -- Historical failure patterns for predictive "last known outcome" enrichment. CREATE TABLE IF NOT EXISTS scheduler.failure_signatures ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), tenant_id UUID NOT NULL, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), -- Signature identification signature_hash TEXT NOT NULL, -- SHA-256 of pattern JSON signature_version INT NOT NULL DEFAULT 1, -- Pattern matching criteria pattern JSONB NOT NULL, -- Example: { -- "phase": "analyze", -- "error_code": "LAYER_EXTRACT_FAILED", -- "image_pattern": "registry.io/.*:v1.*" -- } -- Outcome prediction outcome JSONB NOT NULL, -- Example: { -- "likely_cause": "Registry rate limiting", -- "mttr_p50_seconds": 300, -- "mttr_p95_seconds": 900, -- "suggested_action": "Wait 5 minutes and retry", -- "remediation_url": "/docs/troubleshooting/rate-limits" -- } -- Confidence metrics confidence NUMERIC(4,3) NOT NULL DEFAULT 0.5 CHECK (confidence >= 0 AND confidence <= 1), sample_count INT NOT NULL DEFAULT 0, last_matched_at TIMESTAMPTZ NULL, -- Lifecycle enabled BOOLEAN NOT NULL DEFAULT TRUE, expires_at TIMESTAMPTZ NULL, -- Constraints UNIQUE (tenant_id, signature_hash) ); COMMENT ON TABLE scheduler.failure_signatures IS 'Historical failure patterns for predictive outcome enrichment'; COMMENT ON COLUMN scheduler.failure_signatures.signature_hash IS 'SHA-256 hash of pattern JSON for deduplication'; COMMENT ON COLUMN scheduler.failure_signatures.pattern IS 'JSON pattern for matching job failures'; COMMENT ON COLUMN scheduler.failure_signatures.outcome IS 'Predicted outcome with cause, MTTR, and suggested actions'; COMMENT ON COLUMN scheduler.failure_signatures.confidence IS 'Confidence score 0.0-1.0 based on sample count and recency'; -- Indexes for failure signature lookups CREATE INDEX IF NOT EXISTS idx_failure_signatures_tenant ON scheduler.failure_signatures(tenant_id) WHERE enabled = TRUE; CREATE INDEX IF NOT EXISTS idx_failure_signatures_hash ON scheduler.failure_signatures(signature_hash); CREATE INDEX IF NOT EXISTS idx_failure_signatures_confidence ON scheduler.failure_signatures(tenant_id, confidence DESC) WHERE enabled = TRUE; -- GIN index for JSONB pattern matching CREATE INDEX IF NOT EXISTS idx_failure_signatures_pattern ON scheduler.failure_signatures USING GIN (pattern); -- ============================================================================ -- HOURLY ROLLUP VIEW -- ============================================================================ -- Pre-aggregated metrics for dashboard performance. CREATE OR REPLACE VIEW scheduler.ttfs_hourly_summary AS SELECT date_trunc('hour', ts) AS hour, surface, cache_hit, COUNT(*) AS event_count, AVG(ttfs_ms) AS avg_ms, percentile_cont(0.50) WITHIN GROUP (ORDER BY ttfs_ms) AS p50_ms, percentile_cont(0.95) WITHIN GROUP (ORDER BY ttfs_ms) AS p95_ms, percentile_cont(0.99) WITHIN GROUP (ORDER BY ttfs_ms) AS p99_ms, MIN(ttfs_ms) AS min_ms, MAX(ttfs_ms) AS max_ms, COUNT(*) FILTER (WHERE ttfs_ms > 2000) AS over_p50_slo, COUNT(*) FILTER (WHERE ttfs_ms > 5000) AS over_p95_slo FROM scheduler.ttfs_events WHERE ts >= NOW() - INTERVAL '7 days' GROUP BY date_trunc('hour', ts), surface, cache_hit; COMMENT ON VIEW scheduler.ttfs_hourly_summary IS 'Hourly rollup of TTFS metrics for dashboard queries'; -- ============================================================================ -- DAILY ROLLUP VIEW (for long-term trending) -- ============================================================================ CREATE OR REPLACE VIEW scheduler.ttfs_daily_summary AS SELECT date_trunc('day', ts) AS day, tenant_id, surface, COUNT(*) AS event_count, AVG(ttfs_ms) AS avg_ms, percentile_cont(0.50) WITHIN GROUP (ORDER BY ttfs_ms) AS p50_ms, percentile_cont(0.95) WITHIN GROUP (ORDER BY ttfs_ms) AS p95_ms, SUM(CASE WHEN cache_hit THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT(*), 0) AS cache_hit_rate, COUNT(*) FILTER (WHERE event_type = 'signal.error') AS error_count FROM scheduler.ttfs_events WHERE ts >= NOW() - INTERVAL '90 days' GROUP BY date_trunc('day', ts), tenant_id, surface; COMMENT ON VIEW scheduler.ttfs_daily_summary IS 'Daily rollup of TTFS metrics for long-term trending'; -- ============================================================================ -- SLO BREACH SUMMARY VIEW -- ============================================================================ CREATE OR REPLACE VIEW scheduler.ttfs_slo_breaches AS SELECT date_trunc('hour', ts) AS hour, tenant_id, surface, COUNT(*) AS total_signals, COUNT(*) FILTER (WHERE ttfs_ms > 2000) AS p50_breaches, COUNT(*) FILTER (WHERE ttfs_ms > 5000) AS p95_breaches, ROUND(100.0 * COUNT(*) FILTER (WHERE ttfs_ms <= 2000) / NULLIF(COUNT(*), 0), 2) AS p50_compliance_pct, ROUND(100.0 * COUNT(*) FILTER (WHERE ttfs_ms <= 5000) / NULLIF(COUNT(*), 0), 2) AS p95_compliance_pct FROM scheduler.ttfs_events WHERE ts >= NOW() - INTERVAL '24 hours' AND event_type = 'signal.rendered' GROUP BY date_trunc('hour', ts), tenant_id, surface HAVING COUNT(*) > 0; COMMENT ON VIEW scheduler.ttfs_slo_breaches IS 'SLO compliance summary for alerting dashboards'; -- ============================================================================ -- RETENTION POLICY (for cleanup jobs) -- ============================================================================ -- Note: Implement as scheduled job, not as database trigger -- Recommended retention periods: -- - ttfs_events: 90 days (telemetry data) -- - first_signal_snapshots: 24 hours after job completion (cache) -- - failure_signatures: indefinite (but expire low-confidence signatures) -- Example cleanup queries (run via scheduler): -- -- DELETE FROM scheduler.ttfs_events WHERE ts < NOW() - INTERVAL '90 days'; -- -- DELETE FROM scheduler.first_signal_snapshots -- WHERE updated_at < NOW() - INTERVAL '24 hours' -- AND kind IN ('succeeded', 'failed', 'canceled'); -- -- UPDATE scheduler.failure_signatures -- SET enabled = FALSE -- WHERE confidence < 0.3 AND updated_at < NOW() - INTERVAL '30 days'; -- ============================================================================ -- FUNCTIONS -- ============================================================================ -- Function to upsert first signal snapshot CREATE OR REPLACE FUNCTION scheduler.upsert_first_signal_snapshot( p_job_id UUID, p_tenant_id UUID, p_run_id UUID, p_kind TEXT, p_phase TEXT, p_summary TEXT, p_eta_seconds INT DEFAULT NULL, p_last_known_outcome JSONB DEFAULT NULL, p_next_actions JSONB DEFAULT NULL, p_diagnostics JSONB DEFAULT '{}' ) RETURNS scheduler.first_signal_snapshots AS $$ DECLARE result scheduler.first_signal_snapshots; BEGIN INSERT INTO scheduler.first_signal_snapshots ( job_id, tenant_id, run_id, kind, phase, summary, eta_seconds, last_known_outcome, next_actions, diagnostics ) VALUES ( p_job_id, p_tenant_id, p_run_id, p_kind, p_phase, p_summary, p_eta_seconds, p_last_known_outcome, p_next_actions, p_diagnostics ) ON CONFLICT (job_id) DO UPDATE SET kind = EXCLUDED.kind, phase = EXCLUDED.phase, summary = EXCLUDED.summary, eta_seconds = EXCLUDED.eta_seconds, last_known_outcome = COALESCE(EXCLUDED.last_known_outcome, scheduler.first_signal_snapshots.last_known_outcome), next_actions = EXCLUDED.next_actions, diagnostics = EXCLUDED.diagnostics, updated_at = NOW() RETURNING * INTO result; -- Notify listeners for real-time updates (air-gap mode) PERFORM pg_notify( 'ttfs_signal_update', json_build_object( 'job_id', p_job_id, 'tenant_id', p_tenant_id, 'kind', p_kind, 'phase', p_phase )::text ); RETURN result; END; $$ LANGUAGE plpgsql; COMMENT ON FUNCTION scheduler.upsert_first_signal_snapshot IS 'Upsert signal snapshot with NOTIFY for air-gap real-time updates'; -- Function to record TTFS event CREATE OR REPLACE FUNCTION scheduler.record_ttfs_event( p_tenant_id UUID, p_job_id UUID, p_surface TEXT, p_event_type TEXT, p_ttfs_ms INT, p_cache_hit BOOLEAN DEFAULT FALSE, p_signal_source TEXT DEFAULT NULL, p_kind TEXT DEFAULT NULL, p_phase TEXT DEFAULT NULL, p_run_id UUID DEFAULT NULL, p_correlation_id TEXT DEFAULT NULL, p_error_code TEXT DEFAULT NULL, p_metadata JSONB DEFAULT '{}' ) RETURNS scheduler.ttfs_events AS $$ DECLARE result scheduler.ttfs_events; BEGIN INSERT INTO scheduler.ttfs_events ( tenant_id, job_id, run_id, surface, event_type, ttfs_ms, cache_hit, signal_source, kind, phase, correlation_id, error_code, metadata ) VALUES ( p_tenant_id, p_job_id, p_run_id, p_surface, p_event_type, p_ttfs_ms, p_cache_hit, p_signal_source, p_kind, p_phase, p_correlation_id, p_error_code, p_metadata ) RETURNING * INTO result; RETURN result; END; $$ LANGUAGE plpgsql; COMMENT ON FUNCTION scheduler.record_ttfs_event IS 'Record TTFS telemetry event for metrics and SLO analysis'; -- Function to match failure signatures CREATE OR REPLACE FUNCTION scheduler.match_failure_signature( p_tenant_id UUID, p_phase TEXT, p_error_code TEXT, p_image_reference TEXT DEFAULT NULL ) RETURNS TABLE ( signature_id UUID, outcome JSONB, confidence NUMERIC ) AS $$ BEGIN RETURN QUERY SELECT fs.id, fs.outcome, fs.confidence FROM scheduler.failure_signatures fs WHERE fs.tenant_id = p_tenant_id AND fs.enabled = TRUE AND (fs.expires_at IS NULL OR fs.expires_at > NOW()) AND (fs.pattern->>'phase' IS NULL OR fs.pattern->>'phase' = p_phase) AND (fs.pattern->>'error_code' IS NULL OR fs.pattern->>'error_code' = p_error_code) AND ( fs.pattern->>'image_pattern' IS NULL OR (p_image_reference IS NOT NULL AND p_image_reference ~ (fs.pattern->>'image_pattern')) ) ORDER BY fs.confidence DESC, fs.sample_count DESC LIMIT 1; END; $$ LANGUAGE plpgsql; COMMENT ON FUNCTION scheduler.match_failure_signature IS 'Find best matching failure signature for predictive outcome';