devops folders consolidate
This commit is contained in:
36
deploy/telemetry/alerts/alerts-slo.yaml
Normal file
36
deploy/telemetry/alerts/alerts-slo.yaml
Normal file
@@ -0,0 +1,36 @@
|
||||
groups:
|
||||
- name: slo-burn
|
||||
rules:
|
||||
- alert: SLOBurnRateFast
|
||||
expr: |
|
||||
(rate(service_request_errors_total[5m]) / rate(service_requests_total[5m])) >
|
||||
4 * (1 - 0.99)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Fast burn: 99% SLO breached"
|
||||
description: "Error budget burn (5m) exceeds fast threshold."
|
||||
- alert: SLOBurnRateSlow
|
||||
expr: |
|
||||
(rate(service_request_errors_total[1h]) / rate(service_requests_total[1h])) >
|
||||
1 * (1 - 0.99)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Slow burn: 99% SLO at risk"
|
||||
description: "Error budget burn (1h) exceeds slow threshold."
|
||||
- name: slo-webhook
|
||||
rules:
|
||||
- alert: SLOWebhookFailures
|
||||
expr: rate(slo_webhook_failures_total[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "SLO webhook failures"
|
||||
description: "Webhook emitter has failures in last 5m."
|
||||
164
deploy/telemetry/alerts/export-center-alerts.yaml
Normal file
164
deploy/telemetry/alerts/export-center-alerts.yaml
Normal file
@@ -0,0 +1,164 @@
|
||||
# ExportCenter Alert Rules
|
||||
# SLO Burn-rate alerts for export service reliability
|
||||
|
||||
groups:
|
||||
- name: export-center-slo
|
||||
interval: 30s
|
||||
rules:
|
||||
# SLO: 99.5% success rate target
|
||||
# Error budget: 0.5% (432 errors per day at 86400 requests/day)
|
||||
|
||||
# Fast burn - 2% budget consumption in 1 hour (critical)
|
||||
- alert: ExportCenterHighErrorBurnRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(export_runs_failed_total[1h]))
|
||||
/
|
||||
sum(rate(export_runs_total[1h]))
|
||||
) > (14.4 * 0.005)
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: export-center
|
||||
slo: availability
|
||||
annotations:
|
||||
summary: "ExportCenter high error burn rate"
|
||||
description: "Error rate is {{ $value | humanizePercentage }} over the last hour, consuming error budget at 14.4x the sustainable rate."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-error-rate"
|
||||
|
||||
# Slow burn - 10% budget consumption in 6 hours (warning)
|
||||
- alert: ExportCenterElevatedErrorBurnRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(export_runs_failed_total[6h]))
|
||||
/
|
||||
sum(rate(export_runs_total[6h]))
|
||||
) > (6 * 0.005)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: export-center
|
||||
slo: availability
|
||||
annotations:
|
||||
summary: "ExportCenter elevated error burn rate"
|
||||
description: "Error rate is {{ $value | humanizePercentage }} over the last 6 hours, consuming error budget at 6x the sustainable rate."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/elevated-error-rate"
|
||||
|
||||
- name: export-center-latency
|
||||
interval: 30s
|
||||
rules:
|
||||
# SLO: 95% of exports complete within 120s
|
||||
# Fast burn - p95 latency exceeding threshold
|
||||
- alert: ExportCenterHighLatency
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(export_run_duration_seconds_bucket[5m])) by (le)
|
||||
) > 120
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: export-center
|
||||
slo: latency
|
||||
annotations:
|
||||
summary: "ExportCenter high latency"
|
||||
description: "95th percentile export duration is {{ $value | humanizeDuration }}, exceeding 120s SLO target."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-latency"
|
||||
|
||||
# Critical latency - p99 exceeding 5 minutes
|
||||
- alert: ExportCenterCriticalLatency
|
||||
expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(export_run_duration_seconds_bucket[5m])) by (le)
|
||||
) > 300
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: export-center
|
||||
slo: latency
|
||||
annotations:
|
||||
summary: "ExportCenter critical latency"
|
||||
description: "99th percentile export duration is {{ $value | humanizeDuration }}, indicating severe performance degradation."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/critical-latency"
|
||||
|
||||
- name: export-center-capacity
|
||||
interval: 60s
|
||||
rules:
|
||||
# Queue buildup warning
|
||||
- alert: ExportCenterHighConcurrency
|
||||
expr: sum(export_runs_in_progress) > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: export-center
|
||||
annotations:
|
||||
summary: "ExportCenter high concurrency"
|
||||
description: "{{ $value }} exports currently in progress. Consider scaling or investigating slow exports."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-concurrency"
|
||||
|
||||
# Stuck exports - exports running longer than 30 minutes
|
||||
- alert: ExportCenterStuckExports
|
||||
expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(export_run_duration_seconds_bucket{status!="completed"}[1h])) by (le)
|
||||
) > 1800
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: export-center
|
||||
annotations:
|
||||
summary: "ExportCenter potentially stuck exports"
|
||||
description: "Some exports may be stuck - 99th percentile duration for incomplete exports exceeds 30 minutes."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/stuck-exports"
|
||||
|
||||
- name: export-center-errors
|
||||
interval: 30s
|
||||
rules:
|
||||
# Specific error code spike
|
||||
- alert: ExportCenterErrorCodeSpike
|
||||
expr: |
|
||||
sum by (error_code) (
|
||||
rate(export_runs_failed_total[5m])
|
||||
) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: export-center
|
||||
annotations:
|
||||
summary: "ExportCenter error code spike: {{ $labels.error_code }}"
|
||||
description: "Error code {{ $labels.error_code }} is occurring at {{ $value | humanize }}/s rate."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/error-codes"
|
||||
|
||||
# No successful exports in 15 minutes (when there is traffic)
|
||||
- alert: ExportCenterNoSuccessfulExports
|
||||
expr: |
|
||||
(
|
||||
sum(rate(export_runs_total[15m])) > 0
|
||||
)
|
||||
and
|
||||
(
|
||||
sum(rate(export_runs_success_total[15m])) == 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
service: export-center
|
||||
annotations:
|
||||
summary: "ExportCenter no successful exports"
|
||||
description: "No exports have completed successfully in the last 15 minutes despite ongoing attempts."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/no-successful-exports"
|
||||
|
||||
- name: export-center-deprecation
|
||||
interval: 5m
|
||||
rules:
|
||||
# Deprecated endpoint usage
|
||||
- alert: ExportCenterDeprecatedEndpointUsage
|
||||
expr: |
|
||||
sum(rate(export_center_deprecated_endpoint_access_total[1h])) > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: info
|
||||
service: export-center
|
||||
annotations:
|
||||
summary: "Deprecated export endpoints still in use"
|
||||
description: "Legacy /exports endpoints are still being accessed at {{ $value | humanize }}/s. Migration to v1 API recommended."
|
||||
runbook_url: "https://docs.stellaops.io/api/export-center/migration"
|
||||
52
deploy/telemetry/alerts/policy-alerts.yaml
Normal file
52
deploy/telemetry/alerts/policy-alerts.yaml
Normal file
@@ -0,0 +1,52 @@
|
||||
groups:
|
||||
- name: policy-pipeline
|
||||
rules:
|
||||
- alert: PolicyCompileLatencyP99High
|
||||
expr: histogram_quantile(0.99, sum(rate(policy_compile_duration_seconds_bucket[5m])) by (le)) > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: policy
|
||||
annotations:
|
||||
summary: "Policy compile latency elevated (p99)"
|
||||
description: "p99 compile duration has been >5s for 10m"
|
||||
|
||||
- alert: PolicySimulationQueueBacklog
|
||||
expr: sum(policy_simulation_queue_depth) > 100
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: policy
|
||||
annotations:
|
||||
summary: "Policy simulation backlog"
|
||||
description: "Simulation queue depth above 100 for 10m"
|
||||
|
||||
- alert: PolicyApprovalLatencyHigh
|
||||
expr: histogram_quantile(0.95, sum(rate(policy_approval_latency_seconds_bucket[5m])) by (le)) > 30
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
service: policy
|
||||
annotations:
|
||||
summary: "Policy approval latency high"
|
||||
description: "p95 approval latency above 30s for 15m"
|
||||
|
||||
- alert: PolicyPromotionFailureRate
|
||||
expr: clamp_min(rate(policy_promotion_outcomes_total{outcome="failure"}[15m]), 0) / clamp_min(rate(policy_promotion_outcomes_total[15m]), 1) > 0.2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
service: policy
|
||||
annotations:
|
||||
summary: "Policy promotion failure rate elevated"
|
||||
description: "Failures exceed 20% of promotions over 15m"
|
||||
|
||||
- alert: PolicyPromotionStall
|
||||
expr: rate(policy_promotion_outcomes_total{outcome="success"}[10m]) == 0 and sum(policy_simulation_queue_depth) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: policy
|
||||
annotations:
|
||||
summary: "Policy promotion stalled"
|
||||
description: "No successful promotions while work is queued"
|
||||
42
deploy/telemetry/alerts/scanner-fn-drift-alerts.yaml
Normal file
42
deploy/telemetry/alerts/scanner-fn-drift-alerts.yaml
Normal file
@@ -0,0 +1,42 @@
|
||||
# Scanner FN-Drift Alert Rules
|
||||
# SLO alerts for false-negative drift thresholds (30-day rolling window)
|
||||
|
||||
groups:
|
||||
- name: scanner-fn-drift
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: ScannerFnDriftWarning
|
||||
expr: scanner_fn_drift_percent > 1.0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: scanner
|
||||
slo: fn-drift
|
||||
annotations:
|
||||
summary: "Scanner FN-Drift rate above warning threshold"
|
||||
description: "FN-Drift is {{ $value | humanizePercentage }} (> 1.0%) over the 30-day rolling window."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/scanner/fn-drift-warning"
|
||||
|
||||
- alert: ScannerFnDriftCritical
|
||||
expr: scanner_fn_drift_percent > 2.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: scanner
|
||||
slo: fn-drift
|
||||
annotations:
|
||||
summary: "Scanner FN-Drift rate above critical threshold"
|
||||
description: "FN-Drift is {{ $value | humanizePercentage }} (> 2.5%) over the 30-day rolling window."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/scanner/fn-drift-critical"
|
||||
|
||||
- alert: ScannerFnDriftEngineViolation
|
||||
expr: scanner_fn_drift_cause_engine > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: page
|
||||
service: scanner
|
||||
slo: determinism
|
||||
annotations:
|
||||
summary: "Engine-caused FN drift detected (determinism violation)"
|
||||
description: "Engine-caused FN drift count is {{ $value }} (> 0). This indicates non-feed, non-policy changes affecting outcomes."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/scanner/fn-drift-engine-violation"
|
||||
54
deploy/telemetry/alerts/signals-alerts.yaml
Normal file
54
deploy/telemetry/alerts/signals-alerts.yaml
Normal file
@@ -0,0 +1,54 @@
|
||||
groups:
|
||||
- name: signals-pipeline
|
||||
rules:
|
||||
- alert: SignalsScoringLatencyP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(signals_reachability_scoring_duration_seconds_bucket[5m])) by (le)) > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: signals
|
||||
annotations:
|
||||
summary: "Signals scoring latency high (p95)"
|
||||
description: "Reachability scoring p95 exceeds 2s for 10m"
|
||||
|
||||
- alert: SignalsCacheMissRateHigh
|
||||
expr: |
|
||||
clamp_min(rate(signals_cache_misses_total[5m]), 0)
|
||||
/ clamp_min(rate(signals_cache_hits_total[5m]) + rate(signals_cache_misses_total[5m]), 1) > 0.3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: signals
|
||||
annotations:
|
||||
summary: "Signals cache miss rate high"
|
||||
description: "Cache miss ratio >30% over 10m; investigate Redis or key churn."
|
||||
|
||||
- alert: SignalsCacheDown
|
||||
expr: signals_cache_available == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: signals
|
||||
annotations:
|
||||
summary: "Signals cache unavailable"
|
||||
description: "Redis cache reported unavailable for >2m"
|
||||
|
||||
- alert: SignalsSensorStaleness
|
||||
expr: time() - max(signals_sensor_last_seen_timestamp_seconds) by (sensor) > 900
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: signals
|
||||
annotations:
|
||||
summary: "Signals sensor stale"
|
||||
description: "No updates from sensor for >15 minutes"
|
||||
|
||||
- alert: SignalsIngestionErrorRate
|
||||
expr: clamp_min(rate(signals_ingestion_failures_total[5m]), 0) / clamp_min(rate(signals_ingestion_total[5m]), 1) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: signals
|
||||
annotations:
|
||||
summary: "Signals ingestion failures elevated"
|
||||
description: "Ingestion failure ratio above 5% over 5m"
|
||||
118
deploy/telemetry/alerts/stella-p0-alerts.yml
Normal file
118
deploy/telemetry/alerts/stella-p0-alerts.yml
Normal file
@@ -0,0 +1,118 @@
|
||||
# Sprint: SPRINT_20260117_028_Telemetry_p0_metrics
|
||||
# Task: P0M-006 - Alerting Rules
|
||||
# P0 Product Metrics Alert Rules
|
||||
|
||||
groups:
|
||||
- name: stella-p0-metrics
|
||||
rules:
|
||||
# P0M-001: Time to First Verified Release
|
||||
- alert: StellaTimeToFirstReleaseHigh
|
||||
expr: |
|
||||
histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 14400
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
category: adoption
|
||||
annotations:
|
||||
summary: "Time to first verified release is high for tenant {{ $labels.tenant }}"
|
||||
description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 4 hours)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
|
||||
|
||||
- alert: StellaTimeToFirstReleaseCritical
|
||||
expr: |
|
||||
histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 86400
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
category: adoption
|
||||
annotations:
|
||||
summary: "Time to first verified release critically high for tenant {{ $labels.tenant }}"
|
||||
description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 24 hours)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
|
||||
|
||||
# P0M-002: Why Blocked Latency
|
||||
- alert: StellaWhyBlockedLatencyHigh
|
||||
expr: |
|
||||
histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 300
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
category: usability
|
||||
annotations:
|
||||
summary: "Why-blocked latency is high for tenant {{ $labels.tenant }}"
|
||||
description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 5 minutes)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
|
||||
|
||||
- alert: StellaWhyBlockedLatencyCritical
|
||||
expr: |
|
||||
histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 3600
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
category: usability
|
||||
annotations:
|
||||
summary: "Why-blocked latency critically high for tenant {{ $labels.tenant }}"
|
||||
description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 1 hour)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
|
||||
|
||||
# P0M-003: Support Burden
|
||||
- alert: StellaSupportBurdenHigh
|
||||
expr: |
|
||||
sum by (tenant, month) (stella_support_burden_minutes_total) > 30
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
category: operations
|
||||
annotations:
|
||||
summary: "Support burden high for tenant {{ $labels.tenant }}"
|
||||
description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 30 minutes)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
|
||||
|
||||
- alert: StellaSupportBurdenCritical
|
||||
expr: |
|
||||
sum by (tenant, month) (stella_support_burden_minutes_total) > 60
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
category: operations
|
||||
annotations:
|
||||
summary: "Support burden critically high for tenant {{ $labels.tenant }}"
|
||||
description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 60 minutes)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
|
||||
|
||||
# P0M-004: Determinism Regressions
|
||||
- alert: StellaDeterminismRegression
|
||||
expr: |
|
||||
increase(stella_determinism_regressions_total{severity="policy"}[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
category: reliability
|
||||
annotations:
|
||||
summary: "Policy-level determinism regression detected for tenant {{ $labels.tenant }}"
|
||||
description: "Determinism failure in {{ $labels.component }} component - same inputs produced different policy decisions"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
|
||||
|
||||
- alert: StellaDeterminismRegressionSemantic
|
||||
expr: |
|
||||
increase(stella_determinism_regressions_total{severity="semantic"}[1h]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
category: reliability
|
||||
annotations:
|
||||
summary: "Semantic determinism regression detected for tenant {{ $labels.tenant }}"
|
||||
description: "Semantic-level determinism failure in {{ $labels.component }} - outputs differ but policy decision unchanged"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
|
||||
|
||||
- alert: StellaDeterminismRegressionBitwise
|
||||
expr: |
|
||||
increase(stella_determinism_regressions_total{severity="bitwise"}[24h]) > 5
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
category: reliability
|
||||
annotations:
|
||||
summary: "Multiple bitwise determinism regressions for tenant {{ $labels.tenant }}"
|
||||
description: "{{ $value }} bitwise-level determinism failures in {{ $labels.component }} in last 24h"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
|
||||
62
deploy/telemetry/alerts/triage-alerts.yaml
Normal file
62
deploy/telemetry/alerts/triage-alerts.yaml
Normal file
@@ -0,0 +1,62 @@
|
||||
groups:
|
||||
- name: triage-ttfs
|
||||
rules:
|
||||
- alert: TriageTtfsFirstEvidenceP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_first_evidence_seconds_bucket[5m])) by (le)) > 1.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "TTFS first evidence p95 high"
|
||||
description: "TTFS first-evidence p95 exceeds 1.5s for 10m (triage experience degraded)."
|
||||
|
||||
- alert: TriageTtfsSkeletonP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_skeleton_seconds_bucket[5m])) by (le)) > 0.2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "TTFS skeleton p95 high"
|
||||
description: "TTFS skeleton p95 exceeds 200ms for 10m."
|
||||
|
||||
- alert: TriageTtfsFullEvidenceP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_full_evidence_seconds_bucket[5m])) by (le)) > 1.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "TTFS full evidence p95 high"
|
||||
description: "TTFS full-evidence p95 exceeds 1.5s for 10m."
|
||||
|
||||
- alert: TriageClicksToClosureMedianHigh
|
||||
expr: histogram_quantile(0.50, sum(rate(stellaops_clicks_to_closure_bucket[5m])) by (le)) > 6
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "Clicks-to-closure median high"
|
||||
description: "Median clicks-to-closure exceeds 6 for 15m."
|
||||
|
||||
- alert: TriageEvidenceCompletenessAvgLow
|
||||
expr: (sum(rate(stellaops_evidence_completeness_score_sum[15m])) / clamp_min(sum(rate(stellaops_evidence_completeness_score_count[15m])), 1)) < 3.6
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "Evidence completeness below target"
|
||||
description: "Average evidence completeness score below 3.6 (90%) for 30m."
|
||||
|
||||
- alert: TriageBudgetViolationRateHigh
|
||||
expr: sum(rate(stellaops_performance_budget_violations_total[5m])) by (phase) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "Performance budget violations elevated"
|
||||
description: "Performance budget violation rate exceeds 0.05/s for 10m."
|
||||
92
deploy/telemetry/collectors/otel-collector-config.yaml
Normal file
92
deploy/telemetry/collectors/otel-collector-config.yaml
Normal file
@@ -0,0 +1,92 @@
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
tls:
|
||||
cert_file: ${STELLAOPS_OTEL_TLS_CERT:?STELLAOPS_OTEL_TLS_CERT not set}
|
||||
key_file: ${STELLAOPS_OTEL_TLS_KEY:?STELLAOPS_OTEL_TLS_KEY not set}
|
||||
client_ca_file: ${STELLAOPS_OTEL_TLS_CA:?STELLAOPS_OTEL_TLS_CA not set}
|
||||
require_client_certificate: ${STELLAOPS_OTEL_REQUIRE_CLIENT_CERT:true}
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
tls:
|
||||
cert_file: ${STELLAOPS_OTEL_TLS_CERT:?STELLAOPS_OTEL_TLS_CERT not set}
|
||||
key_file: ${STELLAOPS_OTEL_TLS_KEY:?STELLAOPS_OTEL_TLS_KEY not set}
|
||||
client_ca_file: ${STELLAOPS_OTEL_TLS_CA:?STELLAOPS_OTEL_TLS_CA not set}
|
||||
require_client_certificate: ${STELLAOPS_OTEL_REQUIRE_CLIENT_CERT:true}
|
||||
|
||||
processors:
|
||||
attributes/tenant-tag:
|
||||
actions:
|
||||
- key: tenant.id
|
||||
action: insert
|
||||
value: ${STELLAOPS_TENANT_ID:unknown}
|
||||
batch:
|
||||
send_batch_size: 1024
|
||||
timeout: 5s
|
||||
|
||||
exporters:
|
||||
logging:
|
||||
verbosity: normal
|
||||
prometheus:
|
||||
endpoint: ${STELLAOPS_OTEL_PROMETHEUS_ENDPOINT:0.0.0.0:9464}
|
||||
enable_open_metrics: true
|
||||
metric_expiration: 5m
|
||||
tls:
|
||||
cert_file: ${STELLAOPS_OTEL_TLS_CERT:?STELLAOPS_OTEL_TLS_CERT not set}
|
||||
key_file: ${STELLAOPS_OTEL_TLS_KEY:?STELLAOPS_OTEL_TLS_KEY not set}
|
||||
client_ca_file: ${STELLAOPS_OTEL_TLS_CA:?STELLAOPS_OTEL_TLS_CA not set}
|
||||
otlphttp/tempo:
|
||||
endpoint: ${STELLAOPS_TEMPO_ENDPOINT:https://stellaops-tempo:3200}
|
||||
compression: gzip
|
||||
tls:
|
||||
ca_file: ${STELLAOPS_TEMPO_TLS_CA_FILE:/etc/otel-collector/tls/ca.crt}
|
||||
cert_file: ${STELLAOPS_TEMPO_TLS_CERT_FILE:/etc/otel-collector/tls/client.crt}
|
||||
key_file: ${STELLAOPS_TEMPO_TLS_KEY_FILE:/etc/otel-collector/tls/client.key}
|
||||
insecure_skip_verify: false
|
||||
headers:
|
||||
"X-Scope-OrgID": ${STELLAOPS_TENANT_ID:unknown}
|
||||
loki/tenant:
|
||||
endpoint: ${STELLAOPS_LOKI_ENDPOINT:https://stellaops-loki:3100/loki/api/v1/push}
|
||||
tenant_id: ${STELLAOPS_TENANT_ID:unknown}
|
||||
tls:
|
||||
ca_file: ${STELLAOPS_LOKI_TLS_CA_FILE:/etc/otel-collector/tls/ca.crt}
|
||||
cert_file: ${STELLAOPS_LOKI_TLS_CERT_FILE:/etc/otel-collector/tls/client.crt}
|
||||
key_file: ${STELLAOPS_LOKI_TLS_KEY_FILE:/etc/otel-collector/tls/client.key}
|
||||
insecure_skip_verify: false
|
||||
default_labels_enabled:
|
||||
exporter: false
|
||||
job: false
|
||||
instance: false
|
||||
format: json
|
||||
drain_interval: 5s
|
||||
queue:
|
||||
enabled: true
|
||||
queue_size: 1024
|
||||
retry_on_failure: true
|
||||
|
||||
extensions:
|
||||
health_check:
|
||||
endpoint: ${STELLAOPS_OTEL_HEALTH_ENDPOINT:0.0.0.0:13133}
|
||||
pprof:
|
||||
endpoint: ${STELLAOPS_OTEL_PPROF_ENDPOINT:0.0.0.0:1777}
|
||||
|
||||
service:
|
||||
telemetry:
|
||||
logs:
|
||||
level: ${STELLAOPS_OTEL_LOG_LEVEL:info}
|
||||
extensions: [health_check, pprof]
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [attributes/tenant-tag, batch]
|
||||
exporters: [logging, otlphttp/tempo]
|
||||
metrics:
|
||||
receivers: [otlp]
|
||||
processors: [attributes/tenant-tag, batch]
|
||||
exporters: [logging, prometheus]
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [attributes/tenant-tag, batch]
|
||||
exporters: [logging, loki/tenant]
|
||||
638
deploy/telemetry/dashboards/export-center.json
Normal file
638
deploy/telemetry/dashboards/export-center.json
Normal file
@@ -0,0 +1,638 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": { "type": "grafana", "uid": "-- Grafana --" },
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "ExportCenter service observability dashboard",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"panels": [],
|
||||
"title": "Export Runs Overview",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"editorMode": "code",
|
||||
"expr": "sum(increase(export_runs_total{tenant=~\"$tenant\"}[$__range]))",
|
||||
"legendFormat": "Total Runs",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Total Export Runs",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
|
||||
"id": 3,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"editorMode": "code",
|
||||
"expr": "sum(increase(export_runs_success_total{tenant=~\"$tenant\"}[$__range]))",
|
||||
"legendFormat": "Successful",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Successful Runs",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"editorMode": "code",
|
||||
"expr": "sum(increase(export_runs_failed_total{tenant=~\"$tenant\"}[$__range]))",
|
||||
"legendFormat": "Failed",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Failed Runs",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 95 },
|
||||
{ "color": "green", "value": 99 }
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
|
||||
"id": 5,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"editorMode": "code",
|
||||
"expr": "100 * sum(increase(export_runs_success_total{tenant=~\"$tenant\"}[$__range])) / sum(increase(export_runs_total{tenant=~\"$tenant\"}[$__range]))",
|
||||
"legendFormat": "Success Rate",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Success Rate",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
|
||||
"id": 6,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"editorMode": "code",
|
||||
"expr": "sum(export_runs_in_progress{tenant=~\"$tenant\"})",
|
||||
"legendFormat": "In Progress",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Runs In Progress",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 },
|
||||
"id": 7,
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (export_type) (rate(export_runs_total{tenant=~\"$tenant\"}[5m]))",
|
||||
"legendFormat": "{{export_type}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Export Runs by Type (rate/5m)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 },
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max", "p95"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.50, sum by (le) (rate(export_run_duration_seconds_bucket{tenant=~\"$tenant\"}[5m])))",
|
||||
"legendFormat": "p50",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.95, sum by (le) (rate(export_run_duration_seconds_bucket{tenant=~\"$tenant\"}[5m])))",
|
||||
"legendFormat": "p95",
|
||||
"range": true,
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum by (le) (rate(export_run_duration_seconds_bucket{tenant=~\"$tenant\"}[5m])))",
|
||||
"legendFormat": "p99",
|
||||
"range": true,
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"title": "Export Run Duration (latency percentiles)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 },
|
||||
"id": 9,
|
||||
"panels": [],
|
||||
"title": "Artifacts & Bundle Sizes",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "bars",
|
||||
"fillOpacity": 50,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "normal" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 },
|
||||
"id": 10,
|
||||
"options": {
|
||||
"legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (artifact_type) (increase(export_artifacts_total{tenant=~\"$tenant\"}[1h]))",
|
||||
"legendFormat": "{{artifact_type}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Artifacts Exported by Type (per hour)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 },
|
||||
"id": 11,
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.50, sum by (le, export_type) (rate(export_bundle_size_bytes_bucket{tenant=~\"$tenant\"}[5m])))",
|
||||
"legendFormat": "{{export_type}} p50",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.95, sum by (le, export_type) (rate(export_bundle_size_bytes_bucket{tenant=~\"$tenant\"}[5m])))",
|
||||
"legendFormat": "{{export_type}} p95",
|
||||
"range": true,
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Bundle Size Distribution by Type",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 },
|
||||
"id": 12,
|
||||
"panels": [],
|
||||
"title": "Error Analysis",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false }
|
||||
},
|
||||
"mappings": [],
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 23 },
|
||||
"id": 13,
|
||||
"options": {
|
||||
"legend": { "displayMode": "table", "placement": "right", "showLegend": true },
|
||||
"pieType": "pie",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"tooltip": { "mode": "single", "sort": "none" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (error_code) (increase(export_runs_failed_total{tenant=~\"$tenant\"}[$__range]))",
|
||||
"legendFormat": "{{error_code}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Failures by Error Code",
|
||||
"type": "piechart"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.01 }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 8, "y": 23 },
|
||||
"id": 14,
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(export_runs_failed_total{tenant=~\"$tenant\"}[5m])) / sum(rate(export_runs_total{tenant=~\"$tenant\"}[5m]))",
|
||||
"legendFormat": "Error Rate",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Error Rate (5m window)",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 38,
|
||||
"style": "dark",
|
||||
"tags": ["export-center", "stellaops"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": ".*",
|
||||
"current": {},
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"definition": "label_values(export_runs_total, tenant)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"name": "tenant",
|
||||
"options": [],
|
||||
"query": { "query": "label_values(export_runs_total, tenant)", "refId": "StandardVariableQuery" },
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "utc",
|
||||
"title": "ExportCenter Service",
|
||||
"uid": "export-center-overview",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
536
deploy/telemetry/dashboards/stella-ops-error-tracking.json
Normal file
536
deploy/telemetry/dashboards/stella-ops-error-tracking.json
Normal file
@@ -0,0 +1,536 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"enable": true,
|
||||
"expr": "increase(stella_error_total[1m]) > 0",
|
||||
"iconColor": "red",
|
||||
"name": "Error Spikes",
|
||||
"tagKeys": "error_type",
|
||||
"titleFormat": "Error: {{error_type}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Stella Ops Release Orchestrator - Error Tracking",
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"iteration": 1737158400000,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"panels": [],
|
||||
"title": "Error Summary",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["sum"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(stella_error_total[1h]))",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Errors (1h)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "red", "value": 0.05 }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
|
||||
"id": 3,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(stella_error_total[5m])) / sum(rate(stella_api_requests_total[5m]))",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Error Rate",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["sum"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(stella_release_failed_total[1h]))",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Failed Releases (1h)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
|
||||
"id": 5,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["sum"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(stella_gate_failed_total[1h]))",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Gate Failures (1h)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
|
||||
"id": 6,
|
||||
"panels": [],
|
||||
"title": "Error Trends",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 20,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "normal" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
|
||||
"id": 7,
|
||||
"options": {
|
||||
"legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(stella_error_total[5m])) by (error_type)",
|
||||
"legendFormat": "{{error_type}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Errors by Type",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 20,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "normal" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(stella_error_total{environment=~\"$environment\"}[5m])) by (component)",
|
||||
"legendFormat": "{{component}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Errors by Component",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
|
||||
"id": 9,
|
||||
"panels": [],
|
||||
"title": "Release Failures",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"fillOpacity": 80,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineWidth": 1,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 },
|
||||
"id": 10,
|
||||
"options": {
|
||||
"barRadius": 0.1,
|
||||
"barWidth": 0.8,
|
||||
"groupWidth": 0.7,
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
|
||||
"orientation": "horizontal",
|
||||
"showValue": "auto",
|
||||
"stacking": "none",
|
||||
"tooltip": { "mode": "single", "sort": "none" },
|
||||
"xTickLabelRotation": 0,
|
||||
"xTickLabelSpacing": 0
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10, sum(increase(stella_release_failed_total[24h])) by (failure_reason))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "{{failure_reason}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Top Failure Reasons (24h)",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "Time": true },
|
||||
"indexByName": {},
|
||||
"renameByName": { "Value": "Count", "failure_reason": "Reason" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "barchart"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "bars",
|
||||
"fillOpacity": 80,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "normal" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Failures" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Rollbacks" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 },
|
||||
"id": 11,
|
||||
"options": {
|
||||
"legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(stella_release_failed_total{environment=~\"$environment\"}[1h])) by (environment)",
|
||||
"legendFormat": "{{environment}} Failures",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(stella_rollback_total{environment=~\"$environment\"}[1h])) by (environment)",
|
||||
"legendFormat": "{{environment}} Rollbacks",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Failures & Rollbacks by Environment",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 },
|
||||
"id": 12,
|
||||
"panels": [],
|
||||
"title": "Recent Errors",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${loki_datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 24 },
|
||||
"id": 13,
|
||||
"options": {
|
||||
"dedupStrategy": "none",
|
||||
"enableLogDetails": true,
|
||||
"prettifyLogMessage": false,
|
||||
"showCommonLabels": false,
|
||||
"showLabels": true,
|
||||
"showTime": true,
|
||||
"sortOrder": "Descending",
|
||||
"wrapLogMessage": true
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{app=\"stella-ops\"} |= \"error\" | json | level=~\"error|fatal\"",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Error Logs",
|
||||
"type": "logs"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 36,
|
||||
"style": "dark",
|
||||
"tags": ["stella-ops", "errors"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": { "selected": false, "text": "Prometheus", "value": "Prometheus" },
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Metrics",
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"queryValue": "",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"current": { "selected": false, "text": "Loki", "value": "Loki" },
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Logs",
|
||||
"multi": false,
|
||||
"name": "loki_datasource",
|
||||
"options": [],
|
||||
"query": "loki",
|
||||
"queryValue": "",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": ".*",
|
||||
"current": { "selected": true, "text": "All", "value": "$__all" },
|
||||
"datasource": "${datasource}",
|
||||
"definition": "label_values(stella_error_total, environment)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Environment",
|
||||
"multi": true,
|
||||
"name": "environment",
|
||||
"options": [],
|
||||
"query": { "query": "label_values(stella_error_total, environment)", "refId": "StandardVariableQuery" },
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Stella Ops - Error Tracking",
|
||||
"uid": "stella-ops-errors",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
607
deploy/telemetry/dashboards/stella-ops-performance.json
Normal file
607
deploy/telemetry/dashboards/stella-ops-performance.json
Normal file
@@ -0,0 +1,607 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Stella Ops Release Orchestrator - Performance Metrics",
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"iteration": 1737158400000,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"panels": [],
|
||||
"title": "System Performance",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.7 },
|
||||
{ "color": "red", "value": 0.9 }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
|
||||
"id": 2,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg(stella_cpu_usage_ratio{component=\"orchestrator\"})",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "CPU Usage",
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.7 },
|
||||
{ "color": "red", "value": 0.9 }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
|
||||
"id": 3,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg(stella_memory_usage_ratio{component=\"orchestrator\"})",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Memory Usage",
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 100 },
|
||||
{ "color": "red", "value": 500 }
|
||||
]
|
||||
},
|
||||
"unit": "ms"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["mean"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(stella_api_request_duration_seconds_bucket[5m])) by (le)) * 1000",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "API Latency (p95)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
},
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
|
||||
"id": 5,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(stella_api_requests_total[5m]))",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Request Rate",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
|
||||
"id": 6,
|
||||
"panels": [],
|
||||
"title": "Gate Evaluation Performance",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
|
||||
"id": 7,
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(stella_gate_evaluation_duration_seconds_bucket{gate_type=~\"$gate_type\"}[5m])) by (le, gate_type))",
|
||||
"legendFormat": "{{gate_type}} p99",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(stella_gate_evaluation_duration_seconds_bucket{gate_type=~\"$gate_type\"}[5m])) by (le, gate_type))",
|
||||
"legendFormat": "{{gate_type}} p50",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Gate Evaluation Duration by Type",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(stella_gate_evaluations_total{gate_type=~\"$gate_type\"}[5m])) by (gate_type)",
|
||||
"legendFormat": "{{gate_type}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Gate Evaluations per Second",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
|
||||
"id": 9,
|
||||
"panels": [],
|
||||
"title": "Cache Performance",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 0.7 },
|
||||
{ "color": "green", "value": 0.9 }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 15 },
|
||||
"id": 10,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(stella_cache_hits_total) / (sum(stella_cache_hits_total) + sum(stella_cache_misses_total))",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Cache Hit Ratio",
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Hits" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Misses" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 12, "x": 6, "y": 15 },
|
||||
"id": 11,
|
||||
"options": {
|
||||
"legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(stella_cache_hits_total[5m])) by (cache_name)",
|
||||
"legendFormat": "{{cache_name}} Hits",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(stella_cache_misses_total[5m])) by (cache_name)",
|
||||
"legendFormat": "{{cache_name}} Misses",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Cache Hits vs Misses",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.7 },
|
||||
{ "color": "red", "value": 0.9 }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 6, "x": 18, "y": 15 },
|
||||
"id": 12,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "stella_cache_size_bytes / stella_cache_max_size_bytes",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Cache Utilization",
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
|
||||
"id": 13,
|
||||
"panels": [],
|
||||
"title": "Database Performance",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "ms"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
|
||||
"id": 14,
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(stella_db_query_duration_seconds_bucket[5m])) by (le, query_type)) * 1000",
|
||||
"legendFormat": "{{query_type}} p95",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Database Query Duration (p95)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
|
||||
"id": 15,
|
||||
"options": {
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "stella_db_connections_active",
|
||||
"legendFormat": "Active",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "stella_db_connections_idle",
|
||||
"legendFormat": "Idle",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "stella_db_connections_max",
|
||||
"legendFormat": "Max",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"title": "Database Connection Pool",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 36,
|
||||
"style": "dark",
|
||||
"tags": ["stella-ops", "performance"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": { "selected": false, "text": "Prometheus", "value": "Prometheus" },
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Data Source",
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"queryValue": "",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": ".*",
|
||||
"current": { "selected": true, "text": "All", "value": "$__all" },
|
||||
"datasource": "${datasource}",
|
||||
"definition": "label_values(stella_gate_evaluation_duration_seconds_bucket, gate_type)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Gate Type",
|
||||
"multi": true,
|
||||
"name": "gate_type",
|
||||
"options": [],
|
||||
"query": { "query": "label_values(stella_gate_evaluation_duration_seconds_bucket, gate_type)", "refId": "StandardVariableQuery" },
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Stella Ops - Performance Metrics",
|
||||
"uid": "stella-ops-performance",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
566
deploy/telemetry/dashboards/stella-ops-release-overview.json
Normal file
566
deploy/telemetry/dashboards/stella-ops-release-overview.json
Normal file
@@ -0,0 +1,566 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"enable": true,
|
||||
"expr": "stella_release_promotion_completed{environment=~\"$environment\"}",
|
||||
"iconColor": "green",
|
||||
"name": "Promotions",
|
||||
"tagKeys": "version,environment",
|
||||
"titleFormat": "Promotion to {{environment}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Stella Ops Release Orchestrator - Release Overview",
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"iteration": 1737158400000,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"panels": [],
|
||||
"title": "Release Summary",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(stella_release_active{environment=~\"$environment\"})",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Active Releases",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 5 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
|
||||
"id": 3,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(stella_release_pending_approval{environment=~\"$environment\"})",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Pending Approvals",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(stella_release_success_total{environment=~\"$environment\"}) / sum(stella_release_total{environment=~\"$environment\"})",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Success Rate (24h)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 900 },
|
||||
{ "color": "red", "value": 1800 }
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
|
||||
"id": 5,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["mean"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(stella_release_duration_seconds_bucket{environment=~\"$environment\"}[24h])) by (le))",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Median Release Time",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
|
||||
"id": 6,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(stella_gate_passed_total{environment=~\"$environment\"}) / sum(stella_gate_evaluated_total{environment=~\"$environment\"})",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Gate Pass Rate",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
|
||||
"id": 7,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(stella_rollback_total{environment=~\"$environment\"})",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Rollbacks (24h)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
|
||||
"id": 8,
|
||||
"panels": [],
|
||||
"title": "Release Activity",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
|
||||
"id": 9,
|
||||
"options": {
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(stella_release_total{environment=~\"$environment\"}[5m])) by (environment)",
|
||||
"legendFormat": "{{environment}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Releases per Minute",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "bars",
|
||||
"fillOpacity": 80,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "normal" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Success" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Failed" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
|
||||
"id": 10,
|
||||
"options": {
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(stella_release_success_total{environment=~\"$environment\"}[1h]))",
|
||||
"legendFormat": "Success",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(stella_release_failed_total{environment=~\"$environment\"}[1h]))",
|
||||
"legendFormat": "Failed",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Release Outcomes (Hourly)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
|
||||
"id": 11,
|
||||
"panels": [],
|
||||
"title": "Environment Health",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "0": { "color": "red", "index": 0, "text": "Down" } }, "type": "value" },
|
||||
{ "options": { "1": { "color": "green", "index": 1, "text": "Up" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 15 },
|
||||
"id": 12,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "stella_environment_health{environment=~\"$environment\"}",
|
||||
"legendFormat": "{{environment}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Environment Status",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 16, "x": 8, "y": 15 },
|
||||
"id": 13,
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right" },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(stella_release_duration_seconds_bucket{environment=~\"$environment\"}[5m])) by (le, environment))",
|
||||
"legendFormat": "{{environment}} p95",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(stella_release_duration_seconds_bucket{environment=~\"$environment\"}[5m])) by (le, environment))",
|
||||
"legendFormat": "{{environment}} p50",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Release Duration by Environment",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 36,
|
||||
"style": "dark",
|
||||
"tags": ["stella-ops", "releases"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": { "selected": false, "text": "Prometheus", "value": "Prometheus" },
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Data Source",
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"queryValue": "",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": ".*",
|
||||
"current": { "selected": true, "text": "All", "value": "$__all" },
|
||||
"datasource": "${datasource}",
|
||||
"definition": "label_values(stella_release_total, environment)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Environment",
|
||||
"multi": true,
|
||||
"name": "environment",
|
||||
"options": [],
|
||||
"query": { "query": "label_values(stella_release_total, environment)", "refId": "StandardVariableQuery" },
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-24h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Stella Ops - Release Overview",
|
||||
"uid": "stella-ops-releases",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
541
deploy/telemetry/dashboards/stella-ops-sla-monitoring.json
Normal file
541
deploy/telemetry/dashboards/stella-ops-sla-monitoring.json
Normal file
@@ -0,0 +1,541 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"enable": true,
|
||||
"expr": "changes(stella_sla_breach_total[1m]) > 0",
|
||||
"iconColor": "red",
|
||||
"name": "SLA Breaches",
|
||||
"tagKeys": "sla_name",
|
||||
"titleFormat": "SLA Breach: {{sla_name}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Stella Ops Release Orchestrator - SLA Monitoring",
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"iteration": 1737158400000,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"panels": [],
|
||||
"title": "SLA Overview",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 0.99 },
|
||||
{ "color": "green", "value": 0.999 }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 6, "x": 0, "y": 1 },
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "1 - (sum(increase(stella_release_failed_total[30d])) / sum(increase(stella_release_total[30d])))",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Release Success Rate (30d SLA)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 0.99 },
|
||||
{ "color": "green", "value": 0.999 }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 6, "x": 6, "y": 1 },
|
||||
"id": 3,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg_over_time(stella_api_availability[30d])",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "API Availability (30d SLA)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "red", "value": 600 }
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 6, "x": 12, "y": 1 },
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["mean"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(stella_release_duration_seconds_bucket[30d])) by (le))",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Release Time p95 (Target: <10m)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 6, "x": 18, "y": 1 },
|
||||
"id": 5,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["sum"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(stella_sla_breach_total[30d]))",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "SLA Breaches (30d)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 },
|
||||
"id": 6,
|
||||
"panels": [],
|
||||
"title": "Error Budget",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 20 },
|
||||
{ "color": "green", "value": 50 }
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 7 },
|
||||
"id": 7,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((0.001 * sum(increase(stella_release_total[30d]))) - sum(increase(stella_release_failed_total[30d]))) / (0.001 * sum(increase(stella_release_total[30d]))) * 100",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Error Budget Remaining (99.9% SLA)",
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0 }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 16, "x": 8, "y": 7 },
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(0.001 * sum(increase(stella_release_total[30d]))) - sum(increase(stella_release_failed_total[30d]))",
|
||||
"legendFormat": "Remaining Budget (failures allowed)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Error Budget Burn Rate",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 },
|
||||
"id": 9,
|
||||
"panels": [],
|
||||
"title": "SLI Trends",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "line+area" }
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 1,
|
||||
"min": 0.99,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "transparent", "value": 0.999 }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 },
|
||||
"id": 10,
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "min"], "displayMode": "table", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "1 - (sum(rate(stella_release_failed_total[1h])) / sum(rate(stella_release_total[1h])))",
|
||||
"legendFormat": "Success Rate",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Release Success Rate Over Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "line+area" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "transparent", "value": null },
|
||||
{ "color": "red", "value": 600 }
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 },
|
||||
"id": 11,
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(stella_release_duration_seconds_bucket[1h])) by (le))",
|
||||
"legendFormat": "p95 Duration",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(stella_release_duration_seconds_bucket[1h])) by (le))",
|
||||
"legendFormat": "p99 Duration",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Release Duration SLI",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 },
|
||||
"id": 12,
|
||||
"panels": [],
|
||||
"title": "SLA by Environment",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"custom": {
|
||||
"align": "auto",
|
||||
"displayMode": "auto",
|
||||
"inspect": false
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 0.99 },
|
||||
{ "color": "green", "value": 0.999 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Success Rate" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "percentunit" },
|
||||
{ "id": "custom.displayMode", "value": "color-background-solid" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Avg Duration" },
|
||||
"properties": [{ "id": "unit", "value": "s" }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 23 },
|
||||
"id": 13,
|
||||
"options": {
|
||||
"footer": { "fields": "", "reducer": ["sum"], "show": false },
|
||||
"showHeader": true,
|
||||
"sortBy": []
|
||||
},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "1 - (sum(increase(stella_release_failed_total[7d])) by (environment) / sum(increase(stella_release_total[7d])) by (environment))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(stella_release_total[7d])) by (environment)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "avg(rate(stella_release_duration_seconds_sum[7d]) / rate(stella_release_duration_seconds_count[7d])) by (environment)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"title": "SLA by Environment (7d)",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "seriesToColumns",
|
||||
"options": { "byField": "environment" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "Time 1": true, "Time 2": true, "Time 3": true },
|
||||
"indexByName": {},
|
||||
"renameByName": {
|
||||
"Value #A": "Success Rate",
|
||||
"Value #B": "Total Releases",
|
||||
"Value #C": "Avg Duration",
|
||||
"environment": "Environment"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
}
|
||||
],
|
||||
"refresh": "5m",
|
||||
"schemaVersion": 36,
|
||||
"style": "dark",
|
||||
"tags": ["stella-ops", "sla"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": { "selected": false, "text": "Prometheus", "value": "Prometheus" },
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Data Source",
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"queryValue": "",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-30d", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Stella Ops - SLA Monitoring",
|
||||
"uid": "stella-ops-sla",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
48
deploy/telemetry/storage/loki.yaml
Normal file
48
deploy/telemetry/storage/loki.yaml
Normal file
@@ -0,0 +1,48 @@
|
||||
auth_enabled: true
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
log_level: info
|
||||
|
||||
common:
|
||||
ring:
|
||||
instance_addr: 127.0.0.1
|
||||
kvstore:
|
||||
store: inmemory
|
||||
replication_factor: 1
|
||||
path_prefix: /var/loki
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2024-01-01
|
||||
store: boltdb-shipper
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: loki_index_
|
||||
period: 24h
|
||||
|
||||
storage_config:
|
||||
filesystem:
|
||||
directory: /var/loki/chunks
|
||||
boltdb_shipper:
|
||||
active_index_directory: /var/loki/index
|
||||
cache_location: /var/loki/index_cache
|
||||
shared_store: filesystem
|
||||
|
||||
ruler:
|
||||
storage:
|
||||
type: local
|
||||
local:
|
||||
directory: /var/loki/rules
|
||||
rule_path: /tmp/loki-rules
|
||||
enable_api: true
|
||||
|
||||
limits_config:
|
||||
enforce_metric_name: false
|
||||
reject_old_samples: true
|
||||
reject_old_samples_max_age: 168h
|
||||
max_entries_limit_per_query: 5000
|
||||
ingestion_rate_mb: 10
|
||||
ingestion_burst_size_mb: 20
|
||||
per_tenant_override_config: /etc/telemetry/tenants/loki-overrides.yaml
|
||||
19
deploy/telemetry/storage/prometheus.yaml
Normal file
19
deploy/telemetry/storage/prometheus.yaml
Normal file
@@ -0,0 +1,19 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 30s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "stellaops-otel-collector"
|
||||
scheme: https
|
||||
metrics_path: /
|
||||
tls_config:
|
||||
ca_file: ${PROMETHEUS_TLS_CA_FILE:-/etc/telemetry/tls/ca.crt}
|
||||
cert_file: ${PROMETHEUS_TLS_CERT_FILE:-/etc/telemetry/tls/client.crt}
|
||||
key_file: ${PROMETHEUS_TLS_KEY_FILE:-/etc/telemetry/tls/client.key}
|
||||
insecure_skip_verify: false
|
||||
authorization:
|
||||
type: Bearer
|
||||
credentials_file: ${PROMETHEUS_BEARER_TOKEN_FILE:-/etc/telemetry/auth/token}
|
||||
static_configs:
|
||||
- targets:
|
||||
- ${PROMETHEUS_COLLECTOR_TARGET:-stellaops-otel-collector:9464}
|
||||
56
deploy/telemetry/storage/tempo.yaml
Normal file
56
deploy/telemetry/storage/tempo.yaml
Normal file
@@ -0,0 +1,56 @@
|
||||
multitenancy_enabled: true
|
||||
usage_report:
|
||||
reporting_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3200
|
||||
log_level: info
|
||||
|
||||
distributor:
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
tls:
|
||||
cert_file: ${TEMPO_TLS_CERT_FILE:-/etc/telemetry/tls/server.crt}
|
||||
key_file: ${TEMPO_TLS_KEY_FILE:-/etc/telemetry/tls/server.key}
|
||||
client_ca_file: ${TEMPO_TLS_CA_FILE:-/etc/telemetry/tls/ca.crt}
|
||||
require_client_cert: true
|
||||
http:
|
||||
tls:
|
||||
cert_file: ${TEMPO_TLS_CERT_FILE:-/etc/telemetry/tls/server.crt}
|
||||
key_file: ${TEMPO_TLS_KEY_FILE:-/etc/telemetry/tls/server.key}
|
||||
client_ca_file: ${TEMPO_TLS_CA_FILE:-/etc/telemetry/tls/ca.crt}
|
||||
require_client_cert: true
|
||||
|
||||
ingester:
|
||||
lifecycler:
|
||||
ring:
|
||||
instance_availability_zone: ${TEMPO_ZONE:-zone-a}
|
||||
trace_idle_period: 10s
|
||||
max_block_bytes: 1_048_576
|
||||
|
||||
compactor:
|
||||
compaction:
|
||||
block_retention: 168h
|
||||
|
||||
metrics_generator:
|
||||
registry:
|
||||
external_labels:
|
||||
cluster: stellaops
|
||||
|
||||
storage:
|
||||
trace:
|
||||
backend: local
|
||||
local:
|
||||
path: /var/tempo/traces
|
||||
wal:
|
||||
path: /var/tempo/wal
|
||||
metrics:
|
||||
backend: prometheus
|
||||
|
||||
overrides:
|
||||
defaults:
|
||||
ingestion_rate_limit_bytes: 1048576
|
||||
max_traces_per_user: 200000
|
||||
per_tenant_override_config: /etc/telemetry/tenants/tempo-overrides.yaml
|
||||
Reference in New Issue
Block a user