From 9664b10ad1fa047020eac872bb2d184ea3197313 Mon Sep 17 00:00:00 2001 From: Dave Shrewsberry Date: Tue, 17 Mar 2026 10:08:45 -0400 Subject: [PATCH 1/2] ROX-33626: Add Prometheus metrics to alert manager Add instrumentation to AlertAndNotify and mergeManyAlerts to collect production data on alert processing performance before optimizing. Metrics added: - alert_and_notify_duration_ms: end-to-end duration histogram - alert_and_notify_incoming_count: incoming alerts per call - merge_many_alerts_duration_ms: merge duration histogram - merge_many_alerts_previous_count: previous alerts fetched from DB - alert_outcome_total: counter vec by outcome (new/updated/resolved) Partially generated by AI. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../alertmanager/alert_manager_impl.go | 13 +++++ central/detection/alertmanager/metrics.go | 57 +++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 central/detection/alertmanager/metrics.go diff --git a/central/detection/alertmanager/alert_manager_impl.go b/central/detection/alertmanager/alert_manager_impl.go index 79aa95f442d34..6939ca3f2f693 100644 --- a/central/detection/alertmanager/alert_manager_impl.go +++ b/central/detection/alertmanager/alert_manager_impl.go @@ -48,6 +48,11 @@ func getDeploymentIDsFromAlerts(alertSlices ...[]*storage.Alert) set.StringSet { // AlertAndNotify is the main function that implements the AlertManager interface func (d *alertManagerImpl) AlertAndNotify(ctx context.Context, currentAlerts []*storage.Alert, oldAlertFilters ...AlertFilterOption) (set.StringSet, error) { + defer func(start time.Time) { + alertAndNotifyDuration.Observe(float64(time.Since(start).Milliseconds())) + }(time.Now()) + alertAndNotifyIncomingCount.Observe(float64(len(currentAlerts))) + // Merge the old and the new alerts. newAlerts, updatedAlerts, toBeResolvedAlerts, err := d.mergeManyAlerts(ctx, currentAlerts, oldAlertFilters...) if err != nil { @@ -377,6 +382,13 @@ func (d *alertManagerImpl) mergeManyAlerts( incomingAlerts []*storage.Alert, oldAlertFilters ...AlertFilterOption, ) (newAlerts, updatedAlerts, toBeResolvedAlerts []*storage.Alert, err error) { + defer func(start time.Time) { + mergeManyAlertsDuration.Observe(float64(time.Since(start).Milliseconds())) + alertOutcomeTotal.WithLabelValues("new").Add(float64(len(newAlerts))) + alertOutcomeTotal.WithLabelValues("updated").Add(float64(len(updatedAlerts))) + alertOutcomeTotal.WithLabelValues("resolved").Add(float64(len(toBeResolvedAlerts))) + }(time.Now()) + qb := search.NewQueryBuilder().AddExactMatches( search.ViolationState, storage.ViolationState_ACTIVE.String(), @@ -389,6 +401,7 @@ func (d *alertManagerImpl) mergeManyAlerts( err = errors.Wrapf(err, "couldn't load previous alerts (query was %s)", qb.Query()) return } + mergeManyAlertsPreviousCount.Observe(float64(len(previousAlerts))) // Merge any alerts that have new and old alerts. for _, alert := range incomingAlerts { diff --git a/central/detection/alertmanager/metrics.go b/central/detection/alertmanager/metrics.go new file mode 100644 index 0000000000000..2f5e93b3b6d50 --- /dev/null +++ b/central/detection/alertmanager/metrics.go @@ -0,0 +1,57 @@ +package alertmanager + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/stackrox/rox/pkg/metrics" +) + +var ( + alertAndNotifyDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: metrics.PrometheusNamespace, + Subsystem: metrics.CentralSubsystem.String(), + Name: "alert_and_notify_duration_ms", + Help: "End-to-end duration of AlertAndNotify in milliseconds", + Buckets: prometheus.ExponentialBuckets(4, 2, 12), + }) + + alertAndNotifyIncomingCount = prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: metrics.PrometheusNamespace, + Subsystem: metrics.CentralSubsystem.String(), + Name: "alert_and_notify_incoming_count", + Help: "Number of incoming alerts per AlertAndNotify call", + Buckets: prometheus.ExponentialBuckets(1, 2, 14), + }) + + mergeManyAlertsDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: metrics.PrometheusNamespace, + Subsystem: metrics.CentralSubsystem.String(), + Name: "merge_many_alerts_duration_ms", + Help: "Duration of mergeManyAlerts in milliseconds", + Buckets: prometheus.ExponentialBuckets(4, 2, 12), + }) + + mergeManyAlertsPreviousCount = prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: metrics.PrometheusNamespace, + Subsystem: metrics.CentralSubsystem.String(), + Name: "merge_many_alerts_previous_count", + Help: "Number of previous alerts fetched from DB per mergeManyAlerts call", + Buckets: prometheus.ExponentialBuckets(1, 2, 14), + }) + + alertOutcomeTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metrics.PrometheusNamespace, + Subsystem: metrics.CentralSubsystem.String(), + Name: "alert_outcome_total", + Help: "Cumulative count of alert outcomes from mergeManyAlerts", + }, []string{"outcome"}) +) + +func init() { + metrics.EmplaceCollector( + alertAndNotifyDuration, + alertAndNotifyIncomingCount, + mergeManyAlertsDuration, + mergeManyAlertsPreviousCount, + alertOutcomeTotal, + ) +} From a634bf63121ce52600a68cdaa309b859bd286eb0 Mon Sep 17 00:00:00 2001 From: Dave Shrewsberry Date: Tue, 17 Mar 2026 12:58:21 -0400 Subject: [PATCH 2/2] ROX-33626: Improve readability of alert manager metrics Address PR feedback: extract observeDurationMs helper for the defer-based duration pattern, and move outcome counting to a plain recordAlertOutcomes call at the end of mergeManyAlerts instead of using closures over named return values. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../alertmanager/alert_manager_impl.go | 12 +++-------- central/detection/alertmanager/metrics.go | 20 +++++++++++++++++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/central/detection/alertmanager/alert_manager_impl.go b/central/detection/alertmanager/alert_manager_impl.go index 6939ca3f2f693..2c067da5676fc 100644 --- a/central/detection/alertmanager/alert_manager_impl.go +++ b/central/detection/alertmanager/alert_manager_impl.go @@ -48,9 +48,7 @@ func getDeploymentIDsFromAlerts(alertSlices ...[]*storage.Alert) set.StringSet { // AlertAndNotify is the main function that implements the AlertManager interface func (d *alertManagerImpl) AlertAndNotify(ctx context.Context, currentAlerts []*storage.Alert, oldAlertFilters ...AlertFilterOption) (set.StringSet, error) { - defer func(start time.Time) { - alertAndNotifyDuration.Observe(float64(time.Since(start).Milliseconds())) - }(time.Now()) + defer observeDurationMs(alertAndNotifyDuration)() alertAndNotifyIncomingCount.Observe(float64(len(currentAlerts))) // Merge the old and the new alerts. @@ -382,12 +380,7 @@ func (d *alertManagerImpl) mergeManyAlerts( incomingAlerts []*storage.Alert, oldAlertFilters ...AlertFilterOption, ) (newAlerts, updatedAlerts, toBeResolvedAlerts []*storage.Alert, err error) { - defer func(start time.Time) { - mergeManyAlertsDuration.Observe(float64(time.Since(start).Milliseconds())) - alertOutcomeTotal.WithLabelValues("new").Add(float64(len(newAlerts))) - alertOutcomeTotal.WithLabelValues("updated").Add(float64(len(updatedAlerts))) - alertOutcomeTotal.WithLabelValues("resolved").Add(float64(len(toBeResolvedAlerts))) - }(time.Now()) + defer observeDurationMs(mergeManyAlertsDuration)() qb := search.NewQueryBuilder().AddExactMatches( search.ViolationState, @@ -458,6 +451,7 @@ func (d *alertManagerImpl) mergeManyAlerts( } } } + recordAlertOutcomes(len(newAlerts), len(updatedAlerts), len(toBeResolvedAlerts)) return } diff --git a/central/detection/alertmanager/metrics.go b/central/detection/alertmanager/metrics.go index 2f5e93b3b6d50..eb58d2c23230f 100644 --- a/central/detection/alertmanager/metrics.go +++ b/central/detection/alertmanager/metrics.go @@ -1,10 +1,24 @@ package alertmanager import ( + "time" + "github.com/prometheus/client_golang/prometheus" "github.com/stackrox/rox/pkg/metrics" ) +// observeDurationMs returns a function that, when called, observes the elapsed +// time in milliseconds since observeDurationMs was invoked. Intended for use +// with defer: +// +// defer observeDurationMs(myHistogram)() +func observeDurationMs(h prometheus.Histogram) func() { + start := time.Now() + return func() { + h.Observe(float64(time.Since(start).Milliseconds())) + } +} + var ( alertAndNotifyDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: metrics.PrometheusNamespace, @@ -46,6 +60,12 @@ var ( }, []string{"outcome"}) ) +func recordAlertOutcomes(newCount, updatedCount, resolvedCount int) { + alertOutcomeTotal.WithLabelValues("new").Add(float64(newCount)) + alertOutcomeTotal.WithLabelValues("updated").Add(float64(updatedCount)) + alertOutcomeTotal.WithLabelValues("resolved").Add(float64(resolvedCount)) +} + func init() { metrics.EmplaceCollector( alertAndNotifyDuration,