Datadog: Show Brittle Monitors

With the help of datadogs unofficial search_events endpoint we can see which of our monitors fail the most, a great place to start when trying to reduce alert spam.
(using Kennel)

rake brittle TAG=team:foo
analyzing 104 monitors ... 10.9s
Foo too high🔒
https://app.datadoghq.com/monitors/12345
Frequency: 18.95/h
success: 56x
warning: 44x

 

desc "Show how brittle selected teams monitors are TAG="
task brittle: "kennel:environment" do
  monitors = Kennel.send(:api).list("monitor", with_downtimes: false, monitor_tags: [ENV.fetch("TAG")])
  abort "No monitors found" if monitors.empty?

  hour = 60 * 60
  interval = 7 * 24 * hour
  now = Time.now.to_i
  max = 100

  data = Kennel::Progress.progress "analyzing #{monitors.size} monitors" do
    Kennel::Utils.parallel monitors do |monitor|
      events = Kennel.send(:api).list("monitor/#{monitor[:id]}/search_events", from_ts: now - interval, to_ts: now, count: max, start: 0)
      next if events.empty?

      duration = now - (events.last.fetch(:date_detected) / 1000)
      amount = events.size
      frequency = amount * (hour / duration.to_f)
      [monitor, frequency, events]
    end.compact
  end

  # spammy first
  data.sort_by! { |_, frequency, _| -frequency }

  data.each do |m, frequency, events|
    groups = events.group_by { |e| e.fetch(:alert_type) }
    groups.sort_by(&:first) # sort by alert_type

    puts m.fetch(:name)
    puts "https://zendesk.datadoghq.com/monitors/#{m.fetch(:id)}"
    puts "Frequency: #{frequency.round(2)}/h"
    groups.each do |type, grouped_events|
      puts "#{type}: #{grouped_events.size}x"
    end
    puts
  end
end

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s