With the help of datadogs unofficial search_events endpoint we can see which of our monitors fail the most, a great place to start when trying to reduce alert spam.
(using Kennel)
rake brittle TAG=team:foo analyzing 104 monitors ... 10.9s Foo too high🔒 https://app.datadoghq.com/monitors/12345 Frequency: 18.95/h success: 56x warning: 44x
desc "Show how brittle selected teams monitors are TAG=" task brittle: "kennel:environment" do monitors = Kennel.send(:api).list("monitor", with_downtimes: false, monitor_tags: [ENV.fetch("TAG")]) abort "No monitors found" if monitors.empty? hour = 60 * 60 interval = 7 * 24 * hour now = Time.now.to_i max = 100 data = Kennel::Progress.progress "analyzing #{monitors.size} monitors" do Kennel::Utils.parallel monitors do |monitor| events = Kennel.send(:api).list("monitor/#{monitor[:id]}/search_events", from_ts: now - interval, to_ts: now, count: max, start: 0) next if events.empty? duration = now - (events.last.fetch(:date_detected) / 1000) amount = events.size frequency = amount * (hour / duration.to_f) [monitor, frequency, events] end.compact end # spammy first data.sort_by! { |_, frequency, _| -frequency } data.each do |m, frequency, events| groups = events.group_by { |e| e.fetch(:alert_type) } groups.sort_by(&:first) # sort by alert_type puts m.fetch(:name) puts "https://zendesk.datadoghq.com/monitors/#{m.fetch(:id)}" puts "Frequency: #{frequency.round(2)}/h" groups.each do |type, grouped_events| puts "#{type}: #{grouped_events.size}x" end puts end end