With the help of datadogs unofficial search_events endpoint we can see which of our monitors fail the most, a great place to start when trying to reduce alert spam.
(using Kennel)
rake brittle TAG=team:foo
analyzing 104 monitors ... 10.9s
Foo too high🔒
https://app.datadoghq.com/monitors/12345
Frequency: 18.95/h
success: 56x
warning: 44x
desc "Show how brittle selected teams monitors are TAG="
task brittle: "kennel:environment" do
monitors = Kennel.send(:api).list("monitor", with_downtimes: false, monitor_tags: [ENV.fetch("TAG")])
abort "No monitors found" if monitors.empty?
hour = 60 * 60
interval = 7 * 24 * hour
now = Time.now.to_i
max = 100
data = Kennel::Progress.progress "analyzing #{monitors.size} monitors" do
Kennel::Utils.parallel monitors do |monitor|
events = Kennel.send(:api).list("monitor/#{monitor[:id]}/search_events", from_ts: now - interval, to_ts: now, count: max, start: 0)
next if events.empty?
duration = now - (events.last.fetch(:date_detected) / 1000)
amount = events.size
frequency = amount * (hour / duration.to_f)
[monitor, frequency, events]
end.compact
end
# spammy first
data.sort_by! { |_, frequency, _| -frequency }
data.each do |m, frequency, events|
groups = events.group_by { |e| e.fetch(:alert_type) }
groups.sort_by(&:first) # sort by alert_type
puts m.fetch(:name)
puts "https://zendesk.datadoghq.com/monitors/#{m.fetch(:id)}"
puts "Frequency: #{frequency.round(2)}/h"
groups.each do |type, grouped_events|
puts "#{type}: #{grouped_events.size}x"
end
puts
end
end