We had a few incidents were on-call devs missed their calls because of various spam-blocking setups or “do not disturb” settings.
We now run a small service that test-notifies everyone once a month to make sure notifications go through. Notifications go out shortly before their ‘do not disturb’ stops so we do not wake them in the middle of the night, but still have a realistic situation.
Our setup has more logging/stats etc, but it goes something like this:
# configure user schedule
require 'yaml'
users = YAML.load <<~YAML
- name: "John Doe"
id: ABCD
# cron: "* * * * * America/Los_Angeles" # every minute ... for local testing
cron: "55 6 * * 2#1 America/Los_Angeles" # every first Tuesday of the month at 6:55am
# ... more users here
YAML
# code to notify users
require 'json'
require 'faraday'
def create_test_incident(user)
connection = Faraday.new
response = nil
2.times do
response = connection.post do |req|
req.url "https://api.pagerduty.com/incidents"
req.headers['Content-Type'] = 'application/json'
req.headers['Accept'] = 'application/vnd.pagerduty+json;version=2'
req.headers['From'] = 'realusers@email.com' # incident owner
req.headers['Authorization'] = "Token token=#{ENV.fetch("PAGERDUTY_TOKEN")}"
req.body = {
incident: {
type: "incident",
title: "Pagerduty Tester: Incident for #{user.fetch("name")}, press resolve",
service: {
id: ENV.fetch("SERVICE_ID"),
type: "service_reference"
},
assignments: [{
assignee: {
id: user.fetch("id"),
type: "user_reference"
}
}]
}
}.to_json
end
if response.status == 429 # pagerduty rate-limits to 6 incidents/min/service
sleep 60
next
end
raise "Request failed #{response.status} -- #{response.body}" if response.status >= 300
end
JSON.parse(response.body).fetch("incident").fetch("id")
end
# run on a schedule (no threading / forking)
require 'serial_scheduler'
require 'fugit'
scheduler = SerialScheduler.new
users.each do |user|
scheduler.add("Notify #{user.fetch("name")}", cron: user.fetch("cron"), timeout: 10) do
user_id = user.fetch("id")
incident_id = PagerdutyTester.create_test_incident(user)
puts "Created incident for #{user_id} https://#{ENV.fetch('SUBDOMAIN')}.pagerduty.com/incidents/#{incident_id}"
rescue StandardError => e
puts "Creating incident for #{user_id} failed #{e}"
end
end
scheduler.run