Datadog: Show Metric Usage Warning From HPA Metrics

Problem

When editing metrics in datadog UI (i.e. /metrics/summary) a warning is shown when editing an in-use metric (i.e. a dashboard or monitor uses it). But if that metrics is used by a Kubernetes HorizontalPodAutoscaler, no such warning will show.

Solution

Generate a dashboard that uses 1 widget for every query an HPA uses.

require 'kennel'

class HpaDashboard
  SOURCE_METRIC = "datadog.cluster_agent.external_metrics.delay_seconds".freeze
  attr_reader :id

  def initialize(id, timeframe:)
    @id = id
    @api = Kennel::Api.new
    @from = Time.now.to_i - timeframe
  end

  # see https://docs.datadoghq.com/api/latest/metrics/#get-active-metrics-list
  # this has an undocumented limit of 250000 metrics so we can't just use super old @from
  # also tried /api/v2/metrics which returns similar results but is even slower (filtering it with 'queried' + big window did not help)
  def available_metrics
    @api.send(
      :request, :get, "/api/v1/metrics",
      params: { from: @from }
    ).fetch(:metrics).to_set
  end

  def queries_used_by_any_hpa
    @api.send(
      :request, :get, "/api/v1/query",
      params: {
        query: "avg:#{SOURCE_METRIC}{*} by {metric}",
        from: @from,
        to: Time.now.to_i
      }
    ).fetch(:series).map do |data|
      data.fetch(:scope).split(",").to_h { |t| t.split(":", 2) }["metric"]
    end.uniq
  end

  # covert fallout from query normalization to find actual metrics
  # for example default_zero(foo{a:b}) is converted to "default_zero_foo_a:b"
  # this ignores when multiple metrics are in a single query for example a / b * 100
  # since a and b are usually the same
  def extract_metrics(queries)
    queries = queries.dup
    queries.each do |query|
      query.sub!(/\.total_\d+$/, ".total") # math leftover *.total_100 -> *.total
      query.sub!(/^_*(ewma_\d+|default_zero)_*/, "") # remove math
    end
    queries.uniq!
    queries.sort! # for debug printing and to keep the dashboard stable
    queries.to_set
  end

  # since available_metrics is not reliable (hits limit or just has old data)
  # we verify each potentially unknown metric 1-by-1 by hitting this cheap endpoint
  # https://docs.datadoghq.com/api/latest/metrics/?code-lang=curl#get-metric-metadata
  def slow_filter_unknown!(unknown)
    unknown.select! do |metric|
      print "Verifying potentially unknown metric #{metric} ..."
      not_found = @api.send(:request, :get, "/api/v1/metrics/#{metric}", ignore_404: true)[:error]
      print "#{not_found ? "not found" : "found"}\n"
      not_found # keep the truly not found
    end
  end

  def update(used_metrics)
    attributes = {
      title: "HPA metrics used",
      description: <<~DESC,
        1 widget for each metric used in compute maintained kubernetes clusters (anything that reports #{SOURCE_METRIC})
        Automatically filled by a `rake hpa_dashboard` cron from kennel GHA.
        Last updated: #{Time.now} #{$stdout.tty? ? "manually" : RakeHelper.ci_url}
      DESC
      layout_type: "ordered",
      reflow_type: "auto",
      tags: ["team:compute", "team:compute-accelerate"],
      widgets: used_metrics.map do |m|
        {
          definition: {
            title: m,
            type: "timeseries",
            requests: [
              {
                response_format: "timeseries",
                queries: [
                  {
                    name: "query1",
                    data_source: "metrics",
                    query: "avg:#{m}{*}"
                  }
                ],
                display_type: "line"
              }
            ]
          }
        }
      end
    }
    @api.update("dashboard", @id, attributes)
  end
end

desc "Update hpa dashboard to track all currently used external metrics people that change metrics in the UI see that they are used"
task hpa_dashboard: "kennel:environment" do
  dashboard = HpaDashboard.new(DASHBOARD_ID, timeframe: 24 * 60 * 60)

  available_metrics = dashboard.available_metrics
  puts "Found #{available_metrics.size} available metrics"

  used_queries = dashboard.queries_used_by_any_hpa
  puts "Found #{used_queries.size} used queries"

  used_metrics = dashboard.extract_metrics(used_queries)
  puts "Found #{used_metrics.size} used metrics"

  # validate we found everything
  unknown = used_metrics - available_metrics
  dashboard.slow_filter_unknown! unknown if unknown.size < 100
  if unknown.any?
    $stdout.flush # otherwise mixes with stderr in GHA
    abort <<~MSG
      #{unknown.size} unknown metrics found, these would not be displayable on the dashboard, improve parsing code
      usually that means some part of the metrics got mangled and it cannot be found in datadog
      see https://datadoghq.com/metric/summary to find valid metrics

      #{unknown.join("\n")}
    MSG
  end

  dashboard.update used_metrics
  puts "Updated dashboard https://datadoghq.com/dashboard/#{dashboard.id}"
rescue Exception # rubocop:disable Lint/RescueException
  unless $stdout.tty? # do not spam slack when debugging
    send_to_slack <<~MSG
      HPA dashboard update failed #{RakeHelper.ci_url}, fix it!
    MSG
  end
  raise
end

Testing Rego With enforced code coverage

A ruby script we use to test our Rego policies. They need to be in the policies/ folder. Each line that is not exercised by tests will make it fail.

desc "Test policies"
task test: ["update:opa"] do
  output = `opa test --coverage --verbose policies/* 2>&1`
  abort output unless $?.success?

  coverage = JSON.parse(output).fetch("files")
  errors = policy_files.flat_map do |policy|
    return [policy] unless result = coverage[policy] # untested

    (result["not_covered"] || []).map do |line|
      start = line.dig("start", "row")
      finish = line.dig("end", "row")
      "#{policy}:#{start}#{"-#{finish}" if start != finish}"
    end
  end
  abort "Missing coverage:\n#{errors.join("\n")}" if errors.any?
end

Simple Kubernetes Leader Election via Entrypoint script

Leader election in kubernetes is often done via sidecars + Endpoints or leases, which is a lot of complexity comapred toConfigMap based locking (as used by operator-sdk), it also avoids having the leader move around during execution.

kube-leader produces a downloadable binary, that implements leader election via a docker EXTRYPOINT. Add it to your Dockerfile, add kubernetes env vars/permissions and you are done.

Faster Page Through Reused HTML Options with Rails & JS

We render an edit page the repeats many long option lists, so we made rails only render it’s options once and made javascript reuse them, resulting in reduced render ~70% time, ~90% page size.

It resets options  when a users reload a partially filled out page, but otherwise supports keyboard and mouse navigation nicely.

 

# app/views/projects/_form.html.erb
<% list = [["Foo", "foo"], ["Bar", "bar"], ["Name", @project.name]] %>
  <%= form.reused_select :name, list %>
  <%= form.reused_select :name, list %>
  <%= form.reused_select :name, list %>

# config/initializers/reused_select.rb
ActionView::Helpers::FormBuilder.class_eval do
  def reused_select(column, values, options={})
    value = object.public_send(column)
    options_id = "options_id-#{values.object_id}"
    options[:html] = (options[:html] || {}).merge("data-selected": value, "data-options-id": options_id)
    placeholder_values = [values.detect { |_, v| v == value }] # make select look normal
    rendered = select column, placeholder_values, options

    # render the real values only once and reuse them via js
    if @template.instance_eval { (@reused_select ||= Set.new).add? options_id }
      rendered << @template.tag(:span, id: options_id, style: "display: none", data: {options: [["", ""]] + values})
    end
    rendered
  end
end

# app/assets/javascripts/application.js
function reuseSelect(e){
  var select = e.target;
  var $select = $(select);
  var options_id = $select.data("options-id");
  var selected = $select.data("selected"); // values come from json, so be careful to match the type
  select.innerHTML = ''; // clear out fake options
  $($("#" + options_id).data("options")).each(function(_, e){
    var name = e[0];
    var value = e[1];
    var option = document.createElement("option");
    option.innerText = name;
    option.value = value;
    if(value === selected) { option.selected = "selected"; }
    select.appendChild(option);
  });
}

$("select[data-options-id]").one("mousedown", reuseSelect).one("focus", reuseSelect);

 

Automated Sudo Password Prompt with SshKit

Basically what sshkit-sudo gem promises, but:

  • 1 hack instead of multiple layers
  • obvious how to debug
  • 1 global password
  • does not capture the password prompt
  • does not print the output when capturing
  • works when not using SshKit::DSL

Hint: You might want to start with an extra “puts data” to see how your password prompt looks like.

SSHKit::Command.prepend(Module.new do
  def on_stdout(channel, data)
    if data.include? "[sudo] password for "
      @@password ||= `echo password: 1>&2 && read -s PASSWORD && printf \"$PASSWORD\"`
      channel.send_data(@@password + "\n")
    else
      super
    end
  end
end)

on servers do
  capture :sudo, "ls"
end