Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metrics for other unhealthy VM states #2597

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 18 additions & 10 deletions src/bosh-director/lib/bosh/director/metrics_collector.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ def initialize(config)
labels: %i[name],
docstring: 'Number of unresponsive agents per deployment',
)

@unhealthy_instances = Prometheus::Client.registry.gauge(
:bosh_unhealthy_instances,
labels: %i[name],
docstring: 'Number of unhealthy instances per deployment',
)

@scheduler = Rufus::Scheduler.new
end

Expand Down Expand Up @@ -109,32 +116,33 @@ def populate_metrics

populate_network_metrics

populate_vm_metrics
populate_vm_metrics('/unresponsive_agents', @unresponsive_agents)
populate_vm_metrics('/unhealthy_instances', @unhealthy_instances)

@logger.info('populated metrics')
end

def populate_vm_metrics
response = Net::HTTP.get_response('127.0.0.1', '/unresponsive_agents', @config.health_monitor_port)
def populate_vm_metrics(metric, metric_call)
response = Net::HTTP.get_response('127.0.0.1', "#{metric}", @config.health_monitor_port)
return unless response.is_a?(Net::HTTPSuccess)

unresponsive_agent_counts = JSON.parse(response.body)
return unless unresponsive_agent_counts.is_a?(Hash)
metric_counts = JSON.parse(response.body)
return unless metric_counts.is_a?(Hash)

existing_deployment_names = @unresponsive_agents.values.map do |key, _|
existing_deployment_names = metric_call.values.map do |key, _|
# The keys within the Prometheus::Client::Metric#values method are actually hashes. So the
# data returned from values looks like:
# { { name: "deployment_a"} => 10, { name: "deployment_b "} => 0, ... }
key[:name]
end

unresponsive_agent_counts.each do |deployment, count|
@unresponsive_agents.set(count, labels: { name: deployment })
metric_counts.each do |deployment, count|
metric_call.set(count, labels: { name: deployment })
end

removed_deployments = existing_deployment_names - unresponsive_agent_counts.keys
removed_deployments = existing_deployment_names - metric_counts.keys
removed_deployments.each do |deployment|
@unresponsive_agents.set(0, labels: { name: deployment })
metric_call.set(0, labels: { name: deployment })
end
end

Expand Down
125 changes: 91 additions & 34 deletions src/bosh-director/spec/unit/metrics_collector_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def tick
allow(Api::ConfigManager).to receive(:deploy_config_enabled?).and_return(true, false)
stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
stub_request(:get, /unhealthy_instances/)
.to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
end

after do
Expand All @@ -44,6 +46,7 @@ def tick
Prometheus::Client.registry.unregister(:bosh_networks_dynamic_ips_total)
Prometheus::Client.registry.unregister(:bosh_networks_dynamic_free_ips_total)
Prometheus::Client.registry.unregister(:bosh_unresponsive_agents)
Prometheus::Client.registry.unregister(:bosh_unhealthy_instances)
end

describe '#prep' do
Expand Down Expand Up @@ -286,53 +289,107 @@ def tick
end

describe 'vm metrics' do
it 'emits the number of unresponsive agents for each deployment' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1)
expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)
end

context 'when the health monitor returns a non 200 response' do
before do
stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
.to_return(status: 404)
end

it 'does not emit the vm metrics' do
context 'unresponsive agents' do
it 'emits the number of unresponsive agents for each deployment' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.values).to be_empty
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1)
expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)
end
end

context 'when the health monitor returns a non-json response' do
before do
stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
.to_return(status: 200, body: JSON.dump('bad response'))
context 'when the health monitor returns a non 200 response' do
before do
stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
.to_return(status: 404)
end

it 'does not emit the vm metrics' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.values).to be_empty
end
end

it 'does not emit the vm metrics' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.values).to be_empty
context 'when the health monitor returns a non-json response' do
before do
stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
.to_return(status: 200, body: JSON.dump('bad response'))
end

it 'does not emit the vm metrics' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.values).to be_empty
end
end

context 'when a deployment is deleted after metrics are gathered' do
before do
stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
metrics_collector.start

stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('good_deployment' => 0))
scheduler.tick
end

it 'resets the metrics for the deleted deployment' do
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
end
end
end

context 'when a deployment is deleted after metrics are gathered' do
before do
stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
context 'unhealthy instances' do
it 'emits the number of unhealthy instances for each deployment' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1)
expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)
end

stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('good_deployment' => 0))
scheduler.tick
context 'when the health monitor returns a non 200 response' do
before do
stub_request(:get, '127.0.0.1:12345/unhealthy_instances')
.to_return(status: 404)
end

it 'does not emit the vm metrics' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
expect(metric.values).to be_empty
end
end

it 'resets the metrics for the deleted deployment' do
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
context 'when the health monitor returns a non-json response' do
before do
stub_request(:get, '127.0.0.1:12345/unhealthy_instances')
.to_return(status: 200, body: JSON.dump('bad response'))
end

it 'does not emit the vm metrics' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
expect(metric.values).to be_empty
end
end

context 'when a deployment is deleted after metrics are gathered' do
before do
stub_request(:get, /unhealthy_instances/)
.to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
metrics_collector.start

stub_request(:get, /unhealthy_instances/)
.to_return(status: 200, body: JSON.dump('good_deployment' => 0))
scheduler.tick
end

it 'resets the metrics for the deleted deployment' do
metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
end
end
end
end
Expand Down
10 changes: 9 additions & 1 deletion src/bosh-monitor/lib/bosh/monitor/agent.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ class Agent
attr_reader :discovered_at
attr_accessor :updated_at

ATTRIBUTES = %i[deployment job index instance_id cid].freeze
ATTRIBUTES = %i[deployment job index instance_id cid job_state has_processes].freeze

ATTRIBUTES.each do |attribute|
attr_accessor attribute
Expand All @@ -24,6 +24,8 @@ def initialize(id, opts = {})
@index = opts[:index]
@cid = opts[:cid]
@instance_id = opts[:instance_id]
@job_state = opts[:job_state]
@has_processes = opts[:has_processes]
end

def name
Expand Down Expand Up @@ -51,11 +53,17 @@ def rogue?
(Time.now - @discovered_at) > @intervals.rogue_agent_alert && @deployment.nil?
end

def is_not_running?
@job_state.to_s != 'running' || @has_processes == false
end

def update_instance(instance)
@job = instance.job
@index = instance.index
@cid = instance.cid
@instance_id = instance.id
@job_state = instance.job_state
@has_processes = instance.has_processes
end
end
end
8 changes: 8 additions & 0 deletions src/bosh-monitor/lib/bosh/monitor/api_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,13 @@ def initialize(heartbeat_interval = 1)
status(503)
end
end

get '/unhealthy_instances' do
if @instance_manager.director_initial_deployment_sync_done
JSON.generate(@instance_manager.unhealthy_instances)
else
status(503)
end
end
end
end
56 changes: 52 additions & 4 deletions src/bosh-monitor/lib/bosh/monitor/director.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,52 @@ def get_deployment_instances(name)
parse_json(body, Array)
end

def get_deployment_instances_full(name, recursive_counter=0)
body, status, headers = perform_request(:get, "/deployments/#{name}/instances?format=full")
sleep_amount_seconds = 1
location = headers['location']
unless !location.nil? && location.include?('task')
raise DirectorError, "Can not find 'location' response header to retrieve the task location"
end
counter = 0
# States are documented here: https://bosh.io/docs/director-api-v1/#list-tasks
truthy_states = %w(cancelled cancelling done error timeout)
falsy_states = %w(queued processing)
body = nil, status = nil, state = nil
loop do
counter = counter + 1
body, status = perform_request(:get, location)
if status == 206 || body.nil? || body.empty?
sleep_amount_seconds = counter + sleep_amount_seconds
sleep(sleep_amount_seconds)
next
end
json_output = parse_json(body, Hash)
state = json_output['state']
if truthy_states.include?(state) || counter > 5
@logger.warn("The number of retries to fetch instance details for deployment '#{name}' has exceeded. Could not get the expected response from '#{location}'")
break
end
sleep_amount_seconds = counter + sleep_amount_seconds
sleep(sleep_amount_seconds)
end
updated_body = nil
if state == 'done'
body, status = perform_request(:get, "#{location}/output?type=result")
if status!= 200 || body.nil? || body.empty?
raise DirectorError, "Fetching full instance details for deployment '#{name}' failed"
end
updated_body = "[#{body.chomp.gsub(/\R+/, ',')}]"
return parse_json(updated_body, Array)
else
if recursive_counter > 0
return updated_body, state
end
@logger.warn("Could not fetch instance details for deployment '#{name}' in the first attempt, retrying once more ...")
return get_deployment_instances_full(name, recursive_counter + 1)
end
end

private

def endpoint
Expand All @@ -50,10 +96,12 @@ def parse_json(json, expected_type = nil)
end

def perform_request(method, request_path, options = {})
parsed_endpoint = URI.parse(endpoint + request_path)
headers = {}
if !request_path.nil?
request_path = request_path.sub(endpoint, '')
end
parsed_endpoint = URI.parse(endpoint + (request_path || ''))
headers = options['headers'] || {}
headers['authorization'] = auth_provider.auth_header unless options.fetch(:no_login, false)

ssl_context = OpenSSL::SSL::SSLContext.new
ssl_context.set_params(verify_mode: OpenSSL::SSL::VERIFY_NONE)
async_endpoint = Async::HTTP::Endpoint.parse(parsed_endpoint.to_s, ssl_context: ssl_context)
Expand All @@ -62,7 +110,7 @@ def perform_request(method, request_path, options = {})
body = response.read
status = response.status

[body, status]
[body, status, response.headers]
rescue URI::Error
raise DirectorError, "Invalid URI: #{endpoint + request_path}"
rescue => e
Expand Down
10 changes: 6 additions & 4 deletions src/bosh-monitor/lib/bosh/monitor/instance.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module Bosh::Monitor
class Instance
attr_reader :id, :agent_id, :job, :index, :cid, :expects_vm
attr_accessor :deployment
attr_reader :id, :agent_id, :job, :index, :cid, :expects_vm, :job_state, :has_processes
attr_accessor :deployment, :job_state

def initialize(instance_data)
@logger = Bosh::Monitor.logger
Expand All @@ -11,6 +11,8 @@ def initialize(instance_data)
@index = instance_data['index']
@cid = instance_data['cid']
@expects_vm = instance_data['expects_vm']
@job_state = instance_data['job_state']
@has_processes = instance_data['has_processes']
end

def self.create(instance_data)
Expand All @@ -30,11 +32,11 @@ def self.create(instance_data)
def name
if @job
identifier = "#{@job}(#{@id})"
attributes = create_optional_attributes(%i[agent_id index])
attributes = create_optional_attributes(%i[agent_id index job_state has_processes])
attributes += create_mandatory_attributes([:cid])
else
identifier = "instance #{@id}"
attributes = create_optional_attributes(%i[agent_id job index cid expects_vm])
attributes = create_optional_attributes(%i[agent_id job index cid expects_vm job_state has_processes])
end

"#{@deployment}: #{identifier} [#{attributes.join(', ')}]"
Expand Down
Loading
Loading