cloudfoundry · anshrupani · Jan 3, 2025 · Jan 8, 2025 · Jan 20, 2025 · Jan 22, 2025
diff --git a/src/bosh-director/lib/bosh/director/metrics_collector.rb b/src/bosh-director/lib/bosh/director/metrics_collector.rb
@@ -42,6 +42,13 @@ def initialize(config)
           labels: %i[name],
           docstring: 'Number of unresponsive agents per deployment',
         )
+
+        @unhealthy_instances = Prometheus::Client.registry.gauge(
+          :bosh_unhealthy_instances,
+          labels: %i[name],
+          docstring: 'Number of unhealthy instances per deployment',
+        )
+
         @scheduler = Rufus::Scheduler.new
       end
 
@@ -109,32 +116,33 @@ def populate_metrics
 
         populate_network_metrics
 
-        populate_vm_metrics
+        populate_vm_metrics('/unresponsive_agents', @unresponsive_agents)
+        populate_vm_metrics('/unhealthy_instances', @unhealthy_instances)
 
         @logger.info('populated metrics')
       end
 
-      def populate_vm_metrics
-        response = Net::HTTP.get_response('127.0.0.1', '/unresponsive_agents', @config.health_monitor_port)
+      def populate_vm_metrics(metric, metric_call)
+        response = Net::HTTP.get_response('127.0.0.1', "#{metric}", @config.health_monitor_port)
         return unless response.is_a?(Net::HTTPSuccess)
 
-        unresponsive_agent_counts = JSON.parse(response.body)
-        return unless unresponsive_agent_counts.is_a?(Hash)
+        metric_counts = JSON.parse(response.body)
+        return unless metric_counts.is_a?(Hash)
 
-        existing_deployment_names = @unresponsive_agents.values.map do |key, _|
+        existing_deployment_names = metric_call.values.map do |key, _|
           # The keys within the Prometheus::Client::Metric#values method are actually hashes. So the
           # data returned from values looks like:
           # { { name: "deployment_a"} => 10, { name: "deployment_b "} => 0, ... }
           key[:name]
         end
 
-        unresponsive_agent_counts.each do |deployment, count|
-          @unresponsive_agents.set(count, labels: { name: deployment })
+        metric_counts.each do |deployment, count|
+          metric_call.set(count, labels: { name: deployment })
         end
 
-        removed_deployments = existing_deployment_names - unresponsive_agent_counts.keys
+        removed_deployments = existing_deployment_names - metric_counts.keys
         removed_deployments.each do |deployment|
-          @unresponsive_agents.set(0, labels: { name: deployment })
+          metric_call.set(0, labels: { name: deployment })
         end
       end
 

diff --git a/src/bosh-director/spec/unit/metrics_collector_spec.rb b/src/bosh-director/spec/unit/metrics_collector_spec.rb
@@ -35,6 +35,8 @@ def tick
       allow(Api::ConfigManager).to receive(:deploy_config_enabled?).and_return(true, false)
       stub_request(:get, /unresponsive_agents/)
         .to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
+      stub_request(:get, /unhealthy_instances/)
+        .to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
     end
 
     after do
@@ -44,6 +46,7 @@ def tick
       Prometheus::Client.registry.unregister(:bosh_networks_dynamic_ips_total)
       Prometheus::Client.registry.unregister(:bosh_networks_dynamic_free_ips_total)
       Prometheus::Client.registry.unregister(:bosh_unresponsive_agents)
+      Prometheus::Client.registry.unregister(:bosh_unhealthy_instances)
     end
 
     describe '#prep' do
@@ -286,53 +289,107 @@ def tick
       end
 
       describe 'vm metrics' do
-        it 'emits the number of unresponsive agents for each deployment' do
-          metrics_collector.start
-          metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
-          expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1)
-          expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)
-        end
-
-        context 'when the health monitor returns a non 200 response' do
-          before do
-            stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
-              .to_return(status: 404)
-          end
-
-          it 'does not emit the vm metrics' do
+        context 'unresponsive agents' do
+          it 'emits the number of unresponsive agents for each deployment' do
             metrics_collector.start
             metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
-            expect(metric.values).to be_empty
+            expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1)
+            expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)
           end
-        end
 
-        context 'when the health monitor returns a non-json response' do
-          before do
-            stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
-              .to_return(status: 200, body: JSON.dump('bad response'))
+          context 'when the health monitor returns a non 200 response' do
+            before do
+              stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
+                .to_return(status: 404)
+            end
+
+            it 'does not emit the vm metrics' do
+              metrics_collector.start
+              metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
+              expect(metric.values).to be_empty
+            end
           end
 
-          it 'does not emit the vm metrics' do
-            metrics_collector.start
-            metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
-            expect(metric.values).to be_empty
+          context 'when the health monitor returns a non-json response' do
+            before do
+              stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
+                .to_return(status: 200, body: JSON.dump('bad response'))
+            end
+
+            it 'does not emit the vm metrics' do
+              metrics_collector.start
+              metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
+              expect(metric.values).to be_empty
+            end
+          end
+
+          context 'when a deployment is deleted after metrics are gathered' do
+            before do
+              stub_request(:get, /unresponsive_agents/)
+                .to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
+              metrics_collector.start
+
+              stub_request(:get, /unresponsive_agents/)
+                .to_return(status: 200, body: JSON.dump('good_deployment' => 0))
+              scheduler.tick
+            end
+
+            it 'resets the metrics for the deleted deployment' do
+              metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
+              expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
+            end
           end
         end
 
-        context 'when a deployment is deleted after metrics are gathered' do
-          before do
-            stub_request(:get, /unresponsive_agents/)
-              .to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
+        context 'unhealthy instances' do
+          it 'emits the number of unhealthy instances for each deployment' do
             metrics_collector.start
+            metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
+            expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1)
+            expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)
+          end
 
-            stub_request(:get, /unresponsive_agents/)
-              .to_return(status: 200, body: JSON.dump('good_deployment' => 0))
-            scheduler.tick
+          context 'when the health monitor returns a non 200 response' do
+            before do
+              stub_request(:get, '127.0.0.1:12345/unhealthy_instances')
+                .to_return(status: 404)
+            end
+
+            it 'does not emit the vm metrics' do
+              metrics_collector.start
+              metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
+              expect(metric.values).to be_empty
+            end
           end
 
-          it 'resets the metrics for the deleted deployment' do
-            metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
-            expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
+          context 'when the health monitor returns a non-json response' do
+            before do
+              stub_request(:get, '127.0.0.1:12345/unhealthy_instances')
+                .to_return(status: 200, body: JSON.dump('bad response'))
+            end
+
+            it 'does not emit the vm metrics' do
+              metrics_collector.start
+              metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
+              expect(metric.values).to be_empty
+            end
+          end
+
+          context 'when a deployment is deleted after metrics are gathered' do
+            before do
+              stub_request(:get, /unhealthy_instances/)
+                .to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
+              metrics_collector.start
+
+              stub_request(:get, /unhealthy_instances/)
+                .to_return(status: 200, body: JSON.dump('good_deployment' => 0))
+              scheduler.tick
+            end
+
+            it 'resets the metrics for the deleted deployment' do
+              metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
+              expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
+            end
           end
         end
       end

diff --git a/src/bosh-monitor/lib/bosh/monitor/agent.rb b/src/bosh-monitor/lib/bosh/monitor/agent.rb
@@ -4,7 +4,7 @@ class Agent
     attr_reader   :discovered_at
     attr_accessor :updated_at
 
-    ATTRIBUTES = %i[deployment job index instance_id cid].freeze
+    ATTRIBUTES = %i[deployment job index instance_id cid job_state has_processes].freeze
 
     ATTRIBUTES.each do |attribute|
       attr_accessor attribute
@@ -24,6 +24,8 @@ def initialize(id, opts = {})
       @index = opts[:index]
       @cid = opts[:cid]
       @instance_id = opts[:instance_id]
+      @job_state = opts[:job_state]
+      @has_processes = opts[:has_processes]
     end
 
     def name
@@ -51,11 +53,17 @@ def rogue?
       (Time.now - @discovered_at) > @intervals.rogue_agent_alert && @deployment.nil?
     end
 
+    def is_not_running?
+      @job_state.to_s != 'running' || @has_processes == false
+    end
+
     def update_instance(instance)
       @job = instance.job
       @index = instance.index
       @cid = instance.cid
       @instance_id = instance.id
+      @job_state = instance.job_state
+      @has_processes = instance.has_processes
     end
   end
 end
diff --git a/src/bosh-monitor/lib/bosh/monitor/api_controller.rb b/src/bosh-monitor/lib/bosh/monitor/api_controller.rb
@@ -40,5 +40,13 @@ def initialize(heartbeat_interval = 1)
         status(503)
       end
     end
+
+    get '/unhealthy_instances' do
+      if @instance_manager.director_initial_deployment_sync_done
+        JSON.generate(@instance_manager.unhealthy_instances)
+      else
+        status(503)
+      end
+    end
   end
 end
diff --git a/src/bosh-monitor/lib/bosh/monitor/director.rb b/src/bosh-monitor/lib/bosh/monitor/director.rb
@@ -31,6 +31,52 @@ def get_deployment_instances(name)
       parse_json(body, Array)
     end
 
+    def get_deployment_instances_full(name, recursive_counter=0)
+      body, status, headers = perform_request(:get, "/deployments/#{name}/instances?format=full")
+      sleep_amount_seconds = 1
+      location = headers['location']
+      unless !location.nil? && location.include?('task')
+        raise DirectorError, "Cannot find any task to retrieve vm stats"
+      end
+      counter = 0
+      # States are documented here: https://bosh.io/docs/director-api-v1/#list-tasks
+      truthy_states = %w(cancelled cancelling done error timeout)
+      falsy_states = %w(queued processing)
+      body = nil, status = nil, state = nil
+      loop do
+        counter = counter + 1
+        body, status = perform_request(:get, location)
+        if status == 206 || body.nil? || body.empty?
+          sleep_amount_seconds = counter + sleep_amount_seconds
+          sleep(sleep_amount_seconds)
+          next
+        end
+        json_output = parse_json(body, Hash)
+        state = json_output['state']
+        if truthy_states.include?(state) || counter > 5
+          @logger.warn("The number of retries to fetch instance details for deployment #{name} has exceeded. Could not get the expected response from #{location}")
+          break
+        end
+        sleep_amount_seconds = counter + sleep_amount_seconds
+        sleep(sleep_amount_seconds)
+      end
+      updated_body = nil
+      if state == 'done'
+        body, status = perform_request(:get, "#{location}/output?type=result")
+        if status!= 200 || body.nil? || body.empty?
+          raise DirectorError, "Fetching full instance details for deployment #{name} failed"
+        end
+        updated_body = "[#{body.chomp.gsub(/\R+/, ',')}]"
+        return parse_json(updated_body, Array)
+      else
+        if recursive_counter > 0
+          return updated_body, state
+        end
+        @logger.warn("Could not fetch instance details for deployment #{name} in the first attempt, retrying once more ...")
+        return get_deployment_instances_full(name, recursive_counter + 1)
+      end
+    end
+
     private
 
     def endpoint
@@ -50,10 +96,12 @@ def parse_json(json, expected_type = nil)
     end
 
     def perform_request(method, request_path, options = {})
-      parsed_endpoint = URI.parse(endpoint + request_path)
-      headers = {}
+      if !request_path.nil?
+       request_path = request_path.sub(endpoint, '')
+      end
+      parsed_endpoint = URI.parse(endpoint + (request_path || ''))
+      headers = options['headers'] || {}
       headers['authorization'] = auth_provider.auth_header unless options.fetch(:no_login, false)
-
       ssl_context = OpenSSL::SSL::SSLContext.new
       ssl_context.set_params(verify_mode: OpenSSL::SSL::VERIFY_NONE)
       async_endpoint = Async::HTTP::Endpoint.parse(parsed_endpoint.to_s, ssl_context: ssl_context)
@@ -62,7 +110,7 @@ def perform_request(method, request_path, options = {})
       body   = response.read
       status = response.status
 
-      [body, status]
+      [body, status, response.headers]
     rescue URI::Error
       raise DirectorError, "Invalid URI: #{endpoint + request_path}"
     rescue => e

diff --git a/src/bosh-monitor/lib/bosh/monitor/instance.rb b/src/bosh-monitor/lib/bosh/monitor/instance.rb
@@ -1,7 +1,7 @@
 module Bosh::Monitor
   class Instance
-    attr_reader :id, :agent_id, :job, :index, :cid, :expects_vm
-    attr_accessor :deployment
+    attr_reader :id, :agent_id, :job, :index, :cid, :expects_vm, :job_state, :has_processes
+    attr_accessor :deployment, :job_state
 
     def initialize(instance_data)
       @logger = Bosh::Monitor.logger
@@ -11,6 +11,8 @@ def initialize(instance_data)
       @index = instance_data['index']
       @cid = instance_data['cid']
       @expects_vm = instance_data['expects_vm']
+      @job_state = instance_data['job_state']
+      @has_processes = instance_data['has_processes']
     end
 
     def self.create(instance_data)
@@ -30,11 +32,11 @@ def self.create(instance_data)
     def name
       if @job
         identifier = "#{@job}(#{@id})"
-        attributes = create_optional_attributes(%i[agent_id index])
+        attributes = create_optional_attributes(%i[agent_id index job_state has_processes])
         attributes += create_mandatory_attributes([:cid])
       else
         identifier = "instance #{@id}"
-        attributes = create_optional_attributes(%i[agent_id job index cid expects_vm])
+        attributes = create_optional_attributes(%i[agent_id job index cid expects_vm job_state has_processes])
       end
 
       "#{@deployment}: #{identifier} [#{attributes.join(', ')}]"