From 26875cbbf65ba3755a43159c95b5391f9593a82f Mon Sep 17 00:00:00 2001 From: Abhilash Shetty Date: Wed, 22 Jan 2025 18:43:17 +0000 Subject: [PATCH] fix(reconciler/volume): disown replica from disowned volumes nexus While we disown a replica from volume we used to validate if the replica is owned by any nexus. In some cases involving Outage on volume target node we observed that the new nexus do get created after the volumeattachments are deleted from the node. however, we werent able to disown the unknown replica from volume since it was still owned by old target. Now we check if replica is owned by any of the current target of the volume rather then if it is owned by any target. Signed-off-by: Abhilash Shetty --- .../reconciler/volume/garbage_collector.rs | 15 ++++++++------- .../stor-port/src/types/v0/store/replica.rs | 12 ++++++++++-- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/control-plane/agents/src/bin/core/controller/reconciler/volume/garbage_collector.rs b/control-plane/agents/src/bin/core/controller/reconciler/volume/garbage_collector.rs index 38b4ab568..a251eb856 100644 --- a/control-plane/agents/src/bin/core/controller/reconciler/volume/garbage_collector.rs +++ b/control-plane/agents/src/bin/core/controller/reconciler/volume/garbage_collector.rs @@ -10,7 +10,7 @@ use crate::controller::{ use agents::errors::SvcError; use stor_port::types::v0::{ store::{nexus_persistence::NexusInfo, replica::ReplicaSpec, volume::VolumeSpec}, - transport::{DestroyVolume, NexusOwners, ReplicaOwners, ResizeReplica, VolumeStatus}, + transport::{DestroyVolume, NexusId, NexusOwners, ReplicaOwners, ResizeReplica, VolumeStatus}, }; use tracing::Instrument; @@ -150,7 +150,6 @@ async fn disown_unused_nexuses( context: &PollContext, ) -> PollResult { let mut results = vec![]; - for nexus in context.specs().volume_nexuses(volume.uuid()) { if let Ok(mut nexus) = nexus.operation_guard() { if let Some(target) = volume.as_ref().target() { @@ -158,8 +157,9 @@ async fn disown_unused_nexuses( continue; } } - - nexus.warn_span(|| tracing::warn!("Attempting to disown unused nexus")); + nexus.info_span(|| { + tracing::info!("Disowning unused nexus from its volume"); + }); // the nexus garbage collector will destroy the disowned nexuses let owner = NexusOwners::Volume(volume.uuid().clone()); match nexus.remove_owners(context.registry(), &owner, false).await { @@ -206,7 +206,8 @@ async fn disown_unused_replicas( // don't attempt to disown the replicas if the volume state is faulted or unknown return PollResult::Ok(PollerState::Busy); } - + let vol_nexus = context.specs().volume_nexuses(volume.uuid()); + let vol_nexus_ids: Vec<&NexusId> = vol_nexus.iter().map(|n| n.uuid()).collect(); let mut nexus_info = None; // defer reading from the persistent store unless we find a candidate let mut results = vec![]; @@ -221,12 +222,12 @@ async fn disown_unused_replicas( if !replica_online && !replica.as_ref().dirty() && replica.as_ref().owners.owned_by(volume.uuid()) - && !replica.as_ref().owners.owned_by_a_nexus() + && !replica.as_ref().owned_by_nexus(&vol_nexus_ids) && !replica_in_target && !is_replica_healthy(context, &mut nexus_info, replica.as_ref(), volume.as_ref()) .await? { - volume.warn_span(|| tracing::warn!(replica.uuid = %replica.as_ref().uuid, "Attempting to disown unused replica")); + volume.info_span(|| tracing::info!(replica.uuid = %replica.as_ref().uuid, "Attempting to disown unused replica")); // the replica garbage collector will destroy the disowned replica match replica diff --git a/control-plane/stor-port/src/types/v0/store/replica.rs b/control-plane/stor-port/src/types/v0/store/replica.rs index 1f42a81bf..a256b2830 100644 --- a/control-plane/stor-port/src/types/v0/store/replica.rs +++ b/control-plane/stor-port/src/types/v0/store/replica.rs @@ -8,8 +8,9 @@ use crate::{ AsOperationSequencer, OperationSequence, SpecStatus, SpecTransaction, }, transport::{ - self, CreateReplica, HostNqn, PoolId, PoolUuid, Protocol, ReplicaId, ReplicaKind, - ReplicaName, ReplicaOwners, ReplicaShareProtocol, SnapshotCloneSpecParams, VolumeId, + self, CreateReplica, HostNqn, NexusId, PoolId, PoolUuid, Protocol, ReplicaId, + ReplicaKind, ReplicaName, ReplicaOwners, ReplicaShareProtocol, SnapshotCloneSpecParams, + VolumeId, }, }, IntoOption, @@ -109,6 +110,13 @@ impl ReplicaSpec { pub fn owned_by(&self, id: &VolumeId) -> bool { self.owners.owned_by(id) } + /// Check if this replica is owned by any of the nexus in volume spec. + pub fn owned_by_nexus(&self, nexus_id: &[&NexusId]) -> bool { + self.owners + .nexuses() + .iter() + .any(|owner| nexus_id.contains(&owner)) + } } /// Reference of a pool.