Skip to content

Commit

Permalink
fix(ha-node): use transport info from nvme connect response
Browse files Browse the repository at this point in the history
With tcpFallback option enabled, the csi-node can make a decision to
connect an rdma capable target over tcp if the initiator is not rdma
capable. However, after the NvmeConnectResponse is received at ha-node,
the ha-node doesn't know that fallback could've happened. The ha-node
blindly goes ahead validating the new path presence by matching using
rdma transport parsed from target uri, which it doesn't find because new
path got connected over tcp. This cascades a SubsystemNotFound error up
to ha-cluster agent that reinitiates republish and keeps looping in same
problem.
This fix passes the actual transport used for connect info from csi-node
to ha-node in NvmeConnectResponse, letting ha-node validate using
accurate transport.

Signed-off-by: Diwakar Sharma <[email protected]>
  • Loading branch information
dsharma-dc committed Jan 7, 2025
1 parent 27ab9dc commit e41dad4
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 20 deletions.
38 changes: 24 additions & 14 deletions control-plane/agents/src/bin/ha/node/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,21 +211,31 @@ impl NodeAgentSvc {
})
.await
{
Ok(_) => match Subsystem::get(
parsed_uri.host(),
&parsed_uri.port(),
parsed_uri.transport(),
parsed_uri.nqn().as_str(),
) {
Ok(subsystem) => {
tracing::info!(new_path, "Successfully connected to NVMe target");
Ok(subsystem)
Ok(resp) => {
let connect_xprt = if let Some(t) = resp.into_inner().transport_used {
TrType::try_from(t).unwrap_or(parsed_uri.transport())
} else {
parsed_uri.transport()
};
match Subsystem::get(
parsed_uri.host(),
&parsed_uri.port(),
connect_xprt,
parsed_uri.nqn().as_str(),
) {
Ok(subsystem) => {
tracing::info!(
new_path,
"Successfully connected({connect_xprt:?}) to NVMe target"
);
Ok(subsystem)
}
Err(error) => Err(SubsystemNotFound {
nqn: nqn.clone(),
details: error.to_string(),
}),
}
Err(error) => Err(SubsystemNotFound {
nqn: nqn.clone(),
details: error.to_string(),
}),
},
}
Err(error) => {
tracing::error!(
new_path,
Expand Down
19 changes: 19 additions & 0 deletions control-plane/csi-driver/src/bin/node/dev.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
//! }
//! ```
use nvmeadm::nvmf_discovery::TrType;
use std::{
collections::HashMap,
convert::TryFrom,
Expand All @@ -50,6 +51,21 @@ use crate::match_dev;

pub(crate) type DeviceName = String;

// Could've directly used NvmfAttach object but it contains an AtomicBool that
// doesn't have Clone implemented.
#[derive(Default)]
pub struct NvmfAttachParameters {
_host: String,
_port: u16,
pub transport: TrType,
}

pub enum AttachParameters {
Nbd,
Iscsi,
Nvmf(NvmfAttachParameters),
}

#[tonic::async_trait]
pub(crate) trait Attach: Sync + Send {
async fn parse_parameters(
Expand All @@ -60,6 +76,9 @@ pub(crate) trait Attach: Sync + Send {
async fn find(&self) -> Result<Option<DeviceName>, DeviceError>;
/// Fixup parameters which cannot be set during attach, eg IO timeout
async fn fixup(&self) -> Result<(), DeviceError>;
/// Get basic connections parameters like target address, transport and port,
/// for this attach object.
fn attach_parameters(&self) -> AttachParameters;
}

#[tonic::async_trait]
Expand Down
6 changes: 5 additions & 1 deletion control-plane/csi-driver/src/bin/node/dev/iscsi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use uuid::Uuid;

use crate::{dev::util::extract_uuid, match_dev::match_iscsi_device};

use super::{Attach, Detach, DeviceError, DeviceName};
use super::{Attach, AttachParameters, Detach, DeviceError, DeviceName};

mod iscsiadm;
use iscsiadm::IscsiAdmin;
Expand Down Expand Up @@ -170,6 +170,10 @@ impl Attach for IscsiAttach {
async fn fixup(&self) -> Result<(), DeviceError> {
Ok(())
}

fn attach_parameters(&self) -> AttachParameters {
AttachParameters::Iscsi
}
}

pub(super) struct IscsiDetach {
Expand Down
6 changes: 5 additions & 1 deletion control-plane/csi-driver/src/bin/node/dev/nbd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::{collections::HashMap, convert::TryFrom};

use url::Url;

use super::{Attach, DeviceError, DeviceName};
use super::{Attach, AttachParameters, DeviceError, DeviceName};

pub(super) struct Nbd {
path: String,
Expand Down Expand Up @@ -51,4 +51,8 @@ impl Attach for Nbd {
async fn fixup(&self) -> Result<(), DeviceError> {
Ok(())
}

fn attach_parameters(&self) -> AttachParameters {
AttachParameters::Nbd
}
}
10 changes: 9 additions & 1 deletion control-plane/csi-driver/src/bin/node/dev/nvmf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use crate::{
runtime,
};

use super::{Attach, Detach, DeviceError, DeviceName};
use super::{Attach, AttachParameters, Detach, DeviceError, DeviceName};

lazy_static::lazy_static! {
static ref DEVICE_REGEX: Regex = Regex::new(r"nvme(\d{1,5})n(\d{1,5})").unwrap();
Expand Down Expand Up @@ -285,6 +285,14 @@ impl Attach for NvmfAttach {
Err(error) => Err(error),
}
}

fn attach_parameters(&self) -> AttachParameters {
AttachParameters::Nvmf(super::NvmfAttachParameters {
_host: self.host.clone(),
_port: self.port,
transport: self.transport,
})
}
}

pub(super) struct NvmfDetach {
Expand Down
14 changes: 12 additions & 2 deletions control-plane/csi-driver/src/bin/node/nodeplugin_nvme.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::dev::{nvmf::volume_uuid_from_url_str, Device};
use crate::dev::{nvmf::volume_uuid_from_url_str, AttachParameters, Device};
use csi_driver::limiter::VolumeOpGuard;
use grpc::csi_node_nvme::{
nvme_operations_server::NvmeOperations, NvmeConnectRequest, NvmeConnectResponse,
Expand Down Expand Up @@ -42,6 +42,16 @@ impl NvmeOperations for NvmeOperationsSvc {
.await
.map_err(|error| warn!(error=%error, "Failed to do fixup after connect"));

Ok(tonic::Response::new(NvmeConnectResponse {}))
let conn_params = match device.attach_parameters() {
AttachParameters::Nvmf(p) => p,
// Must not reach here.
_ => {
unreachable!("Params must be of variant AttachParameters::Nvmf here.");
}
};

Ok(tonic::Response::new(NvmeConnectResponse {
transport_used: Some(conn_params.transport as i32),
}))
}
}
10 changes: 9 additions & 1 deletion control-plane/grpc/proto/v1/ha/csi_node_nvme.proto
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,16 @@ message NvmeConnectRequest {
common.MapWrapper publish_context = 2;
}

message NvmeConnectResponse {
// chosen enum variant values to match these values as TrType in nvmeadm.
enum TrType {
Invalid = 0;
Rdma = 1;
Fc = 2;
Tcp = 3;
}

message NvmeConnectResponse {
optional TrType transport_used = 1;
}

service NvmeOperations {
Expand Down

0 comments on commit e41dad4

Please sign in to comment.