Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 61 additions & 1 deletion utils/nvme/nvme.go
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,17 @@ func (nh *NVMeHandler) AttachNVMeVolume(

nvmeDev, err := nvmeSubsys.GetNVMeDevice(ctx, publishInfo.NVMeNamespaceUUID)
if err != nil {
// Subsystem is connected but the namespace device is absent, likely a missed namespace
// notification. Trigger a rescan so a subsequent attach retry can find the device.
if connectionStatus == NVMeSubsystemConnected && errors.IsNotFoundError(err) {
if rescanErr := nvmeSubsys.RescanNamespaces(ctx); rescanErr != nil {
Logc(ctx).WithError(rescanErr).Warning(
"Failed to rescan NVMe namespaces after namespace device was not found.")
} else {
Logc(ctx).WithField("namespace", publishInfo.NVMeNamespaceUUID).Debug(
"Triggered NVMe namespace rescan after namespace device was not found.")
}
}
return err
}
devPath := nvmeDev.GetPath()
Expand Down Expand Up @@ -679,6 +690,14 @@ func (nh *NVMeHandler) InspectNVMeSessions(
pubSessionData.SetRemediation(ConnectOp)
subsToFix = append(subsToFix, currSessionData.Subsystem)
continue
case NVMeSubsystemConnected:
// All paths are up but a published namespace may still be missing its device after a
// missed namespace notification. Schedule a rescan to recover it.
if nh.subsystemHasMissingNamespace(ctx, &currSessionData.Subsystem, pubSessionData) {
pubSessionData.SetRemediation(RescanOp)
subsToFix = append(subsToFix, currSessionData.Subsystem)
continue
}
}

// All/None of the paths are present for the subsystem
Expand All @@ -689,6 +708,39 @@ func (nh *NVMeHandler) InspectNVMeSessions(
return subsToFix
}

// subsystemHasMissingNamespace reports whether any namespace published on the subsystem is missing
// its device on the host, a condition a namespace rescan can recover.
func (nh *NVMeHandler) subsystemHasMissingNamespace(
ctx context.Context, sub *NVMeSubsystem, sessionData *NVMeSessionData,
) bool {
if sessionData == nil {
return false
}

if present, err := afero.DirExists(sub.osFs, sub.Name); err != nil || !present {
return false
}

for nsUUID := range sessionData.Namespaces {
_, err := sub.GetNVMeDeviceAt(ctx, nsUUID)
if err == nil {
continue
}
if errors.IsNotFoundError(err) {
Logc(ctx).WithFields(LogFields{
"subsystem": sub.NQN,
"namespace": nsUUID,
}).Warning("Published NVMe namespace has no device on host; scheduling namespace rescan.")
return true
}

Logc(ctx).WithError(err).WithField("namespace", nsUUID).Debug(
"Error while checking NVMe namespace device presence during self-healing.")
}

return false
}

// RectifyNVMeSession applies the required remediation on the subsystemToFix to make it working again.
func (nh *NVMeHandler) RectifyNVMeSession(
ctx context.Context, subsystemToFix NVMeSubsystem, pubSessions *NVMeSessions,
Expand All @@ -705,12 +757,20 @@ func (nh *NVMeHandler) RectifyNVMeSession(
// Updating the access time as we are trying to do some NVMeOperation on this subsystem.
pubSessionData.LastAccessTime = time.Now()

if pubSessionData.Remediation == ConnectOp {
switch pubSessionData.Remediation {
case ConnectOp:
if err := subsystemToFix.Connect(ctx, pubSessionData.NVMeTargetIPs, true); err != nil {
Logc(ctx).Errorf("NVMe Self healing failed for subsystem %s; %v", subsystemToFix.NQN, err)
} else {
Logc(ctx).Infof("NVMe Self healing succeeded for %s", subsystemToFix.NQN)
}
case RescanOp:
if err := subsystemToFix.RescanNamespaces(ctx); err != nil {
Logc(ctx).Errorf("NVMe Self healing (namespace rescan) failed for subsystem %s; %v",
subsystemToFix.NQN, err)
} else {
Logc(ctx).Infof("NVMe Self healing (namespace rescan) succeeded for %s", subsystemToFix.NQN)
}
}
}

Expand Down
7 changes: 7 additions & 0 deletions utils/nvme/nvme_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ func (s *NVMeSubsystem) DisconnectSubsystemFromHost(ctx context.Context) error {
return errors.UnsupportedError("DisconnectSubsystemFromHost is not supported for darwin")
}

// RescanNamespaces re-enumerates the namespaces on the subsystem's controllers.
func (s *NVMeSubsystem) RescanNamespaces(ctx context.Context) error {
Logc(ctx).Debug(">>>> nvme_darwin.RescanNamespaces")
defer Logc(ctx).Debug("<<<< nvme_darwin.RescanNamespaces")
return errors.UnsupportedError("RescanNamespaces is not supported for darwin")
}

func (nh *NVMeHandler) GetNVMeSubsystem(ctx context.Context, nqn string) (*NVMeSubsystem,
error,
) {
Expand Down
31 changes: 31 additions & 0 deletions utils/nvme/nvme_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,37 @@ func (s *NVMeSubsystem) GetNVMeDeviceAt(ctx context.Context, nsUUID string) (*NV
return nil, errors.NotFoundError("no device found for the given namespace %v", nsUUID)
}

// RescanNamespaces re-enumerates the namespaces on each of the subsystem's controllers using
// "nvme ns-rescan". It recovers namespaces that are mapped but missing on the host; it only adds
// namespaces and never renames existing devices, so it is safe on a connected subsystem.
func (s *NVMeSubsystem) RescanNamespaces(ctx context.Context) error {
Logc(ctx).Debug(">>>> nvme_linux.RescanNamespaces")
defer Logc(ctx).Debug("<<<< nvme_linux.RescanNamespaces")

if len(s.Paths) == 0 {
return fmt.Errorf("no paths present for subsystem %s; cannot rescan namespaces", s.NQN)
}

failed := 0
for _, path := range s.Paths {
// path.Name is the controller's sysfs path, e.g. .../nvme0; the device is /dev/nvme0.
controller := "/dev/" + path.Name[strings.LastIndex(path.Name, "/")+1:]
if _, err := s.command.Execute(ctx, "nvme", "ns-rescan", controller); err != nil {
Logc(ctx).WithError(err).Errorf("Failed to rescan namespaces on controller %s.", controller)
failed++
continue
}
Logc(ctx).WithField("controller", controller).Debug("Rescanned NVMe namespaces on controller.")
}

// Succeed if at least one controller path was rescanned successfully.
if failed == len(s.Paths) {
return fmt.Errorf("failed to rescan namespaces on all paths for subsystem %s", s.NQN)
}

return nil
}

// FlushNVMeDevice flushes any ongoing IOs present on the NVMe device.
func (d *NVMeDevice) FlushNVMeDevice(ctx context.Context) error {
Logc(ctx).Debug(">>>> nvme_linux.FlushNVMeDevice")
Expand Down
2 changes: 2 additions & 0 deletions utils/nvme/nvme_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ type NVMeOperation int8
const (
NoOp NVMeOperation = iota
ConnectOp
RescanOp
)

// NVMeSessionData contains all the information related to any NVMe session. It has the subsystem information, the
Expand All @@ -136,6 +137,7 @@ type NVMeSessions struct {
type NVMeSubsystemInterface interface {
GetConnectionStatus() NVMeSubsystemConnectionStatus
Connect(ctx context.Context, nvmeTargetIps []string, connectOnly bool) error
RescanNamespaces(ctx context.Context) error
Disconnect(ctx context.Context) error
GetNamespaceCount(ctx context.Context) (int, error)
IsNetworkPathPresent(ip string) bool
Expand Down
7 changes: 7 additions & 0 deletions utils/nvme/nvme_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ func (s *NVMeSubsystem) DisconnectSubsystemFromHost(ctx context.Context) error {
return errors.UnsupportedError("DisconnectSubsystemFromHost is not supported for windows")
}

// RescanNamespaces re-enumerates the namespaces on the subsystem's controllers.
func (s *NVMeSubsystem) RescanNamespaces(ctx context.Context) error {
Logc(ctx).Debug(">>>> nvme_windows.RescanNamespaces")
defer Logc(ctx).Debug("<<<< nvme_windows.RescanNamespaces")
return errors.UnsupportedError("RescanNamespaces is not supported for windows")
}

func (nh *NVMeHandler) GetNVMeSubsystem(ctx context.Context, nqn string) (*NVMeSubsystem, error) {
Logc(ctx).Debug(">>>> nvme_windows.GetNVMeSubsystem")
defer Logc(ctx).Debug("<<<< nvme_windows.GetNVMeSubsystem")
Expand Down