nfs/blocklayout: Fix premature PR key unregistration
authorChuck Lever <chuck.lever@oracle.com>
Tue, 25 Jun 2024 20:02:06 +0000 (16:02 -0400)
committerAnna Schumaker <Anna.Schumaker@Netapp.com>
Mon, 8 Jul 2024 17:47:27 +0000 (13:47 -0400)
During generic/069 runs with pNFS SCSI layouts, the NFS client emits
the following in the system journal:

kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
kernel: sd 6:0:0:1: reservation conflict
kernel: sd 6:0:0:1: [sdb] tag#16 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
kernel: sd 6:0:0:1: [sdb] tag#16 CDB: Write(10) 2a 00 00 00 00 50 00 00 08 00
kernel: reservation conflict error, dev sdb, sector 80 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 2
kernel: sd 6:0:0:1: reservation conflict
kernel: sd 6:0:0:1: reservation conflict
kernel: sd 6:0:0:1: [sdb] tag#18 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
kernel: sd 6:0:0:1: [sdb] tag#17 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
kernel: sd 6:0:0:1: [sdb] tag#18 CDB: Write(10) 2a 00 00 00 00 60 00 00 08 00
kernel: sd 6:0:0:1: [sdb] tag#17 CDB: Write(10) 2a 00 00 00 00 58 00 00 08 00
kernel: reservation conflict error, dev sdb, sector 96 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
kernel: reservation conflict error, dev sdb, sector 88 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
systemd[1]: fstests-generic-069.scope: Deactivated successfully.
systemd[1]: fstests-generic-069.scope: Consumed 5.092s CPU time.
systemd[1]: media-test.mount: Deactivated successfully.
systemd[1]: media-scratch.mount: Deactivated successfully.
kernel: sd 6:0:0:1: reservation conflict
kernel: failed to unregister PR key.

This appears to be due to a race. bl_alloc_lseg() calls this:

561 static struct nfs4_deviceid_node *
562 bl_find_get_deviceid(struct nfs_server *server,
563                 const struct nfs4_deviceid *id, const struct cred *cred,
564                 gfp_t gfp_mask)
565 {
566         struct nfs4_deviceid_node *node;
567         unsigned long start, end;
568
569 retry:
570         node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
571         if (!node)
572                 return ERR_PTR(-ENODEV);

nfs4_find_get_deviceid() does a lookup without the spin lock first.
If it can't find a matching deviceid, it creates a new device_info
(which calls bl_alloc_deviceid_node, and that registers the device's
PR key).

Then it takes the nfs4_deviceid_lock and looks up the deviceid again.
If it finds it this time, bl_find_get_deviceid() frees the spare
(new) device_info, which unregisters the PR key for the same device.

Any subsequent I/O from this client on that device gets EBADE.

The umount later unregisters the device's PR key again.

To prevent this problem, register the PR key after the deviceid_node
lookup.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
fs/nfs/blocklayout/blocklayout.c
fs/nfs/blocklayout/blocklayout.h
fs/nfs/blocklayout/dev.c

index 6be13e0..0becdec 100644 (file)
@@ -564,25 +564,32 @@ bl_find_get_deviceid(struct nfs_server *server,
                gfp_t gfp_mask)
 {
        struct nfs4_deviceid_node *node;
-       unsigned long start, end;
+       int err = -ENODEV;
 
 retry:
        node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
        if (!node)
                return ERR_PTR(-ENODEV);
 
-       if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
-               return node;
+       if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
+               unsigned long end = jiffies;
+               unsigned long start = end - PNFS_DEVICE_RETRY_TIMEOUT;
 
-       end = jiffies;
-       start = end - PNFS_DEVICE_RETRY_TIMEOUT;
-       if (!time_in_range(node->timestamp_unavailable, start, end)) {
-               nfs4_delete_deviceid(node->ld, node->nfs_client, id);
-               goto retry;
+               if (!time_in_range(node->timestamp_unavailable, start, end)) {
+                       nfs4_delete_deviceid(node->ld, node->nfs_client, id);
+                       goto retry;
+               }
+               goto out_put;
        }
 
+       if (!bl_register_dev(container_of(node, struct pnfs_block_dev, node)))
+               goto out_put;
+
+       return node;
+
+out_put:
        nfs4_put_deviceid_node(node);
-       return ERR_PTR(-ENODEV);
+       return ERR_PTR(err);
 }
 
 static int
index f1eeb49..6da40ca 100644 (file)
@@ -104,20 +104,26 @@ struct pnfs_block_dev {
        u64                             start;
        u64                             len;
 
+       enum pnfs_block_volume_type     type;
        u32                             nr_children;
        struct pnfs_block_dev           *children;
        u64                             chunk_size;
 
        struct file                     *bdev_file;
        u64                             disk_offset;
+       unsigned long                   flags;
 
        u64                             pr_key;
-       bool                            pr_registered;
 
        bool (*map)(struct pnfs_block_dev *dev, u64 offset,
                        struct pnfs_block_dev_map *map);
 };
 
+/* pnfs_block_dev flag bits */
+enum {
+       PNFS_BDEV_REGISTERED = 0,
+};
+
 /* sector_t fields are all in 512-byte sectors */
 struct pnfs_block_extent {
        union {
@@ -172,6 +178,7 @@ struct bl_msg_hdr {
 #define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
 
 /* dev.c */
+bool bl_register_dev(struct pnfs_block_dev *d);
 struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
                struct pnfs_device *pdev, gfp_t gfp_mask);
 void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
index 93ef7f8..1785162 100644 (file)
 
 #define NFSDBG_FACILITY                NFSDBG_PNFS_LD
 
+static void bl_unregister_scsi(struct pnfs_block_dev *dev)
+{
+       struct block_device *bdev = file_bdev(dev->bdev_file);
+       const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
+
+       if (!test_and_clear_bit(PNFS_BDEV_REGISTERED, &dev->flags))
+               return;
+
+       if (ops->pr_register(bdev, dev->pr_key, 0, false))
+               pr_err("failed to unregister PR key.\n");
+}
+
+static bool bl_register_scsi(struct pnfs_block_dev *dev)
+{
+       struct block_device *bdev = file_bdev(dev->bdev_file);
+       const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
+       int status;
+
+       if (test_and_set_bit(PNFS_BDEV_REGISTERED, &dev->flags))
+               return true;
+
+       status = ops->pr_register(bdev, 0, dev->pr_key, true);
+       if (status) {
+               pr_err("pNFS: failed to register key for block device %s.",
+                      bdev->bd_disk->disk_name);
+               return false;
+       }
+       return true;
+}
+
+static void bl_unregister_dev(struct pnfs_block_dev *dev)
+{
+       u32 i;
+
+       if (dev->nr_children) {
+               for (i = 0; i < dev->nr_children; i++)
+                       bl_unregister_dev(&dev->children[i]);
+               return;
+       }
+
+       if (dev->type == PNFS_BLOCK_VOLUME_SCSI)
+               bl_unregister_scsi(dev);
+}
+
+bool bl_register_dev(struct pnfs_block_dev *dev)
+{
+       u32 i;
+
+       if (dev->nr_children) {
+               for (i = 0; i < dev->nr_children; i++) {
+                       if (!bl_register_dev(&dev->children[i])) {
+                               while (i > 0)
+                                       bl_unregister_dev(&dev->children[--i]);
+                               return false;
+                       }
+               }
+               return true;
+       }
+
+       if (dev->type == PNFS_BLOCK_VOLUME_SCSI)
+               return bl_register_scsi(dev);
+       return true;
+}
+
 static void
 bl_free_device(struct pnfs_block_dev *dev)
 {
+       bl_unregister_dev(dev);
+
        if (dev->nr_children) {
                int i;
 
@@ -23,17 +89,6 @@ bl_free_device(struct pnfs_block_dev *dev)
                        bl_free_device(&dev->children[i]);
                kfree(dev->children);
        } else {
-               if (dev->pr_registered) {
-                       const struct pr_ops *ops =
-                               file_bdev(dev->bdev_file)->bd_disk->fops->pr_ops;
-                       int error;
-
-                       error = ops->pr_register(file_bdev(dev->bdev_file),
-                               dev->pr_key, 0, false);
-                       if (error)
-                               pr_err("failed to unregister PR key.\n");
-               }
-
                if (dev->bdev_file)
                        fput(dev->bdev_file);
        }
@@ -365,14 +420,6 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
                goto out_blkdev_put;
        }
 
-       error = ops->pr_register(file_bdev(d->bdev_file), 0, d->pr_key, true);
-       if (error) {
-               pr_err("pNFS: failed to register key for block device %s.",
-                               file_bdev(d->bdev_file)->bd_disk->disk_name);
-               goto out_blkdev_put;
-       }
-
-       d->pr_registered = true;
        return 0;
 
 out_blkdev_put:
@@ -458,7 +505,9 @@ static int
 bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
                struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
 {
-       switch (volumes[idx].type) {
+       d->type = volumes[idx].type;
+
+       switch (d->type) {
        case PNFS_BLOCK_VOLUME_SIMPLE:
                return bl_parse_simple(server, d, volumes, idx, gfp_mask);
        case PNFS_BLOCK_VOLUME_SLICE:
@@ -470,7 +519,7 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
        case PNFS_BLOCK_VOLUME_SCSI:
                return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
        default:
-               dprintk("unsupported volume type: %d\n", volumes[idx].type);
+               dprintk("unsupported volume type: %d\n", d->type);
                return -EIO;
        }
 }