Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 11 Sep 2021 21:48:42 +0000 (14:48 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 11 Sep 2021 21:48:42 +0000 (14:48 -0700)
Pull virtio updates from Michael Tsirkin:

 - vduse driver ("vDPA Device in Userspace") supporting emulated virtio
   block devices

 - virtio-vsock support for end of record with SEQPACKET

 - vdpa: mac and mq support for ifcvf and mlx5

 - vdpa: management netlink for ifcvf

 - virtio-i2c, gpio dt bindings

 - misc fixes and cleanups

* tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: (39 commits)
  Documentation: Add documentation for VDUSE
  vduse: Introduce VDUSE - vDPA Device in Userspace
  vduse: Implement an MMU-based software IOTLB
  vdpa: Support transferring virtual addressing during DMA mapping
  vdpa: factor out vhost_vdpa_pa_map() and vhost_vdpa_pa_unmap()
  vdpa: Add an opaque pointer for vdpa_config_ops.dma_map()
  vhost-iotlb: Add an opaque pointer for vhost IOTLB
  vhost-vdpa: Handle the failure of vdpa_reset()
  vdpa: Add reset callback in vdpa_config_ops
  vdpa: Fix some coding style issues
  file: Export receive_fd() to modules
  eventfd: Export eventfd_wake_count to modules
  iova: Export alloc_iova_fast() and free_iova_fast()
  virtio-blk: remove unneeded "likely" statements
  virtio-balloon: Use virtio_find_vqs() helper
  vdpa: Make use of PFN_PHYS/PFN_UP/PFN_DOWN helper macro
  vsock_test: update message bounds test for MSG_EOR
  af_vsock: rename variables in receive loop
  virtio/vsock: support MSG_EOR bit processing
  vhost/vsock: support MSG_EOR bit processing
  ...

41 files changed:
Documentation/devicetree/bindings/gpio/gpio-virtio.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/i2c/i2c-virtio.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/virtio/mmio.yaml
Documentation/devicetree/bindings/virtio/virtio-device.yaml [new file with mode: 0644]
Documentation/userspace-api/index.rst
Documentation/userspace-api/ioctl/ioctl-number.rst
Documentation/userspace-api/vduse.rst [new file with mode: 0644]
drivers/block/virtio_blk.c
drivers/iommu/iova.c
drivers/vdpa/Kconfig
drivers/vdpa/Makefile
drivers/vdpa/ifcvf/ifcvf_base.c
drivers/vdpa/ifcvf/ifcvf_base.h
drivers/vdpa/ifcvf/ifcvf_main.c
drivers/vdpa/mlx5/core/mlx5_vdpa.h
drivers/vdpa/mlx5/core/mr.c
drivers/vdpa/mlx5/core/resources.c
drivers/vdpa/mlx5/net/mlx5_vnet.c
drivers/vdpa/vdpa.c
drivers/vdpa/vdpa_sim/vdpa_sim.c
drivers/vdpa/vdpa_user/Makefile [new file with mode: 0644]
drivers/vdpa/vdpa_user/iova_domain.c [new file with mode: 0644]
drivers/vdpa/vdpa_user/iova_domain.h [new file with mode: 0644]
drivers/vdpa/vdpa_user/vduse_dev.c [new file with mode: 0644]
drivers/vdpa/virtio_pci/vp_vdpa.c
drivers/vhost/iotlb.c
drivers/vhost/scsi.c
drivers/vhost/vdpa.c
drivers/vhost/vsock.c
drivers/virtio/virtio.c
drivers/virtio/virtio_balloon.c
fs/file.c
include/linux/file.h
include/linux/vdpa.h
include/linux/vhost_iotlb.h
include/uapi/linux/vduse.h [new file with mode: 0644]
include/uapi/linux/virtio_ids.h
include/uapi/linux/virtio_vsock.h
net/vmw_vsock/af_vsock.c
net/vmw_vsock/virtio_transport_common.c
tools/testing/vsock/vsock_test.c

diff --git a/Documentation/devicetree/bindings/gpio/gpio-virtio.yaml b/Documentation/devicetree/bindings/gpio/gpio-virtio.yaml
new file mode 100644 (file)
index 0000000..601d857
--- /dev/null
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/gpio-virtio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Virtio GPIO controller
+
+maintainers:
+  - Viresh Kumar <viresh.kumar@linaro.org>
+
+allOf:
+  - $ref: /schemas/virtio/virtio-device.yaml#
+
+description:
+  Virtio GPIO controller, see /schemas/virtio/virtio-device.yaml for more
+  details.
+
+properties:
+  $nodename:
+    const: gpio
+
+  compatible:
+    const: virtio,device29
+
+  gpio-controller: true
+
+  "#gpio-cells":
+    const: 2
+
+  interrupt-controller: true
+
+  "#interrupt-cells":
+    const: 2
+
+required:
+  - compatible
+  - gpio-controller
+  - "#gpio-cells"
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    virtio@3000 {
+        compatible = "virtio,mmio";
+        reg = <0x3000 0x100>;
+        interrupts = <41>;
+
+        gpio {
+            compatible = "virtio,device29";
+            gpio-controller;
+            #gpio-cells = <2>;
+            interrupt-controller;
+            #interrupt-cells = <2>;
+        };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/i2c/i2c-virtio.yaml b/Documentation/devicetree/bindings/i2c/i2c-virtio.yaml
new file mode 100644 (file)
index 0000000..7d87ed8
--- /dev/null
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/i2c/i2c-virtio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Virtio I2C Adapter
+
+maintainers:
+  - Viresh Kumar <viresh.kumar@linaro.org>
+
+allOf:
+  - $ref: /schemas/i2c/i2c-controller.yaml#
+  - $ref: /schemas/virtio/virtio-device.yaml#
+
+description:
+  Virtio I2C device, see /schemas/virtio/virtio-device.yaml for more details.
+
+properties:
+  $nodename:
+    const: i2c
+
+  compatible:
+    const: virtio,device22
+
+required:
+  - compatible
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    virtio@3000 {
+        compatible = "virtio,mmio";
+        reg = <0x3000 0x100>;
+        interrupts = <41>;
+
+        i2c {
+            compatible = "virtio,device22";
+
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            light-sensor@20 {
+                compatible = "dynaimage,al3320a";
+                reg = <0x20>;
+            };
+        };
+    };
+
+...
index d465970..4b7a027 100644 (file)
@@ -36,7 +36,8 @@ required:
   - reg
   - interrupts
 
-additionalProperties: false
+additionalProperties:
+  type: object
 
 examples:
   - |
diff --git a/Documentation/devicetree/bindings/virtio/virtio-device.yaml b/Documentation/devicetree/bindings/virtio/virtio-device.yaml
new file mode 100644 (file)
index 0000000..1778ea9
--- /dev/null
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/virtio/virtio-device.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Virtio device bindings
+
+maintainers:
+  - Viresh Kumar <viresh.kumar@linaro.org>
+
+description:
+  These bindings are applicable to virtio devices irrespective of the bus they
+  are bound to, like mmio or pci.
+
+# We need a select here so we don't match all nodes with 'virtio,mmio'
+properties:
+  compatible:
+    pattern: "^virtio,device[0-9a-f]{1,8}$"
+    description: Virtio device nodes.
+      "virtio,deviceID", where ID is the virtio device id. The textual
+      representation of ID shall be in lower case hexadecimal with leading
+      zeroes suppressed.
+
+required:
+  - compatible
+
+additionalProperties: true
+
+examples:
+  - |
+    virtio@3000 {
+        compatible = "virtio,mmio";
+        reg = <0x3000 0x100>;
+        interrupts = <43>;
+
+        i2c {
+            compatible = "virtio,device22";
+        };
+    };
+...
index 0b5eefe..c432be0 100644 (file)
@@ -27,6 +27,7 @@ place where this information is gathered.
    iommu
    media/index
    sysfs-platform_profile
+   vduse
 
 .. only::  subproject and html
 
index b7070d7..2e81340 100644 (file)
@@ -299,6 +299,7 @@ Code  Seq#    Include File                                           Comments
 'z'   10-4F  drivers/s390/crypto/zcrypt_api.h                        conflict!
 '|'   00-7F  linux/media.h
 0x80  00-1F  linux/fb.h
+0x81  00-1F  linux/vduse.h
 0x89  00-06  arch/x86/include/asm/sockios.h
 0x89  0B-DF  linux/sockios.h
 0x89  E0-EF  linux/sockios.h                                         SIOCPROTOPRIVATE range
diff --git a/Documentation/userspace-api/vduse.rst b/Documentation/userspace-api/vduse.rst
new file mode 100644 (file)
index 0000000..42ef59e
--- /dev/null
@@ -0,0 +1,233 @@
+==================================
+VDUSE - "vDPA Device in Userspace"
+==================================
+
+vDPA (virtio data path acceleration) device is a device that uses a
+datapath which complies with the virtio specifications with vendor
+specific control path. vDPA devices can be both physically located on
+the hardware or emulated by software. VDUSE is a framework that makes it
+possible to implement software-emulated vDPA devices in userspace. And
+to make the device emulation more secure, the emulated vDPA device's
+control path is handled in the kernel and only the data path is
+implemented in the userspace.
+
+Note that only virtio block device is supported by VDUSE framework now,
+which can reduce security risks when the userspace process that implements
+the data path is run by an unprivileged user. The support for other device
+types can be added after the security issue of corresponding device driver
+is clarified or fixed in the future.
+
+Create/Destroy VDUSE devices
+------------------------
+
+VDUSE devices are created as follows:
+
+1. Create a new VDUSE instance with ioctl(VDUSE_CREATE_DEV) on
+   /dev/vduse/control.
+
+2. Setup each virtqueue with ioctl(VDUSE_VQ_SETUP) on /dev/vduse/$NAME.
+
+3. Begin processing VDUSE messages from /dev/vduse/$NAME. The first
+   messages will arrive while attaching the VDUSE instance to vDPA bus.
+
+4. Send the VDPA_CMD_DEV_NEW netlink message to attach the VDUSE
+   instance to vDPA bus.
+
+VDUSE devices are destroyed as follows:
+
+1. Send the VDPA_CMD_DEV_DEL netlink message to detach the VDUSE
+   instance from vDPA bus.
+
+2. Close the file descriptor referring to /dev/vduse/$NAME.
+
+3. Destroy the VDUSE instance with ioctl(VDUSE_DESTROY_DEV) on
+   /dev/vduse/control.
+
+The netlink messages can be sent via vdpa tool in iproute2 or use the
+below sample codes:
+
+.. code-block:: c
+
+       static int netlink_add_vduse(const char *name, enum vdpa_command cmd)
+       {
+               struct nl_sock *nlsock;
+               struct nl_msg *msg;
+               int famid;
+
+               nlsock = nl_socket_alloc();
+               if (!nlsock)
+                       return -ENOMEM;
+
+               if (genl_connect(nlsock))
+                       goto free_sock;
+
+               famid = genl_ctrl_resolve(nlsock, VDPA_GENL_NAME);
+               if (famid < 0)
+                       goto close_sock;
+
+               msg = nlmsg_alloc();
+               if (!msg)
+                       goto close_sock;
+
+               if (!genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, famid, 0, 0, cmd, 0))
+                       goto nla_put_failure;
+
+               NLA_PUT_STRING(msg, VDPA_ATTR_DEV_NAME, name);
+               if (cmd == VDPA_CMD_DEV_NEW)
+                       NLA_PUT_STRING(msg, VDPA_ATTR_MGMTDEV_DEV_NAME, "vduse");
+
+               if (nl_send_sync(nlsock, msg))
+                       goto close_sock;
+
+               nl_close(nlsock);
+               nl_socket_free(nlsock);
+
+               return 0;
+       nla_put_failure:
+               nlmsg_free(msg);
+       close_sock:
+               nl_close(nlsock);
+       free_sock:
+               nl_socket_free(nlsock);
+               return -1;
+       }
+
+How VDUSE works
+---------------
+
+As mentioned above, a VDUSE device is created by ioctl(VDUSE_CREATE_DEV) on
+/dev/vduse/control. With this ioctl, userspace can specify some basic configuration
+such as device name (uniquely identify a VDUSE device), virtio features, virtio
+configuration space, the number of virtqueues and so on for this emulated device.
+Then a char device interface (/dev/vduse/$NAME) is exported to userspace for device
+emulation. Userspace can use the VDUSE_VQ_SETUP ioctl on /dev/vduse/$NAME to
+add per-virtqueue configuration such as the max size of virtqueue to the device.
+
+After the initialization, the VDUSE device can be attached to vDPA bus via
+the VDPA_CMD_DEV_NEW netlink message. Userspace needs to read()/write() on
+/dev/vduse/$NAME to receive/reply some control messages from/to VDUSE kernel
+module as follows:
+
+.. code-block:: c
+
+       static int vduse_message_handler(int dev_fd)
+       {
+               int len;
+               struct vduse_dev_request req;
+               struct vduse_dev_response resp;
+
+               len = read(dev_fd, &req, sizeof(req));
+               if (len != sizeof(req))
+                       return -1;
+
+               resp.request_id = req.request_id;
+
+               switch (req.type) {
+
+               /* handle different types of messages */
+
+               }
+
+               len = write(dev_fd, &resp, sizeof(resp));
+               if (len != sizeof(resp))
+                       return -1;
+
+               return 0;
+       }
+
+There are now three types of messages introduced by VDUSE framework:
+
+- VDUSE_GET_VQ_STATE: Get the state for virtqueue, userspace should return
+  avail index for split virtqueue or the device/driver ring wrap counters and
+  the avail and used index for packed virtqueue.
+
+- VDUSE_SET_STATUS: Set the device status, userspace should follow
+  the virtio spec: https://docs.oasis-open.org/virtio/virtio/v1.1/virtio-v1.1.html
+  to process this message. For example, fail to set the FEATURES_OK device
+  status bit if the device can not accept the negotiated virtio features
+  get from the VDUSE_DEV_GET_FEATURES ioctl.
+
+- VDUSE_UPDATE_IOTLB: Notify userspace to update the memory mapping for specified
+  IOVA range, userspace should firstly remove the old mapping, then setup the new
+  mapping via the VDUSE_IOTLB_GET_FD ioctl.
+
+After DRIVER_OK status bit is set via the VDUSE_SET_STATUS message, userspace is
+able to start the dataplane processing as follows:
+
+1. Get the specified virtqueue's information with the VDUSE_VQ_GET_INFO ioctl,
+   including the size, the IOVAs of descriptor table, available ring and used ring,
+   the state and the ready status.
+
+2. Pass the above IOVAs to the VDUSE_IOTLB_GET_FD ioctl so that those IOVA regions
+   can be mapped into userspace. Some sample codes is shown below:
+
+.. code-block:: c
+
+       static int perm_to_prot(uint8_t perm)
+       {
+               int prot = 0;
+
+               switch (perm) {
+               case VDUSE_ACCESS_WO:
+                       prot |= PROT_WRITE;
+                       break;
+               case VDUSE_ACCESS_RO:
+                       prot |= PROT_READ;
+                       break;
+               case VDUSE_ACCESS_RW:
+                       prot |= PROT_READ | PROT_WRITE;
+                       break;
+               }
+
+               return prot;
+       }
+
+       static void *iova_to_va(int dev_fd, uint64_t iova, uint64_t *len)
+       {
+               int fd;
+               void *addr;
+               size_t size;
+               struct vduse_iotlb_entry entry;
+
+               entry.start = iova;
+               entry.last = iova;
+
+               /*
+                * Find the first IOVA region that overlaps with the specified
+                * range [start, last] and return the corresponding file descriptor.
+                */
+               fd = ioctl(dev_fd, VDUSE_IOTLB_GET_FD, &entry);
+               if (fd < 0)
+                       return NULL;
+
+               size = entry.last - entry.start + 1;
+               *len = entry.last - iova + 1;
+               addr = mmap(0, size, perm_to_prot(entry.perm), MAP_SHARED,
+                           fd, entry.offset);
+               close(fd);
+               if (addr == MAP_FAILED)
+                       return NULL;
+
+               /*
+                * Using some data structures such as linked list to store
+                * the iotlb mapping. The munmap(2) should be called for the
+                * cached mapping when the corresponding VDUSE_UPDATE_IOTLB
+                * message is received or the device is reset.
+                */
+
+               return addr + iova - entry.start;
+       }
+
+3. Setup the kick eventfd for the specified virtqueues with the VDUSE_VQ_SETUP_KICKFD
+   ioctl. The kick eventfd is used by VDUSE kernel module to notify userspace to
+   consume the available ring. This is optional since userspace can choose to poll the
+   available ring instead.
+
+4. Listen to the kick eventfd (optional) and consume the available ring. The buffer
+   described by the descriptors in the descriptor table should be also mapped into
+   userspace via the VDUSE_IOTLB_GET_FD ioctl before accessing.
+
+5. Inject an interrupt for specific virtqueue with the VDUSE_INJECT_VQ_IRQ ioctl
+   after the used ring is filled.
+
+For more details on the uAPI, please see include/uapi/linux/vduse.h.
index 57c6ae7..9b3bd08 100644 (file)
@@ -762,7 +762,7 @@ static int virtblk_probe(struct virtio_device *vdev)
                goto out_free_vblk;
 
        /* Default queue sizing is to fill the ring. */
-       if (likely(!virtblk_queue_depth)) {
+       if (!virtblk_queue_depth) {
                queue_depth = vblk->vqs[0].vq->num_free;
                /* ... but without indirect descs, we use 2 descs per req */
                if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
@@ -836,7 +836,7 @@ static int virtblk_probe(struct virtio_device *vdev)
        else
                blk_size = queue_logical_block_size(q);
 
-       if (unlikely(blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE)) {
+       if (blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE) {
                dev_err(&vdev->dev,
                        "block size is changed unexpectedly, now is %u\n",
                        blk_size);
index 0af42fb..9e8bc80 100644 (file)
@@ -519,6 +519,7 @@ retry:
 
        return new_iova->pfn_lo;
 }
+EXPORT_SYMBOL_GPL(alloc_iova_fast);
 
 /**
  * free_iova_fast - free iova pfn range into rcache
@@ -536,6 +537,7 @@ free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
 
        free_iova(iovad, pfn);
 }
+EXPORT_SYMBOL_GPL(free_iova_fast);
 
 #define fq_ring_for_each(i, fq) \
        for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % IOVA_FQ_SIZE)
index a503c1b..3d91982 100644 (file)
@@ -33,6 +33,16 @@ config VDPA_SIM_BLOCK
          vDPA block device simulator which terminates IO request in a
          memory buffer.
 
+config VDPA_USER
+       tristate "VDUSE (vDPA Device in Userspace) support"
+       depends on EVENTFD && MMU && HAS_DMA
+       select DMA_OPS
+       select VHOST_IOTLB
+       select IOMMU_IOVA
+       help
+         With VDUSE it is possible to emulate a vDPA Device
+         in a userspace program.
+
 config IFCVF
        tristate "Intel IFC VF vDPA driver"
        depends on PCI_MSI
@@ -53,6 +63,7 @@ config MLX5_VDPA
 config MLX5_VDPA_NET
        tristate "vDPA driver for ConnectX devices"
        select MLX5_VDPA
+       select VHOST_RING
        depends on MLX5_CORE
        help
          VDPA network driver for ConnectX6 and newer. Provides offloading
index 67fe7f3..f02ebed 100644 (file)
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_VDPA) += vdpa.o
 obj-$(CONFIG_VDPA_SIM) += vdpa_sim/
+obj-$(CONFIG_VDPA_USER) += vdpa_user/
 obj-$(CONFIG_IFCVF)    += ifcvf/
 obj-$(CONFIG_MLX5_VDPA) += mlx5/
 obj-$(CONFIG_VP_VDPA)    += virtio_pci/
index 6e197fe..2808f1b 100644 (file)
@@ -158,7 +158,9 @@ next:
                return -EIO;
        }
 
-       for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+       hw->nr_vring = ifc_ioread16(&hw->common_cfg->num_queues);
+
+       for (i = 0; i < hw->nr_vring; i++) {
                ifc_iowrite16(i, &hw->common_cfg->queue_select);
                notify_off = ifc_ioread16(&hw->common_cfg->queue_notify_off);
                hw->vring[i].notify_addr = hw->notify_base +
@@ -304,7 +306,7 @@ u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid)
        u32 q_pair_id;
 
        ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
-       q_pair_id = qid / (IFCVF_MAX_QUEUE_PAIRS * 2);
+       q_pair_id = qid / hw->nr_vring;
        avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2];
        last_avail_idx = ifc_ioread16(avail_idx_addr);
 
@@ -318,7 +320,7 @@ int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num)
        u32 q_pair_id;
 
        ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
-       q_pair_id = qid / (IFCVF_MAX_QUEUE_PAIRS * 2);
+       q_pair_id = qid / hw->nr_vring;
        avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2];
        hw->vring[qid].last_avail_idx = num;
        ifc_iowrite16(num, avail_idx_addr);
index 2996db0..09918af 100644 (file)
 #define N3000_DEVICE_ID                0x1041
 #define N3000_SUBSYS_DEVICE_ID 0x001A
 
-#define IFCVF_NET_SUPPORTED_FEATURES \
-               ((1ULL << VIRTIO_NET_F_MAC)                     | \
-                (1ULL << VIRTIO_F_ANY_LAYOUT)                  | \
-                (1ULL << VIRTIO_F_VERSION_1)                   | \
-                (1ULL << VIRTIO_NET_F_STATUS)                  | \
-                (1ULL << VIRTIO_F_ORDER_PLATFORM)              | \
-                (1ULL << VIRTIO_F_ACCESS_PLATFORM)             | \
-                (1ULL << VIRTIO_NET_F_MRG_RXBUF))
-
-/* Only one queue pair for now. */
-#define IFCVF_MAX_QUEUE_PAIRS  1
+/* Max 8 data queue pairs(16 queues) and one control vq for now. */
+#define IFCVF_MAX_QUEUES       17
 
 #define IFCVF_QUEUE_ALIGNMENT  PAGE_SIZE
 #define IFCVF_QUEUE_MAX                32768
@@ -51,8 +42,6 @@
 #define ifcvf_private_to_vf(adapter) \
        (&((struct ifcvf_adapter *)adapter)->vf)
 
-#define IFCVF_MAX_INTR (IFCVF_MAX_QUEUE_PAIRS * 2 + 1)
-
 struct vring_info {
        u64 desc;
        u64 avail;
@@ -83,7 +72,7 @@ struct ifcvf_hw {
        u32 dev_type;
        struct virtio_pci_common_cfg __iomem *common_cfg;
        void __iomem *net_cfg;
-       struct vring_info vring[IFCVF_MAX_QUEUE_PAIRS * 2];
+       struct vring_info vring[IFCVF_MAX_QUEUES];
        void __iomem * const *base;
        char config_msix_name[256];
        struct vdpa_callback config_cb;
@@ -103,7 +92,13 @@ struct ifcvf_vring_lm_cfg {
 
 struct ifcvf_lm_cfg {
        u8 reserved[IFCVF_LM_RING_STATE_OFFSET];
-       struct ifcvf_vring_lm_cfg vring_lm_cfg[IFCVF_MAX_QUEUE_PAIRS];
+       struct ifcvf_vring_lm_cfg vring_lm_cfg[IFCVF_MAX_QUEUES];
+};
+
+struct ifcvf_vdpa_mgmt_dev {
+       struct vdpa_mgmt_dev mdev;
+       struct ifcvf_adapter *adapter;
+       struct pci_dev *pdev;
 };
 
 int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *dev);
index 351c6cf..dcd648e 100644 (file)
@@ -63,9 +63,13 @@ static int ifcvf_request_irq(struct ifcvf_adapter *adapter)
        struct pci_dev *pdev = adapter->pdev;
        struct ifcvf_hw *vf = &adapter->vf;
        int vector, i, ret, irq;
+       u16 max_intr;
 
-       ret = pci_alloc_irq_vectors(pdev, IFCVF_MAX_INTR,
-                                   IFCVF_MAX_INTR, PCI_IRQ_MSIX);
+       /* all queues and config interrupt  */
+       max_intr = vf->nr_vring + 1;
+
+       ret = pci_alloc_irq_vectors(pdev, max_intr,
+                                   max_intr, PCI_IRQ_MSIX);
        if (ret < 0) {
                IFCVF_ERR(pdev, "Failed to alloc IRQ vectors\n");
                return ret;
@@ -83,7 +87,7 @@ static int ifcvf_request_irq(struct ifcvf_adapter *adapter)
                return ret;
        }
 
-       for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+       for (i = 0; i < vf->nr_vring; i++) {
                snprintf(vf->vring[i].msix_name, 256, "ifcvf[%s]-%d\n",
                         pci_name(pdev), i);
                vector = i + IFCVF_MSI_QUEUE_OFF;
@@ -112,7 +116,6 @@ static int ifcvf_start_datapath(void *private)
        u8 status;
        int ret;
 
-       vf->nr_vring = IFCVF_MAX_QUEUE_PAIRS * 2;
        ret = ifcvf_start_hw(vf);
        if (ret < 0) {
                status = ifcvf_get_status(vf);
@@ -128,7 +131,7 @@ static int ifcvf_stop_datapath(void *private)
        struct ifcvf_hw *vf = ifcvf_private_to_vf(private);
        int i;
 
-       for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++)
+       for (i = 0; i < vf->nr_vring; i++)
                vf->vring[i].cb.callback = NULL;
 
        ifcvf_stop_hw(vf);
@@ -141,7 +144,7 @@ static void ifcvf_reset_vring(struct ifcvf_adapter *adapter)
        struct ifcvf_hw *vf = ifcvf_private_to_vf(adapter);
        int i;
 
-       for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+       for (i = 0; i < vf->nr_vring; i++) {
                vf->vring[i].last_avail_idx = 0;
                vf->vring[i].desc = 0;
                vf->vring[i].avail = 0;
@@ -171,17 +174,12 @@ static u64 ifcvf_vdpa_get_features(struct vdpa_device *vdpa_dev)
        struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev);
        struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
        struct pci_dev *pdev = adapter->pdev;
-
+       u32 type = vf->dev_type;
        u64 features;
 
-       switch (vf->dev_type) {
-       case VIRTIO_ID_NET:
-               features = ifcvf_get_features(vf) & IFCVF_NET_SUPPORTED_FEATURES;
-               break;
-       case VIRTIO_ID_BLOCK:
+       if (type == VIRTIO_ID_NET || type == VIRTIO_ID_BLOCK)
                features = ifcvf_get_features(vf);
-               break;
-       default:
+       else {
                features = 0;
                IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", vf->dev_type);
        }
@@ -218,23 +216,12 @@ static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
        int ret;
 
        vf  = vdpa_to_vf(vdpa_dev);
-       adapter = dev_get_drvdata(vdpa_dev->dev.parent);
+       adapter = vdpa_to_adapter(vdpa_dev);
        status_old = ifcvf_get_status(vf);
 
        if (status_old == status)
                return;
 
-       if ((status_old & VIRTIO_CONFIG_S_DRIVER_OK) &&
-           !(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
-               ifcvf_stop_datapath(adapter);
-               ifcvf_free_irq(adapter, IFCVF_MAX_QUEUE_PAIRS * 2);
-       }
-
-       if (status == 0) {
-               ifcvf_reset_vring(adapter);
-               return;
-       }
-
        if ((status & VIRTIO_CONFIG_S_DRIVER_OK) &&
            !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) {
                ret = ifcvf_request_irq(adapter);
@@ -254,6 +241,29 @@ static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
        ifcvf_set_status(vf, status);
 }
 
+static int ifcvf_vdpa_reset(struct vdpa_device *vdpa_dev)
+{
+       struct ifcvf_adapter *adapter;
+       struct ifcvf_hw *vf;
+       u8 status_old;
+
+       vf  = vdpa_to_vf(vdpa_dev);
+       adapter = vdpa_to_adapter(vdpa_dev);
+       status_old = ifcvf_get_status(vf);
+
+       if (status_old == 0)
+               return 0;
+
+       if (status_old & VIRTIO_CONFIG_S_DRIVER_OK) {
+               ifcvf_stop_datapath(adapter);
+               ifcvf_free_irq(adapter, vf->nr_vring);
+       }
+
+       ifcvf_reset_vring(adapter);
+
+       return 0;
+}
+
 static u16 ifcvf_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
 {
        return IFCVF_QUEUE_MAX;
@@ -437,6 +447,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = {
        .set_features   = ifcvf_vdpa_set_features,
        .get_status     = ifcvf_vdpa_get_status,
        .set_status     = ifcvf_vdpa_set_status,
+       .reset          = ifcvf_vdpa_reset,
        .get_vq_num_max = ifcvf_vdpa_get_vq_num_max,
        .get_vq_state   = ifcvf_vdpa_get_vq_state,
        .set_vq_state   = ifcvf_vdpa_set_vq_state,
@@ -458,63 +469,63 @@ static const struct vdpa_config_ops ifc_vdpa_ops = {
        .get_vq_notification = ifcvf_get_vq_notification,
 };
 
-static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+static struct virtio_device_id id_table_net[] = {
+       {VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID},
+       {0},
+};
+
+static struct virtio_device_id id_table_blk[] = {
+       {VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID},
+       {0},
+};
+
+static u32 get_dev_type(struct pci_dev *pdev)
 {
-       struct device *dev = &pdev->dev;
-       struct ifcvf_adapter *adapter;
-       struct ifcvf_hw *vf;
-       int ret, i;
+       u32 dev_type;
 
-       ret = pcim_enable_device(pdev);
-       if (ret) {
-               IFCVF_ERR(pdev, "Failed to enable device\n");
-               return ret;
-       }
+       /* This drirver drives both modern virtio devices and transitional
+        * devices in modern mode.
+        * vDPA requires feature bit VIRTIO_F_ACCESS_PLATFORM,
+        * so legacy devices and transitional devices in legacy
+        * mode will not work for vDPA, this driver will not
+        * drive devices with legacy interface.
+        */
 
-       ret = pcim_iomap_regions(pdev, BIT(0) | BIT(2) | BIT(4),
-                                IFCVF_DRIVER_NAME);
-       if (ret) {
-               IFCVF_ERR(pdev, "Failed to request MMIO region\n");
-               return ret;
-       }
+       if (pdev->device < 0x1040)
+               dev_type =  pdev->subsystem_device;
+       else
+               dev_type =  pdev->device - 0x1040;
 
-       ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
-       if (ret) {
-               IFCVF_ERR(pdev, "No usable DMA configuration\n");
-               return ret;
-       }
+       return dev_type;
+}
 
-       ret = devm_add_action_or_reset(dev, ifcvf_free_irq_vectors, pdev);
-       if (ret) {
-               IFCVF_ERR(pdev,
-                         "Failed for adding devres for freeing irq vectors\n");
-               return ret;
-       }
+static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name)
+{
+       struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev;
+       struct ifcvf_adapter *adapter;
+       struct pci_dev *pdev;
+       struct ifcvf_hw *vf;
+       struct device *dev;
+       int ret, i;
 
+       ifcvf_mgmt_dev = container_of(mdev, struct ifcvf_vdpa_mgmt_dev, mdev);
+       if (ifcvf_mgmt_dev->adapter)
+               return -EOPNOTSUPP;
+
+       pdev = ifcvf_mgmt_dev->pdev;
+       dev = &pdev->dev;
        adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa,
-                                   dev, &ifc_vdpa_ops, NULL);
+                                   dev, &ifc_vdpa_ops, name, false);
        if (IS_ERR(adapter)) {
                IFCVF_ERR(pdev, "Failed to allocate vDPA structure");
                return PTR_ERR(adapter);
        }
 
-       pci_set_master(pdev);
-       pci_set_drvdata(pdev, adapter);
+       ifcvf_mgmt_dev->adapter = adapter;
+       pci_set_drvdata(pdev, ifcvf_mgmt_dev);
 
        vf = &adapter->vf;
-
-       /* This drirver drives both modern virtio devices and transitional
-        * devices in modern mode.
-        * vDPA requires feature bit VIRTIO_F_ACCESS_PLATFORM,
-        * so legacy devices and transitional devices in legacy
-        * mode will not work for vDPA, this driver will not
-        * drive devices with legacy interface.
-        */
-       if (pdev->device < 0x1040)
-               vf->dev_type =  pdev->subsystem_device;
-       else
-               vf->dev_type =  pdev->device - 0x1040;
-
+       vf->dev_type = get_dev_type(pdev);
        vf->base = pcim_iomap_table(pdev);
 
        adapter->pdev = pdev;
@@ -526,14 +537,15 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
                goto err;
        }
 
-       for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++)
+       for (i = 0; i < vf->nr_vring; i++)
                vf->vring[i].irq = -EINVAL;
 
        vf->hw_features = ifcvf_get_hw_features(vf);
 
-       ret = vdpa_register_device(&adapter->vdpa, IFCVF_MAX_QUEUE_PAIRS * 2);
+       adapter->vdpa.mdev = &ifcvf_mgmt_dev->mdev;
+       ret = _vdpa_register_device(&adapter->vdpa, vf->nr_vring);
        if (ret) {
-               IFCVF_ERR(pdev, "Failed to register ifcvf to vdpa bus");
+               IFCVF_ERR(pdev, "Failed to register to vDPA bus");
                goto err;
        }
 
@@ -544,11 +556,100 @@ err:
        return ret;
 }
 
+static void ifcvf_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
+{
+       struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev;
+
+       ifcvf_mgmt_dev = container_of(mdev, struct ifcvf_vdpa_mgmt_dev, mdev);
+       _vdpa_unregister_device(dev);
+       ifcvf_mgmt_dev->adapter = NULL;
+}
+
+static const struct vdpa_mgmtdev_ops ifcvf_vdpa_mgmt_dev_ops = {
+       .dev_add = ifcvf_vdpa_dev_add,
+       .dev_del = ifcvf_vdpa_dev_del
+};
+
+static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+       struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev;
+       struct device *dev = &pdev->dev;
+       u32 dev_type;
+       int ret;
+
+       ifcvf_mgmt_dev = kzalloc(sizeof(struct ifcvf_vdpa_mgmt_dev), GFP_KERNEL);
+       if (!ifcvf_mgmt_dev) {
+               IFCVF_ERR(pdev, "Failed to alloc memory for the vDPA management device\n");
+               return -ENOMEM;
+       }
+
+       dev_type = get_dev_type(pdev);
+       switch (dev_type) {
+       case VIRTIO_ID_NET:
+               ifcvf_mgmt_dev->mdev.id_table = id_table_net;
+               break;
+       case VIRTIO_ID_BLOCK:
+               ifcvf_mgmt_dev->mdev.id_table = id_table_blk;
+               break;
+       default:
+               IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", dev_type);
+               ret = -EOPNOTSUPP;
+               goto err;
+       }
+
+       ifcvf_mgmt_dev->mdev.ops = &ifcvf_vdpa_mgmt_dev_ops;
+       ifcvf_mgmt_dev->mdev.device = dev;
+       ifcvf_mgmt_dev->pdev = pdev;
+
+       ret = pcim_enable_device(pdev);
+       if (ret) {
+               IFCVF_ERR(pdev, "Failed to enable device\n");
+               goto err;
+       }
+
+       ret = pcim_iomap_regions(pdev, BIT(0) | BIT(2) | BIT(4),
+                                IFCVF_DRIVER_NAME);
+       if (ret) {
+               IFCVF_ERR(pdev, "Failed to request MMIO region\n");
+               goto err;
+       }
+
+       ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
+       if (ret) {
+               IFCVF_ERR(pdev, "No usable DMA configuration\n");
+               goto err;
+       }
+
+       ret = devm_add_action_or_reset(dev, ifcvf_free_irq_vectors, pdev);
+       if (ret) {
+               IFCVF_ERR(pdev,
+                         "Failed for adding devres for freeing irq vectors\n");
+               goto err;
+       }
+
+       pci_set_master(pdev);
+
+       ret = vdpa_mgmtdev_register(&ifcvf_mgmt_dev->mdev);
+       if (ret) {
+               IFCVF_ERR(pdev,
+                         "Failed to initialize the management interfaces\n");
+               goto err;
+       }
+
+       return 0;
+
+err:
+       kfree(ifcvf_mgmt_dev);
+       return ret;
+}
+
 static void ifcvf_remove(struct pci_dev *pdev)
 {
-       struct ifcvf_adapter *adapter = pci_get_drvdata(pdev);
+       struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev;
 
-       vdpa_unregister_device(&adapter->vdpa);
+       ifcvf_mgmt_dev = pci_get_drvdata(pdev);
+       vdpa_mgmtdev_unregister(&ifcvf_mgmt_dev->mdev);
+       kfree(ifcvf_mgmt_dev);
 }
 
 static struct pci_device_id ifcvf_pci_ids[] = {
index 0002b21..01a848a 100644 (file)
@@ -5,7 +5,7 @@
 #define __MLX5_VDPA_H__
 
 #include <linux/etherdevice.h>
-#include <linux/if_vlan.h>
+#include <linux/vringh.h>
 #include <linux/vdpa.h>
 #include <linux/mlx5/driver.h>
 
@@ -48,6 +48,26 @@ struct mlx5_vdpa_resources {
        bool valid;
 };
 
+struct mlx5_control_vq {
+       struct vhost_iotlb *iotlb;
+       /* spinlock to synchronize iommu table */
+       spinlock_t iommu_lock;
+       struct vringh vring;
+       bool ready;
+       u64 desc_addr;
+       u64 device_addr;
+       u64 driver_addr;
+       struct vdpa_callback event_cb;
+       struct vringh_kiov riov;
+       struct vringh_kiov wiov;
+       unsigned short head;
+};
+
+struct mlx5_ctrl_wq_ent {
+       struct work_struct work;
+       struct mlx5_vdpa_dev *mvdev;
+};
+
 struct mlx5_vdpa_dev {
        struct vdpa_device vdev;
        struct mlx5_core_dev *mdev;
@@ -57,9 +77,12 @@ struct mlx5_vdpa_dev {
        u64 actual_features;
        u8 status;
        u32 max_vqs;
+       u16 max_idx;
        u32 generation;
 
        struct mlx5_vdpa_mr mr;
+       struct mlx5_control_vq cvq;
+       struct workqueue_struct *wq;
 };
 
 int mlx5_vdpa_alloc_pd(struct mlx5_vdpa_dev *dev, u32 *pdn, u16 uid);
@@ -68,6 +91,7 @@ int mlx5_vdpa_get_null_mkey(struct mlx5_vdpa_dev *dev, u32 *null_mkey);
 int mlx5_vdpa_create_tis(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tisn);
 void mlx5_vdpa_destroy_tis(struct mlx5_vdpa_dev *mvdev, u32 tisn);
 int mlx5_vdpa_create_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 *rqtn);
+int mlx5_vdpa_modify_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 rqtn);
 void mlx5_vdpa_destroy_rqt(struct mlx5_vdpa_dev *mvdev, u32 rqtn);
 int mlx5_vdpa_create_tir(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tirn);
 void mlx5_vdpa_destroy_tir(struct mlx5_vdpa_dev *mvdev, u32 tirn);
index e59135f..ff010c6 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
 
+#include <linux/vhost_types.h>
 #include <linux/vdpa.h>
 #include <linux/gcd.h>
 #include <linux/string.h>
@@ -451,33 +452,30 @@ static void destroy_dma_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr)
        mlx5_vdpa_destroy_mkey(mvdev, &mr->mkey);
 }
 
-static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
+static int dup_iotlb(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *src)
 {
-       struct mlx5_vdpa_mr *mr = &mvdev->mr;
+       struct vhost_iotlb_map *map;
+       u64 start = 0, last = ULLONG_MAX;
        int err;
 
-       if (mr->initialized)
-               return 0;
-
-       if (iotlb)
-               err = create_user_mr(mvdev, iotlb);
-       else
-               err = create_dma_mr(mvdev, mr);
-
-       if (!err)
-               mr->initialized = true;
+       if (!src) {
+               err = vhost_iotlb_add_range(mvdev->cvq.iotlb, start, last, start, VHOST_ACCESS_RW);
+               return err;
+       }
 
-       return err;
+       for (map = vhost_iotlb_itree_first(src, start, last); map;
+               map = vhost_iotlb_itree_next(map, start, last)) {
+               err = vhost_iotlb_add_range(mvdev->cvq.iotlb, map->start, map->last,
+                                           map->addr, map->perm);
+               if (err)
+                       return err;
+       }
+       return 0;
 }
 
-int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
+static void prune_iotlb(struct mlx5_vdpa_dev *mvdev)
 {
-       int err;
-
-       mutex_lock(&mvdev->mr.mkey_mtx);
-       err = _mlx5_vdpa_create_mr(mvdev, iotlb);
-       mutex_unlock(&mvdev->mr.mkey_mtx);
-       return err;
+       vhost_iotlb_del_range(mvdev->cvq.iotlb, 0, ULLONG_MAX);
 }
 
 static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr)
@@ -501,6 +499,7 @@ void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev)
        if (!mr->initialized)
                goto out;
 
+       prune_iotlb(mvdev);
        if (mr->user_mr)
                destroy_user_mr(mvdev, mr);
        else
@@ -512,6 +511,48 @@ out:
        mutex_unlock(&mr->mkey_mtx);
 }
 
+static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
+{
+       struct mlx5_vdpa_mr *mr = &mvdev->mr;
+       int err;
+
+       if (mr->initialized)
+               return 0;
+
+       if (iotlb)
+               err = create_user_mr(mvdev, iotlb);
+       else
+               err = create_dma_mr(mvdev, mr);
+
+       if (err)
+               return err;
+
+       err = dup_iotlb(mvdev, iotlb);
+       if (err)
+               goto out_err;
+
+       mr->initialized = true;
+       return 0;
+
+out_err:
+       if (iotlb)
+               destroy_user_mr(mvdev, mr);
+       else
+               destroy_dma_mr(mvdev, mr);
+
+       return err;
+}
+
+int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
+{
+       int err;
+
+       mutex_lock(&mvdev->mr.mkey_mtx);
+       err = _mlx5_vdpa_create_mr(mvdev, iotlb);
+       mutex_unlock(&mvdev->mr.mkey_mtx);
+       return err;
+}
+
 int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
                             bool *change_map)
 {
index d460621..15e266d 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
 
+#include <linux/iova.h>
 #include <linux/mlx5/driver.h>
 #include "mlx5_vdpa.h"
 
@@ -128,6 +129,16 @@ int mlx5_vdpa_create_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 *
        return err;
 }
 
+int mlx5_vdpa_modify_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 rqtn)
+{
+       u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {};
+
+       MLX5_SET(modify_rqt_in, in, uid, mvdev->res.uid);
+       MLX5_SET(modify_rqt_in, in, rqtn, rqtn);
+       MLX5_SET(modify_rqt_in, in, opcode, MLX5_CMD_OP_MODIFY_RQT);
+       return mlx5_cmd_exec(mvdev->mdev, in, inlen, out, sizeof(out));
+}
+
 void mlx5_vdpa_destroy_rqt(struct mlx5_vdpa_dev *mvdev, u32 rqtn)
 {
        u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {};
@@ -221,6 +232,22 @@ int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, struct mlx5_core_mkey *m
        return mlx5_cmd_exec_in(mvdev->mdev, destroy_mkey, in);
 }
 
+static int init_ctrl_vq(struct mlx5_vdpa_dev *mvdev)
+{
+       mvdev->cvq.iotlb = vhost_iotlb_alloc(0, 0);
+       if (!mvdev->cvq.iotlb)
+               return -ENOMEM;
+
+       vringh_set_iotlb(&mvdev->cvq.vring, mvdev->cvq.iotlb, &mvdev->cvq.iommu_lock);
+
+       return 0;
+}
+
+static void cleanup_ctrl_vq(struct mlx5_vdpa_dev *mvdev)
+{
+       vhost_iotlb_free(mvdev->cvq.iotlb);
+}
+
 int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
 {
        u64 offset = MLX5_CAP64_DEV_VDPA_EMULATION(mvdev->mdev, doorbell_bar_offset);
@@ -260,10 +287,17 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
                err = -ENOMEM;
                goto err_key;
        }
+
+       err = init_ctrl_vq(mvdev);
+       if (err)
+               goto err_ctrl;
+
        res->valid = true;
 
        return 0;
 
+err_ctrl:
+       iounmap(res->kick_addr);
 err_key:
        dealloc_pd(mvdev, res->pdn, res->uid);
 err_pd:
@@ -282,6 +316,7 @@ void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev)
        if (!res->valid)
                return;
 
+       cleanup_ctrl_vq(mvdev);
        iounmap(res->kick_addr);
        res->kick_addr = NULL;
        dealloc_pd(mvdev, res->pdn, res->uid);
index 5906cad..294ba05 100644 (file)
@@ -45,6 +45,8 @@ MODULE_LICENSE("Dual BSD/GPL");
        (VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
         VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
 
+#define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature)))
+
 struct mlx5_vdpa_net_resources {
        u32 tisn;
        u32 tdn;
@@ -90,7 +92,6 @@ struct mlx5_vq_restore_info {
        u16 avail_index;
        u16 used_index;
        bool ready;
-       struct vdpa_callback cb;
        bool restore;
 };
 
@@ -100,7 +101,6 @@ struct mlx5_vdpa_virtqueue {
        u64 device_addr;
        u64 driver_addr;
        u32 num_ent;
-       struct vdpa_callback event_cb;
 
        /* Resources for implementing the notification channel from the device
         * to the driver. fwqp is the firmware end of an RC connection; the
@@ -135,11 +135,20 @@ struct mlx5_vdpa_virtqueue {
  */
 #define MLX5_MAX_SUPPORTED_VQS 16
 
+static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx)
+{
+       if (unlikely(idx > mvdev->max_idx))
+               return false;
+
+       return true;
+}
+
 struct mlx5_vdpa_net {
        struct mlx5_vdpa_dev mvdev;
        struct mlx5_vdpa_net_resources res;
        struct virtio_net_config config;
        struct mlx5_vdpa_virtqueue vqs[MLX5_MAX_SUPPORTED_VQS];
+       struct vdpa_callback event_cbs[MLX5_MAX_SUPPORTED_VQS + 1];
 
        /* Serialize vq resources creation and destruction. This is required
         * since memory map might change and we need to destroy and create
@@ -151,15 +160,18 @@ struct mlx5_vdpa_net {
        struct mlx5_flow_handle *rx_rule;
        bool setup;
        u16 mtu;
+       u32 cur_num_vqs;
 };
 
 static void free_resources(struct mlx5_vdpa_net *ndev);
 static void init_mvqs(struct mlx5_vdpa_net *ndev);
-static int setup_driver(struct mlx5_vdpa_net *ndev);
+static int setup_driver(struct mlx5_vdpa_dev *mvdev);
 static void teardown_driver(struct mlx5_vdpa_net *ndev);
 
 static bool mlx5_vdpa_debug;
 
+#define MLX5_CVQ_MAX_ENT 16
+
 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
        do {                                                                                       \
                if (features & BIT_ULL(_feature))                                                  \
@@ -172,11 +184,41 @@ static bool mlx5_vdpa_debug;
                        mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
        } while (0)
 
+/* TODO: cross-endian support */
+static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
+{
+       return virtio_legacy_is_little_endian() ||
+               (mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
+}
+
+static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val)
+{
+       return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val);
+}
+
+static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
+{
+       return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
+}
+
 static inline u32 mlx5_vdpa_max_qps(int max_vqs)
 {
        return max_vqs / 2;
 }
 
+static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
+{
+       if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
+               return 2;
+
+       return 2 * mlx5_vdpa_max_qps(mvdev->max_vqs);
+}
+
+static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx)
+{
+       return idx == ctrl_vq_idx(mvdev);
+}
+
 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
 {
        if (status & ~VALID_STATUS_MASK)
@@ -481,6 +523,10 @@ static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
 
 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
 {
+       struct mlx5_vdpa_net *ndev = mvq->ndev;
+       struct vdpa_callback *event_cb;
+
+       event_cb = &ndev->event_cbs[mvq->index];
        mlx5_cq_set_ci(&mvq->cq.mcq);
 
        /* make sure CQ cosumer update is visible to the hardware before updating
@@ -488,8 +534,8 @@ static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int nu
         */
        dma_wmb();
        rx_post(&mvq->vqqp, num);
-       if (mvq->event_cb.callback)
-               mvq->event_cb.callback(mvq->event_cb.private);
+       if (event_cb->callback)
+               event_cb->callback(event_cb->private);
 }
 
 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
@@ -1100,10 +1146,8 @@ static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
        if (!mvq->num_ent)
                return 0;
 
-       if (mvq->initialized) {
-               mlx5_vdpa_warn(&ndev->mvdev, "attempt re init\n");
-               return -EINVAL;
-       }
+       if (mvq->initialized)
+               return 0;
 
        err = cq_create(ndev, idx, mvq->num_ent);
        if (err)
@@ -1190,19 +1234,20 @@ static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *
 
 static int create_rqt(struct mlx5_vdpa_net *ndev)
 {
-       int log_max_rqt;
        __be32 *list;
+       int max_rqt;
        void *rqtc;
        int inlen;
        void *in;
        int i, j;
        int err;
 
-       log_max_rqt = min_t(int, 1, MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
-       if (log_max_rqt < 1)
+       max_rqt = min_t(int, MLX5_MAX_SUPPORTED_VQS / 2,
+                       1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
+       if (max_rqt < 1)
                return -EOPNOTSUPP;
 
-       inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + (1 << log_max_rqt) * MLX5_ST_SZ_BYTES(rq_num);
+       inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num);
        in = kzalloc(inlen, GFP_KERNEL);
        if (!in)
                return -ENOMEM;
@@ -1211,10 +1256,9 @@ static int create_rqt(struct mlx5_vdpa_net *ndev)
        rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
 
        MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
-       MLX5_SET(rqtc, rqtc, rqt_max_size, 1 << log_max_rqt);
-       MLX5_SET(rqtc, rqtc, rqt_actual_size, 1);
+       MLX5_SET(rqtc, rqtc, rqt_max_size, max_rqt);
        list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
-       for (i = 0, j = 0; j < ndev->mvdev.max_vqs; j++) {
+       for (i = 0, j = 0; j < max_rqt; j++) {
                if (!ndev->vqs[j].initialized)
                        continue;
 
@@ -1223,6 +1267,7 @@ static int create_rqt(struct mlx5_vdpa_net *ndev)
                        i++;
                }
        }
+       MLX5_SET(rqtc, rqtc, rqt_actual_size, i);
 
        err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
        kfree(in);
@@ -1232,6 +1277,52 @@ static int create_rqt(struct mlx5_vdpa_net *ndev)
        return 0;
 }
 
+#define MLX5_MODIFY_RQT_NUM_RQS ((u64)1)
+
+static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
+{
+       __be32 *list;
+       int max_rqt;
+       void *rqtc;
+       int inlen;
+       void *in;
+       int i, j;
+       int err;
+
+       max_rqt = min_t(int, ndev->cur_num_vqs / 2,
+                       1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
+       if (max_rqt < 1)
+               return -EOPNOTSUPP;
+
+       inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num);
+       in = kzalloc(inlen, GFP_KERNEL);
+       if (!in)
+               return -ENOMEM;
+
+       MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid);
+       MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS);
+       rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
+       MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
+
+       list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
+       for (i = 0, j = 0; j < num; j++) {
+               if (!ndev->vqs[j].initialized)
+                       continue;
+
+               if (!vq_is_tx(ndev->vqs[j].index)) {
+                       list[i] = cpu_to_be32(ndev->vqs[j].virtq_id);
+                       i++;
+               }
+       }
+       MLX5_SET(rqtc, rqtc, rqt_actual_size, i);
+       err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
+       kfree(in);
+       if (err)
+               return err;
+
+       return 0;
+}
+
 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
 {
        mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
@@ -1345,12 +1436,206 @@ static void remove_fwd_to_tir(struct mlx5_vdpa_net *ndev)
        ndev->rx_rule = NULL;
 }
 
+static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
+{
+       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+       struct mlx5_control_vq *cvq = &mvdev->cvq;
+       virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+       struct mlx5_core_dev *pfmdev;
+       size_t read;
+       u8 mac[ETH_ALEN];
+
+       pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
+       switch (cmd) {
+       case VIRTIO_NET_CTRL_MAC_ADDR_SET:
+               read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
+               if (read != ETH_ALEN)
+                       break;
+
+               if (!memcmp(ndev->config.mac, mac, 6)) {
+                       status = VIRTIO_NET_OK;
+                       break;
+               }
+
+               if (!is_zero_ether_addr(ndev->config.mac)) {
+                       if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
+                               mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
+                                              ndev->config.mac);
+                               break;
+                       }
+               }
+
+               if (mlx5_mpfs_add_mac(pfmdev, mac)) {
+                       mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
+                                      mac);
+                       break;
+               }
+
+               memcpy(ndev->config.mac, mac, ETH_ALEN);
+               status = VIRTIO_NET_OK;
+               break;
+
+       default:
+               break;
+       }
+
+       return status;
+}
+
+static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
+{
+       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+       int cur_qps = ndev->cur_num_vqs / 2;
+       int err;
+       int i;
+
+       if (cur_qps > newqps) {
+               err = modify_rqt(ndev, 2 * newqps);
+               if (err)
+                       return err;
+
+               for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--)
+                       teardown_vq(ndev, &ndev->vqs[i]);
+
+               ndev->cur_num_vqs = 2 * newqps;
+       } else {
+               ndev->cur_num_vqs = 2 * newqps;
+               for (i = cur_qps * 2; i < 2 * newqps; i++) {
+                       err = setup_vq(ndev, &ndev->vqs[i]);
+                       if (err)
+                               goto clean_added;
+               }
+               err = modify_rqt(ndev, 2 * newqps);
+               if (err)
+                       goto clean_added;
+       }
+       return 0;
+
+clean_added:
+       for (--i; i >= cur_qps; --i)
+               teardown_vq(ndev, &ndev->vqs[i]);
+
+       return err;
+}
+
+static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd)
+{
+       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+       virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+       struct mlx5_control_vq *cvq = &mvdev->cvq;
+       struct virtio_net_ctrl_mq mq;
+       size_t read;
+       u16 newqps;
+
+       switch (cmd) {
+       case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET:
+               read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq));
+               if (read != sizeof(mq))
+                       break;
+
+               newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs);
+               if (ndev->cur_num_vqs == 2 * newqps) {
+                       status = VIRTIO_NET_OK;
+                       break;
+               }
+
+               if (newqps & (newqps - 1))
+                       break;
+
+               if (!change_num_qps(mvdev, newqps))
+                       status = VIRTIO_NET_OK;
+
+               break;
+       default:
+               break;
+       }
+
+       return status;
+}
+
+static void mlx5_cvq_kick_handler(struct work_struct *work)
+{
+       virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+       struct virtio_net_ctrl_hdr ctrl;
+       struct mlx5_ctrl_wq_ent *wqent;
+       struct mlx5_vdpa_dev *mvdev;
+       struct mlx5_control_vq *cvq;
+       struct mlx5_vdpa_net *ndev;
+       size_t read, write;
+       int err;
+
+       wqent = container_of(work, struct mlx5_ctrl_wq_ent, work);
+       mvdev = wqent->mvdev;
+       ndev = to_mlx5_vdpa_ndev(mvdev);
+       cvq = &mvdev->cvq;
+       if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
+               goto out;
+
+       if (!cvq->ready)
+               goto out;
+
+       while (true) {
+               err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head,
+                                          GFP_ATOMIC);
+               if (err <= 0)
+                       break;
+
+               read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl));
+               if (read != sizeof(ctrl))
+                       break;
+
+               switch (ctrl.class) {
+               case VIRTIO_NET_CTRL_MAC:
+                       status = handle_ctrl_mac(mvdev, ctrl.cmd);
+                       break;
+               case VIRTIO_NET_CTRL_MQ:
+                       status = handle_ctrl_mq(mvdev, ctrl.cmd);
+                       break;
+
+               default:
+                       break;
+               }
+
+               /* Make sure data is written before advancing index */
+               smp_wmb();
+
+               write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status));
+               vringh_complete_iotlb(&cvq->vring, cvq->head, write);
+               vringh_kiov_cleanup(&cvq->riov);
+               vringh_kiov_cleanup(&cvq->wiov);
+
+               if (vringh_need_notify_iotlb(&cvq->vring))
+                       vringh_notify(&cvq->vring);
+       }
+out:
+       kfree(wqent);
+}
+
 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-       struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+       struct mlx5_vdpa_virtqueue *mvq;
+       struct mlx5_ctrl_wq_ent *wqent;
+
+       if (!is_index_valid(mvdev, idx))
+               return;
+
+       if (unlikely(is_ctrl_vq_idx(mvdev, idx))) {
+               if (!mvdev->cvq.ready)
+                       return;
+
+               wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
+               if (!wqent)
+                       return;
 
+               wqent->mvdev = mvdev;
+               INIT_WORK(&wqent->work, mlx5_cvq_kick_handler);
+               queue_work(mvdev->wq, &wqent->work);
+               return;
+       }
+
+       mvq = &ndev->vqs[idx];
        if (unlikely(!mvq->ready))
                return;
 
@@ -1362,8 +1647,19 @@ static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-       struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+       struct mlx5_vdpa_virtqueue *mvq;
+
+       if (!is_index_valid(mvdev, idx))
+               return -EINVAL;
 
+       if (is_ctrl_vq_idx(mvdev, idx)) {
+               mvdev->cvq.desc_addr = desc_area;
+               mvdev->cvq.device_addr = device_area;
+               mvdev->cvq.driver_addr = driver_area;
+               return 0;
+       }
+
+       mvq = &ndev->vqs[idx];
        mvq->desc_addr = desc_area;
        mvq->device_addr = device_area;
        mvq->driver_addr = driver_area;
@@ -1376,6 +1672,9 @@ static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
        struct mlx5_vdpa_virtqueue *mvq;
 
+       if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
+               return;
+
        mvq = &ndev->vqs[idx];
        mvq->num_ent = num;
 }
@@ -1384,17 +1683,46 @@ static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_c
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-       struct mlx5_vdpa_virtqueue *vq = &ndev->vqs[idx];
 
-       vq->event_cb = *cb;
+       ndev->event_cbs[idx] = *cb;
+}
+
+static void mlx5_cvq_notify(struct vringh *vring)
+{
+       struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring);
+
+       if (!cvq->event_cb.callback)
+               return;
+
+       cvq->event_cb.callback(cvq->event_cb.private);
+}
+
+static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready)
+{
+       struct mlx5_control_vq *cvq = &mvdev->cvq;
+
+       cvq->ready = ready;
+       if (!ready)
+               return;
+
+       cvq->vring.notify = mlx5_cvq_notify;
 }
 
 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-       struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+       struct mlx5_vdpa_virtqueue *mvq;
+
+       if (!is_index_valid(mvdev, idx))
+               return;
+
+       if (is_ctrl_vq_idx(mvdev, idx)) {
+               set_cvq_ready(mvdev, ready);
+               return;
+       }
 
+       mvq = &ndev->vqs[idx];
        if (!ready)
                suspend_vq(ndev, mvq);
 
@@ -1405,9 +1733,14 @@ static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-       struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
 
-       return mvq->ready;
+       if (!is_index_valid(mvdev, idx))
+               return false;
+
+       if (is_ctrl_vq_idx(mvdev, idx))
+               return mvdev->cvq.ready;
+
+       return ndev->vqs[idx].ready;
 }
 
 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
@@ -1415,8 +1748,17 @@ static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-       struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+       struct mlx5_vdpa_virtqueue *mvq;
 
+       if (!is_index_valid(mvdev, idx))
+               return -EINVAL;
+
+       if (is_ctrl_vq_idx(mvdev, idx)) {
+               mvdev->cvq.vring.last_avail_idx = state->split.avail_index;
+               return 0;
+       }
+
+       mvq = &ndev->vqs[idx];
        if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
                mlx5_vdpa_warn(mvdev, "can't modify available index\n");
                return -EINVAL;
@@ -1431,10 +1773,19 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-       struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+       struct mlx5_vdpa_virtqueue *mvq;
        struct mlx5_virtq_attr attr;
        int err;
 
+       if (!is_index_valid(mvdev, idx))
+               return -EINVAL;
+
+       if (is_ctrl_vq_idx(mvdev, idx)) {
+               state->split.avail_index = mvdev->cvq.vring.last_avail_idx;
+               return 0;
+       }
+
+       mvq = &ndev->vqs[idx];
        /* If the virtq object was destroyed, use the value saved at
         * the last minute of suspend_vq. This caters for userspace
         * that cares about emulating the index after vq is stopped.
@@ -1491,10 +1842,14 @@ static u64 mlx5_vdpa_get_features(struct vdpa_device *vdev)
        u16 dev_features;
 
        dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, device_features_bits_mask);
-       ndev->mvdev.mlx_features = mlx_to_vritio_features(dev_features);
+       ndev->mvdev.mlx_features |= mlx_to_vritio_features(dev_features);
        if (MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, virtio_version_1_0))
                ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_VERSION_1);
        ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
+       ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
+       ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
+       ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MQ);
+
        print_features(mvdev, ndev->mvdev.mlx_features, false);
        return ndev->mvdev.mlx_features;
 }
@@ -1507,17 +1862,29 @@ static int verify_min_features(struct mlx5_vdpa_dev *mvdev, u64 features)
        return 0;
 }
 
-static int setup_virtqueues(struct mlx5_vdpa_net *ndev)
+static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev)
 {
+       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+       struct mlx5_control_vq *cvq = &mvdev->cvq;
        int err;
        int i;
 
-       for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); i++) {
+       for (i = 0; i < 2 * mlx5_vdpa_max_qps(mvdev->max_vqs); i++) {
                err = setup_vq(ndev, &ndev->vqs[i]);
                if (err)
                        goto err_vq;
        }
 
+       if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) {
+               err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features,
+                                       MLX5_CVQ_MAX_ENT, false,
+                                       (struct vring_desc *)(uintptr_t)cvq->desc_addr,
+                                       (struct vring_avail *)(uintptr_t)cvq->driver_addr,
+                                       (struct vring_used *)(uintptr_t)cvq->device_addr);
+               if (err)
+                       goto err_vq;
+       }
+
        return 0;
 
 err_vq:
@@ -1541,16 +1908,22 @@ static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
        }
 }
 
-/* TODO: cross-endian support */
-static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
-{
-       return virtio_legacy_is_little_endian() ||
-               (mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
-}
-
-static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
+static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)
 {
-       return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
+       if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) {
+               if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) {
+                       /* MQ supported. CVQ index is right above the last data virtqueue's */
+                       mvdev->max_idx = mvdev->max_vqs;
+               } else {
+                       /* Only CVQ supportted. data virtqueues occupy indices 0 and 1.
+                        * CVQ gets index 2
+                        */
+                       mvdev->max_idx = 2;
+               }
+       } else {
+               /* Two data virtqueues only: one for rx and one for tx */
+               mvdev->max_idx = 1;
+       }
 }
 
 static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features)
@@ -1568,6 +1941,7 @@ static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features)
        ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
        ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu);
        ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
+       update_cvq_info(mvdev);
        return err;
 }
 
@@ -1605,15 +1979,14 @@ static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
 {
        struct mlx5_vq_restore_info *ri = &mvq->ri;
-       struct mlx5_virtq_attr attr;
+       struct mlx5_virtq_attr attr = {};
        int err;
 
-       if (!mvq->initialized)
-               return 0;
-
-       err = query_virtqueue(ndev, mvq, &attr);
-       if (err)
-               return err;
+       if (mvq->initialized) {
+               err = query_virtqueue(ndev, mvq, &attr);
+               if (err)
+                       return err;
+       }
 
        ri->avail_index = attr.available_index;
        ri->used_index = attr.used_index;
@@ -1622,7 +1995,6 @@ static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu
        ri->desc_addr = mvq->desc_addr;
        ri->device_addr = mvq->device_addr;
        ri->driver_addr = mvq->driver_addr;
-       ri->cb = mvq->event_cb;
        ri->restore = true;
        return 0;
 }
@@ -1667,12 +2039,12 @@ static void restore_channels_info(struct mlx5_vdpa_net *ndev)
                mvq->desc_addr = ri->desc_addr;
                mvq->device_addr = ri->device_addr;
                mvq->driver_addr = ri->driver_addr;
-               mvq->event_cb = ri->cb;
        }
 }
 
-static int mlx5_vdpa_change_map(struct mlx5_vdpa_net *ndev, struct vhost_iotlb *iotlb)
+static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
 {
+       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
        int err;
 
        suspend_vqs(ndev);
@@ -1681,58 +2053,59 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_net *ndev, struct vhost_iotlb *
                goto err_mr;
 
        teardown_driver(ndev);
-       mlx5_vdpa_destroy_mr(&ndev->mvdev);
-       err = mlx5_vdpa_create_mr(&ndev->mvdev, iotlb);
+       mlx5_vdpa_destroy_mr(mvdev);
+       err = mlx5_vdpa_create_mr(mvdev, iotlb);
        if (err)
                goto err_mr;
 
-       if (!(ndev->mvdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
+       if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
                return 0;
 
        restore_channels_info(ndev);
-       err = setup_driver(ndev);
+       err = setup_driver(mvdev);
        if (err)
                goto err_setup;
 
        return 0;
 
 err_setup:
-       mlx5_vdpa_destroy_mr(&ndev->mvdev);
+       mlx5_vdpa_destroy_mr(mvdev);
 err_mr:
        return err;
 }
 
-static int setup_driver(struct mlx5_vdpa_net *ndev)
+static int setup_driver(struct mlx5_vdpa_dev *mvdev)
 {
+       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
        int err;
 
        mutex_lock(&ndev->reslock);
        if (ndev->setup) {
-               mlx5_vdpa_warn(&ndev->mvdev, "setup driver called for already setup driver\n");
+               mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n");
                err = 0;
                goto out;
        }
-       err = setup_virtqueues(ndev);
+       err = setup_virtqueues(mvdev);
        if (err) {
-               mlx5_vdpa_warn(&ndev->mvdev, "setup_virtqueues\n");
+               mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
                goto out;
        }
 
        err = create_rqt(ndev);
        if (err) {
-               mlx5_vdpa_warn(&ndev->mvdev, "create_rqt\n");
+               mlx5_vdpa_warn(mvdev, "create_rqt\n");
                goto err_rqt;
        }
 
        err = create_tir(ndev);
        if (err) {
-               mlx5_vdpa_warn(&ndev->mvdev, "create_tir\n");
+               mlx5_vdpa_warn(mvdev, "create_tir\n");
                goto err_tir;
        }
 
        err = add_fwd_to_tir(ndev);
        if (err) {
-               mlx5_vdpa_warn(&ndev->mvdev, "add_fwd_to_tir\n");
+               mlx5_vdpa_warn(mvdev, "add_fwd_to_tir\n");
                goto err_fwd;
        }
        ndev->setup = true;
@@ -1781,24 +2154,10 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
        int err;
 
        print_status(mvdev, status, true);
-       if (!status) {
-               mlx5_vdpa_info(mvdev, "performing device reset\n");
-               teardown_driver(ndev);
-               clear_vqs_ready(ndev);
-               mlx5_vdpa_destroy_mr(&ndev->mvdev);
-               ndev->mvdev.status = 0;
-               ndev->mvdev.mlx_features = 0;
-               ++mvdev->generation;
-               if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
-                       if (mlx5_vdpa_create_mr(mvdev, NULL))
-                               mlx5_vdpa_warn(mvdev, "create MR failed\n");
-               }
-               return;
-       }
 
        if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
                if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
-                       err = setup_driver(ndev);
+                       err = setup_driver(mvdev);
                        if (err) {
                                mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
                                goto err_setup;
@@ -1817,6 +2176,29 @@ err_setup:
        ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
 }
 
+static int mlx5_vdpa_reset(struct vdpa_device *vdev)
+{
+       struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+
+       print_status(mvdev, 0, true);
+       mlx5_vdpa_info(mvdev, "performing device reset\n");
+       teardown_driver(ndev);
+       clear_vqs_ready(ndev);
+       mlx5_vdpa_destroy_mr(&ndev->mvdev);
+       ndev->mvdev.status = 0;
+       ndev->mvdev.mlx_features = 0;
+       memset(ndev->event_cbs, 0, sizeof(ndev->event_cbs));
+       ndev->mvdev.actual_features = 0;
+       ++mvdev->generation;
+       if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
+               if (mlx5_vdpa_create_mr(mvdev, NULL))
+                       mlx5_vdpa_warn(mvdev, "create MR failed\n");
+       }
+
+       return 0;
+}
+
 static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
 {
        return sizeof(struct virtio_net_config);
@@ -1848,7 +2230,6 @@ static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb)
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
-       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
        bool change_map;
        int err;
 
@@ -1859,7 +2240,7 @@ static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb
        }
 
        if (change_map)
-               return mlx5_vdpa_change_map(ndev, iotlb);
+               return mlx5_vdpa_change_map(mvdev, iotlb);
 
        return 0;
 }
@@ -1889,6 +2270,9 @@ static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device
        struct mlx5_vdpa_net *ndev;
        phys_addr_t addr;
 
+       if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
+               return ret;
+
        /* If SF BAR size is smaller than PAGE_SIZE, do not use direct
         * notification to avoid the risk of mapping pages that contain BAR of more
         * than one SF
@@ -1928,6 +2312,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = {
        .get_vendor_id = mlx5_vdpa_get_vendor_id,
        .get_status = mlx5_vdpa_get_status,
        .set_status = mlx5_vdpa_set_status,
+       .reset = mlx5_vdpa_reset,
        .get_config_size = mlx5_vdpa_get_config_size,
        .get_config = mlx5_vdpa_get_config,
        .set_config = mlx5_vdpa_set_config,
@@ -2040,7 +2425,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
        max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
 
        ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
-                                name);
+                                name, false);
        if (IS_ERR(ndev))
                return PTR_ERR(ndev);
 
@@ -2063,8 +2448,11 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
                err = mlx5_mpfs_add_mac(pfmdev, config->mac);
                if (err)
                        goto err_mtu;
+
+               ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MAC);
        }
 
+       config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, mlx5_vdpa_max_qps(max_vqs));
        mvdev->vdev.dma_dev = &mdev->pdev->dev;
        err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
        if (err)
@@ -2080,8 +2468,15 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
        if (err)
                goto err_mr;
 
+       mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_ctrl_wq");
+       if (!mvdev->wq) {
+               err = -ENOMEM;
+               goto err_res2;
+       }
+
+       ndev->cur_num_vqs = 2 * mlx5_vdpa_max_qps(max_vqs);
        mvdev->vdev.mdev = &mgtdev->mgtdev;
-       err = _vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs));
+       err = _vdpa_register_device(&mvdev->vdev, ndev->cur_num_vqs + 1);
        if (err)
                goto err_reg;
 
@@ -2089,6 +2484,8 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
        return 0;
 
 err_reg:
+       destroy_workqueue(mvdev->wq);
+err_res2:
        free_resources(ndev);
 err_mr:
        mlx5_vdpa_destroy_mr(mvdev);
@@ -2106,7 +2503,9 @@ err_mtu:
 static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
 {
        struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
+       struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
 
+       destroy_workqueue(mvdev->wq);
        _vdpa_unregister_device(dev);
        mgtdev->ndev = NULL;
 }
index 3fc4525..1dc121a 100644 (file)
@@ -69,6 +69,7 @@ static void vdpa_release_dev(struct device *d)
  * @config: the bus operations that is supported by this device
  * @size: size of the parent structure that contains private data
  * @name: name of the vdpa device; optional.
+ * @use_va: indicate whether virtual address must be used by this device
  *
  * Driver should use vdpa_alloc_device() wrapper macro instead of
  * using this directly.
@@ -78,7 +79,8 @@ static void vdpa_release_dev(struct device *d)
  */
 struct vdpa_device *__vdpa_alloc_device(struct device *parent,
                                        const struct vdpa_config_ops *config,
-                                       size_t size, const char *name)
+                                       size_t size, const char *name,
+                                       bool use_va)
 {
        struct vdpa_device *vdev;
        int err = -EINVAL;
@@ -89,6 +91,10 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
        if (!!config->dma_map != !!config->dma_unmap)
                goto err;
 
+       /* It should only work for the device that use on-chip IOMMU */
+       if (use_va && !(config->dma_map || config->set_map))
+               goto err;
+
        err = -ENOMEM;
        vdev = kzalloc(size, GFP_KERNEL);
        if (!vdev)
@@ -104,6 +110,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
        vdev->index = err;
        vdev->config = config;
        vdev->features_valid = false;
+       vdev->use_va = use_va;
 
        if (name)
                err = dev_set_name(&vdev->dev, "%s", name);
index c621cf7..5f484ff 100644 (file)
@@ -92,7 +92,7 @@ static void vdpasim_vq_reset(struct vdpasim *vdpasim,
        vq->vring.notify = NULL;
 }
 
-static void vdpasim_reset(struct vdpasim *vdpasim)
+static void vdpasim_do_reset(struct vdpasim *vdpasim)
 {
        int i;
 
@@ -137,7 +137,8 @@ static dma_addr_t vdpasim_map_range(struct vdpasim *vdpasim, phys_addr_t paddr,
        int ret;
 
        /* We set the limit_pfn to the maximum (ULONG_MAX - 1) */
-       iova = alloc_iova(&vdpasim->iova, size, ULONG_MAX - 1, true);
+       iova = alloc_iova(&vdpasim->iova, size >> iova_shift(&vdpasim->iova),
+                         ULONG_MAX - 1, true);
        if (!iova)
                return DMA_MAPPING_ERROR;
 
@@ -250,7 +251,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr)
                ops = &vdpasim_config_ops;
 
        vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops,
-                                   dev_attr->name);
+                                   dev_attr->name, false);
        if (IS_ERR(vdpasim)) {
                ret = PTR_ERR(vdpasim);
                goto err_alloc;
@@ -459,11 +460,21 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status)
 
        spin_lock(&vdpasim->lock);
        vdpasim->status = status;
-       if (status == 0)
-               vdpasim_reset(vdpasim);
        spin_unlock(&vdpasim->lock);
 }
 
+static int vdpasim_reset(struct vdpa_device *vdpa)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+       spin_lock(&vdpasim->lock);
+       vdpasim->status = 0;
+       vdpasim_do_reset(vdpasim);
+       spin_unlock(&vdpasim->lock);
+
+       return 0;
+}
+
 static size_t vdpasim_get_config_size(struct vdpa_device *vdpa)
 {
        struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
@@ -544,14 +555,14 @@ err:
 }
 
 static int vdpasim_dma_map(struct vdpa_device *vdpa, u64 iova, u64 size,
-                          u64 pa, u32 perm)
+                          u64 pa, u32 perm, void *opaque)
 {
        struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
        int ret;
 
        spin_lock(&vdpasim->iommu_lock);
-       ret = vhost_iotlb_add_range(vdpasim->iommu, iova, iova + size - 1, pa,
-                                   perm);
+       ret = vhost_iotlb_add_range_ctx(vdpasim->iommu, iova, iova + size - 1,
+                                       pa, perm, opaque);
        spin_unlock(&vdpasim->iommu_lock);
 
        return ret;
@@ -607,6 +618,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = {
        .get_vendor_id          = vdpasim_get_vendor_id,
        .get_status             = vdpasim_get_status,
        .set_status             = vdpasim_set_status,
+       .reset                  = vdpasim_reset,
        .get_config_size        = vdpasim_get_config_size,
        .get_config             = vdpasim_get_config,
        .set_config             = vdpasim_set_config,
@@ -635,6 +647,7 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = {
        .get_vendor_id          = vdpasim_get_vendor_id,
        .get_status             = vdpasim_get_status,
        .set_status             = vdpasim_set_status,
+       .reset                  = vdpasim_reset,
        .get_config_size        = vdpasim_get_config_size,
        .get_config             = vdpasim_get_config,
        .set_config             = vdpasim_set_config,
diff --git a/drivers/vdpa/vdpa_user/Makefile b/drivers/vdpa/vdpa_user/Makefile
new file mode 100644 (file)
index 0000000..260e0b2
--- /dev/null
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+vduse-y := vduse_dev.o iova_domain.o
+
+obj-$(CONFIG_VDPA_USER) += vduse.o
diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
new file mode 100644 (file)
index 0000000..1daae26
--- /dev/null
@@ -0,0 +1,545 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MMU-based software IOTLB.
+ *
+ * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xie Yongji <xieyongji@bytedance.com>
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/vdpa.h>
+
+#include "iova_domain.h"
+
+static int vduse_iotlb_add_range(struct vduse_iova_domain *domain,
+                                u64 start, u64 last,
+                                u64 addr, unsigned int perm,
+                                struct file *file, u64 offset)
+{
+       struct vdpa_map_file *map_file;
+       int ret;
+
+       map_file = kmalloc(sizeof(*map_file), GFP_ATOMIC);
+       if (!map_file)
+               return -ENOMEM;
+
+       map_file->file = get_file(file);
+       map_file->offset = offset;
+
+       ret = vhost_iotlb_add_range_ctx(domain->iotlb, start, last,
+                                       addr, perm, map_file);
+       if (ret) {
+               fput(map_file->file);
+               kfree(map_file);
+               return ret;
+       }
+       return 0;
+}
+
+static void vduse_iotlb_del_range(struct vduse_iova_domain *domain,
+                                 u64 start, u64 last)
+{
+       struct vdpa_map_file *map_file;
+       struct vhost_iotlb_map *map;
+
+       while ((map = vhost_iotlb_itree_first(domain->iotlb, start, last))) {
+               map_file = (struct vdpa_map_file *)map->opaque;
+               fput(map_file->file);
+               kfree(map_file);
+               vhost_iotlb_map_free(domain->iotlb, map);
+       }
+}
+
+int vduse_domain_set_map(struct vduse_iova_domain *domain,
+                        struct vhost_iotlb *iotlb)
+{
+       struct vdpa_map_file *map_file;
+       struct vhost_iotlb_map *map;
+       u64 start = 0ULL, last = ULLONG_MAX;
+       int ret;
+
+       spin_lock(&domain->iotlb_lock);
+       vduse_iotlb_del_range(domain, start, last);
+
+       for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
+            map = vhost_iotlb_itree_next(map, start, last)) {
+               map_file = (struct vdpa_map_file *)map->opaque;
+               ret = vduse_iotlb_add_range(domain, map->start, map->last,
+                                           map->addr, map->perm,
+                                           map_file->file,
+                                           map_file->offset);
+               if (ret)
+                       goto err;
+       }
+       spin_unlock(&domain->iotlb_lock);
+
+       return 0;
+err:
+       vduse_iotlb_del_range(domain, start, last);
+       spin_unlock(&domain->iotlb_lock);
+       return ret;
+}
+
+void vduse_domain_clear_map(struct vduse_iova_domain *domain,
+                           struct vhost_iotlb *iotlb)
+{
+       struct vhost_iotlb_map *map;
+       u64 start = 0ULL, last = ULLONG_MAX;
+
+       spin_lock(&domain->iotlb_lock);
+       for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
+            map = vhost_iotlb_itree_next(map, start, last)) {
+               vduse_iotlb_del_range(domain, map->start, map->last);
+       }
+       spin_unlock(&domain->iotlb_lock);
+}
+
+static int vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
+                                        u64 iova, u64 size, u64 paddr)
+{
+       struct vduse_bounce_map *map;
+       u64 last = iova + size - 1;
+
+       while (iova <= last) {
+               map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+               if (!map->bounce_page) {
+                       map->bounce_page = alloc_page(GFP_ATOMIC);
+                       if (!map->bounce_page)
+                               return -ENOMEM;
+               }
+               map->orig_phys = paddr;
+               paddr += PAGE_SIZE;
+               iova += PAGE_SIZE;
+       }
+       return 0;
+}
+
+static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain,
+                                          u64 iova, u64 size)
+{
+       struct vduse_bounce_map *map;
+       u64 last = iova + size - 1;
+
+       while (iova <= last) {
+               map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+               map->orig_phys = INVALID_PHYS_ADDR;
+               iova += PAGE_SIZE;
+       }
+}
+
+static void do_bounce(phys_addr_t orig, void *addr, size_t size,
+                     enum dma_data_direction dir)
+{
+       unsigned long pfn = PFN_DOWN(orig);
+       unsigned int offset = offset_in_page(orig);
+       char *buffer;
+       unsigned int sz = 0;
+
+       while (size) {
+               sz = min_t(size_t, PAGE_SIZE - offset, size);
+
+               buffer = kmap_atomic(pfn_to_page(pfn));
+               if (dir == DMA_TO_DEVICE)
+                       memcpy(addr, buffer + offset, sz);
+               else
+                       memcpy(buffer + offset, addr, sz);
+               kunmap_atomic(buffer);
+
+               size -= sz;
+               pfn++;
+               addr += sz;
+               offset = 0;
+       }
+}
+
+static void vduse_domain_bounce(struct vduse_iova_domain *domain,
+                               dma_addr_t iova, size_t size,
+                               enum dma_data_direction dir)
+{
+       struct vduse_bounce_map *map;
+       unsigned int offset;
+       void *addr;
+       size_t sz;
+
+       if (iova >= domain->bounce_size)
+               return;
+
+       while (size) {
+               map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+               offset = offset_in_page(iova);
+               sz = min_t(size_t, PAGE_SIZE - offset, size);
+
+               if (WARN_ON(!map->bounce_page ||
+                           map->orig_phys == INVALID_PHYS_ADDR))
+                       return;
+
+               addr = page_address(map->bounce_page) + offset;
+               do_bounce(map->orig_phys + offset, addr, sz, dir);
+               size -= sz;
+               iova += sz;
+       }
+}
+
+static struct page *
+vduse_domain_get_coherent_page(struct vduse_iova_domain *domain, u64 iova)
+{
+       u64 start = iova & PAGE_MASK;
+       u64 last = start + PAGE_SIZE - 1;
+       struct vhost_iotlb_map *map;
+       struct page *page = NULL;
+
+       spin_lock(&domain->iotlb_lock);
+       map = vhost_iotlb_itree_first(domain->iotlb, start, last);
+       if (!map)
+               goto out;
+
+       page = pfn_to_page((map->addr + iova - map->start) >> PAGE_SHIFT);
+       get_page(page);
+out:
+       spin_unlock(&domain->iotlb_lock);
+
+       return page;
+}
+
+static struct page *
+vduse_domain_get_bounce_page(struct vduse_iova_domain *domain, u64 iova)
+{
+       struct vduse_bounce_map *map;
+       struct page *page = NULL;
+
+       spin_lock(&domain->iotlb_lock);
+       map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+       if (!map->bounce_page)
+               goto out;
+
+       page = map->bounce_page;
+       get_page(page);
+out:
+       spin_unlock(&domain->iotlb_lock);
+
+       return page;
+}
+
+static void
+vduse_domain_free_bounce_pages(struct vduse_iova_domain *domain)
+{
+       struct vduse_bounce_map *map;
+       unsigned long pfn, bounce_pfns;
+
+       bounce_pfns = domain->bounce_size >> PAGE_SHIFT;
+
+       for (pfn = 0; pfn < bounce_pfns; pfn++) {
+               map = &domain->bounce_maps[pfn];
+               if (WARN_ON(map->orig_phys != INVALID_PHYS_ADDR))
+                       continue;
+
+               if (!map->bounce_page)
+                       continue;
+
+               __free_page(map->bounce_page);
+               map->bounce_page = NULL;
+       }
+}
+
+void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain)
+{
+       if (!domain->bounce_map)
+               return;
+
+       spin_lock(&domain->iotlb_lock);
+       if (!domain->bounce_map)
+               goto unlock;
+
+       vduse_iotlb_del_range(domain, 0, domain->bounce_size - 1);
+       domain->bounce_map = 0;
+unlock:
+       spin_unlock(&domain->iotlb_lock);
+}
+
+static int vduse_domain_init_bounce_map(struct vduse_iova_domain *domain)
+{
+       int ret = 0;
+
+       if (domain->bounce_map)
+               return 0;
+
+       spin_lock(&domain->iotlb_lock);
+       if (domain->bounce_map)
+               goto unlock;
+
+       ret = vduse_iotlb_add_range(domain, 0, domain->bounce_size - 1,
+                                   0, VHOST_MAP_RW, domain->file, 0);
+       if (ret)
+               goto unlock;
+
+       domain->bounce_map = 1;
+unlock:
+       spin_unlock(&domain->iotlb_lock);
+       return ret;
+}
+
+static dma_addr_t
+vduse_domain_alloc_iova(struct iova_domain *iovad,
+                       unsigned long size, unsigned long limit)
+{
+       unsigned long shift = iova_shift(iovad);
+       unsigned long iova_len = iova_align(iovad, size) >> shift;
+       unsigned long iova_pfn;
+
+       /*
+        * Freeing non-power-of-two-sized allocations back into the IOVA caches
+        * will come back to bite us badly, so we have to waste a bit of space
+        * rounding up anything cacheable to make sure that can't happen. The
+        * order of the unadjusted size will still match upon freeing.
+        */
+       if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
+               iova_len = roundup_pow_of_two(iova_len);
+       iova_pfn = alloc_iova_fast(iovad, iova_len, limit >> shift, true);
+
+       return iova_pfn << shift;
+}
+
+static void vduse_domain_free_iova(struct iova_domain *iovad,
+                                  dma_addr_t iova, size_t size)
+{
+       unsigned long shift = iova_shift(iovad);
+       unsigned long iova_len = iova_align(iovad, size) >> shift;
+
+       free_iova_fast(iovad, iova >> shift, iova_len);
+}
+
+dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain,
+                                struct page *page, unsigned long offset,
+                                size_t size, enum dma_data_direction dir,
+                                unsigned long attrs)
+{
+       struct iova_domain *iovad = &domain->stream_iovad;
+       unsigned long limit = domain->bounce_size - 1;
+       phys_addr_t pa = page_to_phys(page) + offset;
+       dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit);
+
+       if (!iova)
+               return DMA_MAPPING_ERROR;
+
+       if (vduse_domain_init_bounce_map(domain))
+               goto err;
+
+       if (vduse_domain_map_bounce_page(domain, (u64)iova, (u64)size, pa))
+               goto err;
+
+       if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+               vduse_domain_bounce(domain, iova, size, DMA_TO_DEVICE);
+
+       return iova;
+err:
+       vduse_domain_free_iova(iovad, iova, size);
+       return DMA_MAPPING_ERROR;
+}
+
+void vduse_domain_unmap_page(struct vduse_iova_domain *domain,
+                            dma_addr_t dma_addr, size_t size,
+                            enum dma_data_direction dir, unsigned long attrs)
+{
+       struct iova_domain *iovad = &domain->stream_iovad;
+
+       if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+               vduse_domain_bounce(domain, dma_addr, size, DMA_FROM_DEVICE);
+
+       vduse_domain_unmap_bounce_page(domain, (u64)dma_addr, (u64)size);
+       vduse_domain_free_iova(iovad, dma_addr, size);
+}
+
+void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
+                                 size_t size, dma_addr_t *dma_addr,
+                                 gfp_t flag, unsigned long attrs)
+{
+       struct iova_domain *iovad = &domain->consistent_iovad;
+       unsigned long limit = domain->iova_limit;
+       dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit);
+       void *orig = alloc_pages_exact(size, flag);
+
+       if (!iova || !orig)
+               goto err;
+
+       spin_lock(&domain->iotlb_lock);
+       if (vduse_iotlb_add_range(domain, (u64)iova, (u64)iova + size - 1,
+                                 virt_to_phys(orig), VHOST_MAP_RW,
+                                 domain->file, (u64)iova)) {
+               spin_unlock(&domain->iotlb_lock);
+               goto err;
+       }
+       spin_unlock(&domain->iotlb_lock);
+
+       *dma_addr = iova;
+
+       return orig;
+err:
+       *dma_addr = DMA_MAPPING_ERROR;
+       if (orig)
+               free_pages_exact(orig, size);
+       if (iova)
+               vduse_domain_free_iova(iovad, iova, size);
+
+       return NULL;
+}
+
+void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size,
+                               void *vaddr, dma_addr_t dma_addr,
+                               unsigned long attrs)
+{
+       struct iova_domain *iovad = &domain->consistent_iovad;
+       struct vhost_iotlb_map *map;
+       struct vdpa_map_file *map_file;
+       phys_addr_t pa;
+
+       spin_lock(&domain->iotlb_lock);
+       map = vhost_iotlb_itree_first(domain->iotlb, (u64)dma_addr,
+                                     (u64)dma_addr + size - 1);
+       if (WARN_ON(!map)) {
+               spin_unlock(&domain->iotlb_lock);
+               return;
+       }
+       map_file = (struct vdpa_map_file *)map->opaque;
+       fput(map_file->file);
+       kfree(map_file);
+       pa = map->addr;
+       vhost_iotlb_map_free(domain->iotlb, map);
+       spin_unlock(&domain->iotlb_lock);
+
+       vduse_domain_free_iova(iovad, dma_addr, size);
+       free_pages_exact(phys_to_virt(pa), size);
+}
+
+static vm_fault_t vduse_domain_mmap_fault(struct vm_fault *vmf)
+{
+       struct vduse_iova_domain *domain = vmf->vma->vm_private_data;
+       unsigned long iova = vmf->pgoff << PAGE_SHIFT;
+       struct page *page;
+
+       if (!domain)
+               return VM_FAULT_SIGBUS;
+
+       if (iova < domain->bounce_size)
+               page = vduse_domain_get_bounce_page(domain, iova);
+       else
+               page = vduse_domain_get_coherent_page(domain, iova);
+
+       if (!page)
+               return VM_FAULT_SIGBUS;
+
+       vmf->page = page;
+
+       return 0;
+}
+
+static const struct vm_operations_struct vduse_domain_mmap_ops = {
+       .fault = vduse_domain_mmap_fault,
+};
+
+static int vduse_domain_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       struct vduse_iova_domain *domain = file->private_data;
+
+       vma->vm_flags |= VM_DONTDUMP | VM_DONTEXPAND;
+       vma->vm_private_data = domain;
+       vma->vm_ops = &vduse_domain_mmap_ops;
+
+       return 0;
+}
+
+static int vduse_domain_release(struct inode *inode, struct file *file)
+{
+       struct vduse_iova_domain *domain = file->private_data;
+
+       spin_lock(&domain->iotlb_lock);
+       vduse_iotlb_del_range(domain, 0, ULLONG_MAX);
+       vduse_domain_free_bounce_pages(domain);
+       spin_unlock(&domain->iotlb_lock);
+       put_iova_domain(&domain->stream_iovad);
+       put_iova_domain(&domain->consistent_iovad);
+       vhost_iotlb_free(domain->iotlb);
+       vfree(domain->bounce_maps);
+       kfree(domain);
+
+       return 0;
+}
+
+static const struct file_operations vduse_domain_fops = {
+       .owner = THIS_MODULE,
+       .mmap = vduse_domain_mmap,
+       .release = vduse_domain_release,
+};
+
+void vduse_domain_destroy(struct vduse_iova_domain *domain)
+{
+       fput(domain->file);
+}
+
+struct vduse_iova_domain *
+vduse_domain_create(unsigned long iova_limit, size_t bounce_size)
+{
+       struct vduse_iova_domain *domain;
+       struct file *file;
+       struct vduse_bounce_map *map;
+       unsigned long pfn, bounce_pfns;
+
+       bounce_pfns = PAGE_ALIGN(bounce_size) >> PAGE_SHIFT;
+       if (iova_limit <= bounce_size)
+               return NULL;
+
+       domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+       if (!domain)
+               return NULL;
+
+       domain->iotlb = vhost_iotlb_alloc(0, 0);
+       if (!domain->iotlb)
+               goto err_iotlb;
+
+       domain->iova_limit = iova_limit;
+       domain->bounce_size = PAGE_ALIGN(bounce_size);
+       domain->bounce_maps = vzalloc(bounce_pfns *
+                               sizeof(struct vduse_bounce_map));
+       if (!domain->bounce_maps)
+               goto err_map;
+
+       for (pfn = 0; pfn < bounce_pfns; pfn++) {
+               map = &domain->bounce_maps[pfn];
+               map->orig_phys = INVALID_PHYS_ADDR;
+       }
+       file = anon_inode_getfile("[vduse-domain]", &vduse_domain_fops,
+                               domain, O_RDWR);
+       if (IS_ERR(file))
+               goto err_file;
+
+       domain->file = file;
+       spin_lock_init(&domain->iotlb_lock);
+       init_iova_domain(&domain->stream_iovad,
+                       PAGE_SIZE, IOVA_START_PFN);
+       init_iova_domain(&domain->consistent_iovad,
+                       PAGE_SIZE, bounce_pfns);
+
+       return domain;
+err_file:
+       vfree(domain->bounce_maps);
+err_map:
+       vhost_iotlb_free(domain->iotlb);
+err_iotlb:
+       kfree(domain);
+       return NULL;
+}
+
+int vduse_domain_init(void)
+{
+       return iova_cache_get();
+}
+
+void vduse_domain_exit(void)
+{
+       iova_cache_put();
+}
diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h
new file mode 100644 (file)
index 0000000..2722d9b
--- /dev/null
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * MMU-based software IOTLB.
+ *
+ * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xie Yongji <xieyongji@bytedance.com>
+ *
+ */
+
+#ifndef _VDUSE_IOVA_DOMAIN_H
+#define _VDUSE_IOVA_DOMAIN_H
+
+#include <linux/iova.h>
+#include <linux/dma-mapping.h>
+#include <linux/vhost_iotlb.h>
+
+#define IOVA_START_PFN 1
+
+#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
+
+struct vduse_bounce_map {
+       struct page *bounce_page;
+       u64 orig_phys;
+};
+
+struct vduse_iova_domain {
+       struct iova_domain stream_iovad;
+       struct iova_domain consistent_iovad;
+       struct vduse_bounce_map *bounce_maps;
+       size_t bounce_size;
+       unsigned long iova_limit;
+       int bounce_map;
+       struct vhost_iotlb *iotlb;
+       spinlock_t iotlb_lock;
+       struct file *file;
+};
+
+int vduse_domain_set_map(struct vduse_iova_domain *domain,
+                        struct vhost_iotlb *iotlb);
+
+void vduse_domain_clear_map(struct vduse_iova_domain *domain,
+                           struct vhost_iotlb *iotlb);
+
+dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain,
+                                struct page *page, unsigned long offset,
+                                size_t size, enum dma_data_direction dir,
+                                unsigned long attrs);
+
+void vduse_domain_unmap_page(struct vduse_iova_domain *domain,
+                            dma_addr_t dma_addr, size_t size,
+                            enum dma_data_direction dir, unsigned long attrs);
+
+void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
+                                 size_t size, dma_addr_t *dma_addr,
+                                 gfp_t flag, unsigned long attrs);
+
+void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size,
+                               void *vaddr, dma_addr_t dma_addr,
+                               unsigned long attrs);
+
+void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain);
+
+void vduse_domain_destroy(struct vduse_iova_domain *domain);
+
+struct vduse_iova_domain *vduse_domain_create(unsigned long iova_limit,
+                                             size_t bounce_size);
+
+int vduse_domain_init(void);
+
+void vduse_domain_exit(void);
+
+#endif /* _VDUSE_IOVA_DOMAIN_H */
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
new file mode 100644 (file)
index 0000000..29a38ec
--- /dev/null
@@ -0,0 +1,1641 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VDUSE: vDPA Device in Userspace
+ *
+ * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xie Yongji <xieyongji@bytedance.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/dma-map-ops.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/vdpa.h>
+#include <linux/nospec.h>
+#include <uapi/linux/vduse.h>
+#include <uapi/linux/vdpa.h>
+#include <uapi/linux/virtio_config.h>
+#include <uapi/linux/virtio_ids.h>
+#include <uapi/linux/virtio_blk.h>
+#include <linux/mod_devicetable.h>
+
+#include "iova_domain.h"
+
+#define DRV_AUTHOR   "Yongji Xie <xieyongji@bytedance.com>"
+#define DRV_DESC     "vDPA Device in Userspace"
+#define DRV_LICENSE  "GPL v2"
+
+#define VDUSE_DEV_MAX (1U << MINORBITS)
+#define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
+#define VDUSE_IOVA_SIZE (128 * 1024 * 1024)
+#define VDUSE_MSG_DEFAULT_TIMEOUT 30
+
+struct vduse_virtqueue {
+       u16 index;
+       u16 num_max;
+       u32 num;
+       u64 desc_addr;
+       u64 driver_addr;
+       u64 device_addr;
+       struct vdpa_vq_state state;
+       bool ready;
+       bool kicked;
+       spinlock_t kick_lock;
+       spinlock_t irq_lock;
+       struct eventfd_ctx *kickfd;
+       struct vdpa_callback cb;
+       struct work_struct inject;
+       struct work_struct kick;
+};
+
+struct vduse_dev;
+
+struct vduse_vdpa {
+       struct vdpa_device vdpa;
+       struct vduse_dev *dev;
+};
+
+struct vduse_dev {
+       struct vduse_vdpa *vdev;
+       struct device *dev;
+       struct vduse_virtqueue *vqs;
+       struct vduse_iova_domain *domain;
+       char *name;
+       struct mutex lock;
+       spinlock_t msg_lock;
+       u64 msg_unique;
+       u32 msg_timeout;
+       wait_queue_head_t waitq;
+       struct list_head send_list;
+       struct list_head recv_list;
+       struct vdpa_callback config_cb;
+       struct work_struct inject;
+       spinlock_t irq_lock;
+       int minor;
+       bool broken;
+       bool connected;
+       u64 api_version;
+       u64 device_features;
+       u64 driver_features;
+       u32 device_id;
+       u32 vendor_id;
+       u32 generation;
+       u32 config_size;
+       void *config;
+       u8 status;
+       u32 vq_num;
+       u32 vq_align;
+};
+
+struct vduse_dev_msg {
+       struct vduse_dev_request req;
+       struct vduse_dev_response resp;
+       struct list_head list;
+       wait_queue_head_t waitq;
+       bool completed;
+};
+
+struct vduse_control {
+       u64 api_version;
+};
+
+static DEFINE_MUTEX(vduse_lock);
+static DEFINE_IDR(vduse_idr);
+
+static dev_t vduse_major;
+static struct class *vduse_class;
+static struct cdev vduse_ctrl_cdev;
+static struct cdev vduse_cdev;
+static struct workqueue_struct *vduse_irq_wq;
+
+static u32 allowed_device_id[] = {
+       VIRTIO_ID_BLOCK,
+};
+
+static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa)
+{
+       struct vduse_vdpa *vdev = container_of(vdpa, struct vduse_vdpa, vdpa);
+
+       return vdev->dev;
+}
+
+static inline struct vduse_dev *dev_to_vduse(struct device *dev)
+{
+       struct vdpa_device *vdpa = dev_to_vdpa(dev);
+
+       return vdpa_to_vduse(vdpa);
+}
+
+static struct vduse_dev_msg *vduse_find_msg(struct list_head *head,
+                                           uint32_t request_id)
+{
+       struct vduse_dev_msg *msg;
+
+       list_for_each_entry(msg, head, list) {
+               if (msg->req.request_id == request_id) {
+                       list_del(&msg->list);
+                       return msg;
+               }
+       }
+
+       return NULL;
+}
+
+static struct vduse_dev_msg *vduse_dequeue_msg(struct list_head *head)
+{
+       struct vduse_dev_msg *msg = NULL;
+
+       if (!list_empty(head)) {
+               msg = list_first_entry(head, struct vduse_dev_msg, list);
+               list_del(&msg->list);
+       }
+
+       return msg;
+}
+
+static void vduse_enqueue_msg(struct list_head *head,
+                             struct vduse_dev_msg *msg)
+{
+       list_add_tail(&msg->list, head);
+}
+
+static void vduse_dev_broken(struct vduse_dev *dev)
+{
+       struct vduse_dev_msg *msg, *tmp;
+
+       if (unlikely(dev->broken))
+               return;
+
+       list_splice_init(&dev->recv_list, &dev->send_list);
+       list_for_each_entry_safe(msg, tmp, &dev->send_list, list) {
+               list_del(&msg->list);
+               msg->completed = 1;
+               msg->resp.result = VDUSE_REQ_RESULT_FAILED;
+               wake_up(&msg->waitq);
+       }
+       dev->broken = true;
+       wake_up(&dev->waitq);
+}
+
+static int vduse_dev_msg_sync(struct vduse_dev *dev,
+                             struct vduse_dev_msg *msg)
+{
+       int ret;
+
+       if (unlikely(dev->broken))
+               return -EIO;
+
+       init_waitqueue_head(&msg->waitq);
+       spin_lock(&dev->msg_lock);
+       if (unlikely(dev->broken)) {
+               spin_unlock(&dev->msg_lock);
+               return -EIO;
+       }
+       msg->req.request_id = dev->msg_unique++;
+       vduse_enqueue_msg(&dev->send_list, msg);
+       wake_up(&dev->waitq);
+       spin_unlock(&dev->msg_lock);
+       if (dev->msg_timeout)
+               ret = wait_event_killable_timeout(msg->waitq, msg->completed,
+                                                 (long)dev->msg_timeout * HZ);
+       else
+               ret = wait_event_killable(msg->waitq, msg->completed);
+
+       spin_lock(&dev->msg_lock);
+       if (!msg->completed) {
+               list_del(&msg->list);
+               msg->resp.result = VDUSE_REQ_RESULT_FAILED;
+               /* Mark the device as malfunction when there is a timeout */
+               if (!ret)
+                       vduse_dev_broken(dev);
+       }
+       ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO;
+       spin_unlock(&dev->msg_lock);
+
+       return ret;
+}
+
+static int vduse_dev_get_vq_state_packed(struct vduse_dev *dev,
+                                        struct vduse_virtqueue *vq,
+                                        struct vdpa_vq_state_packed *packed)
+{
+       struct vduse_dev_msg msg = { 0 };
+       int ret;
+
+       msg.req.type = VDUSE_GET_VQ_STATE;
+       msg.req.vq_state.index = vq->index;
+
+       ret = vduse_dev_msg_sync(dev, &msg);
+       if (ret)
+               return ret;
+
+       packed->last_avail_counter =
+                       msg.resp.vq_state.packed.last_avail_counter & 0x0001;
+       packed->last_avail_idx =
+                       msg.resp.vq_state.packed.last_avail_idx & 0x7FFF;
+       packed->last_used_counter =
+                       msg.resp.vq_state.packed.last_used_counter & 0x0001;
+       packed->last_used_idx =
+                       msg.resp.vq_state.packed.last_used_idx & 0x7FFF;
+
+       return 0;
+}
+
+static int vduse_dev_get_vq_state_split(struct vduse_dev *dev,
+                                       struct vduse_virtqueue *vq,
+                                       struct vdpa_vq_state_split *split)
+{
+       struct vduse_dev_msg msg = { 0 };
+       int ret;
+
+       msg.req.type = VDUSE_GET_VQ_STATE;
+       msg.req.vq_state.index = vq->index;
+
+       ret = vduse_dev_msg_sync(dev, &msg);
+       if (ret)
+               return ret;
+
+       split->avail_index = msg.resp.vq_state.split.avail_index;
+
+       return 0;
+}
+
+static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
+{
+       struct vduse_dev_msg msg = { 0 };
+
+       msg.req.type = VDUSE_SET_STATUS;
+       msg.req.s.status = status;
+
+       return vduse_dev_msg_sync(dev, &msg);
+}
+
+static int vduse_dev_update_iotlb(struct vduse_dev *dev,
+                                 u64 start, u64 last)
+{
+       struct vduse_dev_msg msg = { 0 };
+
+       if (last < start)
+               return -EINVAL;
+
+       msg.req.type = VDUSE_UPDATE_IOTLB;
+       msg.req.iova.start = start;
+       msg.req.iova.last = last;
+
+       return vduse_dev_msg_sync(dev, &msg);
+}
+
+static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+       struct file *file = iocb->ki_filp;
+       struct vduse_dev *dev = file->private_data;
+       struct vduse_dev_msg *msg;
+       int size = sizeof(struct vduse_dev_request);
+       ssize_t ret;
+
+       if (iov_iter_count(to) < size)
+               return -EINVAL;
+
+       spin_lock(&dev->msg_lock);
+       while (1) {
+               msg = vduse_dequeue_msg(&dev->send_list);
+               if (msg)
+                       break;
+
+               ret = -EAGAIN;
+               if (file->f_flags & O_NONBLOCK)
+                       goto unlock;
+
+               spin_unlock(&dev->msg_lock);
+               ret = wait_event_interruptible_exclusive(dev->waitq,
+                                       !list_empty(&dev->send_list));
+               if (ret)
+                       return ret;
+
+               spin_lock(&dev->msg_lock);
+       }
+       spin_unlock(&dev->msg_lock);
+       ret = copy_to_iter(&msg->req, size, to);
+       spin_lock(&dev->msg_lock);
+       if (ret != size) {
+               ret = -EFAULT;
+               vduse_enqueue_msg(&dev->send_list, msg);
+               goto unlock;
+       }
+       vduse_enqueue_msg(&dev->recv_list, msg);
+unlock:
+       spin_unlock(&dev->msg_lock);
+
+       return ret;
+}
+
+static bool is_mem_zero(const char *ptr, int size)
+{
+       int i;
+
+       for (i = 0; i < size; i++) {
+               if (ptr[i])
+                       return false;
+       }
+       return true;
+}
+
+static ssize_t vduse_dev_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct file *file = iocb->ki_filp;
+       struct vduse_dev *dev = file->private_data;
+       struct vduse_dev_response resp;
+       struct vduse_dev_msg *msg;
+       size_t ret;
+
+       ret = copy_from_iter(&resp, sizeof(resp), from);
+       if (ret != sizeof(resp))
+               return -EINVAL;
+
+       if (!is_mem_zero((const char *)resp.reserved, sizeof(resp.reserved)))
+               return -EINVAL;
+
+       spin_lock(&dev->msg_lock);
+       msg = vduse_find_msg(&dev->recv_list, resp.request_id);
+       if (!msg) {
+               ret = -ENOENT;
+               goto unlock;
+       }
+
+       memcpy(&msg->resp, &resp, sizeof(resp));
+       msg->completed = 1;
+       wake_up(&msg->waitq);
+unlock:
+       spin_unlock(&dev->msg_lock);
+
+       return ret;
+}
+
+static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
+{
+       struct vduse_dev *dev = file->private_data;
+       __poll_t mask = 0;
+
+       poll_wait(file, &dev->waitq, wait);
+
+       spin_lock(&dev->msg_lock);
+
+       if (unlikely(dev->broken))
+               mask |= EPOLLERR;
+       if (!list_empty(&dev->send_list))
+               mask |= EPOLLIN | EPOLLRDNORM;
+       if (!list_empty(&dev->recv_list))
+               mask |= EPOLLOUT | EPOLLWRNORM;
+
+       spin_unlock(&dev->msg_lock);
+
+       return mask;
+}
+
+static void vduse_dev_reset(struct vduse_dev *dev)
+{
+       int i;
+       struct vduse_iova_domain *domain = dev->domain;
+
+       /* The coherent mappings are handled in vduse_dev_free_coherent() */
+       if (domain->bounce_map)
+               vduse_domain_reset_bounce_map(domain);
+
+       dev->status = 0;
+       dev->driver_features = 0;
+       dev->generation++;
+       spin_lock(&dev->irq_lock);
+       dev->config_cb.callback = NULL;
+       dev->config_cb.private = NULL;
+       spin_unlock(&dev->irq_lock);
+       flush_work(&dev->inject);
+
+       for (i = 0; i < dev->vq_num; i++) {
+               struct vduse_virtqueue *vq = &dev->vqs[i];
+
+               vq->ready = false;
+               vq->desc_addr = 0;
+               vq->driver_addr = 0;
+               vq->device_addr = 0;
+               vq->num = 0;
+               memset(&vq->state, 0, sizeof(vq->state));
+
+               spin_lock(&vq->kick_lock);
+               vq->kicked = false;
+               if (vq->kickfd)
+                       eventfd_ctx_put(vq->kickfd);
+               vq->kickfd = NULL;
+               spin_unlock(&vq->kick_lock);
+
+               spin_lock(&vq->irq_lock);
+               vq->cb.callback = NULL;
+               vq->cb.private = NULL;
+               spin_unlock(&vq->irq_lock);
+               flush_work(&vq->inject);
+               flush_work(&vq->kick);
+       }
+}
+
+static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
+                               u64 desc_area, u64 driver_area,
+                               u64 device_area)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       vq->desc_addr = desc_area;
+       vq->driver_addr = driver_area;
+       vq->device_addr = device_area;
+
+       return 0;
+}
+
+static void vduse_vq_kick(struct vduse_virtqueue *vq)
+{
+       spin_lock(&vq->kick_lock);
+       if (!vq->ready)
+               goto unlock;
+
+       if (vq->kickfd)
+               eventfd_signal(vq->kickfd, 1);
+       else
+               vq->kicked = true;
+unlock:
+       spin_unlock(&vq->kick_lock);
+}
+
+static void vduse_vq_kick_work(struct work_struct *work)
+{
+       struct vduse_virtqueue *vq = container_of(work,
+                                       struct vduse_virtqueue, kick);
+
+       vduse_vq_kick(vq);
+}
+
+static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       if (!eventfd_signal_allowed()) {
+               schedule_work(&vq->kick);
+               return;
+       }
+       vduse_vq_kick(vq);
+}
+
+static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
+                             struct vdpa_callback *cb)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       spin_lock(&vq->irq_lock);
+       vq->cb.callback = cb->callback;
+       vq->cb.private = cb->private;
+       spin_unlock(&vq->irq_lock);
+}
+
+static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       vq->num = num;
+}
+
+static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
+                                       u16 idx, bool ready)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       vq->ready = ready;
+}
+
+static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       return vq->ready;
+}
+
+static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
+                               const struct vdpa_vq_state *state)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
+               vq->state.packed.last_avail_counter =
+                               state->packed.last_avail_counter;
+               vq->state.packed.last_avail_idx = state->packed.last_avail_idx;
+               vq->state.packed.last_used_counter =
+                               state->packed.last_used_counter;
+               vq->state.packed.last_used_idx = state->packed.last_used_idx;
+       } else
+               vq->state.split.avail_index = state->split.avail_index;
+
+       return 0;
+}
+
+static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
+                               struct vdpa_vq_state *state)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
+               return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
+
+       return vduse_dev_get_vq_state_split(dev, vq, &state->split);
+}
+
+static u32 vduse_vdpa_get_vq_align(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       return dev->vq_align;
+}
+
+static u64 vduse_vdpa_get_features(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       return dev->device_features;
+}
+
+static int vduse_vdpa_set_features(struct vdpa_device *vdpa, u64 features)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       dev->driver_features = features;
+       return 0;
+}
+
+static void vduse_vdpa_set_config_cb(struct vdpa_device *vdpa,
+                                 struct vdpa_callback *cb)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       spin_lock(&dev->irq_lock);
+       dev->config_cb.callback = cb->callback;
+       dev->config_cb.private = cb->private;
+       spin_unlock(&dev->irq_lock);
+}
+
+static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       u16 num_max = 0;
+       int i;
+
+       for (i = 0; i < dev->vq_num; i++)
+               if (num_max < dev->vqs[i].num_max)
+                       num_max = dev->vqs[i].num_max;
+
+       return num_max;
+}
+
+static u32 vduse_vdpa_get_device_id(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       return dev->device_id;
+}
+
+static u32 vduse_vdpa_get_vendor_id(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       return dev->vendor_id;
+}
+
+static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       return dev->status;
+}
+
+static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       if (vduse_dev_set_status(dev, status))
+               return;
+
+       dev->status = status;
+}
+
+static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       return dev->config_size;
+}
+
+static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
+                                 void *buf, unsigned int len)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       if (len > dev->config_size - offset)
+               return;
+
+       memcpy(buf, dev->config + offset, len);
+}
+
+static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
+                       const void *buf, unsigned int len)
+{
+       /* Now we only support read-only configuration space */
+}
+
+static int vduse_vdpa_reset(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       if (vduse_dev_set_status(dev, 0))
+               return -EIO;
+
+       vduse_dev_reset(dev);
+
+       return 0;
+}
+
+static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       return dev->generation;
+}
+
+static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
+                               struct vhost_iotlb *iotlb)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       int ret;
+
+       ret = vduse_domain_set_map(dev->domain, iotlb);
+       if (ret)
+               return ret;
+
+       ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
+       if (ret) {
+               vduse_domain_clear_map(dev->domain, iotlb);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void vduse_vdpa_free(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       dev->vdev = NULL;
+}
+
+static const struct vdpa_config_ops vduse_vdpa_config_ops = {
+       .set_vq_address         = vduse_vdpa_set_vq_address,
+       .kick_vq                = vduse_vdpa_kick_vq,
+       .set_vq_cb              = vduse_vdpa_set_vq_cb,
+       .set_vq_num             = vduse_vdpa_set_vq_num,
+       .set_vq_ready           = vduse_vdpa_set_vq_ready,
+       .get_vq_ready           = vduse_vdpa_get_vq_ready,
+       .set_vq_state           = vduse_vdpa_set_vq_state,
+       .get_vq_state           = vduse_vdpa_get_vq_state,
+       .get_vq_align           = vduse_vdpa_get_vq_align,
+       .get_features           = vduse_vdpa_get_features,
+       .set_features           = vduse_vdpa_set_features,
+       .set_config_cb          = vduse_vdpa_set_config_cb,
+       .get_vq_num_max         = vduse_vdpa_get_vq_num_max,
+       .get_device_id          = vduse_vdpa_get_device_id,
+       .get_vendor_id          = vduse_vdpa_get_vendor_id,
+       .get_status             = vduse_vdpa_get_status,
+       .set_status             = vduse_vdpa_set_status,
+       .get_config_size        = vduse_vdpa_get_config_size,
+       .get_config             = vduse_vdpa_get_config,
+       .set_config             = vduse_vdpa_set_config,
+       .get_generation         = vduse_vdpa_get_generation,
+       .reset                  = vduse_vdpa_reset,
+       .set_map                = vduse_vdpa_set_map,
+       .free                   = vduse_vdpa_free,
+};
+
+static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page,
+                                    unsigned long offset, size_t size,
+                                    enum dma_data_direction dir,
+                                    unsigned long attrs)
+{
+       struct vduse_dev *vdev = dev_to_vduse(dev);
+       struct vduse_iova_domain *domain = vdev->domain;
+
+       return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
+}
+
+static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr,
+                               size_t size, enum dma_data_direction dir,
+                               unsigned long attrs)
+{
+       struct vduse_dev *vdev = dev_to_vduse(dev);
+       struct vduse_iova_domain *domain = vdev->domain;
+
+       return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
+}
+
+static void *vduse_dev_alloc_coherent(struct device *dev, size_t size,
+                                       dma_addr_t *dma_addr, gfp_t flag,
+                                       unsigned long attrs)
+{
+       struct vduse_dev *vdev = dev_to_vduse(dev);
+       struct vduse_iova_domain *domain = vdev->domain;
+       unsigned long iova;
+       void *addr;
+
+       *dma_addr = DMA_MAPPING_ERROR;
+       addr = vduse_domain_alloc_coherent(domain, size,
+                               (dma_addr_t *)&iova, flag, attrs);
+       if (!addr)
+               return NULL;
+
+       *dma_addr = (dma_addr_t)iova;
+
+       return addr;
+}
+
+static void vduse_dev_free_coherent(struct device *dev, size_t size,
+                                       void *vaddr, dma_addr_t dma_addr,
+                                       unsigned long attrs)
+{
+       struct vduse_dev *vdev = dev_to_vduse(dev);
+       struct vduse_iova_domain *domain = vdev->domain;
+
+       vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs);
+}
+
+static size_t vduse_dev_max_mapping_size(struct device *dev)
+{
+       struct vduse_dev *vdev = dev_to_vduse(dev);
+       struct vduse_iova_domain *domain = vdev->domain;
+
+       return domain->bounce_size;
+}
+
+static const struct dma_map_ops vduse_dev_dma_ops = {
+       .map_page = vduse_dev_map_page,
+       .unmap_page = vduse_dev_unmap_page,
+       .alloc = vduse_dev_alloc_coherent,
+       .free = vduse_dev_free_coherent,
+       .max_mapping_size = vduse_dev_max_mapping_size,
+};
+
+static unsigned int perm_to_file_flags(u8 perm)
+{
+       unsigned int flags = 0;
+
+       switch (perm) {
+       case VDUSE_ACCESS_WO:
+               flags |= O_WRONLY;
+               break;
+       case VDUSE_ACCESS_RO:
+               flags |= O_RDONLY;
+               break;
+       case VDUSE_ACCESS_RW:
+               flags |= O_RDWR;
+               break;
+       default:
+               WARN(1, "invalidate vhost IOTLB permission\n");
+               break;
+       }
+
+       return flags;
+}
+
+static int vduse_kickfd_setup(struct vduse_dev *dev,
+                       struct vduse_vq_eventfd *eventfd)
+{
+       struct eventfd_ctx *ctx = NULL;
+       struct vduse_virtqueue *vq;
+       u32 index;
+
+       if (eventfd->index >= dev->vq_num)
+               return -EINVAL;
+
+       index = array_index_nospec(eventfd->index, dev->vq_num);
+       vq = &dev->vqs[index];
+       if (eventfd->fd >= 0) {
+               ctx = eventfd_ctx_fdget(eventfd->fd);
+               if (IS_ERR(ctx))
+                       return PTR_ERR(ctx);
+       } else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN)
+               return 0;
+
+       spin_lock(&vq->kick_lock);
+       if (vq->kickfd)
+               eventfd_ctx_put(vq->kickfd);
+       vq->kickfd = ctx;
+       if (vq->ready && vq->kicked && vq->kickfd) {
+               eventfd_signal(vq->kickfd, 1);
+               vq->kicked = false;
+       }
+       spin_unlock(&vq->kick_lock);
+
+       return 0;
+}
+
+static bool vduse_dev_is_ready(struct vduse_dev *dev)
+{
+       int i;
+
+       for (i = 0; i < dev->vq_num; i++)
+               if (!dev->vqs[i].num_max)
+                       return false;
+
+       return true;
+}
+
+static void vduse_dev_irq_inject(struct work_struct *work)
+{
+       struct vduse_dev *dev = container_of(work, struct vduse_dev, inject);
+
+       spin_lock_irq(&dev->irq_lock);
+       if (dev->config_cb.callback)
+               dev->config_cb.callback(dev->config_cb.private);
+       spin_unlock_irq(&dev->irq_lock);
+}
+
+static void vduse_vq_irq_inject(struct work_struct *work)
+{
+       struct vduse_virtqueue *vq = container_of(work,
+                                       struct vduse_virtqueue, inject);
+
+       spin_lock_irq(&vq->irq_lock);
+       if (vq->ready && vq->cb.callback)
+               vq->cb.callback(vq->cb.private);
+       spin_unlock_irq(&vq->irq_lock);
+}
+
+static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
+                           unsigned long arg)
+{
+       struct vduse_dev *dev = file->private_data;
+       void __user *argp = (void __user *)arg;
+       int ret;
+
+       if (unlikely(dev->broken))
+               return -EPERM;
+
+       switch (cmd) {
+       case VDUSE_IOTLB_GET_FD: {
+               struct vduse_iotlb_entry entry;
+               struct vhost_iotlb_map *map;
+               struct vdpa_map_file *map_file;
+               struct vduse_iova_domain *domain = dev->domain;
+               struct file *f = NULL;
+
+               ret = -EFAULT;
+               if (copy_from_user(&entry, argp, sizeof(entry)))
+                       break;
+
+               ret = -EINVAL;
+               if (entry.start > entry.last)
+                       break;
+
+               spin_lock(&domain->iotlb_lock);
+               map = vhost_iotlb_itree_first(domain->iotlb,
+                                             entry.start, entry.last);
+               if (map) {
+                       map_file = (struct vdpa_map_file *)map->opaque;
+                       f = get_file(map_file->file);
+                       entry.offset = map_file->offset;
+                       entry.start = map->start;
+                       entry.last = map->last;
+                       entry.perm = map->perm;
+               }
+               spin_unlock(&domain->iotlb_lock);
+               ret = -EINVAL;
+               if (!f)
+                       break;
+
+               ret = -EFAULT;
+               if (copy_to_user(argp, &entry, sizeof(entry))) {
+                       fput(f);
+                       break;
+               }
+               ret = receive_fd(f, perm_to_file_flags(entry.perm));
+               fput(f);
+               break;
+       }
+       case VDUSE_DEV_GET_FEATURES:
+               /*
+                * Just mirror what driver wrote here.
+                * The driver is expected to check FEATURE_OK later.
+                */
+               ret = put_user(dev->driver_features, (u64 __user *)argp);
+               break;
+       case VDUSE_DEV_SET_CONFIG: {
+               struct vduse_config_data config;
+               unsigned long size = offsetof(struct vduse_config_data,
+                                             buffer);
+
+               ret = -EFAULT;
+               if (copy_from_user(&config, argp, size))
+                       break;
+
+               ret = -EINVAL;
+               if (config.length == 0 ||
+                   config.length > dev->config_size - config.offset)
+                       break;
+
+               ret = -EFAULT;
+               if (copy_from_user(dev->config + config.offset, argp + size,
+                                  config.length))
+                       break;
+
+               ret = 0;
+               break;
+       }
+       case VDUSE_DEV_INJECT_CONFIG_IRQ:
+               ret = 0;
+               queue_work(vduse_irq_wq, &dev->inject);
+               break;
+       case VDUSE_VQ_SETUP: {
+               struct vduse_vq_config config;
+               u32 index;
+
+               ret = -EFAULT;
+               if (copy_from_user(&config, argp, sizeof(config)))
+                       break;
+
+               ret = -EINVAL;
+               if (config.index >= dev->vq_num)
+                       break;
+
+               if (!is_mem_zero((const char *)config.reserved,
+                                sizeof(config.reserved)))
+                       break;
+
+               index = array_index_nospec(config.index, dev->vq_num);
+               dev->vqs[index].num_max = config.max_size;
+               ret = 0;
+               break;
+       }
+       case VDUSE_VQ_GET_INFO: {
+               struct vduse_vq_info vq_info;
+               struct vduse_virtqueue *vq;
+               u32 index;
+
+               ret = -EFAULT;
+               if (copy_from_user(&vq_info, argp, sizeof(vq_info)))
+                       break;
+
+               ret = -EINVAL;
+               if (vq_info.index >= dev->vq_num)
+                       break;
+
+               index = array_index_nospec(vq_info.index, dev->vq_num);
+               vq = &dev->vqs[index];
+               vq_info.desc_addr = vq->desc_addr;
+               vq_info.driver_addr = vq->driver_addr;
+               vq_info.device_addr = vq->device_addr;
+               vq_info.num = vq->num;
+
+               if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
+                       vq_info.packed.last_avail_counter =
+                               vq->state.packed.last_avail_counter;
+                       vq_info.packed.last_avail_idx =
+                               vq->state.packed.last_avail_idx;
+                       vq_info.packed.last_used_counter =
+                               vq->state.packed.last_used_counter;
+                       vq_info.packed.last_used_idx =
+                               vq->state.packed.last_used_idx;
+               } else
+                       vq_info.split.avail_index =
+                               vq->state.split.avail_index;
+
+               vq_info.ready = vq->ready;
+
+               ret = -EFAULT;
+               if (copy_to_user(argp, &vq_info, sizeof(vq_info)))
+                       break;
+
+               ret = 0;
+               break;
+       }
+       case VDUSE_VQ_SETUP_KICKFD: {
+               struct vduse_vq_eventfd eventfd;
+
+               ret = -EFAULT;
+               if (copy_from_user(&eventfd, argp, sizeof(eventfd)))
+                       break;
+
+               ret = vduse_kickfd_setup(dev, &eventfd);
+               break;
+       }
+       case VDUSE_VQ_INJECT_IRQ: {
+               u32 index;
+
+               ret = -EFAULT;
+               if (get_user(index, (u32 __user *)argp))
+                       break;
+
+               ret = -EINVAL;
+               if (index >= dev->vq_num)
+                       break;
+
+               ret = 0;
+               index = array_index_nospec(index, dev->vq_num);
+               queue_work(vduse_irq_wq, &dev->vqs[index].inject);
+               break;
+       }
+       default:
+               ret = -ENOIOCTLCMD;
+               break;
+       }
+
+       return ret;
+}
+
+static int vduse_dev_release(struct inode *inode, struct file *file)
+{
+       struct vduse_dev *dev = file->private_data;
+
+       spin_lock(&dev->msg_lock);
+       /* Make sure the inflight messages can processed after reconncection */
+       list_splice_init(&dev->recv_list, &dev->send_list);
+       spin_unlock(&dev->msg_lock);
+       dev->connected = false;
+
+       return 0;
+}
+
+static struct vduse_dev *vduse_dev_get_from_minor(int minor)
+{
+       struct vduse_dev *dev;
+
+       mutex_lock(&vduse_lock);
+       dev = idr_find(&vduse_idr, minor);
+       mutex_unlock(&vduse_lock);
+
+       return dev;
+}
+
+static int vduse_dev_open(struct inode *inode, struct file *file)
+{
+       int ret;
+       struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode));
+
+       if (!dev)
+               return -ENODEV;
+
+       ret = -EBUSY;
+       mutex_lock(&dev->lock);
+       if (dev->connected)
+               goto unlock;
+
+       ret = 0;
+       dev->connected = true;
+       file->private_data = dev;
+unlock:
+       mutex_unlock(&dev->lock);
+
+       return ret;
+}
+
+static const struct file_operations vduse_dev_fops = {
+       .owner          = THIS_MODULE,
+       .open           = vduse_dev_open,
+       .release        = vduse_dev_release,
+       .read_iter      = vduse_dev_read_iter,
+       .write_iter     = vduse_dev_write_iter,
+       .poll           = vduse_dev_poll,
+       .unlocked_ioctl = vduse_dev_ioctl,
+       .compat_ioctl   = compat_ptr_ioctl,
+       .llseek         = noop_llseek,
+};
+
+static struct vduse_dev *vduse_dev_create(void)
+{
+       struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+
+       if (!dev)
+               return NULL;
+
+       mutex_init(&dev->lock);
+       spin_lock_init(&dev->msg_lock);
+       INIT_LIST_HEAD(&dev->send_list);
+       INIT_LIST_HEAD(&dev->recv_list);
+       spin_lock_init(&dev->irq_lock);
+
+       INIT_WORK(&dev->inject, vduse_dev_irq_inject);
+       init_waitqueue_head(&dev->waitq);
+
+       return dev;
+}
+
+static void vduse_dev_destroy(struct vduse_dev *dev)
+{
+       kfree(dev);
+}
+
+static struct vduse_dev *vduse_find_dev(const char *name)
+{
+       struct vduse_dev *dev;
+       int id;
+
+       idr_for_each_entry(&vduse_idr, dev, id)
+               if (!strcmp(dev->name, name))
+                       return dev;
+
+       return NULL;
+}
+
+static int vduse_destroy_dev(char *name)
+{
+       struct vduse_dev *dev = vduse_find_dev(name);
+
+       if (!dev)
+               return -EINVAL;
+
+       mutex_lock(&dev->lock);
+       if (dev->vdev || dev->connected) {
+               mutex_unlock(&dev->lock);
+               return -EBUSY;
+       }
+       dev->connected = true;
+       mutex_unlock(&dev->lock);
+
+       vduse_dev_reset(dev);
+       device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
+       idr_remove(&vduse_idr, dev->minor);
+       kvfree(dev->config);
+       kfree(dev->vqs);
+       vduse_domain_destroy(dev->domain);
+       kfree(dev->name);
+       vduse_dev_destroy(dev);
+       module_put(THIS_MODULE);
+
+       return 0;
+}
+
+static bool device_is_allowed(u32 device_id)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(allowed_device_id); i++)
+               if (allowed_device_id[i] == device_id)
+                       return true;
+
+       return false;
+}
+
+static bool features_is_valid(u64 features)
+{
+       if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
+               return false;
+
+       /* Now we only support read-only configuration space */
+       if (features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE))
+               return false;
+
+       return true;
+}
+
+static bool vduse_validate_config(struct vduse_dev_config *config)
+{
+       if (!is_mem_zero((const char *)config->reserved,
+                        sizeof(config->reserved)))
+               return false;
+
+       if (config->vq_align > PAGE_SIZE)
+               return false;
+
+       if (config->config_size > PAGE_SIZE)
+               return false;
+
+       if (!device_is_allowed(config->device_id))
+               return false;
+
+       if (!features_is_valid(config->features))
+               return false;
+
+       return true;
+}
+
+static ssize_t msg_timeout_show(struct device *device,
+                               struct device_attribute *attr, char *buf)
+{
+       struct vduse_dev *dev = dev_get_drvdata(device);
+
+       return sysfs_emit(buf, "%u\n", dev->msg_timeout);
+}
+
+static ssize_t msg_timeout_store(struct device *device,
+                                struct device_attribute *attr,
+                                const char *buf, size_t count)
+{
+       struct vduse_dev *dev = dev_get_drvdata(device);
+       int ret;
+
+       ret = kstrtouint(buf, 10, &dev->msg_timeout);
+       if (ret < 0)
+               return ret;
+
+       return count;
+}
+
+static DEVICE_ATTR_RW(msg_timeout);
+
+static struct attribute *vduse_dev_attrs[] = {
+       &dev_attr_msg_timeout.attr,
+       NULL
+};
+
+ATTRIBUTE_GROUPS(vduse_dev);
+
+static int vduse_create_dev(struct vduse_dev_config *config,
+                           void *config_buf, u64 api_version)
+{
+       int i, ret;
+       struct vduse_dev *dev;
+
+       ret = -EEXIST;
+       if (vduse_find_dev(config->name))
+               goto err;
+
+       ret = -ENOMEM;
+       dev = vduse_dev_create();
+       if (!dev)
+               goto err;
+
+       dev->api_version = api_version;
+       dev->device_features = config->features;
+       dev->device_id = config->device_id;
+       dev->vendor_id = config->vendor_id;
+       dev->name = kstrdup(config->name, GFP_KERNEL);
+       if (!dev->name)
+               goto err_str;
+
+       dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
+                                         VDUSE_BOUNCE_SIZE);
+       if (!dev->domain)
+               goto err_domain;
+
+       dev->config = config_buf;
+       dev->config_size = config->config_size;
+       dev->vq_align = config->vq_align;
+       dev->vq_num = config->vq_num;
+       dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
+       if (!dev->vqs)
+               goto err_vqs;
+
+       for (i = 0; i < dev->vq_num; i++) {
+               dev->vqs[i].index = i;
+               INIT_WORK(&dev->vqs[i].inject, vduse_vq_irq_inject);
+               INIT_WORK(&dev->vqs[i].kick, vduse_vq_kick_work);
+               spin_lock_init(&dev->vqs[i].kick_lock);
+               spin_lock_init(&dev->vqs[i].irq_lock);
+       }
+
+       ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
+       if (ret < 0)
+               goto err_idr;
+
+       dev->minor = ret;
+       dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT;
+       dev->dev = device_create(vduse_class, NULL,
+                                MKDEV(MAJOR(vduse_major), dev->minor),
+                                dev, "%s", config->name);
+       if (IS_ERR(dev->dev)) {
+               ret = PTR_ERR(dev->dev);
+               goto err_dev;
+       }
+       __module_get(THIS_MODULE);
+
+       return 0;
+err_dev:
+       idr_remove(&vduse_idr, dev->minor);
+err_idr:
+       kfree(dev->vqs);
+err_vqs:
+       vduse_domain_destroy(dev->domain);
+err_domain:
+       kfree(dev->name);
+err_str:
+       vduse_dev_destroy(dev);
+err:
+       kvfree(config_buf);
+       return ret;
+}
+
+static long vduse_ioctl(struct file *file, unsigned int cmd,
+                       unsigned long arg)
+{
+       int ret;
+       void __user *argp = (void __user *)arg;
+       struct vduse_control *control = file->private_data;
+
+       mutex_lock(&vduse_lock);
+       switch (cmd) {
+       case VDUSE_GET_API_VERSION:
+               ret = put_user(control->api_version, (u64 __user *)argp);
+               break;
+       case VDUSE_SET_API_VERSION: {
+               u64 api_version;
+
+               ret = -EFAULT;
+               if (get_user(api_version, (u64 __user *)argp))
+                       break;
+
+               ret = -EINVAL;
+               if (api_version > VDUSE_API_VERSION)
+                       break;
+
+               ret = 0;
+               control->api_version = api_version;
+               break;
+       }
+       case VDUSE_CREATE_DEV: {
+               struct vduse_dev_config config;
+               unsigned long size = offsetof(struct vduse_dev_config, config);
+               void *buf;
+
+               ret = -EFAULT;
+               if (copy_from_user(&config, argp, size))
+                       break;
+
+               ret = -EINVAL;
+               if (vduse_validate_config(&config) == false)
+                       break;
+
+               buf = vmemdup_user(argp + size, config.config_size);
+               if (IS_ERR(buf)) {
+                       ret = PTR_ERR(buf);
+                       break;
+               }
+               config.name[VDUSE_NAME_MAX - 1] = '\0';
+               ret = vduse_create_dev(&config, buf, control->api_version);
+               break;
+       }
+       case VDUSE_DESTROY_DEV: {
+               char name[VDUSE_NAME_MAX];
+
+               ret = -EFAULT;
+               if (copy_from_user(name, argp, VDUSE_NAME_MAX))
+                       break;
+
+               name[VDUSE_NAME_MAX - 1] = '\0';
+               ret = vduse_destroy_dev(name);
+               break;
+       }
+       default:
+               ret = -EINVAL;
+               break;
+       }
+       mutex_unlock(&vduse_lock);
+
+       return ret;
+}
+
+static int vduse_release(struct inode *inode, struct file *file)
+{
+       struct vduse_control *control = file->private_data;
+
+       kfree(control);
+       return 0;
+}
+
+static int vduse_open(struct inode *inode, struct file *file)
+{
+       struct vduse_control *control;
+
+       control = kmalloc(sizeof(struct vduse_control), GFP_KERNEL);
+       if (!control)
+               return -ENOMEM;
+
+       control->api_version = VDUSE_API_VERSION;
+       file->private_data = control;
+
+       return 0;
+}
+
+static const struct file_operations vduse_ctrl_fops = {
+       .owner          = THIS_MODULE,
+       .open           = vduse_open,
+       .release        = vduse_release,
+       .unlocked_ioctl = vduse_ioctl,
+       .compat_ioctl   = compat_ptr_ioctl,
+       .llseek         = noop_llseek,
+};
+
+static char *vduse_devnode(struct device *dev, umode_t *mode)
+{
+       return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev));
+}
+
+static void vduse_mgmtdev_release(struct device *dev)
+{
+}
+
+static struct device vduse_mgmtdev = {
+       .init_name = "vduse",
+       .release = vduse_mgmtdev_release,
+};
+
+static struct vdpa_mgmt_dev mgmt_dev;
+
+static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
+{
+       struct vduse_vdpa *vdev;
+       int ret;
+
+       if (dev->vdev)
+               return -EEXIST;
+
+       vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
+                                &vduse_vdpa_config_ops, name, true);
+       if (IS_ERR(vdev))
+               return PTR_ERR(vdev);
+
+       dev->vdev = vdev;
+       vdev->dev = dev;
+       vdev->vdpa.dev.dma_mask = &vdev->vdpa.dev.coherent_dma_mask;
+       ret = dma_set_mask_and_coherent(&vdev->vdpa.dev, DMA_BIT_MASK(64));
+       if (ret) {
+               put_device(&vdev->vdpa.dev);
+               return ret;
+       }
+       set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops);
+       vdev->vdpa.dma_dev = &vdev->vdpa.dev;
+       vdev->vdpa.mdev = &mgmt_dev;
+
+       return 0;
+}
+
+static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name)
+{
+       struct vduse_dev *dev;
+       int ret;
+
+       mutex_lock(&vduse_lock);
+       dev = vduse_find_dev(name);
+       if (!dev || !vduse_dev_is_ready(dev)) {
+               mutex_unlock(&vduse_lock);
+               return -EINVAL;
+       }
+       ret = vduse_dev_init_vdpa(dev, name);
+       mutex_unlock(&vduse_lock);
+       if (ret)
+               return ret;
+
+       ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
+       if (ret) {
+               put_device(&dev->vdev->vdpa.dev);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
+{
+       _vdpa_unregister_device(dev);
+}
+
+static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = {
+       .dev_add = vdpa_dev_add,
+       .dev_del = vdpa_dev_del,
+};
+
+static struct virtio_device_id id_table[] = {
+       { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
+       { 0 },
+};
+
+static struct vdpa_mgmt_dev mgmt_dev = {
+       .device = &vduse_mgmtdev,
+       .id_table = id_table,
+       .ops = &vdpa_dev_mgmtdev_ops,
+};
+
+static int vduse_mgmtdev_init(void)
+{
+       int ret;
+
+       ret = device_register(&vduse_mgmtdev);
+       if (ret)
+               return ret;
+
+       ret = vdpa_mgmtdev_register(&mgmt_dev);
+       if (ret)
+               goto err;
+
+       return 0;
+err:
+       device_unregister(&vduse_mgmtdev);
+       return ret;
+}
+
+static void vduse_mgmtdev_exit(void)
+{
+       vdpa_mgmtdev_unregister(&mgmt_dev);
+       device_unregister(&vduse_mgmtdev);
+}
+
+static int vduse_init(void)
+{
+       int ret;
+       struct device *dev;
+
+       vduse_class = class_create(THIS_MODULE, "vduse");
+       if (IS_ERR(vduse_class))
+               return PTR_ERR(vduse_class);
+
+       vduse_class->devnode = vduse_devnode;
+       vduse_class->dev_groups = vduse_dev_groups;
+
+       ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse");
+       if (ret)
+               goto err_chardev_region;
+
+       /* /dev/vduse/control */
+       cdev_init(&vduse_ctrl_cdev, &vduse_ctrl_fops);
+       vduse_ctrl_cdev.owner = THIS_MODULE;
+       ret = cdev_add(&vduse_ctrl_cdev, vduse_major, 1);
+       if (ret)
+               goto err_ctrl_cdev;
+
+       dev = device_create(vduse_class, NULL, vduse_major, NULL, "control");
+       if (IS_ERR(dev)) {
+               ret = PTR_ERR(dev);
+               goto err_device;
+       }
+
+       /* /dev/vduse/$DEVICE */
+       cdev_init(&vduse_cdev, &vduse_dev_fops);
+       vduse_cdev.owner = THIS_MODULE;
+       ret = cdev_add(&vduse_cdev, MKDEV(MAJOR(vduse_major), 1),
+                      VDUSE_DEV_MAX - 1);
+       if (ret)
+               goto err_cdev;
+
+       vduse_irq_wq = alloc_workqueue("vduse-irq",
+                               WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0);
+       if (!vduse_irq_wq)
+               goto err_wq;
+
+       ret = vduse_domain_init();
+       if (ret)
+               goto err_domain;
+
+       ret = vduse_mgmtdev_init();
+       if (ret)
+               goto err_mgmtdev;
+
+       return 0;
+err_mgmtdev:
+       vduse_domain_exit();
+err_domain:
+       destroy_workqueue(vduse_irq_wq);
+err_wq:
+       cdev_del(&vduse_cdev);
+err_cdev:
+       device_destroy(vduse_class, vduse_major);
+err_device:
+       cdev_del(&vduse_ctrl_cdev);
+err_ctrl_cdev:
+       unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
+err_chardev_region:
+       class_destroy(vduse_class);
+       return ret;
+}
+module_init(vduse_init);
+
+static void vduse_exit(void)
+{
+       vduse_mgmtdev_exit();
+       vduse_domain_exit();
+       destroy_workqueue(vduse_irq_wq);
+       cdev_del(&vduse_cdev);
+       device_destroy(vduse_class, vduse_major);
+       cdev_del(&vduse_ctrl_cdev);
+       unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
+       class_destroy(vduse_class);
+}
+module_exit(vduse_exit);
+
+MODULE_LICENSE(DRV_LICENSE);
+MODULE_AUTHOR(DRV_AUTHOR);
+MODULE_DESCRIPTION(DRV_DESC);
index fe05273..5bcd002 100644 (file)
@@ -189,10 +189,20 @@ static void vp_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
        }
 
        vp_modern_set_status(mdev, status);
+}
 
-       if (!(status & VIRTIO_CONFIG_S_DRIVER_OK) &&
-           (s & VIRTIO_CONFIG_S_DRIVER_OK))
+static int vp_vdpa_reset(struct vdpa_device *vdpa)
+{
+       struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+       struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev;
+       u8 s = vp_vdpa_get_status(vdpa);
+
+       vp_modern_set_status(mdev, 0);
+
+       if (s & VIRTIO_CONFIG_S_DRIVER_OK)
                vp_vdpa_free_irq(vp_vdpa);
+
+       return 0;
 }
 
 static u16 vp_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
@@ -398,6 +408,7 @@ static const struct vdpa_config_ops vp_vdpa_ops = {
        .set_features   = vp_vdpa_set_features,
        .get_status     = vp_vdpa_get_status,
        .set_status     = vp_vdpa_set_status,
+       .reset          = vp_vdpa_reset,
        .get_vq_num_max = vp_vdpa_get_vq_num_max,
        .get_vq_state   = vp_vdpa_get_vq_state,
        .get_vq_notification = vp_vdpa_get_vq_notification,
@@ -435,7 +446,7 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
                return ret;
 
        vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa,
-                                   dev, &vp_vdpa_ops, NULL);
+                                   dev, &vp_vdpa_ops, NULL, false);
        if (IS_ERR(vp_vdpa)) {
                dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n");
                return PTR_ERR(vp_vdpa);
index 0582079..670d56c 100644 (file)
@@ -36,19 +36,21 @@ void vhost_iotlb_map_free(struct vhost_iotlb *iotlb,
 EXPORT_SYMBOL_GPL(vhost_iotlb_map_free);
 
 /**
- * vhost_iotlb_add_range - add a new range to vhost IOTLB
+ * vhost_iotlb_add_range_ctx - add a new range to vhost IOTLB
  * @iotlb: the IOTLB
  * @start: start of the IOVA range
  * @last: last of IOVA range
  * @addr: the address that is mapped to @start
  * @perm: access permission of this range
+ * @opaque: the opaque pointer for the new mapping
  *
  * Returns an error last is smaller than start or memory allocation
  * fails
  */
-int vhost_iotlb_add_range(struct vhost_iotlb *iotlb,
-                         u64 start, u64 last,
-                         u64 addr, unsigned int perm)
+int vhost_iotlb_add_range_ctx(struct vhost_iotlb *iotlb,
+                             u64 start, u64 last,
+                             u64 addr, unsigned int perm,
+                             void *opaque)
 {
        struct vhost_iotlb_map *map;
 
@@ -71,6 +73,7 @@ int vhost_iotlb_add_range(struct vhost_iotlb *iotlb,
        map->last = last;
        map->addr = addr;
        map->perm = perm;
+       map->opaque = opaque;
 
        iotlb->nmaps++;
        vhost_iotlb_itree_insert(map, &iotlb->root);
@@ -80,6 +83,15 @@ int vhost_iotlb_add_range(struct vhost_iotlb *iotlb,
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(vhost_iotlb_add_range_ctx);
+
+int vhost_iotlb_add_range(struct vhost_iotlb *iotlb,
+                         u64 start, u64 last,
+                         u64 addr, unsigned int perm)
+{
+       return vhost_iotlb_add_range_ctx(iotlb, start, last,
+                                        addr, perm, NULL);
+}
 EXPORT_SYMBOL_GPL(vhost_iotlb_add_range);
 
 /**
index 46f897e..532e204 100644 (file)
@@ -1,24 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*******************************************************************************
  * Vhost kernel TCM fabric driver for virtio SCSI initiators
  *
  * (C) Copyright 2010-2013 Datera, Inc.
  * (C) Copyright 2010-2012 IBM Corp.
  *
- * Licensed to the Linux Foundation under the General Public License (GPL) version 2.
- *
  * Authors: Nicholas A. Bellinger <nab@daterainc.com>
  *          Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  ****************************************************************************/
 
 #include <linux/module.h>
index 9479f7f..f41d081 100644 (file)
@@ -116,12 +116,13 @@ static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, u16 qid)
        irq_bypass_unregister_producer(&vq->call_ctx.producer);
 }
 
-static void vhost_vdpa_reset(struct vhost_vdpa *v)
+static int vhost_vdpa_reset(struct vhost_vdpa *v)
 {
        struct vdpa_device *vdpa = v->vdpa;
 
-       vdpa_reset(vdpa);
        v->in_batch = 0;
+
+       return vdpa_reset(vdpa);
 }
 
 static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp)
@@ -157,7 +158,7 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
        struct vdpa_device *vdpa = v->vdpa;
        const struct vdpa_config_ops *ops = vdpa->config;
        u8 status, status_old;
-       int nvqs = v->nvqs;
+       int ret, nvqs = v->nvqs;
        u16 i;
 
        if (copy_from_user(&status, statusp, sizeof(status)))
@@ -172,7 +173,12 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
        if (status != 0 && (ops->get_status(vdpa) & ~status) != 0)
                return -EINVAL;
 
-       ops->set_status(vdpa, status);
+       if (status == 0) {
+               ret = ops->reset(vdpa);
+               if (ret)
+                       return ret;
+       } else
+               ops->set_status(vdpa, status);
 
        if ((status & VIRTIO_CONFIG_S_DRIVER_OK) && !(status_old & VIRTIO_CONFIG_S_DRIVER_OK))
                for (i = 0; i < nvqs; i++)
@@ -498,7 +504,7 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
        return r;
 }
 
-static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last)
+static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, u64 start, u64 last)
 {
        struct vhost_dev *dev = &v->vdev;
        struct vhost_iotlb *iotlb = dev->iotlb;
@@ -507,19 +513,44 @@ static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last)
        unsigned long pfn, pinned;
 
        while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) {
-               pinned = map->size >> PAGE_SHIFT;
-               for (pfn = map->addr >> PAGE_SHIFT;
+               pinned = PFN_DOWN(map->size);
+               for (pfn = PFN_DOWN(map->addr);
                     pinned > 0; pfn++, pinned--) {
                        page = pfn_to_page(pfn);
                        if (map->perm & VHOST_ACCESS_WO)
                                set_page_dirty_lock(page);
                        unpin_user_page(page);
                }
-               atomic64_sub(map->size >> PAGE_SHIFT, &dev->mm->pinned_vm);
+               atomic64_sub(PFN_DOWN(map->size), &dev->mm->pinned_vm);
                vhost_iotlb_map_free(iotlb, map);
        }
 }
 
+static void vhost_vdpa_va_unmap(struct vhost_vdpa *v, u64 start, u64 last)
+{
+       struct vhost_dev *dev = &v->vdev;
+       struct vhost_iotlb *iotlb = dev->iotlb;
+       struct vhost_iotlb_map *map;
+       struct vdpa_map_file *map_file;
+
+       while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) {
+               map_file = (struct vdpa_map_file *)map->opaque;
+               fput(map_file->file);
+               kfree(map_file);
+               vhost_iotlb_map_free(iotlb, map);
+       }
+}
+
+static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last)
+{
+       struct vdpa_device *vdpa = v->vdpa;
+
+       if (vdpa->use_va)
+               return vhost_vdpa_va_unmap(v, start, last);
+
+       return vhost_vdpa_pa_unmap(v, start, last);
+}
+
 static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v)
 {
        struct vhost_dev *dev = &v->vdev;
@@ -551,21 +582,21 @@ static int perm_to_iommu_flags(u32 perm)
        return flags | IOMMU_CACHE;
 }
 
-static int vhost_vdpa_map(struct vhost_vdpa *v,
-                         u64 iova, u64 size, u64 pa, u32 perm)
+static int vhost_vdpa_map(struct vhost_vdpa *v, u64 iova,
+                         u64 size, u64 pa, u32 perm, void *opaque)
 {
        struct vhost_dev *dev = &v->vdev;
        struct vdpa_device *vdpa = v->vdpa;
        const struct vdpa_config_ops *ops = vdpa->config;
        int r = 0;
 
-       r = vhost_iotlb_add_range(dev->iotlb, iova, iova + size - 1,
-                                 pa, perm);
+       r = vhost_iotlb_add_range_ctx(dev->iotlb, iova, iova + size - 1,
+                                     pa, perm, opaque);
        if (r)
                return r;
 
        if (ops->dma_map) {
-               r = ops->dma_map(vdpa, iova, size, pa, perm);
+               r = ops->dma_map(vdpa, iova, size, pa, perm, opaque);
        } else if (ops->set_map) {
                if (!v->in_batch)
                        r = ops->set_map(vdpa, dev->iotlb);
@@ -573,13 +604,15 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
                r = iommu_map(v->domain, iova, pa, size,
                              perm_to_iommu_flags(perm));
        }
-
-       if (r)
+       if (r) {
                vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
-       else
-               atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
+               return r;
+       }
 
-       return r;
+       if (!vdpa->use_va)
+               atomic64_add(PFN_DOWN(size), &dev->mm->pinned_vm);
+
+       return 0;
 }
 
 static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size)
@@ -600,38 +633,78 @@ static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size)
        }
 }
 
-static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
-                                          struct vhost_iotlb_msg *msg)
+static int vhost_vdpa_va_map(struct vhost_vdpa *v,
+                            u64 iova, u64 size, u64 uaddr, u32 perm)
+{
+       struct vhost_dev *dev = &v->vdev;
+       u64 offset, map_size, map_iova = iova;
+       struct vdpa_map_file *map_file;
+       struct vm_area_struct *vma;
+       int ret;
+
+       mmap_read_lock(dev->mm);
+
+       while (size) {
+               vma = find_vma(dev->mm, uaddr);
+               if (!vma) {
+                       ret = -EINVAL;
+                       break;
+               }
+               map_size = min(size, vma->vm_end - uaddr);
+               if (!(vma->vm_file && (vma->vm_flags & VM_SHARED) &&
+                       !(vma->vm_flags & (VM_IO | VM_PFNMAP))))
+                       goto next;
+
+               map_file = kzalloc(sizeof(*map_file), GFP_KERNEL);
+               if (!map_file) {
+                       ret = -ENOMEM;
+                       break;
+               }
+               offset = (vma->vm_pgoff << PAGE_SHIFT) + uaddr - vma->vm_start;
+               map_file->offset = offset;
+               map_file->file = get_file(vma->vm_file);
+               ret = vhost_vdpa_map(v, map_iova, map_size, uaddr,
+                                    perm, map_file);
+               if (ret) {
+                       fput(map_file->file);
+                       kfree(map_file);
+                       break;
+               }
+next:
+               size -= map_size;
+               uaddr += map_size;
+               map_iova += map_size;
+       }
+       if (ret)
+               vhost_vdpa_unmap(v, iova, map_iova - iova);
+
+       mmap_read_unlock(dev->mm);
+
+       return ret;
+}
+
+static int vhost_vdpa_pa_map(struct vhost_vdpa *v,
+                            u64 iova, u64 size, u64 uaddr, u32 perm)
 {
        struct vhost_dev *dev = &v->vdev;
-       struct vhost_iotlb *iotlb = dev->iotlb;
        struct page **page_list;
        unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
        unsigned int gup_flags = FOLL_LONGTERM;
        unsigned long npages, cur_base, map_pfn, last_pfn = 0;
        unsigned long lock_limit, sz2pin, nchunks, i;
-       u64 iova = msg->iova;
+       u64 start = iova;
        long pinned;
        int ret = 0;
 
-       if (msg->iova < v->range.first || !msg->size ||
-           msg->iova > U64_MAX - msg->size + 1 ||
-           msg->iova + msg->size - 1 > v->range.last)
-               return -EINVAL;
-
-       if (vhost_iotlb_itree_first(iotlb, msg->iova,
-                                   msg->iova + msg->size - 1))
-               return -EEXIST;
-
        /* Limit the use of memory for bookkeeping */
        page_list = (struct page **) __get_free_page(GFP_KERNEL);
        if (!page_list)
                return -ENOMEM;
 
-       if (msg->perm & VHOST_ACCESS_WO)
+       if (perm & VHOST_ACCESS_WO)
                gup_flags |= FOLL_WRITE;
 
-       npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT;
+       npages = PFN_UP(size + (iova & ~PAGE_MASK));
        if (!npages) {
                ret = -EINVAL;
                goto free;
@@ -639,13 +712,13 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
 
        mmap_read_lock(dev->mm);
 
-       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+       lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
        if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
                ret = -ENOMEM;
                goto unlock;
        }
 
-       cur_base = msg->uaddr & PAGE_MASK;
+       cur_base = uaddr & PAGE_MASK;
        iova &= PAGE_MASK;
        nchunks = 0;
 
@@ -673,10 +746,10 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
 
                        if (last_pfn && (this_pfn != last_pfn + 1)) {
                                /* Pin a contiguous chunk of memory */
-                               csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
+                               csize = PFN_PHYS(last_pfn - map_pfn + 1);
                                ret = vhost_vdpa_map(v, iova, csize,
-                                                    map_pfn << PAGE_SHIFT,
-                                                    msg->perm);
+                                                    PFN_PHYS(map_pfn),
+                                                    perm, NULL);
                                if (ret) {
                                        /*
                                         * Unpin the pages that are left unmapped
@@ -699,13 +772,13 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
                        last_pfn = this_pfn;
                }
 
-               cur_base += pinned << PAGE_SHIFT;
+               cur_base += PFN_PHYS(pinned);
                npages -= pinned;
        }
 
        /* Pin the rest chunk */
-       ret = vhost_vdpa_map(v, iova, (last_pfn - map_pfn + 1) << PAGE_SHIFT,
-                            map_pfn << PAGE_SHIFT, msg->perm);
+       ret = vhost_vdpa_map(v, iova, PFN_PHYS(last_pfn - map_pfn + 1),
+                            PFN_PHYS(map_pfn), perm, NULL);
 out:
        if (ret) {
                if (nchunks) {
@@ -724,13 +797,38 @@ out:
                        for (pfn = map_pfn; pfn <= last_pfn; pfn++)
                                unpin_user_page(pfn_to_page(pfn));
                }
-               vhost_vdpa_unmap(v, msg->iova, msg->size);
+               vhost_vdpa_unmap(v, start, size);
        }
 unlock:
        mmap_read_unlock(dev->mm);
 free:
        free_page((unsigned long)page_list);
        return ret;
+
+}
+
+static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
+                                          struct vhost_iotlb_msg *msg)
+{
+       struct vhost_dev *dev = &v->vdev;
+       struct vdpa_device *vdpa = v->vdpa;
+       struct vhost_iotlb *iotlb = dev->iotlb;
+
+       if (msg->iova < v->range.first || !msg->size ||
+           msg->iova > U64_MAX - msg->size + 1 ||
+           msg->iova + msg->size - 1 > v->range.last)
+               return -EINVAL;
+
+       if (vhost_iotlb_itree_first(iotlb, msg->iova,
+                                   msg->iova + msg->size - 1))
+               return -EEXIST;
+
+       if (vdpa->use_va)
+               return vhost_vdpa_va_map(v, msg->iova, msg->size,
+                                        msg->uaddr, msg->perm);
+
+       return vhost_vdpa_pa_map(v, msg->iova, msg->size, msg->uaddr,
+                                msg->perm);
 }
 
 static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev,
@@ -860,7 +958,9 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep)
                return -EBUSY;
 
        nvqs = v->nvqs;
-       vhost_vdpa_reset(v);
+       r = vhost_vdpa_reset(v);
+       if (r)
+               goto err;
 
        vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL);
        if (!vqs) {
@@ -945,7 +1045,7 @@ static vm_fault_t vhost_vdpa_fault(struct vm_fault *vmf)
 
        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
        if (remap_pfn_range(vma, vmf->address & PAGE_MASK,
-                           notify.addr >> PAGE_SHIFT, PAGE_SIZE,
+                           PFN_DOWN(notify.addr), PAGE_SIZE,
                            vma->vm_page_prot))
                return VM_FAULT_SIGBUS;
 
index f249622..938aefb 100644 (file)
@@ -114,7 +114,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
                size_t nbytes;
                size_t iov_len, payload_len;
                int head;
-               bool restore_flag = false;
+               u32 flags_to_restore = 0;
 
                spin_lock_bh(&vsock->send_pkt_list_lock);
                if (list_empty(&vsock->send_pkt_list)) {
@@ -178,16 +178,21 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
                         * small rx buffers, headers of packets in rx queue are
                         * created dynamically and are initialized with header
                         * of current packet(except length). But in case of
-                        * SOCK_SEQPACKET, we also must clear record delimeter
-                        * bit(VIRTIO_VSOCK_SEQ_EOR). Otherwise, instead of one
-                        * packet with delimeter(which marks end of record),
-                        * there will be sequence of packets with delimeter
-                        * bit set. After initialized header will be copied to
-                        * rx buffer, this bit will be restored.
+                        * SOCK_SEQPACKET, we also must clear message delimeter
+                        * bit (VIRTIO_VSOCK_SEQ_EOM) and MSG_EOR bit
+                        * (VIRTIO_VSOCK_SEQ_EOR) if set. Otherwise,
+                        * there will be sequence of packets with these
+                        * bits set. After initialized header will be copied to
+                        * rx buffer, these required bits will be restored.
                         */
-                       if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) {
-                               pkt->hdr.flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
-                               restore_flag = true;
+                       if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) {
+                               pkt->hdr.flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
+                               flags_to_restore |= VIRTIO_VSOCK_SEQ_EOM;
+
+                               if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) {
+                                       pkt->hdr.flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+                                       flags_to_restore |= VIRTIO_VSOCK_SEQ_EOR;
+                               }
                        }
                }
 
@@ -224,8 +229,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
                 * to send it with the next available buffer.
                 */
                if (pkt->off < pkt->len) {
-                       if (restore_flag)
-                               pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+                       pkt->hdr.flags |= cpu_to_le32(flags_to_restore);
 
                        /* We are queueing the same virtio_vsock_pkt to handle
                         * the remaining bytes, and we want to deliver it
index 1ea0c1f..588e02f 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/virtio_config.h>
 #include <linux/module.h>
 #include <linux/idr.h>
+#include <linux/of.h>
 #include <uapi/linux/virtio_ids.h>
 
 /* Unique numbering for virtio devices. */
@@ -292,6 +293,8 @@ static void virtio_dev_remove(struct device *_d)
 
        /* Acknowledge the device's existence again. */
        virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+
+       of_node_put(dev->dev.of_node);
 }
 
 static struct bus_type virtio_bus = {
@@ -318,6 +321,43 @@ void unregister_virtio_driver(struct virtio_driver *driver)
 }
 EXPORT_SYMBOL_GPL(unregister_virtio_driver);
 
+static int virtio_device_of_init(struct virtio_device *dev)
+{
+       struct device_node *np, *pnode = dev_of_node(dev->dev.parent);
+       char compat[] = "virtio,deviceXXXXXXXX";
+       int ret, count;
+
+       if (!pnode)
+               return 0;
+
+       count = of_get_available_child_count(pnode);
+       if (!count)
+               return 0;
+
+       /* There can be only 1 child node */
+       if (WARN_ON(count > 1))
+               return -EINVAL;
+
+       np = of_get_next_available_child(pnode, NULL);
+       if (WARN_ON(!np))
+               return -ENODEV;
+
+       ret = snprintf(compat, sizeof(compat), "virtio,device%x", dev->id.device);
+       BUG_ON(ret >= sizeof(compat));
+
+       if (!of_device_is_compatible(np, compat)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       dev->dev.of_node = np;
+       return 0;
+
+out:
+       of_node_put(np);
+       return ret;
+}
+
 /**
  * register_virtio_device - register virtio device
  * @dev        : virtio device to be registered
@@ -342,6 +382,10 @@ int register_virtio_device(struct virtio_device *dev)
        dev->index = err;
        dev_set_name(&dev->dev, "virtio%u", dev->index);
 
+       err = virtio_device_of_init(dev);
+       if (err)
+               goto out_ida_remove;
+
        spin_lock_init(&dev->config_lock);
        dev->config_enabled = false;
        dev->config_change_pending = false;
@@ -362,10 +406,16 @@ int register_virtio_device(struct virtio_device *dev)
         */
        err = device_add(&dev->dev);
        if (err)
-               ida_simple_remove(&virtio_index_ida, dev->index);
+               goto out_of_node_put;
+
+       return 0;
+
+out_of_node_put:
+       of_node_put(dev->dev.of_node);
+out_ida_remove:
+       ida_simple_remove(&virtio_index_ida, dev->index);
 out:
-       if (err)
-               virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
+       virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
        return err;
 }
 EXPORT_SYMBOL_GPL(register_virtio_device);
index 47dce91..c22ff01 100644 (file)
@@ -531,8 +531,8 @@ static int init_vqs(struct virtio_balloon *vb)
                callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
        }
 
-       err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
-                                        vqs, callbacks, names, NULL, NULL);
+       err = virtio_find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, vqs,
+                             callbacks, names, NULL);
        if (err)
                return err;
 
index d8afa82..8627dac 100644 (file)
--- a/fs/file.c
+++ b/fs/file.c
@@ -1150,6 +1150,12 @@ int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
        return new_fd;
 }
 
+int receive_fd(struct file *file, unsigned int o_flags)
+{
+       return __receive_fd(file, NULL, o_flags);
+}
+EXPORT_SYMBOL_GPL(receive_fd);
+
 static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
 {
        int err = -EBADF;
index 2de2e46..51e830b 100644 (file)
@@ -94,6 +94,9 @@ extern void fd_install(unsigned int fd, struct file *file);
 
 extern int __receive_fd(struct file *file, int __user *ufd,
                        unsigned int o_flags);
+
+extern int receive_fd(struct file *file, unsigned int o_flags);
+
 static inline int receive_fd_user(struct file *file, int __user *ufd,
                                  unsigned int o_flags)
 {
@@ -101,10 +104,6 @@ static inline int receive_fd_user(struct file *file, int __user *ufd,
                return -EFAULT;
        return __receive_fd(file, ufd, o_flags);
 }
-static inline int receive_fd(struct file *file, unsigned int o_flags)
-{
-       return __receive_fd(file, NULL, o_flags);
-}
 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);
 
 extern void flush_delayed_fput(void);
index 8cfe49d..3972ab7 100644 (file)
@@ -43,17 +43,17 @@ struct vdpa_vq_state_split {
  * @last_used_idx: used index
  */
 struct vdpa_vq_state_packed {
-        u16    last_avail_counter:1;
-        u16    last_avail_idx:15;
-        u16    last_used_counter:1;
-        u16    last_used_idx:15;
+       u16     last_avail_counter:1;
+       u16     last_avail_idx:15;
+       u16     last_used_counter:1;
+       u16     last_used_idx:15;
 };
 
 struct vdpa_vq_state {
-     union {
-          struct vdpa_vq_state_split split;
-          struct vdpa_vq_state_packed packed;
-     };
+       union {
+               struct vdpa_vq_state_split split;
+               struct vdpa_vq_state_packed packed;
+       };
 };
 
 struct vdpa_mgmt_dev;
@@ -65,6 +65,7 @@ struct vdpa_mgmt_dev;
  * @config: the configuration ops for this device.
  * @index: device index
  * @features_valid: were features initialized? for legacy guests
+ * @use_va: indicate whether virtual address must be used by this device
  * @nvqs: maximum number of supported virtqueues
  * @mdev: management device pointer; caller must setup when registering device as part
  *       of dev_add() mgmtdev ops callback before invoking _vdpa_register_device().
@@ -75,6 +76,7 @@ struct vdpa_device {
        const struct vdpa_config_ops *config;
        unsigned int index;
        bool features_valid;
+       bool use_va;
        int nvqs;
        struct vdpa_mgmt_dev *mdev;
 };
@@ -89,6 +91,16 @@ struct vdpa_iova_range {
        u64 last;
 };
 
+/**
+ * Corresponding file area for device memory mapping
+ * @file: vma->vm_file for the mapping
+ * @offset: mapping offset in the vm_file
+ */
+struct vdpa_map_file {
+       struct file *file;
+       u64 offset;
+};
+
 /**
  * struct vdpa_config_ops - operations for configuring a vDPA device.
  * Note: vDPA device drivers are required to implement all of the
@@ -131,7 +143,7 @@ struct vdpa_iova_range {
  *                             @vdev: vdpa device
  *                             @idx: virtqueue index
  *                             @state: pointer to returned state (last_avail_idx)
- * @get_vq_notification:       Get the notification area for a virtqueue
+ * @get_vq_notification:       Get the notification area for a virtqueue
  *                             @vdev: vdpa device
  *                             @idx: virtqueue index
  *                             Returns the notifcation area
@@ -171,6 +183,9 @@ struct vdpa_iova_range {
  * @set_status:                        Set the device status
  *                             @vdev: vdpa device
  *                             @status: virtio device status
+ * @reset:                     Reset device
+ *                             @vdev: vdpa device
+ *                             Returns integer: success (0) or error (< 0)
  * @get_config_size:           Get the size of the configuration space
  *                             @vdev: vdpa device
  *                             Returns size_t: configuration size
@@ -255,6 +270,7 @@ struct vdpa_config_ops {
        u32 (*get_vendor_id)(struct vdpa_device *vdev);
        u8 (*get_status)(struct vdpa_device *vdev);
        void (*set_status)(struct vdpa_device *vdev, u8 status);
+       int (*reset)(struct vdpa_device *vdev);
        size_t (*get_config_size)(struct vdpa_device *vdev);
        void (*get_config)(struct vdpa_device *vdev, unsigned int offset,
                           void *buf, unsigned int len);
@@ -266,7 +282,7 @@ struct vdpa_config_ops {
        /* DMA ops */
        int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb);
        int (*dma_map)(struct vdpa_device *vdev, u64 iova, u64 size,
-                      u64 pa, u32 perm);
+                      u64 pa, u32 perm, void *opaque);
        int (*dma_unmap)(struct vdpa_device *vdev, u64 iova, u64 size);
 
        /* Free device resources */
@@ -275,7 +291,8 @@ struct vdpa_config_ops {
 
 struct vdpa_device *__vdpa_alloc_device(struct device *parent,
                                        const struct vdpa_config_ops *config,
-                                       size_t size, const char *name);
+                                       size_t size, const char *name,
+                                       bool use_va);
 
 /**
  * vdpa_alloc_device - allocate and initilaize a vDPA device
@@ -285,15 +302,16 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
  * @parent: the parent device
  * @config: the bus operations that is supported by this device
  * @name: name of the vdpa device
+ * @use_va: indicate whether virtual address must be used by this device
  *
  * Return allocated data structure or ERR_PTR upon error
  */
-#define vdpa_alloc_device(dev_struct, member, parent, config, name)   \
+#define vdpa_alloc_device(dev_struct, member, parent, config, name, use_va)   \
                          container_of(__vdpa_alloc_device( \
                                       parent, config, \
                                       sizeof(dev_struct) + \
                                       BUILD_BUG_ON_ZERO(offsetof( \
-                                      dev_struct, member)), name), \
+                                      dev_struct, member)), name, use_va), \
                                       dev_struct, member)
 
 int vdpa_register_device(struct vdpa_device *vdev, int nvqs);
@@ -348,27 +366,27 @@ static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev)
        return vdev->dma_dev;
 }
 
-static inline void vdpa_reset(struct vdpa_device *vdev)
+static inline int vdpa_reset(struct vdpa_device *vdev)
 {
-        const struct vdpa_config_ops *ops = vdev->config;
+       const struct vdpa_config_ops *ops = vdev->config;
 
        vdev->features_valid = false;
-        ops->set_status(vdev, 0);
+       return ops->reset(vdev);
 }
 
 static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features)
 {
-        const struct vdpa_config_ops *ops = vdev->config;
+       const struct vdpa_config_ops *ops = vdev->config;
 
        vdev->features_valid = true;
-        return ops->set_features(vdev, features);
+       return ops->set_features(vdev, features);
 }
 
-
-static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset,
-                                  void *buf, unsigned int len)
+static inline void vdpa_get_config(struct vdpa_device *vdev,
+                                  unsigned int offset, void *buf,
+                                  unsigned int len)
 {
-        const struct vdpa_config_ops *ops = vdev->config;
+       const struct vdpa_config_ops *ops = vdev->config;
 
        /*
         * Config accesses aren't supposed to trigger before features are set.
index 6b09b78..2d0e2f5 100644 (file)
@@ -17,6 +17,7 @@ struct vhost_iotlb_map {
        u32 perm;
        u32 flags_padding;
        u64 __subtree_last;
+       void *opaque;
 };
 
 #define VHOST_IOTLB_FLAG_RETIRE 0x1
@@ -29,6 +30,8 @@ struct vhost_iotlb {
        unsigned int flags;
 };
 
+int vhost_iotlb_add_range_ctx(struct vhost_iotlb *iotlb, u64 start, u64 last,
+                             u64 addr, unsigned int perm, void *opaque);
 int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, u64 start, u64 last,
                          u64 addr, unsigned int perm);
 void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last);
diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
new file mode 100644 (file)
index 0000000..7cfe1c1
--- /dev/null
@@ -0,0 +1,306 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_VDUSE_H_
+#define _UAPI_VDUSE_H_
+
+#include <linux/types.h>
+
+#define VDUSE_BASE     0x81
+
+/* The ioctls for control device (/dev/vduse/control) */
+
+#define VDUSE_API_VERSION      0
+
+/*
+ * Get the version of VDUSE API that kernel supported (VDUSE_API_VERSION).
+ * This is used for future extension.
+ */
+#define VDUSE_GET_API_VERSION  _IOR(VDUSE_BASE, 0x00, __u64)
+
+/* Set the version of VDUSE API that userspace supported. */
+#define VDUSE_SET_API_VERSION  _IOW(VDUSE_BASE, 0x01, __u64)
+
+/**
+ * struct vduse_dev_config - basic configuration of a VDUSE device
+ * @name: VDUSE device name, needs to be NUL terminated
+ * @vendor_id: virtio vendor id
+ * @device_id: virtio device id
+ * @features: virtio features
+ * @vq_num: the number of virtqueues
+ * @vq_align: the allocation alignment of virtqueue's metadata
+ * @reserved: for future use, needs to be initialized to zero
+ * @config_size: the size of the configuration space
+ * @config: the buffer of the configuration space
+ *
+ * Structure used by VDUSE_CREATE_DEV ioctl to create VDUSE device.
+ */
+struct vduse_dev_config {
+#define VDUSE_NAME_MAX 256
+       char name[VDUSE_NAME_MAX];
+       __u32 vendor_id;
+       __u32 device_id;
+       __u64 features;
+       __u32 vq_num;
+       __u32 vq_align;
+       __u32 reserved[13];
+       __u32 config_size;
+       __u8 config[];
+};
+
+/* Create a VDUSE device which is represented by a char device (/dev/vduse/$NAME) */
+#define VDUSE_CREATE_DEV       _IOW(VDUSE_BASE, 0x02, struct vduse_dev_config)
+
+/*
+ * Destroy a VDUSE device. Make sure there are no more references
+ * to the char device (/dev/vduse/$NAME).
+ */
+#define VDUSE_DESTROY_DEV      _IOW(VDUSE_BASE, 0x03, char[VDUSE_NAME_MAX])
+
+/* The ioctls for VDUSE device (/dev/vduse/$NAME) */
+
+/**
+ * struct vduse_iotlb_entry - entry of IOTLB to describe one IOVA region [start, last]
+ * @offset: the mmap offset on returned file descriptor
+ * @start: start of the IOVA region
+ * @last: last of the IOVA region
+ * @perm: access permission of the IOVA region
+ *
+ * Structure used by VDUSE_IOTLB_GET_FD ioctl to find an overlapped IOVA region.
+ */
+struct vduse_iotlb_entry {
+       __u64 offset;
+       __u64 start;
+       __u64 last;
+#define VDUSE_ACCESS_RO 0x1
+#define VDUSE_ACCESS_WO 0x2
+#define VDUSE_ACCESS_RW 0x3
+       __u8 perm;
+};
+
+/*
+ * Find the first IOVA region that overlaps with the range [start, last]
+ * and return the corresponding file descriptor. Return -EINVAL means the
+ * IOVA region doesn't exist. Caller should set start and last fields.
+ */
+#define VDUSE_IOTLB_GET_FD     _IOWR(VDUSE_BASE, 0x10, struct vduse_iotlb_entry)
+
+/*
+ * Get the negotiated virtio features. It's a subset of the features in
+ * struct vduse_dev_config which can be accepted by virtio driver. It's
+ * only valid after FEATURES_OK status bit is set.
+ */
+#define VDUSE_DEV_GET_FEATURES _IOR(VDUSE_BASE, 0x11, __u64)
+
+/**
+ * struct vduse_config_data - data used to update configuration space
+ * @offset: the offset from the beginning of configuration space
+ * @length: the length to write to configuration space
+ * @buffer: the buffer used to write from
+ *
+ * Structure used by VDUSE_DEV_SET_CONFIG ioctl to update device
+ * configuration space.
+ */
+struct vduse_config_data {
+       __u32 offset;
+       __u32 length;
+       __u8 buffer[];
+};
+
+/* Set device configuration space */
+#define VDUSE_DEV_SET_CONFIG   _IOW(VDUSE_BASE, 0x12, struct vduse_config_data)
+
+/*
+ * Inject a config interrupt. It's usually used to notify virtio driver
+ * that device configuration space has changed.
+ */
+#define VDUSE_DEV_INJECT_CONFIG_IRQ    _IO(VDUSE_BASE, 0x13)
+
+/**
+ * struct vduse_vq_config - basic configuration of a virtqueue
+ * @index: virtqueue index
+ * @max_size: the max size of virtqueue
+ * @reserved: for future use, needs to be initialized to zero
+ *
+ * Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue.
+ */
+struct vduse_vq_config {
+       __u32 index;
+       __u16 max_size;
+       __u16 reserved[13];
+};
+
+/*
+ * Setup the specified virtqueue. Make sure all virtqueues have been
+ * configured before the device is attached to vDPA bus.
+ */
+#define VDUSE_VQ_SETUP         _IOW(VDUSE_BASE, 0x14, struct vduse_vq_config)
+
+/**
+ * struct vduse_vq_state_split - split virtqueue state
+ * @avail_index: available index
+ */
+struct vduse_vq_state_split {
+       __u16 avail_index;
+};
+
+/**
+ * struct vduse_vq_state_packed - packed virtqueue state
+ * @last_avail_counter: last driver ring wrap counter observed by device
+ * @last_avail_idx: device available index
+ * @last_used_counter: device ring wrap counter
+ * @last_used_idx: used index
+ */
+struct vduse_vq_state_packed {
+       __u16 last_avail_counter;
+       __u16 last_avail_idx;
+       __u16 last_used_counter;
+       __u16 last_used_idx;
+};
+
+/**
+ * struct vduse_vq_info - information of a virtqueue
+ * @index: virtqueue index
+ * @num: the size of virtqueue
+ * @desc_addr: address of desc area
+ * @driver_addr: address of driver area
+ * @device_addr: address of device area
+ * @split: split virtqueue state
+ * @packed: packed virtqueue state
+ * @ready: ready status of virtqueue
+ *
+ * Structure used by VDUSE_VQ_GET_INFO ioctl to get virtqueue's information.
+ */
+struct vduse_vq_info {
+       __u32 index;
+       __u32 num;
+       __u64 desc_addr;
+       __u64 driver_addr;
+       __u64 device_addr;
+       union {
+               struct vduse_vq_state_split split;
+               struct vduse_vq_state_packed packed;
+       };
+       __u8 ready;
+};
+
+/* Get the specified virtqueue's information. Caller should set index field. */
+#define VDUSE_VQ_GET_INFO      _IOWR(VDUSE_BASE, 0x15, struct vduse_vq_info)
+
+/**
+ * struct vduse_vq_eventfd - eventfd configuration for a virtqueue
+ * @index: virtqueue index
+ * @fd: eventfd, -1 means de-assigning the eventfd
+ *
+ * Structure used by VDUSE_VQ_SETUP_KICKFD ioctl to setup kick eventfd.
+ */
+struct vduse_vq_eventfd {
+       __u32 index;
+#define VDUSE_EVENTFD_DEASSIGN -1
+       int fd;
+};
+
+/*
+ * Setup kick eventfd for specified virtqueue. The kick eventfd is used
+ * by VDUSE kernel module to notify userspace to consume the avail vring.
+ */
+#define VDUSE_VQ_SETUP_KICKFD  _IOW(VDUSE_BASE, 0x16, struct vduse_vq_eventfd)
+
+/*
+ * Inject an interrupt for specific virtqueue. It's used to notify virtio driver
+ * to consume the used vring.
+ */
+#define VDUSE_VQ_INJECT_IRQ    _IOW(VDUSE_BASE, 0x17, __u32)
+
+/* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */
+
+/**
+ * enum vduse_req_type - request type
+ * @VDUSE_GET_VQ_STATE: get the state for specified virtqueue from userspace
+ * @VDUSE_SET_STATUS: set the device status
+ * @VDUSE_UPDATE_IOTLB: Notify userspace to update the memory mapping for
+ *                      specified IOVA range via VDUSE_IOTLB_GET_FD ioctl
+ */
+enum vduse_req_type {
+       VDUSE_GET_VQ_STATE,
+       VDUSE_SET_STATUS,
+       VDUSE_UPDATE_IOTLB,
+};
+
+/**
+ * struct vduse_vq_state - virtqueue state
+ * @index: virtqueue index
+ * @split: split virtqueue state
+ * @packed: packed virtqueue state
+ */
+struct vduse_vq_state {
+       __u32 index;
+       union {
+               struct vduse_vq_state_split split;
+               struct vduse_vq_state_packed packed;
+       };
+};
+
+/**
+ * struct vduse_dev_status - device status
+ * @status: device status
+ */
+struct vduse_dev_status {
+       __u8 status;
+};
+
+/**
+ * struct vduse_iova_range - IOVA range [start, last]
+ * @start: start of the IOVA range
+ * @last: last of the IOVA range
+ */
+struct vduse_iova_range {
+       __u64 start;
+       __u64 last;
+};
+
+/**
+ * struct vduse_dev_request - control request
+ * @type: request type
+ * @request_id: request id
+ * @reserved: for future use
+ * @vq_state: virtqueue state, only index field is available
+ * @s: device status
+ * @iova: IOVA range for updating
+ * @padding: padding
+ *
+ * Structure used by read(2) on /dev/vduse/$NAME.
+ */
+struct vduse_dev_request {
+       __u32 type;
+       __u32 request_id;
+       __u32 reserved[4];
+       union {
+               struct vduse_vq_state vq_state;
+               struct vduse_dev_status s;
+               struct vduse_iova_range iova;
+               __u32 padding[32];
+       };
+};
+
+/**
+ * struct vduse_dev_response - response to control request
+ * @request_id: corresponding request id
+ * @result: the result of request
+ * @reserved: for future use, needs to be initialized to zero
+ * @vq_state: virtqueue state
+ * @padding: padding
+ *
+ * Structure used by write(2) on /dev/vduse/$NAME.
+ */
+struct vduse_dev_response {
+       __u32 request_id;
+#define VDUSE_REQ_RESULT_OK    0x00
+#define VDUSE_REQ_RESULT_FAILED        0x01
+       __u32 result;
+       __u32 reserved[4];
+       union {
+               struct vduse_vq_state vq_state;
+               __u32 padding[32];
+       };
+};
+
+#endif /* _UAPI_VDUSE_H_ */
index 50d352f..80d76b7 100644 (file)
 #define VIRTIO_ID_SOUND                        25 /* virtio sound */
 #define VIRTIO_ID_FS                   26 /* virtio filesystem */
 #define VIRTIO_ID_PMEM                 27 /* virtio pmem */
+#define VIRTIO_ID_RPMB                 28 /* virtio rpmb */
 #define VIRTIO_ID_MAC80211_HWSIM       29 /* virtio mac80211-hwsim */
+#define VIRTIO_ID_VIDEO_ENCODER                30 /* virtio video encoder */
+#define VIRTIO_ID_VIDEO_DECODER                31 /* virtio video decoder */
 #define VIRTIO_ID_SCMI                 32 /* virtio SCMI */
+#define VIRTIO_ID_NITRO_SEC_MOD                33 /* virtio nitro secure module*/
 #define VIRTIO_ID_I2C_ADAPTER          34 /* virtio i2c adapter */
+#define VIRTIO_ID_WATCHDOG             35 /* virtio watchdog */
+#define VIRTIO_ID_CAN                  36 /* virtio can */
+#define VIRTIO_ID_DMABUF               37 /* virtio dmabuf */
+#define VIRTIO_ID_PARAM_SERV           38 /* virtio parameter server */
+#define VIRTIO_ID_AUDIO_POLICY         39 /* virtio audio policy */
 #define VIRTIO_ID_BT                   40 /* virtio bluetooth */
 #define VIRTIO_ID_GPIO                 41 /* virtio gpio */
 
index 3dd3555..6473883 100644 (file)
@@ -97,7 +97,8 @@ enum virtio_vsock_shutdown {
 
 /* VIRTIO_VSOCK_OP_RW flags values */
 enum virtio_vsock_rw {
-       VIRTIO_VSOCK_SEQ_EOR = 1,
+       VIRTIO_VSOCK_SEQ_EOM = 1,
+       VIRTIO_VSOCK_SEQ_EOR = 2,
 };
 
 #endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */
index 3e02cc3..e2c0cfb 100644 (file)
@@ -2014,7 +2014,7 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg,
 {
        const struct vsock_transport *transport;
        struct vsock_sock *vsk;
-       ssize_t record_len;
+       ssize_t msg_len;
        long timeout;
        int err = 0;
        DEFINE_WAIT(wait);
@@ -2028,9 +2028,9 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg,
        if (err <= 0)
                goto out;
 
-       record_len = transport->seqpacket_dequeue(vsk, msg, flags);
+       msg_len = transport->seqpacket_dequeue(vsk, msg, flags);
 
-       if (record_len < 0) {
+       if (msg_len < 0) {
                err = -ENOMEM;
                goto out;
        }
@@ -2044,14 +2044,14 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg,
                 * packet.
                 */
                if (flags & MSG_TRUNC)
-                       err = record_len;
+                       err = msg_len;
                else
                        err = len - msg_data_left(msg);
 
                /* Always set MSG_TRUNC if real length of packet is
                 * bigger than user's buffer.
                 */
-               if (record_len > len)
+               if (msg_len > len)
                        msg->msg_flags |= MSG_TRUNC;
        }
 
index 081e7ae..59ee1be 100644 (file)
@@ -76,8 +76,12 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
                        goto out;
 
                if (msg_data_left(info->msg) == 0 &&
-                   info->type == VIRTIO_VSOCK_TYPE_SEQPACKET)
-                       pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+                   info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
+                       pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
+
+                       if (info->msg->msg_flags & MSG_EOR)
+                               pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+               }
        }
 
        trace_virtio_transport_alloc_pkt(src_cid, src_port,
@@ -457,9 +461,12 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
                                dequeued_len += pkt_len;
                }
 
-               if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) {
+               if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) {
                        msg_ready = true;
                        vvs->msg_count--;
+
+                       if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)
+                               msg->msg_flags |= MSG_EOR;
                }
 
                virtio_transport_dec_rx_pkt(vvs, pkt);
@@ -1029,7 +1036,7 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk,
                goto out;
        }
 
-       if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)
+       if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM)
                vvs->msg_count++;
 
        /* Try to copy small packets into the buffer of last packet queued,
@@ -1044,12 +1051,12 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk,
 
                /* If there is space in the last packet queued, we copy the
                 * new packet in its buffer. We avoid this if the last packet
-                * queued has VIRTIO_VSOCK_SEQ_EOR set, because this is
-                * delimiter of SEQPACKET record, so 'pkt' is the first packet
-                * of a new record.
+                * queued has VIRTIO_VSOCK_SEQ_EOM set, because this is
+                * delimiter of SEQPACKET message, so 'pkt' is the first packet
+                * of a new message.
                 */
                if ((pkt->len <= last_pkt->buf_len - last_pkt->len) &&
-                   !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)) {
+                   !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM)) {
                        memcpy(last_pkt->buf + last_pkt->len, pkt->buf,
                               pkt->len);
                        last_pkt->len += pkt->len;
index 67766bf..2a3638c 100644 (file)
@@ -282,6 +282,7 @@ static void test_stream_msg_peek_server(const struct test_opts *opts)
 }
 
 #define MESSAGES_CNT 7
+#define MSG_EOR_IDX (MESSAGES_CNT / 2)
 static void test_seqpacket_msg_bounds_client(const struct test_opts *opts)
 {
        int fd;
@@ -294,7 +295,7 @@ static void test_seqpacket_msg_bounds_client(const struct test_opts *opts)
 
        /* Send several messages, one with MSG_EOR flag */
        for (int i = 0; i < MESSAGES_CNT; i++)
-               send_byte(fd, 1, 0);
+               send_byte(fd, 1, (i == MSG_EOR_IDX) ? MSG_EOR : 0);
 
        control_writeln("SENDDONE");
        close(fd);
@@ -324,6 +325,11 @@ static void test_seqpacket_msg_bounds_server(const struct test_opts *opts)
                        perror("message bound violated");
                        exit(EXIT_FAILURE);
                }
+
+               if ((i == MSG_EOR_IDX) ^ !!(msg.msg_flags & MSG_EOR)) {
+                       perror("MSG_EOR");
+                       exit(EXIT_FAILURE);
+               }
        }
 
        close(fd);