From: Linus Torvalds Date: Thu, 24 Dec 2020 20:06:46 +0000 (-0800) Subject: Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost X-Git-Tag: microblaze-v5.12~31 X-Git-Url: http://git.monstr.eu/?p=linux-2.6-microblaze.git;a=commitdiff_plain;h=64145482d3339d71f58857591d021588040543f4;hp=58cf05f597b03a8212d9ecf2c79ee046d3ee8ad9 Merge tag 'for_linus' of git://git./linux/kernel/git/mst/vhost Pull virtio updates from Michael Tsirkin: - vdpa sim refactoring - virtio mem: Big Block Mode support - misc cleanus, fixes * tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: (61 commits) vdpa: Use simpler version of ida allocation vdpa: Add missing comment for virtqueue count uapi: virtio_ids: add missing device type IDs from OASIS spec uapi: virtio_ids.h: consistent indentions vhost scsi: fix error return code in vhost_scsi_set_endpoint() virtio_ring: Fix two use after free bugs virtio_net: Fix error code in probe() virtio_ring: Cut and paste bugs in vring_create_virtqueue_packed() tools/virtio: add barrier for aarch64 tools/virtio: add krealloc_array tools/virtio: include asm/bug.h vdpa/mlx5: Use write memory barrier after updating CQ index vdpa: split vdpasim to core and net modules vdpa_sim: split vdpasim_virtqueue's iov field in out_iov and in_iov vdpa_sim: make vdpasim->buffer size configurable vdpa_sim: use kvmalloc to allocate vdpasim->buffer vdpa_sim: set vringh notify callback vdpa_sim: add set_config callback in vdpasim_dev_attr vdpa_sim: add get_config callback in vdpasim_dev_attr vdpa_sim: make 'config' generic and usable for any device type ... --- diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 052975ea0af4..4c41df624dbb 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3072,6 +3072,7 @@ static int virtnet_probe(struct virtio_device *vdev) dev_err(&vdev->dev, "device MTU appears to have changed it is now %d < %d", mtu, dev->min_mtu); + err = -EINVAL; goto free; } diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index 6caf539091e5..92a6396f8a73 100644 --- a/drivers/vdpa/Kconfig +++ b/drivers/vdpa/Kconfig @@ -9,21 +9,24 @@ menuconfig VDPA if VDPA config VDPA_SIM - tristate "vDPA device simulator" + tristate "vDPA device simulator core" depends on RUNTIME_TESTING_MENU && HAS_DMA select DMA_OPS select VHOST_RING + help + Enable this module to support vDPA device simulators. These devices + are used for testing, prototyping and development of vDPA. + +config VDPA_SIM_NET + tristate "vDPA simulator for networking device" + depends on VDPA_SIM select GENERIC_NET_UTILS - default n help - vDPA networking device simulator which loop TX traffic back - to RX. This device is used for testing, prototyping and - development of vDPA. + vDPA networking device simulator which loops TX traffic back to RX. config IFCVF tristate "Intel IFC VF vDPA driver" depends on PCI_MSI - default n help This kernel module can drive Intel IFC VF NIC to offload virtio dataplane traffic to hardware. @@ -42,7 +45,6 @@ config MLX5_VDPA_NET tristate "vDPA driver for ConnectX devices" select MLX5_VDPA depends on MLX5_CORE - default n help VDPA network driver for ConnectX6 and newer. Provides offloading of virtio net datapath such that descriptors put on the ring will diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 8b4028556cb6..fa1af301cf55 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -417,16 +417,9 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; } - ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); if (ret) { - IFCVF_ERR(pdev, "No usable DMA confiugration\n"); - return ret; - } - - ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); - if (ret) { - IFCVF_ERR(pdev, - "No usable coherent DMA confiugration\n"); + IFCVF_ERR(pdev, "No usable DMA configuration\n"); return ret; } diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index f1d54814db97..88dde3455bfd 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -479,6 +479,11 @@ static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq) static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num) { mlx5_cq_set_ci(&mvq->cq.mcq); + + /* make sure CQ cosumer update is visible to the hardware before updating + * RX doorbell record. + */ + dma_wmb(); rx_post(&mvq->vqqp, num); if (mvq->event_cb.callback) mvq->event_cb.callback(mvq->event_cb.private); diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index a69ffc991e13..c0825650c055 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -89,7 +89,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, if (!vdev) goto err; - err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL); + err = ida_alloc(&vdpa_index_ida, GFP_KERNEL); if (err < 0) goto err_ida; diff --git a/drivers/vdpa/vdpa_sim/Makefile b/drivers/vdpa/vdpa_sim/Makefile index b40278f65e04..79d4536d347e 100644 --- a/drivers/vdpa/vdpa_sim/Makefile +++ b/drivers/vdpa/vdpa_sim/Makefile @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_VDPA_SIM) += vdpa_sim.o +obj-$(CONFIG_VDPA_SIM_NET) += vdpa_sim_net.o diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 6a90fdb9cbfc..b3fcc67bfdf0 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * VDPA networking device simulator. + * VDPA device simulator core. * * Copyright (c) 2020, Red Hat Inc. All rights reserved. * Author: Jason Wang @@ -11,97 +11,32 @@ #include #include #include -#include -#include #include #include -#include -#include -#include #include -#include -#include -#include #include #include -#include #include -#include -#include + +#include "vdpa_sim.h" #define DRV_VERSION "0.1" #define DRV_AUTHOR "Jason Wang " -#define DRV_DESC "vDPA Device Simulator" +#define DRV_DESC "vDPA Device Simulator core" #define DRV_LICENSE "GPL v2" static int batch_mapping = 1; module_param(batch_mapping, int, 0444); MODULE_PARM_DESC(batch_mapping, "Batched mapping 1 -Enable; 0 - Disable"); -static char *macaddr; -module_param(macaddr, charp, 0); -MODULE_PARM_DESC(macaddr, "Ethernet MAC address"); - -struct vdpasim_virtqueue { - struct vringh vring; - struct vringh_kiov iov; - unsigned short head; - bool ready; - u64 desc_addr; - u64 device_addr; - u64 driver_addr; - u32 num; - void *private; - irqreturn_t (*cb)(void *data); -}; +static int max_iotlb_entries = 2048; +module_param(max_iotlb_entries, int, 0444); +MODULE_PARM_DESC(max_iotlb_entries, + "Maximum number of iotlb entries. 0 means unlimited. (default: 2048)"); #define VDPASIM_QUEUE_ALIGN PAGE_SIZE #define VDPASIM_QUEUE_MAX 256 -#define VDPASIM_DEVICE_ID 0x1 #define VDPASIM_VENDOR_ID 0 -#define VDPASIM_VQ_NUM 0x2 -#define VDPASIM_NAME "vdpasim-netdev" - -static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) | - (1ULL << VIRTIO_F_VERSION_1) | - (1ULL << VIRTIO_F_ACCESS_PLATFORM) | - (1ULL << VIRTIO_NET_F_MAC); - -/* State of each vdpasim device */ -struct vdpasim { - struct vdpa_device vdpa; - struct vdpasim_virtqueue vqs[VDPASIM_VQ_NUM]; - struct work_struct work; - /* spinlock to synchronize virtqueue state */ - spinlock_t lock; - struct virtio_net_config config; - struct vhost_iotlb *iommu; - void *buffer; - u32 status; - u32 generation; - u64 features; - /* spinlock to synchronize iommu table */ - spinlock_t iommu_lock; -}; - -/* TODO: cross-endian support */ -static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim) -{ - return virtio_legacy_is_little_endian() || - (vdpasim->features & (1ULL << VIRTIO_F_VERSION_1)); -} - -static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val) -{ - return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val); -} - -static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val) -{ - return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val); -} - -static struct vdpasim *vdpasim_dev; static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa) { @@ -115,20 +50,34 @@ static struct vdpasim *dev_to_sim(struct device *dev) return vdpa_to_sim(vdpa); } +static void vdpasim_vq_notify(struct vringh *vring) +{ + struct vdpasim_virtqueue *vq = + container_of(vring, struct vdpasim_virtqueue, vring); + + if (!vq->cb) + return; + + vq->cb(vq->private); +} + static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) { struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; - vringh_init_iotlb(&vq->vring, vdpasim_features, + vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features, VDPASIM_QUEUE_MAX, false, (struct vring_desc *)(uintptr_t)vq->desc_addr, (struct vring_avail *) (uintptr_t)vq->driver_addr, (struct vring_used *) (uintptr_t)vq->device_addr); + + vq->vring.notify = vdpasim_vq_notify; } -static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq) +static void vdpasim_vq_reset(struct vdpasim *vdpasim, + struct vdpasim_virtqueue *vq) { vq->ready = false; vq->desc_addr = 0; @@ -136,16 +85,18 @@ static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq) vq->device_addr = 0; vq->cb = NULL; vq->private = NULL; - vringh_init_iotlb(&vq->vring, vdpasim_features, VDPASIM_QUEUE_MAX, - false, NULL, NULL, NULL); + vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features, + VDPASIM_QUEUE_MAX, false, NULL, NULL, NULL); + + vq->vring.notify = NULL; } static void vdpasim_reset(struct vdpasim *vdpasim) { int i; - for (i = 0; i < VDPASIM_VQ_NUM; i++) - vdpasim_vq_reset(&vdpasim->vqs[i]); + for (i = 0; i < vdpasim->dev_attr.nvqs; i++) + vdpasim_vq_reset(vdpasim, &vdpasim->vqs[i]); spin_lock(&vdpasim->iommu_lock); vhost_iotlb_reset(vdpasim->iommu); @@ -156,80 +107,6 @@ static void vdpasim_reset(struct vdpasim *vdpasim) ++vdpasim->generation; } -static void vdpasim_work(struct work_struct *work) -{ - struct vdpasim *vdpasim = container_of(work, struct - vdpasim, work); - struct vdpasim_virtqueue *txq = &vdpasim->vqs[1]; - struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0]; - ssize_t read, write; - size_t total_write; - int pkts = 0; - int err; - - spin_lock(&vdpasim->lock); - - if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) - goto out; - - if (!txq->ready || !rxq->ready) - goto out; - - while (true) { - total_write = 0; - err = vringh_getdesc_iotlb(&txq->vring, &txq->iov, NULL, - &txq->head, GFP_ATOMIC); - if (err <= 0) - break; - - err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->iov, - &rxq->head, GFP_ATOMIC); - if (err <= 0) { - vringh_complete_iotlb(&txq->vring, txq->head, 0); - break; - } - - while (true) { - read = vringh_iov_pull_iotlb(&txq->vring, &txq->iov, - vdpasim->buffer, - PAGE_SIZE); - if (read <= 0) - break; - - write = vringh_iov_push_iotlb(&rxq->vring, &rxq->iov, - vdpasim->buffer, read); - if (write <= 0) - break; - - total_write += write; - } - - /* Make sure data is wrote before advancing index */ - smp_wmb(); - - vringh_complete_iotlb(&txq->vring, txq->head, 0); - vringh_complete_iotlb(&rxq->vring, rxq->head, total_write); - - /* Make sure used is visible before rasing the interrupt. */ - smp_wmb(); - - local_bh_disable(); - if (txq->cb) - txq->cb(txq->private); - if (rxq->cb) - rxq->cb(rxq->private); - local_bh_enable(); - - if (++pkts > 4) { - schedule_work(&vdpasim->work); - goto out; - } - } - -out: - spin_unlock(&vdpasim->lock); -} - static int dir_to_perm(enum dma_data_direction dir) { int perm = -EFAULT; @@ -342,26 +219,28 @@ static const struct dma_map_ops vdpasim_dma_ops = { .free = vdpasim_free_coherent, }; -static const struct vdpa_config_ops vdpasim_net_config_ops; -static const struct vdpa_config_ops vdpasim_net_batch_config_ops; +static const struct vdpa_config_ops vdpasim_config_ops; +static const struct vdpa_config_ops vdpasim_batch_config_ops; -static struct vdpasim *vdpasim_create(void) +struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) { const struct vdpa_config_ops *ops; struct vdpasim *vdpasim; struct device *dev; - int ret = -ENOMEM; + int i, ret = -ENOMEM; if (batch_mapping) - ops = &vdpasim_net_batch_config_ops; + ops = &vdpasim_batch_config_ops; else - ops = &vdpasim_net_config_ops; + ops = &vdpasim_config_ops; - vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, VDPASIM_VQ_NUM); + vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, + dev_attr->nvqs); if (!vdpasim) goto err_alloc; - INIT_WORK(&vdpasim->work, vdpasim_work); + vdpasim->dev_attr = *dev_attr; + INIT_WORK(&vdpasim->work, dev_attr->work_fn); spin_lock_init(&vdpasim->lock); spin_lock_init(&vdpasim->iommu_lock); @@ -371,31 +250,27 @@ static struct vdpasim *vdpasim_create(void) goto err_iommu; set_dma_ops(dev, &vdpasim_dma_ops); - vdpasim->iommu = vhost_iotlb_alloc(2048, 0); + vdpasim->config = kzalloc(dev_attr->config_size, GFP_KERNEL); + if (!vdpasim->config) + goto err_iommu; + + vdpasim->vqs = kcalloc(dev_attr->nvqs, sizeof(struct vdpasim_virtqueue), + GFP_KERNEL); + if (!vdpasim->vqs) + goto err_iommu; + + vdpasim->iommu = vhost_iotlb_alloc(max_iotlb_entries, 0); if (!vdpasim->iommu) goto err_iommu; - vdpasim->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); + vdpasim->buffer = kvmalloc(dev_attr->buffer_size, GFP_KERNEL); if (!vdpasim->buffer) goto err_iommu; - if (macaddr) { - mac_pton(macaddr, vdpasim->config.mac); - if (!is_valid_ether_addr(vdpasim->config.mac)) { - ret = -EADDRNOTAVAIL; - goto err_iommu; - } - } else { - eth_random_addr(vdpasim->config.mac); - } - - vringh_set_iotlb(&vdpasim->vqs[0].vring, vdpasim->iommu); - vringh_set_iotlb(&vdpasim->vqs[1].vring, vdpasim->iommu); + for (i = 0; i < dev_attr->nvqs; i++) + vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu); vdpasim->vdpa.dma_dev = dev; - ret = vdpa_register_device(&vdpasim->vdpa); - if (ret) - goto err_iommu; return vdpasim; @@ -404,6 +279,7 @@ err_iommu: err_alloc: return ERR_PTR(ret); } +EXPORT_SYMBOL_GPL(vdpasim_create); static int vdpasim_set_vq_address(struct vdpa_device *vdpa, u16 idx, u64 desc_area, u64 driver_area, @@ -498,28 +374,21 @@ static u32 vdpasim_get_vq_align(struct vdpa_device *vdpa) static u64 vdpasim_get_features(struct vdpa_device *vdpa) { - return vdpasim_features; + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + return vdpasim->dev_attr.supported_features; } static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); - struct virtio_net_config *config = &vdpasim->config; /* DMA mapping must be done by driver */ if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) return -EINVAL; - vdpasim->features = features & vdpasim_features; - - /* We generally only know whether guest is using the legacy interface - * here, so generally that's the earliest we can set config fields. - * Note: We actually require VIRTIO_F_ACCESS_PLATFORM above which - * implies VIRTIO_F_VERSION_1, but let's not try to be clever here. - */ + vdpasim->features = features & vdpasim->dev_attr.supported_features; - config->mtu = cpu_to_vdpasim16(vdpasim, 1500); - config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP); return 0; } @@ -536,7 +405,9 @@ static u16 vdpasim_get_vq_num_max(struct vdpa_device *vdpa) static u32 vdpasim_get_device_id(struct vdpa_device *vdpa) { - return VDPASIM_DEVICE_ID; + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + return vdpasim->dev_attr.id; } static u32 vdpasim_get_vendor_id(struct vdpa_device *vdpa) @@ -572,14 +443,27 @@ static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset, { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); - if (offset + len < sizeof(struct virtio_net_config)) - memcpy(buf, (u8 *)&vdpasim->config + offset, len); + if (offset + len > vdpasim->dev_attr.config_size) + return; + + if (vdpasim->dev_attr.get_config) + vdpasim->dev_attr.get_config(vdpasim, vdpasim->config); + + memcpy(buf, vdpasim->config + offset, len); } static void vdpasim_set_config(struct vdpa_device *vdpa, unsigned int offset, const void *buf, unsigned int len) { - /* No writable config supportted by vdpasim */ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + if (offset + len > vdpasim->dev_attr.config_size) + return; + + memcpy(vdpasim->config + offset, buf, len); + + if (vdpasim->dev_attr.set_config) + vdpasim->dev_attr.set_config(vdpasim, vdpasim->config); } static u32 vdpasim_get_generation(struct vdpa_device *vdpa) @@ -656,12 +540,14 @@ static void vdpasim_free(struct vdpa_device *vdpa) struct vdpasim *vdpasim = vdpa_to_sim(vdpa); cancel_work_sync(&vdpasim->work); - kfree(vdpasim->buffer); + kvfree(vdpasim->buffer); if (vdpasim->iommu) vhost_iotlb_free(vdpasim->iommu); + kfree(vdpasim->vqs); + kfree(vdpasim->config); } -static const struct vdpa_config_ops vdpasim_net_config_ops = { +static const struct vdpa_config_ops vdpasim_config_ops = { .set_vq_address = vdpasim_set_vq_address, .set_vq_num = vdpasim_set_vq_num, .kick_vq = vdpasim_kick_vq, @@ -688,7 +574,7 @@ static const struct vdpa_config_ops vdpasim_net_config_ops = { .free = vdpasim_free, }; -static const struct vdpa_config_ops vdpasim_net_batch_config_ops = { +static const struct vdpa_config_ops vdpasim_batch_config_ops = { .set_vq_address = vdpasim_set_vq_address, .set_vq_num = vdpasim_set_vq_num, .kick_vq = vdpasim_kick_vq, @@ -714,26 +600,6 @@ static const struct vdpa_config_ops vdpasim_net_batch_config_ops = { .free = vdpasim_free, }; -static int __init vdpasim_dev_init(void) -{ - vdpasim_dev = vdpasim_create(); - - if (!IS_ERR(vdpasim_dev)) - return 0; - - return PTR_ERR(vdpasim_dev); -} - -static void __exit vdpasim_dev_exit(void) -{ - struct vdpa_device *vdpa = &vdpasim_dev->vdpa; - - vdpa_unregister_device(vdpa); -} - -module_init(vdpasim_dev_init) -module_exit(vdpasim_dev_exit) - MODULE_VERSION(DRV_VERSION); MODULE_LICENSE(DRV_LICENSE); MODULE_AUTHOR(DRV_AUTHOR); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h new file mode 100644 index 000000000000..b02142293d5b --- /dev/null +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020, Red Hat Inc. All rights reserved. + */ + +#ifndef _VDPA_SIM_H +#define _VDPA_SIM_H + +#include +#include +#include +#include +#include + +#define VDPASIM_FEATURES ((1ULL << VIRTIO_F_ANY_LAYOUT) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VIRTIO_F_ACCESS_PLATFORM)) + +struct vdpasim; + +struct vdpasim_virtqueue { + struct vringh vring; + struct vringh_kiov in_iov; + struct vringh_kiov out_iov; + unsigned short head; + bool ready; + u64 desc_addr; + u64 device_addr; + u64 driver_addr; + u32 num; + void *private; + irqreturn_t (*cb)(void *data); +}; + +struct vdpasim_dev_attr { + u64 supported_features; + size_t config_size; + size_t buffer_size; + int nvqs; + u32 id; + + work_func_t work_fn; + void (*get_config)(struct vdpasim *vdpasim, void *config); + void (*set_config)(struct vdpasim *vdpasim, const void *config); +}; + +/* State of each vdpasim device */ +struct vdpasim { + struct vdpa_device vdpa; + struct vdpasim_virtqueue *vqs; + struct work_struct work; + struct vdpasim_dev_attr dev_attr; + /* spinlock to synchronize virtqueue state */ + spinlock_t lock; + /* virtio config according to device type */ + void *config; + struct vhost_iotlb *iommu; + void *buffer; + u32 status; + u32 generation; + u64 features; + /* spinlock to synchronize iommu table */ + spinlock_t iommu_lock; +}; + +struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *attr); + +/* TODO: cross-endian support */ +static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim) +{ + return virtio_legacy_is_little_endian() || + (vdpasim->features & (1ULL << VIRTIO_F_VERSION_1)); +} + +static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val) +{ + return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val); +} + +static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val) +{ + return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val); +} + +static inline u32 vdpasim32_to_cpu(struct vdpasim *vdpasim, __virtio32 val) +{ + return __virtio32_to_cpu(vdpasim_is_little_endian(vdpasim), val); +} + +static inline __virtio32 cpu_to_vdpasim32(struct vdpasim *vdpasim, u32 val) +{ + return __cpu_to_virtio32(vdpasim_is_little_endian(vdpasim), val); +} + +static inline u64 vdpasim64_to_cpu(struct vdpasim *vdpasim, __virtio64 val) +{ + return __virtio64_to_cpu(vdpasim_is_little_endian(vdpasim), val); +} + +static inline __virtio64 cpu_to_vdpasim64(struct vdpasim *vdpasim, u64 val) +{ + return __cpu_to_virtio64(vdpasim_is_little_endian(vdpasim), val); +} + +#endif diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c new file mode 100644 index 000000000000..c10b6981fdab --- /dev/null +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VDPA simulator for networking device. + * + * Copyright (c) 2020, Red Hat Inc. All rights reserved. + * Author: Jason Wang + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vdpa_sim.h" + +#define DRV_VERSION "0.1" +#define DRV_AUTHOR "Jason Wang " +#define DRV_DESC "vDPA Device Simulator for networking device" +#define DRV_LICENSE "GPL v2" + +#define VDPASIM_NET_FEATURES (VDPASIM_FEATURES | \ + (1ULL << VIRTIO_NET_F_MAC)) + +#define VDPASIM_NET_VQ_NUM 2 + +static char *macaddr; +module_param(macaddr, charp, 0); +MODULE_PARM_DESC(macaddr, "Ethernet MAC address"); + +u8 macaddr_buf[ETH_ALEN]; + +static struct vdpasim *vdpasim_net_dev; + +static void vdpasim_net_work(struct work_struct *work) +{ + struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); + struct vdpasim_virtqueue *txq = &vdpasim->vqs[1]; + struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0]; + ssize_t read, write; + size_t total_write; + int pkts = 0; + int err; + + spin_lock(&vdpasim->lock); + + if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) + goto out; + + if (!txq->ready || !rxq->ready) + goto out; + + while (true) { + total_write = 0; + err = vringh_getdesc_iotlb(&txq->vring, &txq->out_iov, NULL, + &txq->head, GFP_ATOMIC); + if (err <= 0) + break; + + err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->in_iov, + &rxq->head, GFP_ATOMIC); + if (err <= 0) { + vringh_complete_iotlb(&txq->vring, txq->head, 0); + break; + } + + while (true) { + read = vringh_iov_pull_iotlb(&txq->vring, &txq->out_iov, + vdpasim->buffer, + PAGE_SIZE); + if (read <= 0) + break; + + write = vringh_iov_push_iotlb(&rxq->vring, &rxq->in_iov, + vdpasim->buffer, read); + if (write <= 0) + break; + + total_write += write; + } + + /* Make sure data is wrote before advancing index */ + smp_wmb(); + + vringh_complete_iotlb(&txq->vring, txq->head, 0); + vringh_complete_iotlb(&rxq->vring, rxq->head, total_write); + + /* Make sure used is visible before rasing the interrupt. */ + smp_wmb(); + + local_bh_disable(); + if (vringh_need_notify_iotlb(&txq->vring) > 0) + vringh_notify(&txq->vring); + if (vringh_need_notify_iotlb(&rxq->vring) > 0) + vringh_notify(&rxq->vring); + local_bh_enable(); + + if (++pkts > 4) { + schedule_work(&vdpasim->work); + goto out; + } + } + +out: + spin_unlock(&vdpasim->lock); +} + +static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config) +{ + struct virtio_net_config *net_config = + (struct virtio_net_config *)config; + + net_config->mtu = cpu_to_vdpasim16(vdpasim, 1500); + net_config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP); + memcpy(net_config->mac, macaddr_buf, ETH_ALEN); +} + +static int __init vdpasim_net_init(void) +{ + struct vdpasim_dev_attr dev_attr = {}; + int ret; + + if (macaddr) { + mac_pton(macaddr, macaddr_buf); + if (!is_valid_ether_addr(macaddr_buf)) { + ret = -EADDRNOTAVAIL; + goto out; + } + } else { + eth_random_addr(macaddr_buf); + } + + dev_attr.id = VIRTIO_ID_NET; + dev_attr.supported_features = VDPASIM_NET_FEATURES; + dev_attr.nvqs = VDPASIM_NET_VQ_NUM; + dev_attr.config_size = sizeof(struct virtio_net_config); + dev_attr.get_config = vdpasim_net_get_config; + dev_attr.work_fn = vdpasim_net_work; + dev_attr.buffer_size = PAGE_SIZE; + + vdpasim_net_dev = vdpasim_create(&dev_attr); + if (IS_ERR(vdpasim_net_dev)) { + ret = PTR_ERR(vdpasim_net_dev); + goto out; + } + + ret = vdpa_register_device(&vdpasim_net_dev->vdpa); + if (ret) + goto put_dev; + + return 0; + +put_dev: + put_device(&vdpasim_net_dev->vdpa.dev); +out: + return ret; +} + +static void __exit vdpasim_net_exit(void) +{ + struct vdpa_device *vdpa = &vdpasim_net_dev->vdpa; + + vdpa_unregister_device(vdpa); +} + +module_init(vdpasim_net_init); +module_exit(vdpasim_net_exit); + +MODULE_VERSION(DRV_VERSION); +MODULE_LICENSE(DRV_LICENSE); +MODULE_AUTHOR(DRV_AUTHOR); +MODULE_DESCRIPTION(DRV_DESC); diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 6ff8a5096691..4ce9f00ae10e 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1643,7 +1643,8 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, if (!vhost_vq_is_setup(vq)) continue; - if (vhost_scsi_setup_vq_cmds(vq, vq->num)) + ret = vhost_scsi_setup_vq_cmds(vq, vq->num); + if (ret) goto destroy_vq_cmds; } diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 29ed4173f04e..ef688c8c0e0e 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -245,14 +245,10 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v, return -EFAULT; if (vhost_vdpa_config_validate(v, &config)) return -EINVAL; - buf = kvzalloc(config.len, GFP_KERNEL); - if (!buf) - return -ENOMEM; - if (copy_from_user(buf, c->buf, config.len)) { - kvfree(buf); - return -EFAULT; - } + buf = vmemdup_user(c->buf, config.len); + if (IS_ERR(buf)) + return PTR_ERR(buf); ops->set_config(vdpa, config.off, buf, config.len); diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 181e2f18beae..9fc9ec4a25f5 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -27,20 +27,74 @@ static bool unplug_online = true; module_param(unplug_online, bool, 0644); MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); -enum virtio_mem_mb_state { +static bool force_bbm; +module_param(force_bbm, bool, 0444); +MODULE_PARM_DESC(force_bbm, + "Force Big Block Mode. Default is 0 (auto-selection)"); + +static unsigned long bbm_block_size; +module_param(bbm_block_size, ulong, 0444); +MODULE_PARM_DESC(bbm_block_size, + "Big Block size in bytes. Default is 0 (auto-detection)."); + +static bool bbm_safe_unplug = true; +module_param(bbm_safe_unplug, bool, 0444); +MODULE_PARM_DESC(bbm_safe_unplug, + "Use a safe unplug mechanism in BBM, avoiding long/endless loops"); + +/* + * virtio-mem currently supports the following modes of operation: + * + * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The + * size of a Sub Block (SB) is determined based on the device block size, the + * pageblock size, and the maximum allocation granularity of the buddy. + * Subblocks within a Linux memory block might either be plugged or unplugged. + * Memory is added/removed to Linux MM in Linux memory block granularity. + * + * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks. + * Memory is added/removed to Linux MM in Big Block granularity. + * + * The mode is determined automatically based on the Linux memory block size + * and the device block size. + * + * User space / core MM (auto onlining) is responsible for onlining added + * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are + * always onlined separately, and all memory within a Linux memory block is + * onlined to the same zone - virtio-mem relies on this behavior. + */ + +/* + * State of a Linux memory block in SBM. + */ +enum virtio_mem_sbm_mb_state { /* Unplugged, not added to Linux. Can be reused later. */ - VIRTIO_MEM_MB_STATE_UNUSED = 0, + VIRTIO_MEM_SBM_MB_UNUSED = 0, /* (Partially) plugged, not added to Linux. Error on add_memory(). */ - VIRTIO_MEM_MB_STATE_PLUGGED, + VIRTIO_MEM_SBM_MB_PLUGGED, /* Fully plugged, fully added to Linux, offline. */ - VIRTIO_MEM_MB_STATE_OFFLINE, + VIRTIO_MEM_SBM_MB_OFFLINE, /* Partially plugged, fully added to Linux, offline. */ - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, /* Fully plugged, fully added to Linux, online. */ - VIRTIO_MEM_MB_STATE_ONLINE, + VIRTIO_MEM_SBM_MB_ONLINE, /* Partially plugged, fully added to Linux, online. */ - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL, - VIRTIO_MEM_MB_STATE_COUNT + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL, + VIRTIO_MEM_SBM_MB_COUNT +}; + +/* + * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks. + */ +enum virtio_mem_bbm_bb_state { + /* Unplugged, not added to Linux. Can be reused later. */ + VIRTIO_MEM_BBM_BB_UNUSED = 0, + /* Plugged, not added to Linux. Error on add_memory(). */ + VIRTIO_MEM_BBM_BB_PLUGGED, + /* Plugged and added to Linux. */ + VIRTIO_MEM_BBM_BB_ADDED, + /* All online parts are fake-offline, ready to remove. */ + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE, + VIRTIO_MEM_BBM_BB_COUNT }; struct virtio_mem { @@ -51,6 +105,7 @@ struct virtio_mem { /* Workqueue that processes the plug/unplug requests. */ struct work_struct wq; + atomic_t wq_active; atomic_t config_changed; /* Virtqueue for guest->host requests. */ @@ -70,27 +125,13 @@ struct virtio_mem { /* The device block size (for communicating with the device). */ uint64_t device_block_size; - /* The translated node id. NUMA_NO_NODE in case not specified. */ + /* The determined node id for all memory of the device. */ int nid; /* Physical start address of the memory region. */ uint64_t addr; /* Maximum region size in bytes. */ uint64_t region_size; - /* The subblock size. */ - uint64_t subblock_size; - /* The number of subblocks per memory block. */ - uint32_t nb_sb_per_mb; - - /* Id of the first memory block of this device. */ - unsigned long first_mb_id; - /* Id of the last memory block of this device. */ - unsigned long last_mb_id; - /* Id of the last usable memory block of this device. */ - unsigned long last_usable_mb_id; - /* Id of the next memory bock to prepare when needed. */ - unsigned long next_mb_id; - /* The parent resource for all memory added via this device. */ struct resource *parent_resource; /* @@ -99,31 +140,79 @@ struct virtio_mem { */ const char *resource_name; - /* Summary of all memory block states. */ - unsigned long nb_mb_state[VIRTIO_MEM_MB_STATE_COUNT]; -#define VIRTIO_MEM_NB_OFFLINE_THRESHOLD 10 - - /* - * One byte state per memory block. - * - * Allocated via vmalloc(). When preparing new blocks, resized - * (alloc+copy+free) when needed (crossing pages with the next mb). - * (when crossing pages). - * - * With 128MB memory blocks, we have states for 512GB of memory in one - * page. - */ - uint8_t *mb_state; - /* - * $nb_sb_per_mb bit per memory block. Handled similar to mb_state. - * - * With 4MB subblocks, we manage 128GB of memory in one page. + * We don't want to add too much memory if it's not getting onlined, + * to avoid running OOM. Besides this threshold, we allow to have at + * least two offline blocks at a time (whatever is bigger). */ - unsigned long *sb_bitmap; +#define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024) + atomic64_t offline_size; + uint64_t offline_threshold; + + /* If set, the driver is in SBM, otherwise in BBM. */ + bool in_sbm; + + union { + struct { + /* Id of the first memory block of this device. */ + unsigned long first_mb_id; + /* Id of the last usable memory block of this device. */ + unsigned long last_usable_mb_id; + /* Id of the next memory bock to prepare when needed. */ + unsigned long next_mb_id; + + /* The subblock size. */ + uint64_t sb_size; + /* The number of subblocks per Linux memory block. */ + uint32_t sbs_per_mb; + + /* Summary of all memory block states. */ + unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; + + /* + * One byte state per memory block. Allocated via + * vmalloc(). Resized (alloc+copy+free) on demand. + * + * With 128 MiB memory blocks, we have states for 512 + * GiB of memory in one 4 KiB page. + */ + uint8_t *mb_states; + + /* + * Bitmap: one bit per subblock. Allocated similar to + * sbm.mb_states. + * + * A set bit means the corresponding subblock is + * plugged, otherwise it's unblocked. + * + * With 4 MiB subblocks, we manage 128 GiB of memory + * in one 4 KiB page. + */ + unsigned long *sb_states; + } sbm; + + struct { + /* Id of the first big block of this device. */ + unsigned long first_bb_id; + /* Id of the last usable big block of this device. */ + unsigned long last_usable_bb_id; + /* Id of the next device bock to prepare when needed. */ + unsigned long next_bb_id; + + /* Summary of all big block states. */ + unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT]; + + /* One byte state per big block. See sbm.mb_states. */ + uint8_t *bb_states; + + /* The block size used for plugging/adding/removing. */ + uint64_t bb_size; + } bbm; + }; /* - * Mutex that protects the nb_mb_state, mb_state, and sb_bitmap. + * Mutex that protects the sbm.mb_count, sbm.mb_states, + * sbm.sb_states, bbm.bb_count, and bbm.bb_states * * When this lock is held the pointers can't change, ONLINE and * OFFLINE blocks can't change the state and no subblocks will get @@ -160,6 +249,11 @@ static DEFINE_MUTEX(virtio_mem_mutex); static LIST_HEAD(virtio_mem_devices); static void virtio_mem_online_page_cb(struct page *page, unsigned int order); +static void virtio_mem_fake_offline_going_offline(unsigned long pfn, + unsigned long nr_pages); +static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, + unsigned long nr_pages); +static void virtio_mem_retry(struct virtio_mem *vm); /* * Register a virtio-mem device so it will be considered for the online_page @@ -212,6 +306,24 @@ static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) return mb_id * memory_block_size_bytes(); } +/* + * Calculate the big block id of a given address. + */ +static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm, + uint64_t addr) +{ + return addr / vm->bbm.bb_size; +} + +/* + * Calculate the physical start address of a given big block id. + */ +static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm, + unsigned long bb_id) +{ + return bb_id * vm->bbm.bb_size; +} + /* * Calculate the subblock id of a given address. */ @@ -221,89 +333,164 @@ static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); - return (addr - mb_addr) / vm->subblock_size; + return (addr - mb_addr) / vm->sbm.sb_size; } +/* + * Set the state of a big block, taking care of the state counter. + */ +static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm, + unsigned long bb_id, + enum virtio_mem_bbm_bb_state state) +{ + const unsigned long idx = bb_id - vm->bbm.first_bb_id; + enum virtio_mem_bbm_bb_state old_state; + + old_state = vm->bbm.bb_states[idx]; + vm->bbm.bb_states[idx] = state; + + BUG_ON(vm->bbm.bb_count[old_state] == 0); + vm->bbm.bb_count[old_state]--; + vm->bbm.bb_count[state]++; +} + +/* + * Get the state of a big block. + */ +static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm, + unsigned long bb_id) +{ + return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id]; +} + +/* + * Prepare the big block state array for the next big block. + */ +static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) +{ + unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id; + unsigned long new_bytes = old_bytes + 1; + int old_pages = PFN_UP(old_bytes); + int new_pages = PFN_UP(new_bytes); + uint8_t *new_array; + + if (vm->bbm.bb_states && old_pages == new_pages) + return 0; + + new_array = vzalloc(new_pages * PAGE_SIZE); + if (!new_array) + return -ENOMEM; + + mutex_lock(&vm->hotplug_mutex); + if (vm->bbm.bb_states) + memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE); + vfree(vm->bbm.bb_states); + vm->bbm.bb_states = new_array; + mutex_unlock(&vm->hotplug_mutex); + + return 0; +} + +#define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \ + for (_bb_id = vm->bbm.first_bb_id; \ + _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \ + _bb_id++) \ + if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) + +#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \ + for (_bb_id = vm->bbm.next_bb_id - 1; \ + _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \ + _bb_id--) \ + if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) + /* * Set the state of a memory block, taking care of the state counter. */ -static void virtio_mem_mb_set_state(struct virtio_mem *vm, unsigned long mb_id, - enum virtio_mem_mb_state state) +static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, + unsigned long mb_id, uint8_t state) { - const unsigned long idx = mb_id - vm->first_mb_id; - enum virtio_mem_mb_state old_state; + const unsigned long idx = mb_id - vm->sbm.first_mb_id; + uint8_t old_state; - old_state = vm->mb_state[idx]; - vm->mb_state[idx] = state; + old_state = vm->sbm.mb_states[idx]; + vm->sbm.mb_states[idx] = state; - BUG_ON(vm->nb_mb_state[old_state] == 0); - vm->nb_mb_state[old_state]--; - vm->nb_mb_state[state]++; + BUG_ON(vm->sbm.mb_count[old_state] == 0); + vm->sbm.mb_count[old_state]--; + vm->sbm.mb_count[state]++; } /* * Get the state of a memory block. */ -static enum virtio_mem_mb_state virtio_mem_mb_get_state(struct virtio_mem *vm, - unsigned long mb_id) +static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, + unsigned long mb_id) { - const unsigned long idx = mb_id - vm->first_mb_id; + const unsigned long idx = mb_id - vm->sbm.first_mb_id; - return vm->mb_state[idx]; + return vm->sbm.mb_states[idx]; } /* * Prepare the state array for the next memory block. */ -static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem *vm) +static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) { - unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id + 1; - unsigned long new_bytes = vm->next_mb_id - vm->first_mb_id + 2; - int old_pages = PFN_UP(old_bytes); - int new_pages = PFN_UP(new_bytes); - uint8_t *new_mb_state; + int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id); + int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1); + uint8_t *new_array; - if (vm->mb_state && old_pages == new_pages) + if (vm->sbm.mb_states && old_pages == new_pages) return 0; - new_mb_state = vzalloc(new_pages * PAGE_SIZE); - if (!new_mb_state) + new_array = vzalloc(new_pages * PAGE_SIZE); + if (!new_array) return -ENOMEM; mutex_lock(&vm->hotplug_mutex); - if (vm->mb_state) - memcpy(new_mb_state, vm->mb_state, old_pages * PAGE_SIZE); - vfree(vm->mb_state); - vm->mb_state = new_mb_state; + if (vm->sbm.mb_states) + memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE); + vfree(vm->sbm.mb_states); + vm->sbm.mb_states = new_array; mutex_unlock(&vm->hotplug_mutex); return 0; } -#define virtio_mem_for_each_mb_state(_vm, _mb_id, _state) \ - for (_mb_id = _vm->first_mb_id; \ - _mb_id < _vm->next_mb_id && _vm->nb_mb_state[_state]; \ +#define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ + for (_mb_id = _vm->sbm.first_mb_id; \ + _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \ _mb_id++) \ - if (virtio_mem_mb_get_state(_vm, _mb_id) == _state) + if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) -#define virtio_mem_for_each_mb_state_rev(_vm, _mb_id, _state) \ - for (_mb_id = _vm->next_mb_id - 1; \ - _mb_id >= _vm->first_mb_id && _vm->nb_mb_state[_state]; \ +#define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ + for (_mb_id = _vm->sbm.next_mb_id - 1; \ + _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \ _mb_id--) \ - if (virtio_mem_mb_get_state(_vm, _mb_id) == _state) + if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) + +/* + * Calculate the bit number in the subblock bitmap for the given subblock + * inside the given memory block. + */ +static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, + unsigned long mb_id, int sb_id) +{ + return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id; +} /* * Mark all selected subblocks plugged. * * Will not modify the state of the memory block. */ -static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm, - unsigned long mb_id, int sb_id, - int count) +static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) { - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); - __bitmap_set(vm->sb_bitmap, bit, count); + __bitmap_set(vm->sbm.sb_states, bit, count); } /* @@ -311,105 +498,114 @@ static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm, * * Will not modify the state of the memory block. */ -static void virtio_mem_mb_set_sb_unplugged(struct virtio_mem *vm, - unsigned long mb_id, int sb_id, - int count) +static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) { - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); - __bitmap_clear(vm->sb_bitmap, bit, count); + __bitmap_clear(vm->sbm.sb_states, bit, count); } /* * Test if all selected subblocks are plugged. */ -static bool virtio_mem_mb_test_sb_plugged(struct virtio_mem *vm, - unsigned long mb_id, int sb_id, - int count) +static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) { - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); if (count == 1) - return test_bit(bit, vm->sb_bitmap); + return test_bit(bit, vm->sbm.sb_states); /* TODO: Helper similar to bitmap_set() */ - return find_next_zero_bit(vm->sb_bitmap, bit + count, bit) >= + return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >= bit + count; } /* * Test if all selected subblocks are unplugged. */ -static bool virtio_mem_mb_test_sb_unplugged(struct virtio_mem *vm, - unsigned long mb_id, int sb_id, - int count) +static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) { - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); /* TODO: Helper similar to bitmap_set() */ - return find_next_bit(vm->sb_bitmap, bit + count, bit) >= bit + count; + return find_next_bit(vm->sbm.sb_states, bit + count, bit) >= + bit + count; } /* - * Find the first unplugged subblock. Returns vm->nb_sb_per_mb in case there is + * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is * none. */ -static int virtio_mem_mb_first_unplugged_sb(struct virtio_mem *vm, +static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, unsigned long mb_id) { - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb; + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); - return find_next_zero_bit(vm->sb_bitmap, bit + vm->nb_sb_per_mb, bit) - - bit; + return find_next_zero_bit(vm->sbm.sb_states, + bit + vm->sbm.sbs_per_mb, bit) - bit; } /* * Prepare the subblock bitmap for the next memory block. */ -static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem *vm) +static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) { - const unsigned long old_nb_mb = vm->next_mb_id - vm->first_mb_id; - const unsigned long old_nb_bits = old_nb_mb * vm->nb_sb_per_mb; - const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->nb_sb_per_mb; + const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id; + const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; + const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); - unsigned long *new_sb_bitmap, *old_sb_bitmap; + unsigned long *new_bitmap, *old_bitmap; - if (vm->sb_bitmap && old_pages == new_pages) + if (vm->sbm.sb_states && old_pages == new_pages) return 0; - new_sb_bitmap = vzalloc(new_pages * PAGE_SIZE); - if (!new_sb_bitmap) + new_bitmap = vzalloc(new_pages * PAGE_SIZE); + if (!new_bitmap) return -ENOMEM; mutex_lock(&vm->hotplug_mutex); - if (new_sb_bitmap) - memcpy(new_sb_bitmap, vm->sb_bitmap, old_pages * PAGE_SIZE); + if (new_bitmap) + memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE); - old_sb_bitmap = vm->sb_bitmap; - vm->sb_bitmap = new_sb_bitmap; + old_bitmap = vm->sbm.sb_states; + vm->sbm.sb_states = new_bitmap; mutex_unlock(&vm->hotplug_mutex); - vfree(old_sb_bitmap); + vfree(old_bitmap); return 0; } /* - * Try to add a memory block to Linux. This will usually only fail - * if out of memory. + * Test if we could add memory without creating too much offline memory - + * to avoid running OOM if memory is getting onlined deferred. + */ +static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) +{ + if (WARN_ON_ONCE(size > vm->offline_threshold)) + return false; + + return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; +} + +/* + * Try adding memory to Linux. Will usually only fail if out of memory. * * Must not be called with the vm->hotplug_mutex held (possible deadlock with * onlining code). * - * Will not modify the state of the memory block. + * Will not modify the state of memory blocks in virtio-mem. */ -static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) +static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, + uint64_t size) { - const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); - int nid = vm->nid; - - if (nid == NUMA_NO_NODE) - nid = memory_add_physaddr_to_nid(addr); + int rc; /* * When force-unloading the driver and we still have memory added to @@ -422,53 +618,155 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) return -ENOMEM; } - dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id); - return add_memory_driver_managed(nid, addr, memory_block_size_bytes(), - vm->resource_name, - MEMHP_MERGE_RESOURCE); + dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + /* Memory might get onlined immediately. */ + atomic64_add(size, &vm->offline_size); + rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name, + MEMHP_MERGE_RESOURCE); + if (rc) { + atomic64_sub(size, &vm->offline_size); + dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); + /* + * TODO: Linux MM does not properly clean up yet in all cases + * where adding of memory failed - especially on -ENOMEM. + */ + } + return rc; +} + +/* + * See virtio_mem_add_memory(): Try adding a single Linux memory block. + */ +static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) +{ + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); + const uint64_t size = memory_block_size_bytes(); + + return virtio_mem_add_memory(vm, addr, size); +} + +/* + * See virtio_mem_add_memory(): Try adding a big block. + */ +static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; + + return virtio_mem_add_memory(vm, addr, size); } /* - * Try to remove a memory block from Linux. Will only fail if the memory block - * is not offline. + * Try removing memory from Linux. Will only fail if memory blocks aren't + * offline. * * Must not be called with the vm->hotplug_mutex held (possible deadlock with * onlining code). * - * Will not modify the state of the memory block. + * Will not modify the state of memory blocks in virtio-mem. + */ +static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, + uint64_t size) +{ + int rc; + + dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + rc = remove_memory(vm->nid, addr, size); + if (!rc) { + atomic64_sub(size, &vm->offline_size); + /* + * We might have freed up memory we can now unplug, retry + * immediately instead of waiting. + */ + virtio_mem_retry(vm); + } else { + dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc); + } + return rc; +} + +/* + * See virtio_mem_remove_memory(): Try removing a single Linux memory block. */ -static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id) +static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); - int nid = vm->nid; + const uint64_t size = memory_block_size_bytes(); - if (nid == NUMA_NO_NODE) - nid = memory_add_physaddr_to_nid(addr); + return virtio_mem_remove_memory(vm, addr, size); +} + +/* + * See virtio_mem_remove_memory(): Try to remove all Linux memory blocks covered + * by the big block. + */ +static int virtio_mem_bbm_remove_bb(struct virtio_mem *vm, unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; - dev_dbg(&vm->vdev->dev, "removing memory block: %lu\n", mb_id); - return remove_memory(nid, addr, memory_block_size_bytes()); + return virtio_mem_remove_memory(vm, addr, size); } /* - * Try to offline and remove a memory block from Linux. + * Try offlining and removing memory from Linux. * * Must not be called with the vm->hotplug_mutex held (possible deadlock with * onlining code). * - * Will not modify the state of the memory block. + * Will not modify the state of memory blocks in virtio-mem. */ -static int virtio_mem_mb_offline_and_remove(struct virtio_mem *vm, - unsigned long mb_id) +static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, + uint64_t addr, + uint64_t size) +{ + int rc; + + dev_dbg(&vm->vdev->dev, + "offlining and removing memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + + rc = offline_and_remove_memory(vm->nid, addr, size); + if (!rc) { + atomic64_sub(size, &vm->offline_size); + /* + * We might have freed up memory we can now unplug, retry + * immediately instead of waiting. + */ + virtio_mem_retry(vm); + } else { + dev_dbg(&vm->vdev->dev, + "offlining and removing memory failed: %d\n", rc); + } + return rc; +} + +/* + * See virtio_mem_offline_and_remove_memory(): Try offlining and removing + * a single Linux memory block. + */ +static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, + unsigned long mb_id) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); - int nid = vm->nid; + const uint64_t size = memory_block_size_bytes(); - if (nid == NUMA_NO_NODE) - nid = memory_add_physaddr_to_nid(addr); + return virtio_mem_offline_and_remove_memory(vm, addr, size); +} + +/* + * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a + * all Linux memory blocks covered by the big block. + */ +static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm, + unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; - dev_dbg(&vm->vdev->dev, "offlining and removing memory block: %lu\n", - mb_id); - return offline_and_remove_memory(nid, addr, memory_block_size_bytes()); + return virtio_mem_offline_and_remove_memory(vm, addr, size); } /* @@ -499,31 +797,28 @@ static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) * Test if a virtio-mem device overlaps with the given range. Can be called * from (notifier) callbacks lockless. */ -static bool virtio_mem_overlaps_range(struct virtio_mem *vm, - unsigned long start, unsigned long size) +static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start, + uint64_t size) { - unsigned long dev_start = virtio_mem_mb_id_to_phys(vm->first_mb_id); - unsigned long dev_end = virtio_mem_mb_id_to_phys(vm->last_mb_id) + - memory_block_size_bytes(); - - return start < dev_end && dev_start < start + size; + return start < vm->addr + vm->region_size && vm->addr < start + size; } /* - * Test if a virtio-mem device owns a memory block. Can be called from + * Test if a virtio-mem device contains a given range. Can be called from * (notifier) callbacks lockless. */ -static bool virtio_mem_owned_mb(struct virtio_mem *vm, unsigned long mb_id) +static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, + uint64_t size) { - return mb_id >= vm->first_mb_id && mb_id <= vm->last_mb_id; + return start >= vm->addr && start + size <= vm->addr + vm->region_size; } -static int virtio_mem_notify_going_online(struct virtio_mem *vm, - unsigned long mb_id) +static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm, + unsigned long mb_id) { - switch (virtio_mem_mb_get_state(vm, mb_id)) { - case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: - case VIRTIO_MEM_MB_STATE_OFFLINE: + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { + case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: + case VIRTIO_MEM_SBM_MB_OFFLINE: return NOTIFY_OK; default: break; @@ -533,108 +828,100 @@ static int virtio_mem_notify_going_online(struct virtio_mem *vm, return NOTIFY_BAD; } -static void virtio_mem_notify_offline(struct virtio_mem *vm, - unsigned long mb_id) +static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, + unsigned long mb_id) { - switch (virtio_mem_mb_get_state(vm, mb_id)) { - case VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL: - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { + case VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); break; - case VIRTIO_MEM_MB_STATE_ONLINE: - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE); + case VIRTIO_MEM_SBM_MB_ONLINE: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE); break; default: BUG(); break; } - - /* - * Trigger the workqueue, maybe we can now unplug memory. Also, - * when we offline and remove a memory block, this will re-trigger - * us immediately - which is often nice because the removal of - * the memory block (e.g., memmap) might have freed up memory - * on other memory blocks we manage. - */ - virtio_mem_retry(vm); } -static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id) +static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, + unsigned long mb_id) { - unsigned long nb_offline; - - switch (virtio_mem_mb_get_state(vm, mb_id)) { - case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL); + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { + case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL); break; - case VIRTIO_MEM_MB_STATE_OFFLINE: - virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_ONLINE); + case VIRTIO_MEM_SBM_MB_OFFLINE: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_ONLINE); break; default: BUG(); break; } - nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] + - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL]; - - /* see if we can add new blocks now that we onlined one block */ - if (nb_offline == VIRTIO_MEM_NB_OFFLINE_THRESHOLD - 1) - virtio_mem_retry(vm); } -static void virtio_mem_notify_going_offline(struct virtio_mem *vm, - unsigned long mb_id) +static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, + unsigned long mb_id) { - const unsigned long nr_pages = PFN_DOWN(vm->subblock_size); - struct page *page; + const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); unsigned long pfn; - int sb_id, i; + int sb_id; - for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) { - if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) + for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) continue; - /* - * Drop our reference to the pages so the memory can get - * offlined and add the unplugged pages to the managed - * page counters (so offlining code can correctly subtract - * them again). - */ pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + - sb_id * vm->subblock_size); - adjust_managed_page_count(pfn_to_page(pfn), nr_pages); - for (i = 0; i < nr_pages; i++) { - page = pfn_to_page(pfn + i); - if (WARN_ON(!page_ref_dec_and_test(page))) - dump_page(page, "unplugged page referenced"); - } + sb_id * vm->sbm.sb_size); + virtio_mem_fake_offline_going_offline(pfn, nr_pages); } } -static void virtio_mem_notify_cancel_offline(struct virtio_mem *vm, - unsigned long mb_id) +static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, + unsigned long mb_id) { - const unsigned long nr_pages = PFN_DOWN(vm->subblock_size); + const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); unsigned long pfn; - int sb_id, i; + int sb_id; - for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) { - if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) + for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) continue; - /* - * Get the reference we dropped when going offline and - * subtract the unplugged pages from the managed page - * counters. - */ pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + - sb_id * vm->subblock_size); - adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); - for (i = 0; i < nr_pages; i++) - page_ref_inc(pfn_to_page(pfn + i)); + sb_id * vm->sbm.sb_size); + virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); } } +static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm, + unsigned long bb_id, + unsigned long pfn, + unsigned long nr_pages) +{ + /* + * When marked as "fake-offline", all online memory of this device block + * is allocated by us. Otherwise, we don't have any memory allocated. + */ + if (virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) + return; + virtio_mem_fake_offline_going_offline(pfn, nr_pages); +} + +static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm, + unsigned long bb_id, + unsigned long pfn, + unsigned long nr_pages) +{ + if (virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) + return; + virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); +} + /* * This callback will either be called synchronously from add_memory() or * asynchronously (e.g., triggered via user space). We have to be careful @@ -648,20 +935,33 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, struct memory_notify *mhp = arg; const unsigned long start = PFN_PHYS(mhp->start_pfn); const unsigned long size = PFN_PHYS(mhp->nr_pages); - const unsigned long mb_id = virtio_mem_phys_to_mb_id(start); int rc = NOTIFY_OK; + unsigned long id; if (!virtio_mem_overlaps_range(vm, start, size)) return NOTIFY_DONE; - /* - * Memory is onlined/offlined in memory block granularity. We cannot - * cross virtio-mem device boundaries and memory block boundaries. Bail - * out if this ever changes. - */ - if (WARN_ON_ONCE(size != memory_block_size_bytes() || - !IS_ALIGNED(start, memory_block_size_bytes()))) - return NOTIFY_BAD; + if (vm->in_sbm) { + id = virtio_mem_phys_to_mb_id(start); + /* + * In SBM, we add memory in separate memory blocks - we expect + * it to be onlined/offlined in the same granularity. Bail out + * if this ever changes. + */ + if (WARN_ON_ONCE(size != memory_block_size_bytes() || + !IS_ALIGNED(start, memory_block_size_bytes()))) + return NOTIFY_BAD; + } else { + id = virtio_mem_phys_to_bb_id(vm, start); + /* + * In BBM, we only care about onlining/offlining happening + * within a single big block, we don't care about the + * actual granularity as we don't track individual Linux + * memory blocks. + */ + if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1))) + return NOTIFY_BAD; + } /* * Avoid circular locking lockdep warnings. We lock the mutex @@ -680,7 +980,12 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, break; } vm->hotplug_active = true; - virtio_mem_notify_going_offline(vm, mb_id); + if (vm->in_sbm) + virtio_mem_sbm_notify_going_offline(vm, id); + else + virtio_mem_bbm_notify_going_offline(vm, id, + mhp->start_pfn, + mhp->nr_pages); break; case MEM_GOING_ONLINE: mutex_lock(&vm->hotplug_mutex); @@ -690,22 +995,51 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, break; } vm->hotplug_active = true; - rc = virtio_mem_notify_going_online(vm, mb_id); + if (vm->in_sbm) + rc = virtio_mem_sbm_notify_going_online(vm, id); break; case MEM_OFFLINE: - virtio_mem_notify_offline(vm, mb_id); + if (vm->in_sbm) + virtio_mem_sbm_notify_offline(vm, id); + + atomic64_add(size, &vm->offline_size); + /* + * Trigger the workqueue. Now that we have some offline memory, + * maybe we can handle pending unplug requests. + */ + if (!unplug_online) + virtio_mem_retry(vm); + vm->hotplug_active = false; mutex_unlock(&vm->hotplug_mutex); break; case MEM_ONLINE: - virtio_mem_notify_online(vm, mb_id); + if (vm->in_sbm) + virtio_mem_sbm_notify_online(vm, id); + + atomic64_sub(size, &vm->offline_size); + /* + * Start adding more memory once we onlined half of our + * threshold. Don't trigger if it's possibly due to our actipn + * (e.g., us adding memory which gets onlined immediately from + * the core). + */ + if (!atomic_read(&vm->wq_active) && + virtio_mem_could_add_memory(vm, vm->offline_threshold / 2)) + virtio_mem_retry(vm); + vm->hotplug_active = false; mutex_unlock(&vm->hotplug_mutex); break; case MEM_CANCEL_OFFLINE: if (!vm->hotplug_active) break; - virtio_mem_notify_cancel_offline(vm, mb_id); + if (vm->in_sbm) + virtio_mem_sbm_notify_cancel_offline(vm, id); + else + virtio_mem_bbm_notify_cancel_offline(vm, id, + mhp->start_pfn, + mhp->nr_pages); vm->hotplug_active = false; mutex_unlock(&vm->hotplug_mutex); break; @@ -729,7 +1063,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, * (via generic_online_page()) using PageDirty(). */ static void virtio_mem_set_fake_offline(unsigned long pfn, - unsigned int nr_pages, bool onlined) + unsigned long nr_pages, bool onlined) { for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); @@ -748,7 +1082,7 @@ static void virtio_mem_set_fake_offline(unsigned long pfn, * (via generic_online_page()), clear PageDirty(). */ static void virtio_mem_clear_fake_offline(unsigned long pfn, - unsigned int nr_pages, bool onlined) + unsigned long nr_pages, bool onlined) { for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); @@ -763,16 +1097,17 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn, * Release a range of fake-offline pages to the buddy, effectively * fake-onlining them. */ -static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages) +static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) { - const int order = MAX_ORDER - 1; - int i; + const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES; + unsigned long i; /* - * We are always called with subblock granularity, which is at least - * aligned to MAX_ORDER - 1. + * We are always called at least with MAX_ORDER_NR_PAGES + * granularity/alignment (e.g., the way subblocks work). All pages + * inside such a block are alike. */ - for (i = 0; i < nr_pages; i += 1 << order) { + for (i = 0; i < nr_pages; i += max_nr_pages) { struct page *page = pfn_to_page(pfn + i); /* @@ -782,42 +1117,128 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages) * alike. */ if (PageDirty(page)) { - virtio_mem_clear_fake_offline(pfn + i, 1 << order, + virtio_mem_clear_fake_offline(pfn + i, max_nr_pages, false); - generic_online_page(page, order); + generic_online_page(page, MAX_ORDER - 1); } else { - virtio_mem_clear_fake_offline(pfn + i, 1 << order, + virtio_mem_clear_fake_offline(pfn + i, max_nr_pages, true); - free_contig_range(pfn + i, 1 << order); - adjust_managed_page_count(page, 1 << order); + free_contig_range(pfn + i, max_nr_pages); + adjust_managed_page_count(page, max_nr_pages); } } } +/* + * Try to allocate a range, marking pages fake-offline, effectively + * fake-offlining them. + */ +static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) +{ + const bool is_movable = zone_idx(page_zone(pfn_to_page(pfn))) == + ZONE_MOVABLE; + int rc, retry_count; + + /* + * TODO: We want an alloc_contig_range() mode that tries to allocate + * harder (e.g., dealing with temporarily pinned pages, PCP), especially + * with ZONE_MOVABLE. So for now, retry a couple of times with + * ZONE_MOVABLE before giving up - because that zone is supposed to give + * some guarantees. + */ + for (retry_count = 0; retry_count < 5; retry_count++) { + rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, + GFP_KERNEL); + if (rc == -ENOMEM) + /* whoops, out of memory */ + return rc; + else if (rc && !is_movable) + break; + else if (rc) + continue; + + virtio_mem_set_fake_offline(pfn, nr_pages, true); + adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); + return 0; + } + + return -EBUSY; +} + +/* + * Handle fake-offline pages when memory is going offline - such that the + * pages can be skipped by mm-core when offlining. + */ +static void virtio_mem_fake_offline_going_offline(unsigned long pfn, + unsigned long nr_pages) +{ + struct page *page; + unsigned long i; + + /* + * Drop our reference to the pages so the memory can get offlined + * and add the unplugged pages to the managed page counters (so + * offlining code can correctly subtract them again). + */ + adjust_managed_page_count(pfn_to_page(pfn), nr_pages); + /* Drop our reference to the pages so the memory can get offlined. */ + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(pfn + i); + if (WARN_ON(!page_ref_dec_and_test(page))) + dump_page(page, "fake-offline page referenced"); + } +} + +/* + * Handle fake-offline pages when memory offlining is canceled - to undo + * what we did in virtio_mem_fake_offline_going_offline(). + */ +static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, + unsigned long nr_pages) +{ + unsigned long i; + + /* + * Get the reference we dropped when going offline and subtract the + * unplugged pages from the managed page counters. + */ + adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); + for (i = 0; i < nr_pages; i++) + page_ref_inc(pfn_to_page(pfn + i)); +} + static void virtio_mem_online_page_cb(struct page *page, unsigned int order) { const unsigned long addr = page_to_phys(page); - const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); + unsigned long id, sb_id; struct virtio_mem *vm; - int sb_id; + bool do_online; - /* - * We exploit here that subblocks have at least MAX_ORDER - 1 - * size/alignment and that this callback is is called with such a - * size/alignment. So we cannot cross subblocks and therefore - * also not memory blocks. - */ rcu_read_lock(); list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { - if (!virtio_mem_owned_mb(vm, mb_id)) + if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order))) continue; - sb_id = virtio_mem_phys_to_sb_id(vm, addr); - /* - * If plugged, online the pages, otherwise, set them fake - * offline (PageOffline). - */ - if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) + if (vm->in_sbm) { + /* + * We exploit here that subblocks have at least + * MAX_ORDER_NR_PAGES size/alignment - so we cannot + * cross subblocks within one call. + */ + id = virtio_mem_phys_to_mb_id(addr); + sb_id = virtio_mem_phys_to_sb_id(vm, addr); + do_online = virtio_mem_sbm_test_sb_plugged(vm, id, + sb_id, 1); + } else { + /* + * If the whole block is marked fake offline, keep + * everything that way. + */ + id = virtio_mem_phys_to_bb_id(vm, addr); + do_online = virtio_mem_bbm_get_bb_state(vm, id) != + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE; + } + if (do_online) generic_online_page(page, order); else virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, @@ -870,23 +1291,33 @@ static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr, .u.plug.addr = cpu_to_virtio64(vm->vdev, addr), .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), }; + int rc = -ENOMEM; if (atomic_read(&vm->config_changed)) return -EAGAIN; + dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + switch (virtio_mem_send_request(vm, &req)) { case VIRTIO_MEM_RESP_ACK: vm->plugged_size += size; return 0; case VIRTIO_MEM_RESP_NACK: - return -EAGAIN; + rc = -EAGAIN; + break; case VIRTIO_MEM_RESP_BUSY: - return -ETXTBSY; + rc = -ETXTBSY; + break; case VIRTIO_MEM_RESP_ERROR: - return -EINVAL; + rc = -EINVAL; + break; default: - return -ENOMEM; + break; } + + dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc); + return rc; } static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, @@ -898,21 +1329,30 @@ static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), }; + int rc = -ENOMEM; if (atomic_read(&vm->config_changed)) return -EAGAIN; + dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + switch (virtio_mem_send_request(vm, &req)) { case VIRTIO_MEM_RESP_ACK: vm->plugged_size -= size; return 0; case VIRTIO_MEM_RESP_BUSY: - return -ETXTBSY; + rc = -ETXTBSY; + break; case VIRTIO_MEM_RESP_ERROR: - return -EINVAL; + rc = -EINVAL; + break; default: - return -ENOMEM; + break; } + + dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc); + return rc; } static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) @@ -920,6 +1360,9 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) const struct virtio_mem_req req = { .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), }; + int rc = -ENOMEM; + + dev_dbg(&vm->vdev->dev, "unplugging all memory"); switch (virtio_mem_send_request(vm, &req)) { case VIRTIO_MEM_RESP_ACK: @@ -929,30 +1372,31 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) atomic_set(&vm->config_changed, 1); return 0; case VIRTIO_MEM_RESP_BUSY: - return -ETXTBSY; + rc = -ETXTBSY; + break; default: - return -ENOMEM; + break; } + + dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc); + return rc; } /* * Plug selected subblocks. Updates the plugged state, but not the state * of the memory block. */ -static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id, - int sb_id, int count) +static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id, + int sb_id, int count) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + - sb_id * vm->subblock_size; - const uint64_t size = count * vm->subblock_size; + sb_id * vm->sbm.sb_size; + const uint64_t size = count * vm->sbm.sb_size; int rc; - dev_dbg(&vm->vdev->dev, "plugging memory block: %lu : %i - %i\n", mb_id, - sb_id, sb_id + count - 1); - rc = virtio_mem_send_plug_request(vm, addr, size); if (!rc) - virtio_mem_mb_set_sb_plugged(vm, mb_id, sb_id, count); + virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count); return rc; } @@ -960,23 +1404,46 @@ static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id, * Unplug selected subblocks. Updates the plugged state, but not the state * of the memory block. */ -static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, - int sb_id, int count) +static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, + int sb_id, int count) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + - sb_id * vm->subblock_size; - const uint64_t size = count * vm->subblock_size; + sb_id * vm->sbm.sb_size; + const uint64_t size = count * vm->sbm.sb_size; int rc; - dev_dbg(&vm->vdev->dev, "unplugging memory block: %lu : %i - %i\n", - mb_id, sb_id, sb_id + count - 1); - rc = virtio_mem_send_unplug_request(vm, addr, size); if (!rc) - virtio_mem_mb_set_sb_unplugged(vm, mb_id, sb_id, count); + virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count); return rc; } +/* + * Request to unplug a big block. + * + * Will not modify the state of the big block. + */ +static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; + + return virtio_mem_send_unplug_request(vm, addr, size); +} + +/* + * Request to plug a big block. + * + * Will not modify the state of the big block. + */ +static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; + + return virtio_mem_send_plug_request(vm, addr, size); +} + /* * Unplug the desired number of plugged subblocks of a offline or not-added * memory block. Will fail if any subblock cannot get unplugged (instead of @@ -986,29 +1453,29 @@ static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, * * Note: can fail after some subblocks were unplugged. */ -static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm, - unsigned long mb_id, uint64_t *nb_sb) +static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, + unsigned long mb_id, uint64_t *nb_sb) { int sb_id, count; int rc; - sb_id = vm->nb_sb_per_mb - 1; + sb_id = vm->sbm.sbs_per_mb - 1; while (*nb_sb) { /* Find the next candidate subblock */ while (sb_id >= 0 && - virtio_mem_mb_test_sb_unplugged(vm, mb_id, sb_id, 1)) + virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1)) sb_id--; if (sb_id < 0) break; /* Try to unplug multiple subblocks at a time */ count = 1; while (count < *nb_sb && sb_id > 0 && - virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { + virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { count++; sb_id--; } - rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count); + rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); if (rc) return rc; *nb_sb -= count; @@ -1025,63 +1492,50 @@ static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm, * * Note: can fail after some subblocks were unplugged. */ -static int virtio_mem_mb_unplug(struct virtio_mem *vm, unsigned long mb_id) +static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) { - uint64_t nb_sb = vm->nb_sb_per_mb; + uint64_t nb_sb = vm->sbm.sbs_per_mb; - return virtio_mem_mb_unplug_any_sb(vm, mb_id, &nb_sb); + return virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); } /* * Prepare tracking data for the next memory block. */ -static int virtio_mem_prepare_next_mb(struct virtio_mem *vm, - unsigned long *mb_id) +static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm, + unsigned long *mb_id) { int rc; - if (vm->next_mb_id > vm->last_usable_mb_id) + if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id) return -ENOSPC; /* Resize the state array if required. */ - rc = virtio_mem_mb_state_prepare_next_mb(vm); + rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm); if (rc) return rc; /* Resize the subblock bitmap if required. */ - rc = virtio_mem_sb_bitmap_prepare_next_mb(vm); + rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm); if (rc) return rc; - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_UNUSED]++; - *mb_id = vm->next_mb_id++; + vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++; + *mb_id = vm->sbm.next_mb_id++; return 0; } -/* - * Don't add too many blocks that are not onlined yet to avoid running OOM. - */ -static bool virtio_mem_too_many_mb_offline(struct virtio_mem *vm) -{ - unsigned long nb_offline; - - nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] + - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL]; - return nb_offline >= VIRTIO_MEM_NB_OFFLINE_THRESHOLD; -} - /* * Try to plug the desired number of subblocks and add the memory block * to Linux. * * Will modify the state of the memory block. */ -static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, - unsigned long mb_id, - uint64_t *nb_sb) +static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, + unsigned long mb_id, uint64_t *nb_sb) { - const int count = min_t(int, *nb_sb, vm->nb_sb_per_mb); - int rc, rc2; + const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb); + int rc; if (WARN_ON_ONCE(!count)) return -EINVAL; @@ -1090,7 +1544,7 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, * Plug the requested number of subblocks before adding it to linux, * so that onlining will directly online all plugged subblocks. */ - rc = virtio_mem_mb_plug_sb(vm, mb_id, 0, count); + rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count); if (rc) return rc; @@ -1098,29 +1552,21 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, * Mark the block properly offline before adding it to Linux, * so the memory notifiers will find the block in the right state. */ - if (count == vm->nb_sb_per_mb) - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE); + if (count == vm->sbm.sbs_per_mb) + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE); else - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); /* Add the memory block to linux - if that fails, try to unplug. */ - rc = virtio_mem_mb_add(vm, mb_id); + rc = virtio_mem_sbm_add_mb(vm, mb_id); if (rc) { - enum virtio_mem_mb_state new_state = VIRTIO_MEM_MB_STATE_UNUSED; - - dev_err(&vm->vdev->dev, - "adding memory block %lu failed with %d\n", mb_id, rc); - rc2 = virtio_mem_mb_unplug_sb(vm, mb_id, 0, count); + int new_state = VIRTIO_MEM_SBM_MB_UNUSED; - /* - * TODO: Linux MM does not properly clean up yet in all cases - * where adding of memory failed - especially on -ENOMEM. - */ - if (rc2) - new_state = VIRTIO_MEM_MB_STATE_PLUGGED; - virtio_mem_mb_set_state(vm, mb_id, new_state); + if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count)) + new_state = VIRTIO_MEM_SBM_MB_PLUGGED; + virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); return rc; } @@ -1136,8 +1582,9 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, * * Note: Can fail after some subblocks were successfully plugged. */ -static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, - uint64_t *nb_sb, bool online) +static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, + unsigned long mb_id, uint64_t *nb_sb, + bool online) { unsigned long pfn, nr_pages; int sb_id, count; @@ -1147,17 +1594,16 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, return -EINVAL; while (*nb_sb) { - sb_id = virtio_mem_mb_first_unplugged_sb(vm, mb_id); - if (sb_id >= vm->nb_sb_per_mb) + sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id); + if (sb_id >= vm->sbm.sbs_per_mb) break; count = 1; while (count < *nb_sb && - sb_id + count < vm->nb_sb_per_mb && - !virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id + count, - 1)) + sb_id + count < vm->sbm.sbs_per_mb && + !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) count++; - rc = virtio_mem_mb_plug_sb(vm, mb_id, sb_id, count); + rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count); if (rc) return rc; *nb_sb -= count; @@ -1166,29 +1612,26 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, /* fake-online the pages if the memory block is online */ pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + - sb_id * vm->subblock_size); - nr_pages = PFN_DOWN(count * vm->subblock_size); + sb_id * vm->sbm.sb_size); + nr_pages = PFN_DOWN(count * vm->sbm.sb_size); virtio_mem_fake_online(pfn, nr_pages); } - if (virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { if (online) - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_ONLINE); else - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE); } return 0; } -/* - * Try to plug the requested amount of memory. - */ -static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) +static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) { - uint64_t nb_sb = diff / vm->subblock_size; + uint64_t nb_sb = diff / vm->sbm.sb_size; unsigned long mb_id; int rc; @@ -1199,18 +1642,18 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) mutex_lock(&vm->hotplug_mutex); /* Try to plug subblocks of partially plugged online blocks. */ - virtio_mem_for_each_mb_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) { - rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, true); + virtio_mem_sbm_for_each_mb(vm, mb_id, + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { + rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, true); if (rc || !nb_sb) goto out_unlock; cond_resched(); } /* Try to plug subblocks of partially plugged offline blocks. */ - virtio_mem_for_each_mb_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) { - rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, false); + virtio_mem_sbm_for_each_mb(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { + rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, false); if (rc || !nb_sb) goto out_unlock; cond_resched(); @@ -1223,11 +1666,11 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) mutex_unlock(&vm->hotplug_mutex); /* Try to plug and add unused blocks */ - virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED) { - if (virtio_mem_too_many_mb_offline(vm)) + virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) { + if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) return -ENOSPC; - rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb); + rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); if (rc || !nb_sb) return rc; cond_resched(); @@ -1235,13 +1678,13 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) /* Try to prepare, plug and add new blocks */ while (nb_sb) { - if (virtio_mem_too_many_mb_offline(vm)) + if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) return -ENOSPC; - rc = virtio_mem_prepare_next_mb(vm, &mb_id); + rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id); if (rc) return rc; - rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb); + rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); if (rc) return rc; cond_resched(); @@ -1253,6 +1696,112 @@ out_unlock: return rc; } +/* + * Plug a big block and add it to Linux. + * + * Will modify the state of the big block. + */ +static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm, + unsigned long bb_id) +{ + int rc; + + if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_UNUSED)) + return -EINVAL; + + rc = virtio_mem_bbm_plug_bb(vm, bb_id); + if (rc) + return rc; + virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); + + rc = virtio_mem_bbm_add_bb(vm, bb_id); + if (rc) { + if (!virtio_mem_bbm_unplug_bb(vm, bb_id)) + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_UNUSED); + else + /* Retry from the main loop. */ + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_PLUGGED); + return rc; + } + return 0; +} + +/* + * Prepare tracking data for the next big block. + */ +static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm, + unsigned long *bb_id) +{ + int rc; + + if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id) + return -ENOSPC; + + /* Resize the big block state array if required. */ + rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm); + if (rc) + return rc; + + vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++; + *bb_id = vm->bbm.next_bb_id; + vm->bbm.next_bb_id++; + return 0; +} + +static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff) +{ + uint64_t nb_bb = diff / vm->bbm.bb_size; + unsigned long bb_id; + int rc; + + if (!nb_bb) + return 0; + + /* Try to plug and add unused big blocks */ + virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) { + if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) + return -ENOSPC; + + rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); + if (!rc) + nb_bb--; + if (rc || !nb_bb) + return rc; + cond_resched(); + } + + /* Try to prepare, plug and add new big blocks */ + while (nb_bb) { + if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) + return -ENOSPC; + + rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id); + if (rc) + return rc; + rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); + if (!rc) + nb_bb--; + if (rc) + return rc; + cond_resched(); + } + + return 0; +} + +/* + * Try to plug the requested amount of memory. + */ +static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) +{ + if (vm->in_sbm) + return virtio_mem_sbm_plug_request(vm, diff); + return virtio_mem_bbm_plug_request(vm, diff); +} + /* * Unplug the desired number of plugged subblocks of an offline memory block. * Will fail if any subblock cannot get unplugged (instead of skipping it). @@ -1262,33 +1811,33 @@ out_unlock: * * Note: Can fail after some subblocks were successfully unplugged. */ -static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm, - unsigned long mb_id, - uint64_t *nb_sb) +static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, + unsigned long mb_id, + uint64_t *nb_sb) { int rc; - rc = virtio_mem_mb_unplug_any_sb(vm, mb_id, nb_sb); + rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, nb_sb); /* some subblocks might have been unplugged even on failure */ - if (!virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); + if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); if (rc) return rc; - if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { + if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { /* * Remove the block from Linux - this should never fail. * Hinder the block from getting onlined by marking it * unplugged. Temporarily drop the mutex, so * any pending GOING_ONLINE requests can be serviced/rejected. */ - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_UNUSED); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_UNUSED); mutex_unlock(&vm->hotplug_mutex); - rc = virtio_mem_mb_remove(vm, mb_id); + rc = virtio_mem_sbm_remove_mb(vm, mb_id); BUG_ON(rc); mutex_lock(&vm->hotplug_mutex); } @@ -1300,38 +1849,31 @@ static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm, * * Will modify the state of the memory block. */ -static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm, - unsigned long mb_id, int sb_id, - int count) +static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) { - const unsigned long nr_pages = PFN_DOWN(vm->subblock_size) * count; + const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; unsigned long start_pfn; int rc; start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + - sb_id * vm->subblock_size); - rc = alloc_contig_range(start_pfn, start_pfn + nr_pages, - MIGRATE_MOVABLE, GFP_KERNEL); - if (rc == -ENOMEM) - /* whoops, out of memory */ - return rc; - if (rc) - return -EBUSY; + sb_id * vm->sbm.sb_size); - /* Mark it as fake-offline before unplugging it */ - virtio_mem_set_fake_offline(start_pfn, nr_pages, true); - adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); + rc = virtio_mem_fake_offline(start_pfn, nr_pages); + if (rc) + return rc; /* Try to unplug the allocated memory */ - rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count); + rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); if (rc) { /* Return the memory to the buddy. */ virtio_mem_fake_online(start_pfn, nr_pages); return rc; } - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL); return 0; } @@ -1345,34 +1887,34 @@ static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm, * Note: Can fail after some subblocks were successfully unplugged. Can * return 0 even if subblocks were busy and could not get unplugged. */ -static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm, - unsigned long mb_id, - uint64_t *nb_sb) +static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, + unsigned long mb_id, + uint64_t *nb_sb) { int rc, sb_id; /* If possible, try to unplug the complete block in one shot. */ - if (*nb_sb >= vm->nb_sb_per_mb && - virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { - rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, 0, - vm->nb_sb_per_mb); + if (*nb_sb >= vm->sbm.sbs_per_mb && + virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { + rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0, + vm->sbm.sbs_per_mb); if (!rc) { - *nb_sb -= vm->nb_sb_per_mb; + *nb_sb -= vm->sbm.sbs_per_mb; goto unplugged; } else if (rc != -EBUSY) return rc; } /* Fallback to single subblocks. */ - for (sb_id = vm->nb_sb_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { + for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { /* Find the next candidate subblock */ while (sb_id >= 0 && - !virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) + !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) sb_id--; if (sb_id < 0) break; - rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, sb_id, 1); + rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1); if (rc == -EBUSY) continue; else if (rc) @@ -1386,24 +1928,21 @@ unplugged: * remove it. This will usually not fail, as no memory is in use * anymore - however some other notifiers might NACK the request. */ - if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { + if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { mutex_unlock(&vm->hotplug_mutex); - rc = virtio_mem_mb_offline_and_remove(vm, mb_id); + rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); mutex_lock(&vm->hotplug_mutex); if (!rc) - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_UNUSED); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_UNUSED); } return 0; } -/* - * Try to unplug the requested amount of memory. - */ -static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) +static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) { - uint64_t nb_sb = diff / vm->subblock_size; + uint64_t nb_sb = diff / vm->sbm.sb_size; unsigned long mb_id; int rc; @@ -1418,20 +1957,17 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) mutex_lock(&vm->hotplug_mutex); /* Try to unplug subblocks of partially plugged offline blocks. */ - virtio_mem_for_each_mb_state_rev(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) { - rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id, - &nb_sb); + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { + rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb); if (rc || !nb_sb) goto out_unlock; cond_resched(); } /* Try to unplug subblocks of plugged offline blocks. */ - virtio_mem_for_each_mb_state_rev(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE) { - rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id, - &nb_sb); + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE) { + rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb); if (rc || !nb_sb) goto out_unlock; cond_resched(); @@ -1443,10 +1979,9 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) } /* Try to unplug subblocks of partially plugged online blocks. */ - virtio_mem_for_each_mb_state_rev(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) { - rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id, - &nb_sb); + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { + rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb); if (rc || !nb_sb) goto out_unlock; mutex_unlock(&vm->hotplug_mutex); @@ -1455,10 +1990,8 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) } /* Try to unplug subblocks of plugged online blocks. */ - virtio_mem_for_each_mb_state_rev(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE) { - rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id, - &nb_sb); + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE) { + rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb); if (rc || !nb_sb) goto out_unlock; mutex_unlock(&vm->hotplug_mutex); @@ -1473,20 +2006,212 @@ out_unlock: return rc; } +/* + * Try to offline and remove a big block from Linux and unplug it. Will fail + * with -EBUSY if some memory is busy and cannot get unplugged. + * + * Will modify the state of the memory block. Might temporarily drop the + * hotplug_mutex. + */ +static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, + unsigned long bb_id) +{ + const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); + const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); + unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn; + struct page *page; + int rc; + + if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_ADDED)) + return -EINVAL; + + if (bbm_safe_unplug) { + /* + * Start by fake-offlining all memory. Once we marked the device + * block as fake-offline, all newly onlined memory will + * automatically be kept fake-offline. Protect from concurrent + * onlining/offlining until we have a consistent state. + */ + mutex_lock(&vm->hotplug_mutex); + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + page = pfn_to_online_page(pfn); + if (!page) + continue; + + rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION); + if (rc) { + end_pfn = pfn; + goto rollback_safe_unplug; + } + } + mutex_unlock(&vm->hotplug_mutex); + } + + rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); + if (rc) { + if (bbm_safe_unplug) { + mutex_lock(&vm->hotplug_mutex); + goto rollback_safe_unplug; + } + return rc; + } + + rc = virtio_mem_bbm_unplug_bb(vm, bb_id); + if (rc) + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_PLUGGED); + else + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_UNUSED); + return rc; + +rollback_safe_unplug: + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + page = pfn_to_online_page(pfn); + if (!page) + continue; + virtio_mem_fake_online(pfn, PAGES_PER_SECTION); + } + virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); + mutex_unlock(&vm->hotplug_mutex); + return rc; +} + +/* + * Try to remove a big block from Linux and unplug it. Will fail with + * -EBUSY if some memory is online. + * + * Will modify the state of the memory block. + */ +static int virtio_mem_bbm_remove_and_unplug_bb(struct virtio_mem *vm, + unsigned long bb_id) +{ + int rc; + + if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_ADDED)) + return -EINVAL; + + rc = virtio_mem_bbm_remove_bb(vm, bb_id); + if (rc) + return -EBUSY; + + rc = virtio_mem_bbm_unplug_bb(vm, bb_id); + if (rc) + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_PLUGGED); + else + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_UNUSED); + return rc; +} + +/* + * Test if a big block is completely offline. + */ +static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, + unsigned long bb_id) +{ + const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); + const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); + unsigned long pfn; + + for (pfn = start_pfn; pfn < start_pfn + nr_pages; + pfn += PAGES_PER_SECTION) { + if (pfn_to_online_page(pfn)) + return false; + } + + return true; +} + +static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) +{ + uint64_t nb_bb = diff / vm->bbm.bb_size; + uint64_t bb_id; + int rc; + + if (!nb_bb) + return 0; + + /* Try to unplug completely offline big blocks first. */ + virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { + cond_resched(); + /* + * As we're holding no locks, this check is racy as memory + * can get onlined in the meantime - but we'll fail gracefully. + */ + if (!virtio_mem_bbm_bb_is_offline(vm, bb_id)) + continue; + rc = virtio_mem_bbm_remove_and_unplug_bb(vm, bb_id); + if (rc == -EBUSY) + continue; + if (!rc) + nb_bb--; + if (rc || !nb_bb) + return rc; + } + + if (!unplug_online) + return 0; + + /* Try to unplug any big blocks. */ + virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { + cond_resched(); + rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); + if (rc == -EBUSY) + continue; + if (!rc) + nb_bb--; + if (rc || !nb_bb) + return rc; + } + + return nb_bb ? -EBUSY : 0; +} + +/* + * Try to unplug the requested amount of memory. + */ +static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) +{ + if (vm->in_sbm) + return virtio_mem_sbm_unplug_request(vm, diff); + return virtio_mem_bbm_unplug_request(vm, diff); +} + /* * Try to unplug all blocks that couldn't be unplugged before, for example, * because the hypervisor was busy. */ static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) { - unsigned long mb_id; + unsigned long id; int rc; - virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_PLUGGED) { - rc = virtio_mem_mb_unplug(vm, mb_id); + if (!vm->in_sbm) { + virtio_mem_bbm_for_each_bb(vm, id, + VIRTIO_MEM_BBM_BB_PLUGGED) { + rc = virtio_mem_bbm_unplug_bb(vm, id); + if (rc) + return rc; + virtio_mem_bbm_set_bb_state(vm, id, + VIRTIO_MEM_BBM_BB_UNUSED); + } + return 0; + } + + virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) { + rc = virtio_mem_sbm_unplug_mb(vm, id); if (rc) return rc; - virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED); + virtio_mem_sbm_set_mb_state(vm, id, + VIRTIO_MEM_SBM_MB_UNUSED); } return 0; @@ -1511,7 +2236,13 @@ static void virtio_mem_refresh_config(struct virtio_mem *vm) usable_region_size, &usable_region_size); end_addr = vm->addr + usable_region_size; end_addr = min(end_addr, phys_limit); - vm->last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr) - 1; + + if (vm->in_sbm) + vm->sbm.last_usable_mb_id = + virtio_mem_phys_to_mb_id(end_addr) - 1; + else + vm->bbm.last_usable_bb_id = + virtio_mem_phys_to_bb_id(vm, end_addr) - 1; /* see if there is a request to change the size */ virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, @@ -1535,6 +2266,7 @@ static void virtio_mem_run_wq(struct work_struct *work) if (vm->broken) return; + atomic_set(&vm->wq_active, 1); retry: rc = 0; @@ -1595,6 +2327,8 @@ retry: "unknown error, marking device broken: %d\n", rc); vm->broken = true; } + + atomic_set(&vm->wq_active, 0); } static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) @@ -1631,6 +2365,7 @@ static int virtio_mem_init_vq(struct virtio_mem *vm) static int virtio_mem_init(struct virtio_mem *vm) { const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS; + uint64_t sb_size, addr; uint16_t node_id; if (!vm->vdev->config->get) { @@ -1659,15 +2394,9 @@ static int virtio_mem_init(struct virtio_mem *vm) virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, &vm->region_size); - /* - * We always hotplug memory in memory block granularity. This way, - * we have to wait for exactly one memory block to online. - */ - if (vm->device_block_size > memory_block_size_bytes()) { - dev_err(&vm->vdev->dev, - "The block size is not supported (too big).\n"); - return -EINVAL; - } + /* Determine the nid for the device based on the lowest address. */ + if (vm->nid == NUMA_NO_NODE) + vm->nid = memory_add_physaddr_to_nid(vm->addr); /* bad device setup - warn only */ if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) @@ -1681,23 +2410,57 @@ static int virtio_mem_init(struct virtio_mem *vm) "Some memory is not addressable. This can make some memory unusable.\n"); /* - * Calculate the subblock size: - * - At least MAX_ORDER - 1 / pageblock_order. - * - At least the device block size. - * In the worst case, a single subblock per memory block. + * We want subblocks to span at least MAX_ORDER_NR_PAGES and + * pageblock_nr_pages pages. This: + * - Simplifies our page onlining code (virtio_mem_online_page_cb) + * and fake page onlining code (virtio_mem_fake_online). + * - Is required for now for alloc_contig_range() to work reliably - + * it doesn't properly handle smaller granularity on ZONE_NORMAL. */ - vm->subblock_size = PAGE_SIZE * 1ul << max_t(uint32_t, MAX_ORDER - 1, - pageblock_order); - vm->subblock_size = max_t(uint64_t, vm->device_block_size, - vm->subblock_size); - vm->nb_sb_per_mb = memory_block_size_bytes() / vm->subblock_size; - - /* Round up to the next full memory block */ - vm->first_mb_id = virtio_mem_phys_to_mb_id(vm->addr - 1 + - memory_block_size_bytes()); - vm->next_mb_id = vm->first_mb_id; - vm->last_mb_id = virtio_mem_phys_to_mb_id(vm->addr + - vm->region_size) - 1; + sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES, + pageblock_nr_pages) * PAGE_SIZE; + sb_size = max_t(uint64_t, vm->device_block_size, sb_size); + + if (sb_size < memory_block_size_bytes() && !force_bbm) { + /* SBM: At least two subblocks per Linux memory block. */ + vm->in_sbm = true; + vm->sbm.sb_size = sb_size; + vm->sbm.sbs_per_mb = memory_block_size_bytes() / + vm->sbm.sb_size; + + /* Round up to the next full memory block */ + addr = vm->addr + memory_block_size_bytes() - 1; + vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); + vm->sbm.next_mb_id = vm->sbm.first_mb_id; + } else { + /* BBM: At least one Linux memory block. */ + vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size, + memory_block_size_bytes()); + + if (bbm_block_size) { + if (!is_power_of_2(bbm_block_size)) { + dev_warn(&vm->vdev->dev, + "bbm_block_size is not a power of 2"); + } else if (bbm_block_size < vm->bbm.bb_size) { + dev_warn(&vm->vdev->dev, + "bbm_block_size is too small"); + } else { + vm->bbm.bb_size = bbm_block_size; + } + } + + /* Round up to the next aligned big block */ + addr = vm->addr + vm->bbm.bb_size - 1; + vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); + vm->bbm.next_bb_id = vm->bbm.first_bb_id; + } + + /* Prepare the offline threshold - make sure we can add two blocks. */ + vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), + VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); + /* In BBM, we also want at least two big blocks. */ + vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, + vm->offline_threshold); dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); @@ -1705,9 +2468,13 @@ static int virtio_mem_init(struct virtio_mem *vm) (unsigned long long)vm->device_block_size); dev_info(&vm->vdev->dev, "memory block size: 0x%lx", memory_block_size_bytes()); - dev_info(&vm->vdev->dev, "subblock size: 0x%llx", - (unsigned long long)vm->subblock_size); - if (vm->nid != NUMA_NO_NODE) + if (vm->in_sbm) + dev_info(&vm->vdev->dev, "subblock size: 0x%llx", + (unsigned long long)vm->sbm.sb_size); + else + dev_info(&vm->vdev->dev, "big block size: 0x%llx", + (unsigned long long)vm->bbm.bb_size); + if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) dev_info(&vm->vdev->dev, "nid: %d", vm->nid); return 0; @@ -1753,6 +2520,20 @@ static void virtio_mem_delete_resource(struct virtio_mem *vm) vm->parent_resource = NULL; } +static int virtio_mem_range_has_system_ram(struct resource *res, void *arg) +{ + return 1; +} + +static bool virtio_mem_has_memory_added(struct virtio_mem *vm) +{ + const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + + return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr, + vm->addr + vm->region_size, NULL, + virtio_mem_range_has_system_ram) == 1; +} + static int virtio_mem_probe(struct virtio_device *vdev) { struct virtio_mem *vm; @@ -1849,21 +2630,24 @@ static void virtio_mem_remove(struct virtio_device *vdev) cancel_work_sync(&vm->wq); hrtimer_cancel(&vm->retry_timer); - /* - * After we unregistered our callbacks, user space can online partially - * plugged offline blocks. Make sure to remove them. - */ - virtio_mem_for_each_mb_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) { - rc = virtio_mem_mb_remove(vm, mb_id); - BUG_ON(rc); - virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED); + if (vm->in_sbm) { + /* + * After we unregistered our callbacks, user space can online + * partially plugged offline blocks. Make sure to remove them. + */ + virtio_mem_sbm_for_each_mb(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { + rc = virtio_mem_sbm_remove_mb(vm, mb_id); + BUG_ON(rc); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_UNUSED); + } + /* + * After we unregistered our callbacks, user space can no longer + * offline partially plugged online memory blocks. No need to + * worry about them. + */ } - /* - * After we unregistered our callbacks, user space can no longer - * offline partially plugged online memory blocks. No need to worry - * about them. - */ /* unregister callbacks */ unregister_virtio_mem_device(vm); @@ -1874,10 +2658,7 @@ static void virtio_mem_remove(struct virtio_device *vdev) * the system. And there is no way to stop the driver/device from going * away. Warn at least. */ - if (vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] || - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL] || - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE] || - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL]) { + if (virtio_mem_has_memory_added(vm)) { dev_warn(&vdev->dev, "device still has system memory added\n"); } else { virtio_mem_delete_resource(vm); @@ -1885,8 +2666,12 @@ static void virtio_mem_remove(struct virtio_device *vdev) } /* remove all tracking data - no locking needed */ - vfree(vm->mb_state); - vfree(vm->sb_bitmap); + if (vm->in_sbm) { + vfree(vm->sbm.mb_states); + vfree(vm->sbm.sb_states); + } else { + vfree(vm->bbm.bb_states); + } /* reset the device and cleanup the queues */ vdev->config->reset(vdev); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index becc77697960..71e16b53e9c1 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1608,7 +1608,6 @@ static struct virtqueue *vring_create_virtqueue_packed( vq->num_added = 0; vq->packed_ring = true; vq->use_dma_api = vring_use_dma_api(vdev); - list_add_tail(&vq->vq.list, &vdev->vqs); #ifdef DEBUG vq->in_use = false; vq->last_add_time_valid = false; @@ -1669,6 +1668,7 @@ static struct virtqueue *vring_create_virtqueue_packed( cpu_to_le16(vq->packed.event_flags_shadow); } + list_add_tail(&vq->vq.list, &vdev->vqs); return &vq->vq; err_desc_extra: @@ -1676,9 +1676,9 @@ err_desc_extra: err_desc_state: kfree(vq); err_vq: - vring_free_queue(vdev, event_size_in_bytes, device, ring_dma_addr); + vring_free_queue(vdev, event_size_in_bytes, device, device_event_dma_addr); err_device: - vring_free_queue(vdev, event_size_in_bytes, driver, ring_dma_addr); + vring_free_queue(vdev, event_size_in_bytes, driver, driver_event_dma_addr); err_driver: vring_free_queue(vdev, ring_size_in_bytes, ring, ring_dma_addr); err_ring: @@ -2085,7 +2085,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, vq->last_used_idx = 0; vq->num_added = 0; vq->use_dma_api = vring_use_dma_api(vdev); - list_add_tail(&vq->vq.list, &vdev->vqs); #ifdef DEBUG vq->in_use = false; vq->last_add_time_valid = false; @@ -2127,6 +2126,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, memset(vq->split.desc_state, 0, vring.num * sizeof(struct vring_desc_state_split)); + list_add_tail(&vq->vq.list, &vdev->vqs); return &vq->vq; } EXPORT_SYMBOL_GPL(__vring_new_virtqueue); diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 30bc7a7223bb..0fefeb976877 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -42,6 +42,7 @@ struct vdpa_vq_state { * @config: the configuration ops for this device. * @index: device index * @features_valid: were features initialized? for legacy guests + * @nvqs: maximum number of supported virtqueues */ struct vdpa_device { struct device dev; diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index b052355ac7a3..bc1c0621f5ed 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -29,24 +29,30 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#define VIRTIO_ID_NET 1 /* virtio net */ -#define VIRTIO_ID_BLOCK 2 /* virtio block */ -#define VIRTIO_ID_CONSOLE 3 /* virtio console */ -#define VIRTIO_ID_RNG 4 /* virtio rng */ -#define VIRTIO_ID_BALLOON 5 /* virtio balloon */ -#define VIRTIO_ID_RPMSG 7 /* virtio remote processor messaging */ -#define VIRTIO_ID_SCSI 8 /* virtio scsi */ -#define VIRTIO_ID_9P 9 /* 9p virtio console */ -#define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ -#define VIRTIO_ID_CAIF 12 /* Virtio caif */ -#define VIRTIO_ID_GPU 16 /* virtio GPU */ -#define VIRTIO_ID_INPUT 18 /* virtio input */ -#define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ -#define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ -#define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */ -#define VIRTIO_ID_MEM 24 /* virtio mem */ -#define VIRTIO_ID_FS 26 /* virtio filesystem */ -#define VIRTIO_ID_PMEM 27 /* virtio pmem */ -#define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */ +#define VIRTIO_ID_NET 1 /* virtio net */ +#define VIRTIO_ID_BLOCK 2 /* virtio block */ +#define VIRTIO_ID_CONSOLE 3 /* virtio console */ +#define VIRTIO_ID_RNG 4 /* virtio rng */ +#define VIRTIO_ID_BALLOON 5 /* virtio balloon */ +#define VIRTIO_ID_IOMEM 6 /* virtio ioMemory */ +#define VIRTIO_ID_RPMSG 7 /* virtio remote processor messaging */ +#define VIRTIO_ID_SCSI 8 /* virtio scsi */ +#define VIRTIO_ID_9P 9 /* 9p virtio console */ +#define VIRTIO_ID_MAC80211_WLAN 10 /* virtio WLAN MAC */ +#define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ +#define VIRTIO_ID_CAIF 12 /* Virtio caif */ +#define VIRTIO_ID_MEMORY_BALLOON 13 /* virtio memory balloon */ +#define VIRTIO_ID_GPU 16 /* virtio GPU */ +#define VIRTIO_ID_CLOCK 17 /* virtio clock/timer */ +#define VIRTIO_ID_INPUT 18 /* virtio input */ +#define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ +#define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ +#define VIRTIO_ID_SIGNAL_DIST 21 /* virtio signal distribution device */ +#define VIRTIO_ID_PSTORE 22 /* virtio pstore device */ +#define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */ +#define VIRTIO_ID_MEM 24 /* virtio mem */ +#define VIRTIO_ID_FS 26 /* virtio filesystem */ +#define VIRTIO_ID_PMEM 27 /* virtio pmem */ +#define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */ #endif /* _LINUX_VIRTIO_IDS_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c01604224299..af41fb990820 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1784,39 +1784,112 @@ int remove_memory(int nid, u64 start, u64 size) } EXPORT_SYMBOL_GPL(remove_memory); +static int try_offline_memory_block(struct memory_block *mem, void *arg) +{ + uint8_t online_type = MMOP_ONLINE_KERNEL; + uint8_t **online_types = arg; + struct page *page; + int rc; + + /* + * Sense the online_type via the zone of the memory block. Offlining + * with multiple zones within one memory block will be rejected + * by offlining code ... so we don't care about that. + */ + page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr)); + if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE) + online_type = MMOP_ONLINE_MOVABLE; + + rc = device_offline(&mem->dev); + /* + * Default is MMOP_OFFLINE - change it only if offlining succeeded, + * so try_reonline_memory_block() can do the right thing. + */ + if (!rc) + **online_types = online_type; + + (*online_types)++; + /* Ignore if already offline. */ + return rc < 0 ? rc : 0; +} + +static int try_reonline_memory_block(struct memory_block *mem, void *arg) +{ + uint8_t **online_types = arg; + int rc; + + if (**online_types != MMOP_OFFLINE) { + mem->online_type = **online_types; + rc = device_online(&mem->dev); + if (rc < 0) + pr_warn("%s: Failed to re-online memory: %d", + __func__, rc); + } + + /* Continue processing all remaining memory blocks. */ + (*online_types)++; + return 0; +} + /* - * Try to offline and remove a memory block. Might take a long time to - * finish in case memory is still in use. Primarily useful for memory devices - * that logically unplugged all memory (so it's no longer in use) and want to - * offline + remove the memory block. + * Try to offline and remove memory. Might take a long time to finish in case + * memory is still in use. Primarily useful for memory devices that logically + * unplugged all memory (so it's no longer in use) and want to offline + remove + * that memory. */ int offline_and_remove_memory(int nid, u64 start, u64 size) { - struct memory_block *mem; - int rc = -EINVAL; + const unsigned long mb_count = size / memory_block_size_bytes(); + uint8_t *online_types, *tmp; + int rc; if (!IS_ALIGNED(start, memory_block_size_bytes()) || - size != memory_block_size_bytes()) - return rc; + !IS_ALIGNED(size, memory_block_size_bytes()) || !size) + return -EINVAL; + + /* + * We'll remember the old online type of each memory block, so we can + * try to revert whatever we did when offlining one memory block fails + * after offlining some others succeeded. + */ + online_types = kmalloc_array(mb_count, sizeof(*online_types), + GFP_KERNEL); + if (!online_types) + return -ENOMEM; + /* + * Initialize all states to MMOP_OFFLINE, so when we abort processing in + * try_offline_memory_block(), we'll skip all unprocessed blocks in + * try_reonline_memory_block(). + */ + memset(online_types, MMOP_OFFLINE, mb_count); lock_device_hotplug(); - mem = find_memory_block(__pfn_to_section(PFN_DOWN(start))); - if (mem) - rc = device_offline(&mem->dev); - /* Ignore if the device is already offline. */ - if (rc > 0) - rc = 0; + + tmp = online_types; + rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block); /* - * In case we succeeded to offline the memory block, remove it. + * In case we succeeded to offline all memory, remove it. * This cannot fail as it cannot get onlined in the meantime. */ if (!rc) { rc = try_remove_memory(nid, start, size); - WARN_ON_ONCE(rc); + if (rc) + pr_err("%s: Failed to remove memory: %d", __func__, rc); + } + + /* + * Rollback what we did. While memory onlining might theoretically fail + * (nacked by a notifier), it barely ever happens. + */ + if (rc) { + tmp = online_types; + walk_memory_blocks(start, size, &tmp, + try_reonline_memory_block); } unlock_device_hotplug(); + kfree(online_types); return rc; } EXPORT_SYMBOL_GPL(offline_and_remove_memory); diff --git a/tools/virtio/asm/barrier.h b/tools/virtio/asm/barrier.h index 04d563fc9b95..468435ed64e6 100644 --- a/tools/virtio/asm/barrier.h +++ b/tools/virtio/asm/barrier.h @@ -16,6 +16,16 @@ # define mb() abort() # define dma_rmb() abort() # define dma_wmb() abort() +#elif defined(__aarch64__) +#define dmb(opt) asm volatile("dmb " #opt : : : "memory") +#define virt_mb() __sync_synchronize() +#define virt_rmb() dmb(ishld) +#define virt_wmb() dmb(ishst) +#define virt_store_mb(var, value) do { WRITE_ONCE(var, value); dmb(ish); } while (0) +/* Weak barriers should be used. If not - it's a bug */ +# define mb() abort() +# define dma_rmb() abort() +# define dma_wmb() abort() #else #error Please fill in barrier macros #endif diff --git a/tools/virtio/linux/bug.h b/tools/virtio/linux/bug.h index b14c2c3b6b85..813baf13f62a 100644 --- a/tools/virtio/linux/bug.h +++ b/tools/virtio/linux/bug.h @@ -2,6 +2,8 @@ #ifndef BUG_H #define BUG_H +#include + #define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond)) #define BUILD_BUG_ON(x) diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h index 315e85cabeda..0b493542e61a 100644 --- a/tools/virtio/linux/kernel.h +++ b/tools/virtio/linux/kernel.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -117,6 +118,16 @@ static inline void free_page(unsigned long addr) # define unlikely(x) (__builtin_expect(!!(x), 0)) # endif +static inline void *krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t gfp) +{ + size_t bytes; + + if (unlikely(check_mul_overflow(new_n, new_size, &bytes))) + return NULL; + + return krealloc(p, bytes, gfp); +} + #define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__) #ifdef DEBUG #define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__) @@ -126,8 +137,6 @@ static inline void free_page(unsigned long addr) #define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) #define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) -#define WARN_ON_ONCE(cond) (unlikely(cond) ? fprintf (stderr, "WARNING\n") : 0) - #define min(x, y) ({ \ typeof(x) _min1 = (x); \ typeof(y) _min2 = (y); \