// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2020 Mellanox Technologies Ltd. */
+#include <linux/module.h>
#include <linux/vdpa.h>
+#include <linux/vringh.h>
+#include <uapi/linux/virtio_net.h>
#include <uapi/linux/virtio_ids.h>
#include <linux/virtio_config.h>
+#include <linux/auxiliary_bus.h>
+#include <linux/mlx5/cq.h>
#include <linux/mlx5/qp.h>
#include <linux/mlx5/device.h>
+#include <linux/mlx5/driver.h>
#include <linux/mlx5/vport.h>
#include <linux/mlx5/fs.h>
-#include <linux/mlx5/device.h>
-#include "mlx5_vnet.h"
-#include "mlx5_vdpa_ifc.h"
+#include <linux/mlx5/mlx5_ifc_vdpa.h>
#include "mlx5_vdpa.h"
+MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox VDPA driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define to_mlx5_vdpa_ndev(__mvdev) \
+ container_of(__mvdev, struct mlx5_vdpa_net, mvdev)
#define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev)
#define VALID_FEATURES_MASK \
mlx5_vdpa_info(mvdev, "%s\n", #_status); \
} while (0)
+static inline u32 mlx5_vdpa_max_qps(int max_vqs)
+{
+ return max_vqs / 2;
+}
+
static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
{
if (status & ~VALID_STATUS_MASK)
static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
{
mlx5_cq_set_ci(&mvq->cq.mcq);
+
+ /* make sure CQ cosumer update is visible to the hardware before updating
+ * RX doorbell record.
+ */
+ dma_wmb();
rx_post(&mvq->vqqp, num);
if (mvq->event_cb.callback)
mvq->event_cb.callback(mvq->event_cb.private);
}
}
-void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev)
+static int mlx5v_probe(struct auxiliary_device *adev,
+ const struct auxiliary_device_id *id)
{
+ struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
+ struct mlx5_core_dev *mdev = madev->mdev;
struct virtio_net_config *config;
struct mlx5_vdpa_dev *mvdev;
struct mlx5_vdpa_net *ndev;
ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
2 * mlx5_vdpa_max_qps(max_vqs));
if (IS_ERR(ndev))
- return ndev;
+ return PTR_ERR(ndev);
ndev->mvdev.max_vqs = max_vqs;
mvdev = &ndev->mvdev;
if (err)
goto err_reg;
- return ndev;
+ dev_set_drvdata(&adev->dev, ndev);
+ return 0;
err_reg:
free_resources(ndev);
err_mtu:
mutex_destroy(&ndev->reslock);
put_device(&mvdev->vdev.dev);
- return ERR_PTR(err);
+ return err;
}
-void mlx5_vdpa_remove_dev(struct mlx5_vdpa_dev *mvdev)
+static void mlx5v_remove(struct auxiliary_device *adev)
{
+ struct mlx5_vdpa_dev *mvdev = dev_get_drvdata(&adev->dev);
+
vdpa_unregister_device(&mvdev->vdev);
}
+
+static const struct auxiliary_device_id mlx5v_id_table[] = {
+ { .name = MLX5_ADEV_NAME ".vnet", },
+ {},
+};
+
+MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
+
+static struct auxiliary_driver mlx5v_driver = {
+ .name = "vnet",
+ .probe = mlx5v_probe,
+ .remove = mlx5v_remove,
+ .id_table = mlx5v_id_table,
+};
+
+module_auxiliary_driver(mlx5v_driver);
* so we should map it first. This is better than introducing a special
* case in page freeing fast path.
*/
- if (debug_pagealloc_enabled_static())
- kernel_map_pages(page, 1 << order, 1);
+ debug_pagealloc_map_pages(page, 1 << order);
__free_pages_core(page, order);
totalram_pages_add(1UL << order);
#ifdef CONFIG_HIGHMEM
if (WARN_ON(PageLRU(page)))
isolate_lru_page(page);
if (page_mapped(page))
- try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+ try_to_unmap(page, TTU_IGNORE_MLOCK);
continue;
}
}
node = zone_to_nid(zone);
+ /*
+ * Disable pcplists so that page isolation cannot race with freeing
+ * in a way that pages from isolated pageblock are left on pcplists.
+ */
+ zone_pcp_disable(zone);
+
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
MEMORY_OFFLINE | REPORT_FAILURE);
if (ret) {
reason = "failure to isolate range";
- goto failed_removal;
+ goto failed_removal_pcplists_disabled;
}
arg.start_pfn = start_pfn;
goto failed_removal_isolated;
}
- /*
- * per-cpu pages are drained in start_isolate_page_range, but if
- * there are still pages that are not free, make sure that we
- * drain again, because when we isolated range we might
- * have raced with another thread that was adding pages to pcp
- * list.
- *
- * Forward progress should be still guaranteed because
- * pages on the pcp list can only belong to MOVABLE_ZONE
- * because has_unmovable_pages explicitly checks for
- * PageBuddy on freed pages on other zones.
- */
ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
- if (ret)
- drain_all_pages(zone);
+
} while (ret);
/* Mark all sections offline and remove free pages from the buddy. */
__offline_isolated_pages(start_pfn, end_pfn);
- pr_info("Offlined Pages %ld\n", nr_pages);
+ pr_debug("Offlined Pages %ld\n", nr_pages);
/*
* The memory sections are marked offline, and the pageblock flags
zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);
+ zone_pcp_enable(zone);
+
/* removal success */
adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
zone->present_pages -= nr_pages;
failed_removal_isolated:
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
memory_notify(MEM_CANCEL_OFFLINE, &arg);
+failed_removal_pcplists_disabled:
+ zone_pcp_enable(zone);
failed_removal:
pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
(unsigned long long) start_pfn << PAGE_SHIFT,
}
EXPORT_SYMBOL_GPL(remove_memory);
+ static int try_offline_memory_block(struct memory_block *mem, void *arg)
+ {
+ uint8_t online_type = MMOP_ONLINE_KERNEL;
+ uint8_t **online_types = arg;
+ struct page *page;
+ int rc;
+
+ /*
+ * Sense the online_type via the zone of the memory block. Offlining
+ * with multiple zones within one memory block will be rejected
+ * by offlining code ... so we don't care about that.
+ */
+ page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
+ if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
+ online_type = MMOP_ONLINE_MOVABLE;
+
+ rc = device_offline(&mem->dev);
+ /*
+ * Default is MMOP_OFFLINE - change it only if offlining succeeded,
+ * so try_reonline_memory_block() can do the right thing.
+ */
+ if (!rc)
+ **online_types = online_type;
+
+ (*online_types)++;
+ /* Ignore if already offline. */
+ return rc < 0 ? rc : 0;
+ }
+
+ static int try_reonline_memory_block(struct memory_block *mem, void *arg)
+ {
+ uint8_t **online_types = arg;
+ int rc;
+
+ if (**online_types != MMOP_OFFLINE) {
+ mem->online_type = **online_types;
+ rc = device_online(&mem->dev);
+ if (rc < 0)
+ pr_warn("%s: Failed to re-online memory: %d",
+ __func__, rc);
+ }
+
+ /* Continue processing all remaining memory blocks. */
+ (*online_types)++;
+ return 0;
+ }
+
/*
- * Try to offline and remove a memory block. Might take a long time to
- * finish in case memory is still in use. Primarily useful for memory devices
- * that logically unplugged all memory (so it's no longer in use) and want to
- * offline + remove the memory block.
+ * Try to offline and remove memory. Might take a long time to finish in case
+ * memory is still in use. Primarily useful for memory devices that logically
+ * unplugged all memory (so it's no longer in use) and want to offline + remove
+ * that memory.
*/
int offline_and_remove_memory(int nid, u64 start, u64 size)
{
- struct memory_block *mem;
- int rc = -EINVAL;
+ const unsigned long mb_count = size / memory_block_size_bytes();
+ uint8_t *online_types, *tmp;
+ int rc;
if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
- size != memory_block_size_bytes())
- return rc;
+ !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
+ return -EINVAL;
+
+ /*
+ * We'll remember the old online type of each memory block, so we can
+ * try to revert whatever we did when offlining one memory block fails
+ * after offlining some others succeeded.
+ */
+ online_types = kmalloc_array(mb_count, sizeof(*online_types),
+ GFP_KERNEL);
+ if (!online_types)
+ return -ENOMEM;
+ /*
+ * Initialize all states to MMOP_OFFLINE, so when we abort processing in
+ * try_offline_memory_block(), we'll skip all unprocessed blocks in
+ * try_reonline_memory_block().
+ */
+ memset(online_types, MMOP_OFFLINE, mb_count);
lock_device_hotplug();
- mem = find_memory_block(__pfn_to_section(PFN_DOWN(start)));
- if (mem)
- rc = device_offline(&mem->dev);
- /* Ignore if the device is already offline. */
- if (rc > 0)
- rc = 0;
+
+ tmp = online_types;
+ rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
/*
- * In case we succeeded to offline the memory block, remove it.
+ * In case we succeeded to offline all memory, remove it.
* This cannot fail as it cannot get onlined in the meantime.
*/
if (!rc) {
rc = try_remove_memory(nid, start, size);
- WARN_ON_ONCE(rc);
+ if (rc)
+ pr_err("%s: Failed to remove memory: %d", __func__, rc);
+ }
+
+ /*
+ * Rollback what we did. While memory onlining might theoretically fail
+ * (nacked by a notifier), it barely ever happens.
+ */
+ if (rc) {
+ tmp = online_types;
+ walk_memory_blocks(start, size, &tmp,
+ try_reonline_memory_block);
}
unlock_device_hotplug();
+ kfree(online_types);
return rc;
}
EXPORT_SYMBOL_GPL(offline_and_remove_memory);