net/mlx5e: RX, Break the wqe bulk refill in smaller chunks
authorDragos Tatulea <dtatulea@nvidia.com>
Tue, 21 Feb 2023 19:05:07 +0000 (21:05 +0200)
committerSaeed Mahameed <saeedm@nvidia.com>
Tue, 28 Mar 2023 20:43:59 +0000 (13:43 -0700)
To avoid overflowing the page pool's cache, don't release the
whole bulk which is usually larger than the cache refill size.
Group release+alloc instead into cache refill units that
allow releasing to the cache and then allocating from the cache.

A refill_unit variable is added as a iteration unit over the
wqe_bulk when doing release+alloc.

For a single ring, single core, default MTU (1500) TCP stream
test the number of pages allocated from the cache directly
(rx_pp_recycle_cached) increases from 0% to 52%:

+---------------------------------------------+
| Page Pool stats (/sec)  |  Before |   After |
+-------------------------+---------+---------+
|rx_pp_alloc_fast         | 2145422 | 2193802 |
|rx_pp_alloc_slow         |       2 |       0 |
|rx_pp_alloc_empty        |       2 |       0 |
|rx_pp_alloc_refill       |   34059 |   16634 |
|rx_pp_alloc_waive        |       0 |       0 |
|rx_pp_recycle_cached     |       0 | 1145818 |
|rx_pp_recycle_cache_full |       0 |       0 |
|rx_pp_recycle_ring       | 2179361 | 1064616 |
|rx_pp_recycle_ring_full  |     121 |       0 |
+---------------------------------------------+

With this patch, the performance for legacy rq for the above test is
back to baseline.

Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en/params.c
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c

index a087c43..ba615b7 100644 (file)
@@ -671,6 +671,7 @@ struct mlx5e_rq_frags_info {
        u8 num_frags;
        u8 log_num_frags;
        u16 wqe_bulk;
+       u16 refill_unit;
        u8 wqe_index_mask;
 };
 
index 40218d7..31f3c6e 100644 (file)
@@ -674,6 +674,7 @@ static void mlx5e_rx_compute_wqe_bulk_params(struct mlx5e_params *params,
        u32 bulk_bound_rq_size_in_bytes;
        u32 sum_frag_strides = 0;
        u32 wqe_bulk_in_bytes;
+       u16 split_factor;
        u32 wqe_bulk;
        int i;
 
@@ -702,6 +703,10 @@ static void mlx5e_rx_compute_wqe_bulk_params(struct mlx5e_params *params,
         * by older WQEs.
         */
        info->wqe_bulk = max_t(u16, info->wqe_index_mask + 1, wqe_bulk);
+
+       split_factor = DIV_ROUND_UP(MAX_WQE_BULK_BYTES(params->xdp_prog),
+                                   PP_ALLOC_CACHE_REFILL * PAGE_SIZE);
+       info->refill_unit = DIV_ROUND_UP(info->wqe_bulk, split_factor);
 }
 
 #define DEFAULT_FRAG_SIZE (2048)
@@ -817,7 +822,8 @@ out:
         */
        mlx5e_rx_compute_wqe_bulk_params(params, info);
 
-       mlx5_core_dbg(mdev, "%s: wqe_bulk = %u\n", __func__, info->wqe_bulk);
+       mlx5_core_dbg(mdev, "%s: wqe_bulk = %u, wqe_bulk_refill_unit = %u\n",
+                     __func__, info->wqe_bulk, info->refill_unit);
 
        info->log_num_frags = order_base_2(info->num_frags);
 
index 9c5270e..df5dbef 100644 (file)
@@ -449,6 +449,31 @@ static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
        return i;
 }
 
+static int mlx5e_refill_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
+{
+       int remaining = wqe_bulk;
+       int i = 0;
+
+       /* The WQE bulk is split into smaller bulks that are sized
+        * according to the page pool cache refill size to avoid overflowing
+        * the page pool cache due to too many page releases at once.
+        */
+       do {
+               int refill = min_t(u16, rq->wqe.info.refill_unit, remaining);
+               int alloc_count;
+
+               mlx5e_free_rx_wqes(rq, ix + i, refill);
+               alloc_count = mlx5e_alloc_rx_wqes(rq, ix + i, refill);
+               i += alloc_count;
+               if (unlikely(alloc_count != refill))
+                       break;
+
+               remaining -= refill;
+       } while (remaining);
+
+       return i;
+}
+
 static inline void
 mlx5e_add_skb_frag(struct mlx5e_rq *rq, struct sk_buff *skb,
                   struct page *page, u32 frag_offset, u32 len,
@@ -837,8 +862,7 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
        wqe_bulk -= (head + wqe_bulk) & rq->wqe.info.wqe_index_mask;
 
        if (!rq->xsk_pool) {
-               mlx5e_free_rx_wqes(rq, head, wqe_bulk);
-               count = mlx5e_alloc_rx_wqes(rq, head, wqe_bulk);
+               count = mlx5e_refill_rx_wqes(rq, head, wqe_bulk);
        } else if (likely(!rq->xsk_pool->dma_need_sync)) {
                mlx5e_xsk_free_rx_wqes(rq, head, wqe_bulk);
                count = mlx5e_xsk_alloc_rx_wqes_batched(rq, head, wqe_bulk);