drivers/net/ethernet/google/gve/gve_tx_dqo.c

   1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
   2 /* Google virtual Ethernet (gve) driver
   3  *
   4  * Copyright (C) 2015-2021 Google, Inc.
   5  */
   6
   7 #include "gve.h"
   8 #include "gve_adminq.h"
   9 #include "gve_utils.h"
  10 #include "gve_dqo.h"
  11 #include <linux/tcp.h>
  12 #include <linux/slab.h>
  13 #include <linux/skbuff.h>
  14
  15 /* Returns true if a gve_tx_pending_packet_dqo object is available. */
  16 static bool gve_has_pending_packet(struct gve_tx_ring *tx)
  17 {
  18         /* Check TX path's list. */
  19         if (tx->dqo_tx.free_pending_packets != -1)
  20                 return true;
  21
  22         /* Check completion handler's list. */
  23         if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1)
  24                 return true;
  25
  26         return false;
  27 }
  28
  29 static struct gve_tx_pending_packet_dqo *
  30 gve_alloc_pending_packet(struct gve_tx_ring *tx)
  31 {
  32         struct gve_tx_pending_packet_dqo *pending_packet;
  33         s16 index;
  34
  35         index = tx->dqo_tx.free_pending_packets;
  36
  37         /* No pending_packets available, try to steal the list from the
  38          * completion handler.
  39          */
  40         if (unlikely(index == -1)) {
  41                 tx->dqo_tx.free_pending_packets =
  42                         atomic_xchg(&tx->dqo_compl.free_pending_packets, -1);
  43                 index = tx->dqo_tx.free_pending_packets;
  44
  45                 if (unlikely(index == -1))
  46                         return NULL;
  47         }
  48
  49         pending_packet = &tx->dqo.pending_packets[index];
  50
  51         /* Remove pending_packet from free list */
  52         tx->dqo_tx.free_pending_packets = pending_packet->next;
  53         pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL;
  54
  55         return pending_packet;
  56 }
  57
  58 static void
  59 gve_free_pending_packet(struct gve_tx_ring *tx,
  60                         struct gve_tx_pending_packet_dqo *pending_packet)
  61 {
  62         s16 index = pending_packet - tx->dqo.pending_packets;
  63
  64         pending_packet->state = GVE_PACKET_STATE_UNALLOCATED;
  65         while (true) {
  66                 s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets);
  67
  68                 pending_packet->next = old_head;
  69                 if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets,
  70                                    old_head, index) == old_head) {
  71                         break;
  72                 }
  73         }
  74 }
  75
  76 /* gve_tx_free_desc - Cleans up all pending tx requests and buffers.
  77  */
  78 static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx)
  79 {
  80         int i;
  81
  82         for (i = 0; i < tx->dqo.num_pending_packets; i++) {
  83                 struct gve_tx_pending_packet_dqo *cur_state =
  84                         &tx->dqo.pending_packets[i];
  85                 int j;
  86
  87                 for (j = 0; j < cur_state->num_bufs; j++) {
  88                         if (j == 0) {
  89                                 dma_unmap_single(tx->dev,
  90                                         dma_unmap_addr(cur_state, dma[j]),
  91                                         dma_unmap_len(cur_state, len[j]),
  92                                         DMA_TO_DEVICE);
  93                         } else {
  94                                 dma_unmap_page(tx->dev,
  95                                         dma_unmap_addr(cur_state, dma[j]),
  96                                         dma_unmap_len(cur_state, len[j]),
  97                                         DMA_TO_DEVICE);
  98                         }
  99                 }
 100                 if (cur_state->skb) {
 101                         dev_consume_skb_any(cur_state->skb);
 102                         cur_state->skb = NULL;
 103                 }
 104         }
 105 }
 106
 107 static void gve_tx_free_ring_dqo(struct gve_priv *priv, int idx)
 108 {
 109         struct gve_tx_ring *tx = &priv->tx[idx];
 110         struct device *hdev = &priv->pdev->dev;
 111         size_t bytes;
 112
 113         gve_tx_remove_from_block(priv, idx);
 114
 115         if (tx->q_resources) {
 116                 dma_free_coherent(hdev, sizeof(*tx->q_resources),
 117                                   tx->q_resources, tx->q_resources_bus);
 118                 tx->q_resources = NULL;
 119         }
 120
 121         if (tx->dqo.compl_ring) {
 122                 bytes = sizeof(tx->dqo.compl_ring[0]) *
 123                         (tx->dqo.complq_mask + 1);
 124                 dma_free_coherent(hdev, bytes, tx->dqo.compl_ring,
 125                                   tx->complq_bus_dqo);
 126                 tx->dqo.compl_ring = NULL;
 127         }
 128
 129         if (tx->dqo.tx_ring) {
 130                 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
 131                 dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus);
 132                 tx->dqo.tx_ring = NULL;
 133         }
 134
 135         kvfree(tx->dqo.pending_packets);
 136         tx->dqo.pending_packets = NULL;
 137
 138         netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx);
 139 }
 140
 141 static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int idx)
 142 {
 143         struct gve_tx_ring *tx = &priv->tx[idx];
 144         struct device *hdev = &priv->pdev->dev;
 145         int num_pending_packets;
 146         size_t bytes;
 147         int i;
 148
 149         memset(tx, 0, sizeof(*tx));
 150         tx->q_num = idx;
 151         tx->dev = &priv->pdev->dev;
 152         tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx);
 153         atomic_set_release(&tx->dqo_compl.hw_tx_head, 0);
 154
 155         /* Queue sizes must be a power of 2 */
 156         tx->mask = priv->tx_desc_cnt - 1;
 157         tx->dqo.complq_mask = priv->options_dqo_rda.tx_comp_ring_entries - 1;
 158
 159         /* The max number of pending packets determines the maximum number of
 160          * descriptors which maybe written to the completion queue.
 161          *
 162          * We must set the number small enough to make sure we never overrun the
 163          * completion queue.
 164          */
 165         num_pending_packets = tx->dqo.complq_mask + 1;
 166
 167         /* Reserve space for descriptor completions, which will be reported at
 168          * most every GVE_TX_MIN_RE_INTERVAL packets.
 169          */
 170         num_pending_packets -=
 171                 (tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL;
 172
 173         /* Each packet may have at most 2 buffer completions if it receives both
 174          * a miss and reinjection completion.
 175          */
 176         num_pending_packets /= 2;
 177
 178         tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX);
 179         tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets,
 180                                            sizeof(tx->dqo.pending_packets[0]),
 181                                            GFP_KERNEL);
 182         if (!tx->dqo.pending_packets)
 183                 goto err;
 184
 185         /* Set up linked list of pending packets */
 186         for (i = 0; i < tx->dqo.num_pending_packets - 1; i++)
 187                 tx->dqo.pending_packets[i].next = i + 1;
 188
 189         tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1;
 190         atomic_set_release(&tx->dqo_compl.free_pending_packets, -1);
 191         tx->dqo_compl.miss_completions.head = -1;
 192         tx->dqo_compl.miss_completions.tail = -1;
 193         tx->dqo_compl.timed_out_completions.head = -1;
 194         tx->dqo_compl.timed_out_completions.tail = -1;
 195
 196         bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
 197         tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL);
 198         if (!tx->dqo.tx_ring)
 199                 goto err;
 200
 201         bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1);
 202         tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes,
 203                                                 &tx->complq_bus_dqo,
 204                                                 GFP_KERNEL);
 205         if (!tx->dqo.compl_ring)
 206                 goto err;
 207
 208         tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources),
 209                                              &tx->q_resources_bus, GFP_KERNEL);
 210         if (!tx->q_resources)
 211                 goto err;
 212
 213         gve_tx_add_to_block(priv, idx);
 214
 215         return 0;
 216
 217 err:
 218         gve_tx_free_ring_dqo(priv, idx);
 219         return -ENOMEM;
 220 }
 221
 222 int gve_tx_alloc_rings_dqo(struct gve_priv *priv)
 223 {
 224         int err = 0;
 225         int i;
 226
 227         for (i = 0; i < priv->tx_cfg.num_queues; i++) {
 228                 err = gve_tx_alloc_ring_dqo(priv, i);
 229                 if (err) {
 230                         netif_err(priv, drv, priv->dev,
 231                                   "Failed to alloc tx ring=%d: err=%d\n",
 232                                   i, err);
 233                         goto err;
 234                 }
 235         }
 236
 237         return 0;
 238
 239 err:
 240         for (i--; i >= 0; i--)
 241                 gve_tx_free_ring_dqo(priv, i);
 242
 243         return err;
 244 }
 245
 246 void gve_tx_free_rings_dqo(struct gve_priv *priv)
 247 {
 248         int i;
 249
 250         for (i = 0; i < priv->tx_cfg.num_queues; i++) {
 251                 struct gve_tx_ring *tx = &priv->tx[i];
 252
 253                 gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL);
 254                 netdev_tx_reset_queue(tx->netdev_txq);
 255                 gve_tx_clean_pending_packets(tx);
 256
 257                 gve_tx_free_ring_dqo(priv, i);
 258         }
 259 }
 260
 261 /* Returns the number of slots available in the ring */
 262 static u32 num_avail_tx_slots(const struct gve_tx_ring *tx)
 263 {
 264         u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask;
 265
 266         return tx->mask - num_used;
 267 }
 268
 269 /* Stops the queue if available descriptors is less than 'count'.
 270  * Return: 0 if stop is not required.
 271  */
 272 static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count)
 273 {
 274         if (likely(gve_has_pending_packet(tx) &&
 275                    num_avail_tx_slots(tx) >= count))
 276                 return 0;
 277
 278         /* Update cached TX head pointer */
 279         tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
 280
 281         if (likely(gve_has_pending_packet(tx) &&
 282                    num_avail_tx_slots(tx) >= count))
 283                 return 0;
 284
 285         /* No space, so stop the queue */
 286         tx->stop_queue++;
 287         netif_tx_stop_queue(tx->netdev_txq);
 288
 289         /* Sync with restarting queue in `gve_tx_poll_dqo()` */
 290         mb();
 291
 292         /* After stopping queue, check if we can transmit again in order to
 293          * avoid TOCTOU bug.
 294          */
 295         tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
 296
 297         if (likely(!gve_has_pending_packet(tx) ||
 298                    num_avail_tx_slots(tx) < count))
 299                 return -EBUSY;
 300
 301         netif_tx_start_queue(tx->netdev_txq);
 302         tx->wake_queue++;
 303         return 0;
 304 }
 305
 306 static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb,
 307                                         struct gve_tx_metadata_dqo *metadata)
 308 {
 309         memset(metadata, 0, sizeof(*metadata));
 310         metadata->version = GVE_TX_METADATA_VERSION_DQO;
 311
 312         if (skb->l4_hash) {
 313                 u16 path_hash = skb->hash ^ (skb->hash >> 16);
 314
 315                 path_hash &= (1 << 15) - 1;
 316                 if (unlikely(path_hash == 0))
 317                         path_hash = ~path_hash;
 318
 319                 metadata->path_hash = path_hash;
 320         }
 321 }
 322
 323 static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx,
 324                                      struct sk_buff *skb, u32 len, u64 addr,
 325                                      s16 compl_tag, bool eop, bool is_gso)
 326 {
 327         const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL;
 328
 329         while (len > 0) {
 330                 struct gve_tx_pkt_desc_dqo *desc =
 331                         &tx->dqo.tx_ring[*desc_idx].pkt;
 332                 u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO);
 333                 bool cur_eop = eop && cur_len == len;
 334
 335                 *desc = (struct gve_tx_pkt_desc_dqo){
 336                         .buf_addr = cpu_to_le64(addr),
 337                         .dtype = GVE_TX_PKT_DESC_DTYPE_DQO,
 338                         .end_of_packet = cur_eop,
 339                         .checksum_offload_enable = checksum_offload_en,
 340                         .compl_tag = cpu_to_le16(compl_tag),
 341                         .buf_size = cur_len,
 342                 };
 343
 344                 addr += cur_len;
 345                 len -= cur_len;
 346                 *desc_idx = (*desc_idx + 1) & tx->mask;
 347         }
 348 }
 349
 350 /* Validates and prepares `skb` for TSO.
 351  *
 352  * Returns header length, or < 0 if invalid.
 353  */
 354 static int gve_prep_tso(struct sk_buff *skb)
 355 {
 356         struct tcphdr *tcp;
 357         int header_len;
 358         u32 paylen;
 359         int err;
 360
 361         /* Note: HW requires MSS (gso_size) to be <= 9728 and the total length
 362          * of the TSO to be <= 262143.
 363          *
 364          * However, we don't validate these because:
 365          * - Hypervisor enforces a limit of 9K MTU
 366          * - Kernel will not produce a TSO larger than 64k
 367          */
 368
 369         if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO))
 370                 return -1;
 371
 372         /* Needed because we will modify header. */
 373         err = skb_cow_head(skb, 0);
 374         if (err < 0)
 375                 return err;
 376
 377         tcp = tcp_hdr(skb);
 378
 379         /* Remove payload length from checksum. */
 380         paylen = skb->len - skb_transport_offset(skb);
 381
 382         switch (skb_shinfo(skb)->gso_type) {
 383         case SKB_GSO_TCPV4:
 384         case SKB_GSO_TCPV6:
 385                 csum_replace_by_diff(&tcp->check,
 386                                      (__force __wsum)htonl(paylen));
 387
 388                 /* Compute length of segmentation header. */
 389                 header_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
 390                 break;
 391         default:
 392                 return -EINVAL;
 393         }
 394
 395         if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO))
 396                 return -EINVAL;
 397
 398         return header_len;
 399 }
 400
 401 static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc,
 402                                      const struct sk_buff *skb,
 403                                      const struct gve_tx_metadata_dqo *metadata,
 404                                      int header_len)
 405 {
 406         *desc = (struct gve_tx_tso_context_desc_dqo){
 407                 .header_len = header_len,
 408                 .cmd_dtype = {
 409                         .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO,
 410                         .tso = 1,
 411                 },
 412                 .flex0 = metadata->bytes[0],
 413                 .flex5 = metadata->bytes[5],
 414                 .flex6 = metadata->bytes[6],
 415                 .flex7 = metadata->bytes[7],
 416                 .flex8 = metadata->bytes[8],
 417                 .flex9 = metadata->bytes[9],
 418                 .flex10 = metadata->bytes[10],
 419                 .flex11 = metadata->bytes[11],
 420         };
 421         desc->tso_total_len = skb->len - header_len;
 422         desc->mss = skb_shinfo(skb)->gso_size;
 423 }
 424
 425 static void
 426 gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc,
 427                              const struct gve_tx_metadata_dqo *metadata)
 428 {
 429         *desc = (struct gve_tx_general_context_desc_dqo){
 430                 .flex0 = metadata->bytes[0],
 431                 .flex1 = metadata->bytes[1],
 432                 .flex2 = metadata->bytes[2],
 433                 .flex3 = metadata->bytes[3],
 434                 .flex4 = metadata->bytes[4],
 435                 .flex5 = metadata->bytes[5],
 436                 .flex6 = metadata->bytes[6],
 437                 .flex7 = metadata->bytes[7],
 438                 .flex8 = metadata->bytes[8],
 439                 .flex9 = metadata->bytes[9],
 440                 .flex10 = metadata->bytes[10],
 441                 .flex11 = metadata->bytes[11],
 442                 .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO},
 443         };
 444 }
 445
 446 /* Returns 0 on success, or < 0 on error.
 447  *
 448  * Before this function is called, the caller must ensure
 449  * gve_has_pending_packet(tx) returns true.
 450  */
 451 static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx,
 452                                       struct sk_buff *skb)
 453 {
 454         const struct skb_shared_info *shinfo = skb_shinfo(skb);
 455         const bool is_gso = skb_is_gso(skb);
 456         u32 desc_idx = tx->dqo_tx.tail;
 457
 458         struct gve_tx_pending_packet_dqo *pkt;
 459         struct gve_tx_metadata_dqo metadata;
 460         s16 completion_tag;
 461         int i;
 462
 463         pkt = gve_alloc_pending_packet(tx);
 464         pkt->skb = skb;
 465         pkt->num_bufs = 0;
 466         completion_tag = pkt - tx->dqo.pending_packets;
 467
 468         gve_extract_tx_metadata_dqo(skb, &metadata);
 469         if (is_gso) {
 470                 int header_len = gve_prep_tso(skb);
 471
 472                 if (unlikely(header_len < 0))
 473                         goto err;
 474
 475                 gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx,
 476                                          skb, &metadata, header_len);
 477                 desc_idx = (desc_idx + 1) & tx->mask;
 478         }
 479
 480         gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx,
 481                                      &metadata);
 482         desc_idx = (desc_idx + 1) & tx->mask;
 483
 484         /* Note: HW requires that the size of a non-TSO packet be within the
 485          * range of [17, 9728].
 486          *
 487          * We don't double check because
 488          * - We limited `netdev->min_mtu` to ETH_MIN_MTU.
 489          * - Hypervisor won't allow MTU larger than 9216.
 490          */
 491
 492         /* Map the linear portion of skb */
 493         {
 494                 u32 len = skb_headlen(skb);
 495                 dma_addr_t addr;
 496
 497                 addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
 498                 if (unlikely(dma_mapping_error(tx->dev, addr)))
 499                         goto err;
 500
 501                 dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
 502                 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
 503                 ++pkt->num_bufs;
 504
 505                 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
 506                                          completion_tag,
 507                                          /*eop=*/shinfo->nr_frags == 0, is_gso);
 508         }
 509
 510         for (i = 0; i < shinfo->nr_frags; i++) {
 511                 const skb_frag_t *frag = &shinfo->frags[i];
 512                 bool is_eop = i == (shinfo->nr_frags - 1);
 513                 u32 len = skb_frag_size(frag);
 514                 dma_addr_t addr;
 515
 516                 addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
 517                 if (unlikely(dma_mapping_error(tx->dev, addr)))
 518                         goto err;
 519
 520                 dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
 521                 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
 522                 ++pkt->num_bufs;
 523
 524                 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
 525                                          completion_tag, is_eop, is_gso);
 526         }
 527
 528         /* Commit the changes to our state */
 529         tx->dqo_tx.tail = desc_idx;
 530
 531         /* Request a descriptor completion on the last descriptor of the
 532          * packet if we are allowed to by the HW enforced interval.
 533          */
 534         {
 535                 u32 last_desc_idx = (desc_idx - 1) & tx->mask;
 536                 u32 last_report_event_interval =
 537                         (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask;
 538
 539                 if (unlikely(last_report_event_interval >=
 540                              GVE_TX_MIN_RE_INTERVAL)) {
 541                         tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true;
 542                         tx->dqo_tx.last_re_idx = last_desc_idx;
 543                 }
 544         }
 545
 546         return 0;
 547
 548 err:
 549         for (i = 0; i < pkt->num_bufs; i++) {
 550                 if (i == 0) {
 551                         dma_unmap_single(tx->dev,
 552                                          dma_unmap_addr(pkt, dma[i]),
 553                                          dma_unmap_len(pkt, len[i]),
 554                                          DMA_TO_DEVICE);
 555                 } else {
 556                         dma_unmap_page(tx->dev,
 557                                        dma_unmap_addr(pkt, dma[i]),
 558                                        dma_unmap_len(pkt, len[i]),
 559                                        DMA_TO_DEVICE);
 560                 }
 561         }
 562
 563         pkt->skb = NULL;
 564         pkt->num_bufs = 0;
 565         gve_free_pending_packet(tx, pkt);
 566
 567         return -1;
 568 }
 569
 570 static int gve_num_descs_per_buf(size_t size)
 571 {
 572         return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO);
 573 }
 574
 575 static int gve_num_buffer_descs_needed(const struct sk_buff *skb)
 576 {
 577         const struct skb_shared_info *shinfo = skb_shinfo(skb);
 578         int num_descs;
 579         int i;
 580
 581         num_descs = gve_num_descs_per_buf(skb_headlen(skb));
 582
 583         for (i = 0; i < shinfo->nr_frags; i++) {
 584                 unsigned int frag_size = skb_frag_size(&shinfo->frags[i]);
 585
 586                 num_descs += gve_num_descs_per_buf(frag_size);
 587         }
 588
 589         return num_descs;
 590 }
 591
 592 /* Returns true if HW is capable of sending TSO represented by `skb`.
 593  *
 594  * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers.
 595  * - The header is counted as one buffer for every single segment.
 596  * - A buffer which is split between two segments is counted for both.
 597  * - If a buffer contains both header and payload, it is counted as two buffers.
 598  */
 599 static bool gve_can_send_tso(const struct sk_buff *skb)
 600 {
 601         const int header_len = skb_checksum_start_offset(skb) + tcp_hdrlen(skb);
 602         const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1;
 603         const struct skb_shared_info *shinfo = skb_shinfo(skb);
 604         const int gso_size = shinfo->gso_size;
 605         int cur_seg_num_bufs;
 606         int cur_seg_size;
 607         int i;
 608
 609         cur_seg_size = skb_headlen(skb) - header_len;
 610         cur_seg_num_bufs = cur_seg_size > 0;
 611
 612         for (i = 0; i < shinfo->nr_frags; i++) {
 613                 if (cur_seg_size >= gso_size) {
 614                         cur_seg_size %= gso_size;
 615                         cur_seg_num_bufs = cur_seg_size > 0;
 616                 }
 617
 618                 if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg))
 619                         return false;
 620
 621                 cur_seg_size += skb_frag_size(&shinfo->frags[i]);
 622         }
 623
 624         return true;
 625 }
 626
 627 /* Attempt to transmit specified SKB.
 628  *
 629  * Returns 0 if the SKB was transmitted or dropped.
 630  * Returns -1 if there is not currently enough space to transmit the SKB.
 631  */
 632 static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx,
 633                           struct sk_buff *skb)
 634 {
 635         int num_buffer_descs;
 636         int total_num_descs;
 637
 638         if (skb_is_gso(skb)) {
 639                 /* If TSO doesn't meet HW requirements, attempt to linearize the
 640                  * packet.
 641                  */
 642                 if (unlikely(!gve_can_send_tso(skb) &&
 643                              skb_linearize(skb) < 0)) {
 644                         net_err_ratelimited("%s: Failed to transmit TSO packet\n",
 645                                             priv->dev->name);
 646                         goto drop;
 647                 }
 648
 649                 num_buffer_descs = gve_num_buffer_descs_needed(skb);
 650         } else {
 651                 num_buffer_descs = gve_num_buffer_descs_needed(skb);
 652
 653                 if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) {
 654                         if (unlikely(skb_linearize(skb) < 0))
 655                                 goto drop;
 656
 657                         num_buffer_descs = 1;
 658                 }
 659         }
 660
 661         /* Metadata + (optional TSO) + data descriptors. */
 662         total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs;
 663         if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs +
 664                         GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) {
 665                 return -1;
 666         }
 667
 668         if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0))
 669                 goto drop;
 670
 671         netdev_tx_sent_queue(tx->netdev_txq, skb->len);
 672         skb_tx_timestamp(skb);
 673         return 0;
 674
 675 drop:
 676         tx->dropped_pkt++;
 677         dev_kfree_skb_any(skb);
 678         return 0;
 679 }
 680
 681 /* Transmit a given skb and ring the doorbell. */
 682 netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
 683 {
 684         struct gve_priv *priv = netdev_priv(dev);
 685         struct gve_tx_ring *tx;
 686
 687         tx = &priv->tx[skb_get_queue_mapping(skb)];
 688         if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) {
 689                 /* We need to ring the txq doorbell -- we have stopped the Tx
 690                  * queue for want of resources, but prior calls to gve_tx()
 691                  * may have added descriptors without ringing the doorbell.
 692                  */
 693                 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
 694                 return NETDEV_TX_BUSY;
 695         }
 696
 697         if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())
 698                 return NETDEV_TX_OK;
 699
 700         gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
 701         return NETDEV_TX_OK;
 702 }
 703
 704 static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list,
 705                         struct gve_tx_pending_packet_dqo *pending_packet)
 706 {
 707         s16 old_tail, index;
 708
 709         index = pending_packet - tx->dqo.pending_packets;
 710         old_tail = list->tail;
 711         list->tail = index;
 712         if (old_tail == -1)
 713                 list->head = index;
 714         else
 715                 tx->dqo.pending_packets[old_tail].next = index;
 716
 717         pending_packet->next = -1;
 718         pending_packet->prev = old_tail;
 719 }
 720
 721 static void remove_from_list(struct gve_tx_ring *tx,
 722                              struct gve_index_list *list,
 723                              struct gve_tx_pending_packet_dqo *pkt)
 724 {
 725         s16 prev_index, next_index;
 726
 727         prev_index = pkt->prev;
 728         next_index = pkt->next;
 729
 730         if (prev_index == -1) {
 731                 /* Node is head */
 732                 list->head = next_index;
 733         } else {
 734                 tx->dqo.pending_packets[prev_index].next = next_index;
 735         }
 736         if (next_index == -1) {
 737                 /* Node is tail */
 738                 list->tail = prev_index;
 739         } else {
 740                 tx->dqo.pending_packets[next_index].prev = prev_index;
 741         }
 742 }
 743
 744 static void gve_unmap_packet(struct device *dev,
 745                              struct gve_tx_pending_packet_dqo *pkt)
 746 {
 747         int i;
 748
 749         /* SKB linear portion is guaranteed to be mapped */
 750         dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]),
 751                          dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE);
 752         for (i = 1; i < pkt->num_bufs; i++) {
 753                 dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]),
 754                                dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE);
 755         }
 756         pkt->num_bufs = 0;
 757 }
 758
 759 /* Completion types and expected behavior:
 760  * No Miss compl + Packet compl = Packet completed normally.
 761  * Miss compl + Re-inject compl = Packet completed normally.
 762  * No Miss compl + Re-inject compl = Skipped i.e. packet not completed.
 763  * Miss compl + Packet compl = Skipped i.e. packet not completed.
 764  */
 765 static void gve_handle_packet_completion(struct gve_priv *priv,
 766                                          struct gve_tx_ring *tx, bool is_napi,
 767                                          u16 compl_tag, u64 *bytes, u64 *pkts,
 768                                          bool is_reinjection)
 769 {
 770         struct gve_tx_pending_packet_dqo *pending_packet;
 771
 772         if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
 773                 net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
 774                                     priv->dev->name, (int)compl_tag);
 775                 return;
 776         }
 777
 778         pending_packet = &tx->dqo.pending_packets[compl_tag];
 779
 780         if (unlikely(is_reinjection)) {
 781                 if (unlikely(pending_packet->state ==
 782                              GVE_PACKET_STATE_TIMED_OUT_COMPL)) {
 783                         net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n",
 784                                             priv->dev->name, (int)compl_tag);
 785                         /* Packet was already completed as a result of timeout,
 786                          * so just remove from list and free pending packet.
 787                          */
 788                         remove_from_list(tx,
 789                                          &tx->dqo_compl.timed_out_completions,
 790                                          pending_packet);
 791                         gve_free_pending_packet(tx, pending_packet);
 792                         return;
 793                 }
 794                 if (unlikely(pending_packet->state !=
 795                              GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) {
 796                         /* No outstanding miss completion but packet allocated
 797                          * implies packet receives a re-injection completion
 798                          * without a prior miss completion. Return without
 799                          * completing the packet.
 800                          */
 801                         net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n",
 802                                             priv->dev->name, (int)compl_tag);
 803                         return;
 804                 }
 805                 remove_from_list(tx, &tx->dqo_compl.miss_completions,
 806                                  pending_packet);
 807         } else {
 808                 /* Packet is allocated but not a pending data completion. */
 809                 if (unlikely(pending_packet->state !=
 810                              GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
 811                         net_err_ratelimited("%s: No pending data completion: %d\n",
 812                                             priv->dev->name, (int)compl_tag);
 813                         return;
 814                 }
 815         }
 816         gve_unmap_packet(tx->dev, pending_packet);
 817
 818         *bytes += pending_packet->skb->len;
 819         (*pkts)++;
 820         napi_consume_skb(pending_packet->skb, is_napi);
 821         pending_packet->skb = NULL;
 822         gve_free_pending_packet(tx, pending_packet);
 823 }
 824
 825 static void gve_handle_miss_completion(struct gve_priv *priv,
 826                                        struct gve_tx_ring *tx, u16 compl_tag,
 827                                        u64 *bytes, u64 *pkts)
 828 {
 829         struct gve_tx_pending_packet_dqo *pending_packet;
 830
 831         if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
 832                 net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
 833                                     priv->dev->name, (int)compl_tag);
 834                 return;
 835         }
 836
 837         pending_packet = &tx->dqo.pending_packets[compl_tag];
 838         if (unlikely(pending_packet->state !=
 839                                 GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
 840                 net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n",
 841                                     priv->dev->name, (int)pending_packet->state,
 842                                     (int)compl_tag);
 843                 return;
 844         }
 845
 846         pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL;
 847         /* jiffies can wraparound but time comparisons can handle overflows. */
 848         pending_packet->timeout_jiffies =
 849                         jiffies +
 850                         msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT *
 851                                          MSEC_PER_SEC);
 852         add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet);
 853
 854         *bytes += pending_packet->skb->len;
 855         (*pkts)++;
 856 }
 857
 858 static void remove_miss_completions(struct gve_priv *priv,
 859                                     struct gve_tx_ring *tx)
 860 {
 861         struct gve_tx_pending_packet_dqo *pending_packet;
 862         s16 next_index;
 863
 864         next_index = tx->dqo_compl.miss_completions.head;
 865         while (next_index != -1) {
 866                 pending_packet = &tx->dqo.pending_packets[next_index];
 867                 next_index = pending_packet->next;
 868                 /* Break early because packets should timeout in order. */
 869                 if (time_is_after_jiffies(pending_packet->timeout_jiffies))
 870                         break;
 871
 872                 remove_from_list(tx, &tx->dqo_compl.miss_completions,
 873                                  pending_packet);
 874                 /* Unmap buffers and free skb but do not unallocate packet i.e.
 875                  * the completion tag is not freed to ensure that the driver
 876                  * can take appropriate action if a corresponding valid
 877                  * completion is received later.
 878                  */
 879                 gve_unmap_packet(tx->dev, pending_packet);
 880                 /* This indicates the packet was dropped. */
 881                 dev_kfree_skb_any(pending_packet->skb);
 882                 pending_packet->skb = NULL;
 883                 tx->dropped_pkt++;
 884                 net_err_ratelimited("%s: No reinjection completion was received for: %d.\n",
 885                                     priv->dev->name,
 886                                     (int)(pending_packet - tx->dqo.pending_packets));
 887
 888                 pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL;
 889                 pending_packet->timeout_jiffies =
 890                                 jiffies +
 891                                 msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT *
 892                                                  MSEC_PER_SEC);
 893                 /* Maintain pending packet in another list so the packet can be
 894                  * unallocated at a later time.
 895                  */
 896                 add_to_list(tx, &tx->dqo_compl.timed_out_completions,
 897                             pending_packet);
 898         }
 899 }
 900
 901 static void remove_timed_out_completions(struct gve_priv *priv,
 902                                          struct gve_tx_ring *tx)
 903 {
 904         struct gve_tx_pending_packet_dqo *pending_packet;
 905         s16 next_index;
 906
 907         next_index = tx->dqo_compl.timed_out_completions.head;
 908         while (next_index != -1) {
 909                 pending_packet = &tx->dqo.pending_packets[next_index];
 910                 next_index = pending_packet->next;
 911                 /* Break early because packets should timeout in order. */
 912                 if (time_is_after_jiffies(pending_packet->timeout_jiffies))
 913                         break;
 914
 915                 remove_from_list(tx, &tx->dqo_compl.timed_out_completions,
 916                                  pending_packet);
 917                 gve_free_pending_packet(tx, pending_packet);
 918         }
 919 }
 920
 921 int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
 922                           struct napi_struct *napi)
 923 {
 924         u64 reinject_compl_bytes = 0;
 925         u64 reinject_compl_pkts = 0;
 926         int num_descs_cleaned = 0;
 927         u64 miss_compl_bytes = 0;
 928         u64 miss_compl_pkts = 0;
 929         u64 pkt_compl_bytes = 0;
 930         u64 pkt_compl_pkts = 0;
 931
 932         /* Limit in order to avoid blocking for too long */
 933         while (!napi || pkt_compl_pkts < napi->weight) {
 934                 struct gve_tx_compl_desc *compl_desc =
 935                         &tx->dqo.compl_ring[tx->dqo_compl.head];
 936                 u16 type;
 937
 938                 if (compl_desc->generation == tx->dqo_compl.cur_gen_bit)
 939                         break;
 940
 941                 /* Prefetch the next descriptor. */
 942                 prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) &
 943                                 tx->dqo.complq_mask]);
 944
 945                 /* Do not read data until we own the descriptor */
 946                 dma_rmb();
 947                 type = compl_desc->type;
 948
 949                 if (type == GVE_COMPL_TYPE_DQO_DESC) {
 950                         /* This is the last descriptor fetched by HW plus one */
 951                         u16 tx_head = le16_to_cpu(compl_desc->tx_head);
 952
 953                         atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head);
 954                 } else if (type == GVE_COMPL_TYPE_DQO_PKT) {
 955                         u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
 956
 957                         gve_handle_packet_completion(priv, tx, !!napi,
 958                                                      compl_tag,
 959                                                      &pkt_compl_bytes,
 960                                                      &pkt_compl_pkts,
 961                                                      /*is_reinjection=*/false);
 962                 } else if (type == GVE_COMPL_TYPE_DQO_MISS) {
 963                         u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
 964
 965                         gve_handle_miss_completion(priv, tx, compl_tag,
 966                                                    &miss_compl_bytes,
 967                                                    &miss_compl_pkts);
 968                 } else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) {
 969                         u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
 970
 971                         gve_handle_packet_completion(priv, tx, !!napi,
 972                                                      compl_tag,
 973                                                      &reinject_compl_bytes,
 974                                                      &reinject_compl_pkts,
 975                                                      /*is_reinjection=*/true);
 976                 }
 977
 978                 tx->dqo_compl.head =
 979                         (tx->dqo_compl.head + 1) & tx->dqo.complq_mask;
 980                 /* Flip the generation bit when we wrap around */
 981                 tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0;
 982                 num_descs_cleaned++;
 983         }
 984
 985         netdev_tx_completed_queue(tx->netdev_txq,
 986                                   pkt_compl_pkts + miss_compl_pkts,
 987                                   pkt_compl_bytes + miss_compl_bytes);
 988
 989         remove_miss_completions(priv, tx);
 990         remove_timed_out_completions(priv, tx);
 991
 992         u64_stats_update_begin(&tx->statss);
 993         tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes;
 994         tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts;
 995         u64_stats_update_end(&tx->statss);
 996         return num_descs_cleaned;
 997 }
 998
 999 bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean)
1000 {
1001         struct gve_tx_compl_desc *compl_desc;
1002         struct gve_tx_ring *tx = block->tx;
1003         struct gve_priv *priv = block->priv;
1004
1005         if (do_clean) {
1006                 int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx,
1007                                                               &block->napi);
1008
1009                 /* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */
1010                 mb();
1011
1012                 if (netif_tx_queue_stopped(tx->netdev_txq) &&
1013                     num_descs_cleaned > 0) {
1014                         tx->wake_queue++;
1015                         netif_tx_wake_queue(tx->netdev_txq);
1016                 }
1017         }
1018
1019         /* Return true if we still have work. */
1020         compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head];
1021         return compl_desc->generation != tx->dqo_compl.cur_gen_bit;
1022 }