google/gve:fix repeated words in comments
[linux-2.6-microblaze.git] / drivers / net / ethernet / google / gve / gve_tx_dqo.c
1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2021 Google, Inc.
5  */
6
7 #include "gve.h"
8 #include "gve_adminq.h"
9 #include "gve_utils.h"
10 #include "gve_dqo.h"
11 #include <linux/tcp.h>
12 #include <linux/slab.h>
13 #include <linux/skbuff.h>
14
15 /* Returns true if a gve_tx_pending_packet_dqo object is available. */
16 static bool gve_has_pending_packet(struct gve_tx_ring *tx)
17 {
18         /* Check TX path's list. */
19         if (tx->dqo_tx.free_pending_packets != -1)
20                 return true;
21
22         /* Check completion handler's list. */
23         if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1)
24                 return true;
25
26         return false;
27 }
28
29 static struct gve_tx_pending_packet_dqo *
30 gve_alloc_pending_packet(struct gve_tx_ring *tx)
31 {
32         struct gve_tx_pending_packet_dqo *pending_packet;
33         s16 index;
34
35         index = tx->dqo_tx.free_pending_packets;
36
37         /* No pending_packets available, try to steal the list from the
38          * completion handler.
39          */
40         if (unlikely(index == -1)) {
41                 tx->dqo_tx.free_pending_packets =
42                         atomic_xchg(&tx->dqo_compl.free_pending_packets, -1);
43                 index = tx->dqo_tx.free_pending_packets;
44
45                 if (unlikely(index == -1))
46                         return NULL;
47         }
48
49         pending_packet = &tx->dqo.pending_packets[index];
50
51         /* Remove pending_packet from free list */
52         tx->dqo_tx.free_pending_packets = pending_packet->next;
53         pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL;
54
55         return pending_packet;
56 }
57
58 static void
59 gve_free_pending_packet(struct gve_tx_ring *tx,
60                         struct gve_tx_pending_packet_dqo *pending_packet)
61 {
62         s16 index = pending_packet - tx->dqo.pending_packets;
63
64         pending_packet->state = GVE_PACKET_STATE_UNALLOCATED;
65         while (true) {
66                 s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets);
67
68                 pending_packet->next = old_head;
69                 if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets,
70                                    old_head, index) == old_head) {
71                         break;
72                 }
73         }
74 }
75
76 /* gve_tx_free_desc - Cleans up all pending tx requests and buffers.
77  */
78 static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx)
79 {
80         int i;
81
82         for (i = 0; i < tx->dqo.num_pending_packets; i++) {
83                 struct gve_tx_pending_packet_dqo *cur_state =
84                         &tx->dqo.pending_packets[i];
85                 int j;
86
87                 for (j = 0; j < cur_state->num_bufs; j++) {
88                         if (j == 0) {
89                                 dma_unmap_single(tx->dev,
90                                         dma_unmap_addr(cur_state, dma[j]),
91                                         dma_unmap_len(cur_state, len[j]),
92                                         DMA_TO_DEVICE);
93                         } else {
94                                 dma_unmap_page(tx->dev,
95                                         dma_unmap_addr(cur_state, dma[j]),
96                                         dma_unmap_len(cur_state, len[j]),
97                                         DMA_TO_DEVICE);
98                         }
99                 }
100                 if (cur_state->skb) {
101                         dev_consume_skb_any(cur_state->skb);
102                         cur_state->skb = NULL;
103                 }
104         }
105 }
106
107 static void gve_tx_free_ring_dqo(struct gve_priv *priv, int idx)
108 {
109         struct gve_tx_ring *tx = &priv->tx[idx];
110         struct device *hdev = &priv->pdev->dev;
111         size_t bytes;
112
113         gve_tx_remove_from_block(priv, idx);
114
115         if (tx->q_resources) {
116                 dma_free_coherent(hdev, sizeof(*tx->q_resources),
117                                   tx->q_resources, tx->q_resources_bus);
118                 tx->q_resources = NULL;
119         }
120
121         if (tx->dqo.compl_ring) {
122                 bytes = sizeof(tx->dqo.compl_ring[0]) *
123                         (tx->dqo.complq_mask + 1);
124                 dma_free_coherent(hdev, bytes, tx->dqo.compl_ring,
125                                   tx->complq_bus_dqo);
126                 tx->dqo.compl_ring = NULL;
127         }
128
129         if (tx->dqo.tx_ring) {
130                 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
131                 dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus);
132                 tx->dqo.tx_ring = NULL;
133         }
134
135         kvfree(tx->dqo.pending_packets);
136         tx->dqo.pending_packets = NULL;
137
138         netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx);
139 }
140
141 static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int idx)
142 {
143         struct gve_tx_ring *tx = &priv->tx[idx];
144         struct device *hdev = &priv->pdev->dev;
145         int num_pending_packets;
146         size_t bytes;
147         int i;
148
149         memset(tx, 0, sizeof(*tx));
150         tx->q_num = idx;
151         tx->dev = &priv->pdev->dev;
152         tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx);
153         atomic_set_release(&tx->dqo_compl.hw_tx_head, 0);
154
155         /* Queue sizes must be a power of 2 */
156         tx->mask = priv->tx_desc_cnt - 1;
157         tx->dqo.complq_mask = priv->options_dqo_rda.tx_comp_ring_entries - 1;
158
159         /* The max number of pending packets determines the maximum number of
160          * descriptors which maybe written to the completion queue.
161          *
162          * We must set the number small enough to make sure we never overrun the
163          * completion queue.
164          */
165         num_pending_packets = tx->dqo.complq_mask + 1;
166
167         /* Reserve space for descriptor completions, which will be reported at
168          * most every GVE_TX_MIN_RE_INTERVAL packets.
169          */
170         num_pending_packets -=
171                 (tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL;
172
173         /* Each packet may have at most 2 buffer completions if it receives both
174          * a miss and reinjection completion.
175          */
176         num_pending_packets /= 2;
177
178         tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX);
179         tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets,
180                                            sizeof(tx->dqo.pending_packets[0]),
181                                            GFP_KERNEL);
182         if (!tx->dqo.pending_packets)
183                 goto err;
184
185         /* Set up linked list of pending packets */
186         for (i = 0; i < tx->dqo.num_pending_packets - 1; i++)
187                 tx->dqo.pending_packets[i].next = i + 1;
188
189         tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1;
190         atomic_set_release(&tx->dqo_compl.free_pending_packets, -1);
191         tx->dqo_compl.miss_completions.head = -1;
192         tx->dqo_compl.miss_completions.tail = -1;
193         tx->dqo_compl.timed_out_completions.head = -1;
194         tx->dqo_compl.timed_out_completions.tail = -1;
195
196         bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
197         tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL);
198         if (!tx->dqo.tx_ring)
199                 goto err;
200
201         bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1);
202         tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes,
203                                                 &tx->complq_bus_dqo,
204                                                 GFP_KERNEL);
205         if (!tx->dqo.compl_ring)
206                 goto err;
207
208         tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources),
209                                              &tx->q_resources_bus, GFP_KERNEL);
210         if (!tx->q_resources)
211                 goto err;
212
213         gve_tx_add_to_block(priv, idx);
214
215         return 0;
216
217 err:
218         gve_tx_free_ring_dqo(priv, idx);
219         return -ENOMEM;
220 }
221
222 int gve_tx_alloc_rings_dqo(struct gve_priv *priv)
223 {
224         int err = 0;
225         int i;
226
227         for (i = 0; i < priv->tx_cfg.num_queues; i++) {
228                 err = gve_tx_alloc_ring_dqo(priv, i);
229                 if (err) {
230                         netif_err(priv, drv, priv->dev,
231                                   "Failed to alloc tx ring=%d: err=%d\n",
232                                   i, err);
233                         goto err;
234                 }
235         }
236
237         return 0;
238
239 err:
240         for (i--; i >= 0; i--)
241                 gve_tx_free_ring_dqo(priv, i);
242
243         return err;
244 }
245
246 void gve_tx_free_rings_dqo(struct gve_priv *priv)
247 {
248         int i;
249
250         for (i = 0; i < priv->tx_cfg.num_queues; i++) {
251                 struct gve_tx_ring *tx = &priv->tx[i];
252
253                 gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL);
254                 netdev_tx_reset_queue(tx->netdev_txq);
255                 gve_tx_clean_pending_packets(tx);
256
257                 gve_tx_free_ring_dqo(priv, i);
258         }
259 }
260
261 /* Returns the number of slots available in the ring */
262 static u32 num_avail_tx_slots(const struct gve_tx_ring *tx)
263 {
264         u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask;
265
266         return tx->mask - num_used;
267 }
268
269 /* Stops the queue if available descriptors is less than 'count'.
270  * Return: 0 if stop is not required.
271  */
272 static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count)
273 {
274         if (likely(gve_has_pending_packet(tx) &&
275                    num_avail_tx_slots(tx) >= count))
276                 return 0;
277
278         /* Update cached TX head pointer */
279         tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
280
281         if (likely(gve_has_pending_packet(tx) &&
282                    num_avail_tx_slots(tx) >= count))
283                 return 0;
284
285         /* No space, so stop the queue */
286         tx->stop_queue++;
287         netif_tx_stop_queue(tx->netdev_txq);
288
289         /* Sync with restarting queue in `gve_tx_poll_dqo()` */
290         mb();
291
292         /* After stopping queue, check if we can transmit again in order to
293          * avoid TOCTOU bug.
294          */
295         tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
296
297         if (likely(!gve_has_pending_packet(tx) ||
298                    num_avail_tx_slots(tx) < count))
299                 return -EBUSY;
300
301         netif_tx_start_queue(tx->netdev_txq);
302         tx->wake_queue++;
303         return 0;
304 }
305
306 static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb,
307                                         struct gve_tx_metadata_dqo *metadata)
308 {
309         memset(metadata, 0, sizeof(*metadata));
310         metadata->version = GVE_TX_METADATA_VERSION_DQO;
311
312         if (skb->l4_hash) {
313                 u16 path_hash = skb->hash ^ (skb->hash >> 16);
314
315                 path_hash &= (1 << 15) - 1;
316                 if (unlikely(path_hash == 0))
317                         path_hash = ~path_hash;
318
319                 metadata->path_hash = path_hash;
320         }
321 }
322
323 static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx,
324                                      struct sk_buff *skb, u32 len, u64 addr,
325                                      s16 compl_tag, bool eop, bool is_gso)
326 {
327         const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL;
328
329         while (len > 0) {
330                 struct gve_tx_pkt_desc_dqo *desc =
331                         &tx->dqo.tx_ring[*desc_idx].pkt;
332                 u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO);
333                 bool cur_eop = eop && cur_len == len;
334
335                 *desc = (struct gve_tx_pkt_desc_dqo){
336                         .buf_addr = cpu_to_le64(addr),
337                         .dtype = GVE_TX_PKT_DESC_DTYPE_DQO,
338                         .end_of_packet = cur_eop,
339                         .checksum_offload_enable = checksum_offload_en,
340                         .compl_tag = cpu_to_le16(compl_tag),
341                         .buf_size = cur_len,
342                 };
343
344                 addr += cur_len;
345                 len -= cur_len;
346                 *desc_idx = (*desc_idx + 1) & tx->mask;
347         }
348 }
349
350 /* Validates and prepares `skb` for TSO.
351  *
352  * Returns header length, or < 0 if invalid.
353  */
354 static int gve_prep_tso(struct sk_buff *skb)
355 {
356         struct tcphdr *tcp;
357         int header_len;
358         u32 paylen;
359         int err;
360
361         /* Note: HW requires MSS (gso_size) to be <= 9728 and the total length
362          * of the TSO to be <= 262143.
363          *
364          * However, we don't validate these because:
365          * - Hypervisor enforces a limit of 9K MTU
366          * - Kernel will not produce a TSO larger than 64k
367          */
368
369         if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO))
370                 return -1;
371
372         /* Needed because we will modify header. */
373         err = skb_cow_head(skb, 0);
374         if (err < 0)
375                 return err;
376
377         tcp = tcp_hdr(skb);
378
379         /* Remove payload length from checksum. */
380         paylen = skb->len - skb_transport_offset(skb);
381
382         switch (skb_shinfo(skb)->gso_type) {
383         case SKB_GSO_TCPV4:
384         case SKB_GSO_TCPV6:
385                 csum_replace_by_diff(&tcp->check,
386                                      (__force __wsum)htonl(paylen));
387
388                 /* Compute length of segmentation header. */
389                 header_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
390                 break;
391         default:
392                 return -EINVAL;
393         }
394
395         if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO))
396                 return -EINVAL;
397
398         return header_len;
399 }
400
401 static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc,
402                                      const struct sk_buff *skb,
403                                      const struct gve_tx_metadata_dqo *metadata,
404                                      int header_len)
405 {
406         *desc = (struct gve_tx_tso_context_desc_dqo){
407                 .header_len = header_len,
408                 .cmd_dtype = {
409                         .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO,
410                         .tso = 1,
411                 },
412                 .flex0 = metadata->bytes[0],
413                 .flex5 = metadata->bytes[5],
414                 .flex6 = metadata->bytes[6],
415                 .flex7 = metadata->bytes[7],
416                 .flex8 = metadata->bytes[8],
417                 .flex9 = metadata->bytes[9],
418                 .flex10 = metadata->bytes[10],
419                 .flex11 = metadata->bytes[11],
420         };
421         desc->tso_total_len = skb->len - header_len;
422         desc->mss = skb_shinfo(skb)->gso_size;
423 }
424
425 static void
426 gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc,
427                              const struct gve_tx_metadata_dqo *metadata)
428 {
429         *desc = (struct gve_tx_general_context_desc_dqo){
430                 .flex0 = metadata->bytes[0],
431                 .flex1 = metadata->bytes[1],
432                 .flex2 = metadata->bytes[2],
433                 .flex3 = metadata->bytes[3],
434                 .flex4 = metadata->bytes[4],
435                 .flex5 = metadata->bytes[5],
436                 .flex6 = metadata->bytes[6],
437                 .flex7 = metadata->bytes[7],
438                 .flex8 = metadata->bytes[8],
439                 .flex9 = metadata->bytes[9],
440                 .flex10 = metadata->bytes[10],
441                 .flex11 = metadata->bytes[11],
442                 .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO},
443         };
444 }
445
446 /* Returns 0 on success, or < 0 on error.
447  *
448  * Before this function is called, the caller must ensure
449  * gve_has_pending_packet(tx) returns true.
450  */
451 static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx,
452                                       struct sk_buff *skb)
453 {
454         const struct skb_shared_info *shinfo = skb_shinfo(skb);
455         const bool is_gso = skb_is_gso(skb);
456         u32 desc_idx = tx->dqo_tx.tail;
457
458         struct gve_tx_pending_packet_dqo *pkt;
459         struct gve_tx_metadata_dqo metadata;
460         s16 completion_tag;
461         int i;
462
463         pkt = gve_alloc_pending_packet(tx);
464         pkt->skb = skb;
465         pkt->num_bufs = 0;
466         completion_tag = pkt - tx->dqo.pending_packets;
467
468         gve_extract_tx_metadata_dqo(skb, &metadata);
469         if (is_gso) {
470                 int header_len = gve_prep_tso(skb);
471
472                 if (unlikely(header_len < 0))
473                         goto err;
474
475                 gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx,
476                                          skb, &metadata, header_len);
477                 desc_idx = (desc_idx + 1) & tx->mask;
478         }
479
480         gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx,
481                                      &metadata);
482         desc_idx = (desc_idx + 1) & tx->mask;
483
484         /* Note: HW requires that the size of a non-TSO packet be within the
485          * range of [17, 9728].
486          *
487          * We don't double check because
488          * - We limited `netdev->min_mtu` to ETH_MIN_MTU.
489          * - Hypervisor won't allow MTU larger than 9216.
490          */
491
492         /* Map the linear portion of skb */
493         {
494                 u32 len = skb_headlen(skb);
495                 dma_addr_t addr;
496
497                 addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
498                 if (unlikely(dma_mapping_error(tx->dev, addr)))
499                         goto err;
500
501                 dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
502                 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
503                 ++pkt->num_bufs;
504
505                 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
506                                          completion_tag,
507                                          /*eop=*/shinfo->nr_frags == 0, is_gso);
508         }
509
510         for (i = 0; i < shinfo->nr_frags; i++) {
511                 const skb_frag_t *frag = &shinfo->frags[i];
512                 bool is_eop = i == (shinfo->nr_frags - 1);
513                 u32 len = skb_frag_size(frag);
514                 dma_addr_t addr;
515
516                 addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
517                 if (unlikely(dma_mapping_error(tx->dev, addr)))
518                         goto err;
519
520                 dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
521                 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
522                 ++pkt->num_bufs;
523
524                 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
525                                          completion_tag, is_eop, is_gso);
526         }
527
528         /* Commit the changes to our state */
529         tx->dqo_tx.tail = desc_idx;
530
531         /* Request a descriptor completion on the last descriptor of the
532          * packet if we are allowed to by the HW enforced interval.
533          */
534         {
535                 u32 last_desc_idx = (desc_idx - 1) & tx->mask;
536                 u32 last_report_event_interval =
537                         (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask;
538
539                 if (unlikely(last_report_event_interval >=
540                              GVE_TX_MIN_RE_INTERVAL)) {
541                         tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true;
542                         tx->dqo_tx.last_re_idx = last_desc_idx;
543                 }
544         }
545
546         return 0;
547
548 err:
549         for (i = 0; i < pkt->num_bufs; i++) {
550                 if (i == 0) {
551                         dma_unmap_single(tx->dev,
552                                          dma_unmap_addr(pkt, dma[i]),
553                                          dma_unmap_len(pkt, len[i]),
554                                          DMA_TO_DEVICE);
555                 } else {
556                         dma_unmap_page(tx->dev,
557                                        dma_unmap_addr(pkt, dma[i]),
558                                        dma_unmap_len(pkt, len[i]),
559                                        DMA_TO_DEVICE);
560                 }
561         }
562
563         pkt->skb = NULL;
564         pkt->num_bufs = 0;
565         gve_free_pending_packet(tx, pkt);
566
567         return -1;
568 }
569
570 static int gve_num_descs_per_buf(size_t size)
571 {
572         return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO);
573 }
574
575 static int gve_num_buffer_descs_needed(const struct sk_buff *skb)
576 {
577         const struct skb_shared_info *shinfo = skb_shinfo(skb);
578         int num_descs;
579         int i;
580
581         num_descs = gve_num_descs_per_buf(skb_headlen(skb));
582
583         for (i = 0; i < shinfo->nr_frags; i++) {
584                 unsigned int frag_size = skb_frag_size(&shinfo->frags[i]);
585
586                 num_descs += gve_num_descs_per_buf(frag_size);
587         }
588
589         return num_descs;
590 }
591
592 /* Returns true if HW is capable of sending TSO represented by `skb`.
593  *
594  * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers.
595  * - The header is counted as one buffer for every single segment.
596  * - A buffer which is split between two segments is counted for both.
597  * - If a buffer contains both header and payload, it is counted as two buffers.
598  */
599 static bool gve_can_send_tso(const struct sk_buff *skb)
600 {
601         const int header_len = skb_checksum_start_offset(skb) + tcp_hdrlen(skb);
602         const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1;
603         const struct skb_shared_info *shinfo = skb_shinfo(skb);
604         const int gso_size = shinfo->gso_size;
605         int cur_seg_num_bufs;
606         int cur_seg_size;
607         int i;
608
609         cur_seg_size = skb_headlen(skb) - header_len;
610         cur_seg_num_bufs = cur_seg_size > 0;
611
612         for (i = 0; i < shinfo->nr_frags; i++) {
613                 if (cur_seg_size >= gso_size) {
614                         cur_seg_size %= gso_size;
615                         cur_seg_num_bufs = cur_seg_size > 0;
616                 }
617
618                 if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg))
619                         return false;
620
621                 cur_seg_size += skb_frag_size(&shinfo->frags[i]);
622         }
623
624         return true;
625 }
626
627 /* Attempt to transmit specified SKB.
628  *
629  * Returns 0 if the SKB was transmitted or dropped.
630  * Returns -1 if there is not currently enough space to transmit the SKB.
631  */
632 static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx,
633                           struct sk_buff *skb)
634 {
635         int num_buffer_descs;
636         int total_num_descs;
637
638         if (skb_is_gso(skb)) {
639                 /* If TSO doesn't meet HW requirements, attempt to linearize the
640                  * packet.
641                  */
642                 if (unlikely(!gve_can_send_tso(skb) &&
643                              skb_linearize(skb) < 0)) {
644                         net_err_ratelimited("%s: Failed to transmit TSO packet\n",
645                                             priv->dev->name);
646                         goto drop;
647                 }
648
649                 num_buffer_descs = gve_num_buffer_descs_needed(skb);
650         } else {
651                 num_buffer_descs = gve_num_buffer_descs_needed(skb);
652
653                 if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) {
654                         if (unlikely(skb_linearize(skb) < 0))
655                                 goto drop;
656
657                         num_buffer_descs = 1;
658                 }
659         }
660
661         /* Metadata + (optional TSO) + data descriptors. */
662         total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs;
663         if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs +
664                         GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) {
665                 return -1;
666         }
667
668         if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0))
669                 goto drop;
670
671         netdev_tx_sent_queue(tx->netdev_txq, skb->len);
672         skb_tx_timestamp(skb);
673         return 0;
674
675 drop:
676         tx->dropped_pkt++;
677         dev_kfree_skb_any(skb);
678         return 0;
679 }
680
681 /* Transmit a given skb and ring the doorbell. */
682 netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
683 {
684         struct gve_priv *priv = netdev_priv(dev);
685         struct gve_tx_ring *tx;
686
687         tx = &priv->tx[skb_get_queue_mapping(skb)];
688         if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) {
689                 /* We need to ring the txq doorbell -- we have stopped the Tx
690                  * queue for want of resources, but prior calls to gve_tx()
691                  * may have added descriptors without ringing the doorbell.
692                  */
693                 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
694                 return NETDEV_TX_BUSY;
695         }
696
697         if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())
698                 return NETDEV_TX_OK;
699
700         gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
701         return NETDEV_TX_OK;
702 }
703
704 static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list,
705                         struct gve_tx_pending_packet_dqo *pending_packet)
706 {
707         s16 old_tail, index;
708
709         index = pending_packet - tx->dqo.pending_packets;
710         old_tail = list->tail;
711         list->tail = index;
712         if (old_tail == -1)
713                 list->head = index;
714         else
715                 tx->dqo.pending_packets[old_tail].next = index;
716
717         pending_packet->next = -1;
718         pending_packet->prev = old_tail;
719 }
720
721 static void remove_from_list(struct gve_tx_ring *tx,
722                              struct gve_index_list *list,
723                              struct gve_tx_pending_packet_dqo *pkt)
724 {
725         s16 prev_index, next_index;
726
727         prev_index = pkt->prev;
728         next_index = pkt->next;
729
730         if (prev_index == -1) {
731                 /* Node is head */
732                 list->head = next_index;
733         } else {
734                 tx->dqo.pending_packets[prev_index].next = next_index;
735         }
736         if (next_index == -1) {
737                 /* Node is tail */
738                 list->tail = prev_index;
739         } else {
740                 tx->dqo.pending_packets[next_index].prev = prev_index;
741         }
742 }
743
744 static void gve_unmap_packet(struct device *dev,
745                              struct gve_tx_pending_packet_dqo *pkt)
746 {
747         int i;
748
749         /* SKB linear portion is guaranteed to be mapped */
750         dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]),
751                          dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE);
752         for (i = 1; i < pkt->num_bufs; i++) {
753                 dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]),
754                                dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE);
755         }
756         pkt->num_bufs = 0;
757 }
758
759 /* Completion types and expected behavior:
760  * No Miss compl + Packet compl = Packet completed normally.
761  * Miss compl + Re-inject compl = Packet completed normally.
762  * No Miss compl + Re-inject compl = Skipped i.e. packet not completed.
763  * Miss compl + Packet compl = Skipped i.e. packet not completed.
764  */
765 static void gve_handle_packet_completion(struct gve_priv *priv,
766                                          struct gve_tx_ring *tx, bool is_napi,
767                                          u16 compl_tag, u64 *bytes, u64 *pkts,
768                                          bool is_reinjection)
769 {
770         struct gve_tx_pending_packet_dqo *pending_packet;
771
772         if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
773                 net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
774                                     priv->dev->name, (int)compl_tag);
775                 return;
776         }
777
778         pending_packet = &tx->dqo.pending_packets[compl_tag];
779
780         if (unlikely(is_reinjection)) {
781                 if (unlikely(pending_packet->state ==
782                              GVE_PACKET_STATE_TIMED_OUT_COMPL)) {
783                         net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n",
784                                             priv->dev->name, (int)compl_tag);
785                         /* Packet was already completed as a result of timeout,
786                          * so just remove from list and free pending packet.
787                          */
788                         remove_from_list(tx,
789                                          &tx->dqo_compl.timed_out_completions,
790                                          pending_packet);
791                         gve_free_pending_packet(tx, pending_packet);
792                         return;
793                 }
794                 if (unlikely(pending_packet->state !=
795                              GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) {
796                         /* No outstanding miss completion but packet allocated
797                          * implies packet receives a re-injection completion
798                          * without a prior miss completion. Return without
799                          * completing the packet.
800                          */
801                         net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n",
802                                             priv->dev->name, (int)compl_tag);
803                         return;
804                 }
805                 remove_from_list(tx, &tx->dqo_compl.miss_completions,
806                                  pending_packet);
807         } else {
808                 /* Packet is allocated but not a pending data completion. */
809                 if (unlikely(pending_packet->state !=
810                              GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
811                         net_err_ratelimited("%s: No pending data completion: %d\n",
812                                             priv->dev->name, (int)compl_tag);
813                         return;
814                 }
815         }
816         gve_unmap_packet(tx->dev, pending_packet);
817
818         *bytes += pending_packet->skb->len;
819         (*pkts)++;
820         napi_consume_skb(pending_packet->skb, is_napi);
821         pending_packet->skb = NULL;
822         gve_free_pending_packet(tx, pending_packet);
823 }
824
825 static void gve_handle_miss_completion(struct gve_priv *priv,
826                                        struct gve_tx_ring *tx, u16 compl_tag,
827                                        u64 *bytes, u64 *pkts)
828 {
829         struct gve_tx_pending_packet_dqo *pending_packet;
830
831         if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
832                 net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
833                                     priv->dev->name, (int)compl_tag);
834                 return;
835         }
836
837         pending_packet = &tx->dqo.pending_packets[compl_tag];
838         if (unlikely(pending_packet->state !=
839                                 GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
840                 net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n",
841                                     priv->dev->name, (int)pending_packet->state,
842                                     (int)compl_tag);
843                 return;
844         }
845
846         pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL;
847         /* jiffies can wraparound but time comparisons can handle overflows. */
848         pending_packet->timeout_jiffies =
849                         jiffies +
850                         msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT *
851                                          MSEC_PER_SEC);
852         add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet);
853
854         *bytes += pending_packet->skb->len;
855         (*pkts)++;
856 }
857
858 static void remove_miss_completions(struct gve_priv *priv,
859                                     struct gve_tx_ring *tx)
860 {
861         struct gve_tx_pending_packet_dqo *pending_packet;
862         s16 next_index;
863
864         next_index = tx->dqo_compl.miss_completions.head;
865         while (next_index != -1) {
866                 pending_packet = &tx->dqo.pending_packets[next_index];
867                 next_index = pending_packet->next;
868                 /* Break early because packets should timeout in order. */
869                 if (time_is_after_jiffies(pending_packet->timeout_jiffies))
870                         break;
871
872                 remove_from_list(tx, &tx->dqo_compl.miss_completions,
873                                  pending_packet);
874                 /* Unmap buffers and free skb but do not unallocate packet i.e.
875                  * the completion tag is not freed to ensure that the driver
876                  * can take appropriate action if a corresponding valid
877                  * completion is received later.
878                  */
879                 gve_unmap_packet(tx->dev, pending_packet);
880                 /* This indicates the packet was dropped. */
881                 dev_kfree_skb_any(pending_packet->skb);
882                 pending_packet->skb = NULL;
883                 tx->dropped_pkt++;
884                 net_err_ratelimited("%s: No reinjection completion was received for: %d.\n",
885                                     priv->dev->name,
886                                     (int)(pending_packet - tx->dqo.pending_packets));
887
888                 pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL;
889                 pending_packet->timeout_jiffies =
890                                 jiffies +
891                                 msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT *
892                                                  MSEC_PER_SEC);
893                 /* Maintain pending packet in another list so the packet can be
894                  * unallocated at a later time.
895                  */
896                 add_to_list(tx, &tx->dqo_compl.timed_out_completions,
897                             pending_packet);
898         }
899 }
900
901 static void remove_timed_out_completions(struct gve_priv *priv,
902                                          struct gve_tx_ring *tx)
903 {
904         struct gve_tx_pending_packet_dqo *pending_packet;
905         s16 next_index;
906
907         next_index = tx->dqo_compl.timed_out_completions.head;
908         while (next_index != -1) {
909                 pending_packet = &tx->dqo.pending_packets[next_index];
910                 next_index = pending_packet->next;
911                 /* Break early because packets should timeout in order. */
912                 if (time_is_after_jiffies(pending_packet->timeout_jiffies))
913                         break;
914
915                 remove_from_list(tx, &tx->dqo_compl.timed_out_completions,
916                                  pending_packet);
917                 gve_free_pending_packet(tx, pending_packet);
918         }
919 }
920
921 int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
922                           struct napi_struct *napi)
923 {
924         u64 reinject_compl_bytes = 0;
925         u64 reinject_compl_pkts = 0;
926         int num_descs_cleaned = 0;
927         u64 miss_compl_bytes = 0;
928         u64 miss_compl_pkts = 0;
929         u64 pkt_compl_bytes = 0;
930         u64 pkt_compl_pkts = 0;
931
932         /* Limit in order to avoid blocking for too long */
933         while (!napi || pkt_compl_pkts < napi->weight) {
934                 struct gve_tx_compl_desc *compl_desc =
935                         &tx->dqo.compl_ring[tx->dqo_compl.head];
936                 u16 type;
937
938                 if (compl_desc->generation == tx->dqo_compl.cur_gen_bit)
939                         break;
940
941                 /* Prefetch the next descriptor. */
942                 prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) &
943                                 tx->dqo.complq_mask]);
944
945                 /* Do not read data until we own the descriptor */
946                 dma_rmb();
947                 type = compl_desc->type;
948
949                 if (type == GVE_COMPL_TYPE_DQO_DESC) {
950                         /* This is the last descriptor fetched by HW plus one */
951                         u16 tx_head = le16_to_cpu(compl_desc->tx_head);
952
953                         atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head);
954                 } else if (type == GVE_COMPL_TYPE_DQO_PKT) {
955                         u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
956
957                         gve_handle_packet_completion(priv, tx, !!napi,
958                                                      compl_tag,
959                                                      &pkt_compl_bytes,
960                                                      &pkt_compl_pkts,
961                                                      /*is_reinjection=*/false);
962                 } else if (type == GVE_COMPL_TYPE_DQO_MISS) {
963                         u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
964
965                         gve_handle_miss_completion(priv, tx, compl_tag,
966                                                    &miss_compl_bytes,
967                                                    &miss_compl_pkts);
968                 } else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) {
969                         u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
970
971                         gve_handle_packet_completion(priv, tx, !!napi,
972                                                      compl_tag,
973                                                      &reinject_compl_bytes,
974                                                      &reinject_compl_pkts,
975                                                      /*is_reinjection=*/true);
976                 }
977
978                 tx->dqo_compl.head =
979                         (tx->dqo_compl.head + 1) & tx->dqo.complq_mask;
980                 /* Flip the generation bit when we wrap around */
981                 tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0;
982                 num_descs_cleaned++;
983         }
984
985         netdev_tx_completed_queue(tx->netdev_txq,
986                                   pkt_compl_pkts + miss_compl_pkts,
987                                   pkt_compl_bytes + miss_compl_bytes);
988
989         remove_miss_completions(priv, tx);
990         remove_timed_out_completions(priv, tx);
991
992         u64_stats_update_begin(&tx->statss);
993         tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes;
994         tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts;
995         u64_stats_update_end(&tx->statss);
996         return num_descs_cleaned;
997 }
998
999 bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean)
1000 {
1001         struct gve_tx_compl_desc *compl_desc;
1002         struct gve_tx_ring *tx = block->tx;
1003         struct gve_priv *priv = block->priv;
1004
1005         if (do_clean) {
1006                 int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx,
1007                                                               &block->napi);
1008
1009                 /* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */
1010                 mb();
1011
1012                 if (netif_tx_queue_stopped(tx->netdev_txq) &&
1013                     num_descs_cleaned > 0) {
1014                         tx->wake_queue++;
1015                         netif_tx_wake_queue(tx->netdev_txq);
1016                 }
1017         }
1018
1019         /* Return true if we still have work. */
1020         compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head];
1021         return compl_desc->generation != tx->dqo_compl.cur_gen_bit;
1022 }