1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
4 * Copyright (C) 2015-2021 Google, Inc.
8 #include "gve_adminq.h"
11 #include <linux/tcp.h>
12 #include <linux/slab.h>
13 #include <linux/skbuff.h>
15 /* Returns true if a gve_tx_pending_packet_dqo object is available. */
16 static bool gve_has_pending_packet(struct gve_tx_ring *tx)
18 /* Check TX path's list. */
19 if (tx->dqo_tx.free_pending_packets != -1)
22 /* Check completion handler's list. */
23 if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1)
29 static struct gve_tx_pending_packet_dqo *
30 gve_alloc_pending_packet(struct gve_tx_ring *tx)
32 struct gve_tx_pending_packet_dqo *pending_packet;
35 index = tx->dqo_tx.free_pending_packets;
37 /* No pending_packets available, try to steal the list from the
40 if (unlikely(index == -1)) {
41 tx->dqo_tx.free_pending_packets =
42 atomic_xchg(&tx->dqo_compl.free_pending_packets, -1);
43 index = tx->dqo_tx.free_pending_packets;
45 if (unlikely(index == -1))
49 pending_packet = &tx->dqo.pending_packets[index];
51 /* Remove pending_packet from free list */
52 tx->dqo_tx.free_pending_packets = pending_packet->next;
53 pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL;
55 return pending_packet;
59 gve_free_pending_packet(struct gve_tx_ring *tx,
60 struct gve_tx_pending_packet_dqo *pending_packet)
62 s16 index = pending_packet - tx->dqo.pending_packets;
64 pending_packet->state = GVE_PACKET_STATE_UNALLOCATED;
66 s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets);
68 pending_packet->next = old_head;
69 if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets,
70 old_head, index) == old_head) {
76 /* gve_tx_free_desc - Cleans up all pending tx requests and buffers.
78 static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx)
82 for (i = 0; i < tx->dqo.num_pending_packets; i++) {
83 struct gve_tx_pending_packet_dqo *cur_state =
84 &tx->dqo.pending_packets[i];
87 for (j = 0; j < cur_state->num_bufs; j++) {
89 dma_unmap_single(tx->dev,
90 dma_unmap_addr(cur_state, dma[j]),
91 dma_unmap_len(cur_state, len[j]),
94 dma_unmap_page(tx->dev,
95 dma_unmap_addr(cur_state, dma[j]),
96 dma_unmap_len(cur_state, len[j]),
100 if (cur_state->skb) {
101 dev_consume_skb_any(cur_state->skb);
102 cur_state->skb = NULL;
107 static void gve_tx_free_ring_dqo(struct gve_priv *priv, int idx)
109 struct gve_tx_ring *tx = &priv->tx[idx];
110 struct device *hdev = &priv->pdev->dev;
113 gve_tx_remove_from_block(priv, idx);
115 if (tx->q_resources) {
116 dma_free_coherent(hdev, sizeof(*tx->q_resources),
117 tx->q_resources, tx->q_resources_bus);
118 tx->q_resources = NULL;
121 if (tx->dqo.compl_ring) {
122 bytes = sizeof(tx->dqo.compl_ring[0]) *
123 (tx->dqo.complq_mask + 1);
124 dma_free_coherent(hdev, bytes, tx->dqo.compl_ring,
126 tx->dqo.compl_ring = NULL;
129 if (tx->dqo.tx_ring) {
130 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
131 dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus);
132 tx->dqo.tx_ring = NULL;
135 kvfree(tx->dqo.pending_packets);
136 tx->dqo.pending_packets = NULL;
138 netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx);
141 static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int idx)
143 struct gve_tx_ring *tx = &priv->tx[idx];
144 struct device *hdev = &priv->pdev->dev;
145 int num_pending_packets;
149 memset(tx, 0, sizeof(*tx));
151 tx->dev = &priv->pdev->dev;
152 tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx);
153 atomic_set_release(&tx->dqo_compl.hw_tx_head, 0);
155 /* Queue sizes must be a power of 2 */
156 tx->mask = priv->tx_desc_cnt - 1;
157 tx->dqo.complq_mask = priv->options_dqo_rda.tx_comp_ring_entries - 1;
159 /* The max number of pending packets determines the maximum number of
160 * descriptors which maybe written to the completion queue.
162 * We must set the number small enough to make sure we never overrun the
165 num_pending_packets = tx->dqo.complq_mask + 1;
167 /* Reserve space for descriptor completions, which will be reported at
168 * most every GVE_TX_MIN_RE_INTERVAL packets.
170 num_pending_packets -=
171 (tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL;
173 /* Each packet may have at most 2 buffer completions if it receives both
174 * a miss and reinjection completion.
176 num_pending_packets /= 2;
178 tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX);
179 tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets,
180 sizeof(tx->dqo.pending_packets[0]),
182 if (!tx->dqo.pending_packets)
185 /* Set up linked list of pending packets */
186 for (i = 0; i < tx->dqo.num_pending_packets - 1; i++)
187 tx->dqo.pending_packets[i].next = i + 1;
189 tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1;
190 atomic_set_release(&tx->dqo_compl.free_pending_packets, -1);
191 tx->dqo_compl.miss_completions.head = -1;
192 tx->dqo_compl.miss_completions.tail = -1;
193 tx->dqo_compl.timed_out_completions.head = -1;
194 tx->dqo_compl.timed_out_completions.tail = -1;
196 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
197 tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL);
198 if (!tx->dqo.tx_ring)
201 bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1);
202 tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes,
205 if (!tx->dqo.compl_ring)
208 tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources),
209 &tx->q_resources_bus, GFP_KERNEL);
210 if (!tx->q_resources)
213 gve_tx_add_to_block(priv, idx);
218 gve_tx_free_ring_dqo(priv, idx);
222 int gve_tx_alloc_rings_dqo(struct gve_priv *priv)
227 for (i = 0; i < priv->tx_cfg.num_queues; i++) {
228 err = gve_tx_alloc_ring_dqo(priv, i);
230 netif_err(priv, drv, priv->dev,
231 "Failed to alloc tx ring=%d: err=%d\n",
240 for (i--; i >= 0; i--)
241 gve_tx_free_ring_dqo(priv, i);
246 void gve_tx_free_rings_dqo(struct gve_priv *priv)
250 for (i = 0; i < priv->tx_cfg.num_queues; i++) {
251 struct gve_tx_ring *tx = &priv->tx[i];
253 gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL);
254 netdev_tx_reset_queue(tx->netdev_txq);
255 gve_tx_clean_pending_packets(tx);
257 gve_tx_free_ring_dqo(priv, i);
261 /* Returns the number of slots available in the ring */
262 static u32 num_avail_tx_slots(const struct gve_tx_ring *tx)
264 u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask;
266 return tx->mask - num_used;
269 /* Stops the queue if available descriptors is less than 'count'.
270 * Return: 0 if stop is not required.
272 static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count)
274 if (likely(gve_has_pending_packet(tx) &&
275 num_avail_tx_slots(tx) >= count))
278 /* Update cached TX head pointer */
279 tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
281 if (likely(gve_has_pending_packet(tx) &&
282 num_avail_tx_slots(tx) >= count))
285 /* No space, so stop the queue */
287 netif_tx_stop_queue(tx->netdev_txq);
289 /* Sync with restarting queue in `gve_tx_poll_dqo()` */
292 /* After stopping queue, check if we can transmit again in order to
295 tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
297 if (likely(!gve_has_pending_packet(tx) ||
298 num_avail_tx_slots(tx) < count))
301 netif_tx_start_queue(tx->netdev_txq);
306 static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb,
307 struct gve_tx_metadata_dqo *metadata)
309 memset(metadata, 0, sizeof(*metadata));
310 metadata->version = GVE_TX_METADATA_VERSION_DQO;
313 u16 path_hash = skb->hash ^ (skb->hash >> 16);
315 path_hash &= (1 << 15) - 1;
316 if (unlikely(path_hash == 0))
317 path_hash = ~path_hash;
319 metadata->path_hash = path_hash;
323 static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx,
324 struct sk_buff *skb, u32 len, u64 addr,
325 s16 compl_tag, bool eop, bool is_gso)
327 const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL;
330 struct gve_tx_pkt_desc_dqo *desc =
331 &tx->dqo.tx_ring[*desc_idx].pkt;
332 u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO);
333 bool cur_eop = eop && cur_len == len;
335 *desc = (struct gve_tx_pkt_desc_dqo){
336 .buf_addr = cpu_to_le64(addr),
337 .dtype = GVE_TX_PKT_DESC_DTYPE_DQO,
338 .end_of_packet = cur_eop,
339 .checksum_offload_enable = checksum_offload_en,
340 .compl_tag = cpu_to_le16(compl_tag),
346 *desc_idx = (*desc_idx + 1) & tx->mask;
350 /* Validates and prepares `skb` for TSO.
352 * Returns header length, or < 0 if invalid.
354 static int gve_prep_tso(struct sk_buff *skb)
361 /* Note: HW requires MSS (gso_size) to be <= 9728 and the total length
362 * of the TSO to be <= 262143.
364 * However, we don't validate these because:
365 * - Hypervisor enforces a limit of 9K MTU
366 * - Kernel will not produce a TSO larger than 64k
369 if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO))
372 /* Needed because we will modify header. */
373 err = skb_cow_head(skb, 0);
379 /* Remove payload length from checksum. */
380 paylen = skb->len - skb_transport_offset(skb);
382 switch (skb_shinfo(skb)->gso_type) {
385 csum_replace_by_diff(&tcp->check,
386 (__force __wsum)htonl(paylen));
388 /* Compute length of segmentation header. */
389 header_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
395 if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO))
401 static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc,
402 const struct sk_buff *skb,
403 const struct gve_tx_metadata_dqo *metadata,
406 *desc = (struct gve_tx_tso_context_desc_dqo){
407 .header_len = header_len,
409 .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO,
412 .flex0 = metadata->bytes[0],
413 .flex5 = metadata->bytes[5],
414 .flex6 = metadata->bytes[6],
415 .flex7 = metadata->bytes[7],
416 .flex8 = metadata->bytes[8],
417 .flex9 = metadata->bytes[9],
418 .flex10 = metadata->bytes[10],
419 .flex11 = metadata->bytes[11],
421 desc->tso_total_len = skb->len - header_len;
422 desc->mss = skb_shinfo(skb)->gso_size;
426 gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc,
427 const struct gve_tx_metadata_dqo *metadata)
429 *desc = (struct gve_tx_general_context_desc_dqo){
430 .flex0 = metadata->bytes[0],
431 .flex1 = metadata->bytes[1],
432 .flex2 = metadata->bytes[2],
433 .flex3 = metadata->bytes[3],
434 .flex4 = metadata->bytes[4],
435 .flex5 = metadata->bytes[5],
436 .flex6 = metadata->bytes[6],
437 .flex7 = metadata->bytes[7],
438 .flex8 = metadata->bytes[8],
439 .flex9 = metadata->bytes[9],
440 .flex10 = metadata->bytes[10],
441 .flex11 = metadata->bytes[11],
442 .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO},
446 /* Returns 0 on success, or < 0 on error.
448 * Before this function is called, the caller must ensure
449 * gve_has_pending_packet(tx) returns true.
451 static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx,
454 const struct skb_shared_info *shinfo = skb_shinfo(skb);
455 const bool is_gso = skb_is_gso(skb);
456 u32 desc_idx = tx->dqo_tx.tail;
458 struct gve_tx_pending_packet_dqo *pkt;
459 struct gve_tx_metadata_dqo metadata;
463 pkt = gve_alloc_pending_packet(tx);
466 completion_tag = pkt - tx->dqo.pending_packets;
468 gve_extract_tx_metadata_dqo(skb, &metadata);
470 int header_len = gve_prep_tso(skb);
472 if (unlikely(header_len < 0))
475 gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx,
476 skb, &metadata, header_len);
477 desc_idx = (desc_idx + 1) & tx->mask;
480 gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx,
482 desc_idx = (desc_idx + 1) & tx->mask;
484 /* Note: HW requires that the size of a non-TSO packet be within the
485 * range of [17, 9728].
487 * We don't double check because
488 * - We limited `netdev->min_mtu` to ETH_MIN_MTU.
489 * - Hypervisor won't allow MTU larger than 9216.
492 /* Map the linear portion of skb */
494 u32 len = skb_headlen(skb);
497 addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
498 if (unlikely(dma_mapping_error(tx->dev, addr)))
501 dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
502 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
505 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
507 /*eop=*/shinfo->nr_frags == 0, is_gso);
510 for (i = 0; i < shinfo->nr_frags; i++) {
511 const skb_frag_t *frag = &shinfo->frags[i];
512 bool is_eop = i == (shinfo->nr_frags - 1);
513 u32 len = skb_frag_size(frag);
516 addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
517 if (unlikely(dma_mapping_error(tx->dev, addr)))
520 dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
521 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
524 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
525 completion_tag, is_eop, is_gso);
528 /* Commit the changes to our state */
529 tx->dqo_tx.tail = desc_idx;
531 /* Request a descriptor completion on the last descriptor of the
532 * packet if we are allowed to by the HW enforced interval.
535 u32 last_desc_idx = (desc_idx - 1) & tx->mask;
536 u32 last_report_event_interval =
537 (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask;
539 if (unlikely(last_report_event_interval >=
540 GVE_TX_MIN_RE_INTERVAL)) {
541 tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true;
542 tx->dqo_tx.last_re_idx = last_desc_idx;
549 for (i = 0; i < pkt->num_bufs; i++) {
551 dma_unmap_single(tx->dev,
552 dma_unmap_addr(pkt, dma[i]),
553 dma_unmap_len(pkt, len[i]),
556 dma_unmap_page(tx->dev,
557 dma_unmap_addr(pkt, dma[i]),
558 dma_unmap_len(pkt, len[i]),
565 gve_free_pending_packet(tx, pkt);
570 static int gve_num_descs_per_buf(size_t size)
572 return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO);
575 static int gve_num_buffer_descs_needed(const struct sk_buff *skb)
577 const struct skb_shared_info *shinfo = skb_shinfo(skb);
581 num_descs = gve_num_descs_per_buf(skb_headlen(skb));
583 for (i = 0; i < shinfo->nr_frags; i++) {
584 unsigned int frag_size = skb_frag_size(&shinfo->frags[i]);
586 num_descs += gve_num_descs_per_buf(frag_size);
592 /* Returns true if HW is capable of sending TSO represented by `skb`.
594 * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers.
595 * - The header is counted as one buffer for every single segment.
596 * - A buffer which is split between two segments is counted for both.
597 * - If a buffer contains both header and payload, it is counted as two buffers.
599 static bool gve_can_send_tso(const struct sk_buff *skb)
601 const int header_len = skb_checksum_start_offset(skb) + tcp_hdrlen(skb);
602 const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1;
603 const struct skb_shared_info *shinfo = skb_shinfo(skb);
604 const int gso_size = shinfo->gso_size;
605 int cur_seg_num_bufs;
609 cur_seg_size = skb_headlen(skb) - header_len;
610 cur_seg_num_bufs = cur_seg_size > 0;
612 for (i = 0; i < shinfo->nr_frags; i++) {
613 if (cur_seg_size >= gso_size) {
614 cur_seg_size %= gso_size;
615 cur_seg_num_bufs = cur_seg_size > 0;
618 if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg))
621 cur_seg_size += skb_frag_size(&shinfo->frags[i]);
627 /* Attempt to transmit specified SKB.
629 * Returns 0 if the SKB was transmitted or dropped.
630 * Returns -1 if there is not currently enough space to transmit the SKB.
632 static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx,
635 int num_buffer_descs;
638 if (skb_is_gso(skb)) {
639 /* If TSO doesn't meet HW requirements, attempt to linearize the
642 if (unlikely(!gve_can_send_tso(skb) &&
643 skb_linearize(skb) < 0)) {
644 net_err_ratelimited("%s: Failed to transmit TSO packet\n",
649 num_buffer_descs = gve_num_buffer_descs_needed(skb);
651 num_buffer_descs = gve_num_buffer_descs_needed(skb);
653 if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) {
654 if (unlikely(skb_linearize(skb) < 0))
657 num_buffer_descs = 1;
661 /* Metadata + (optional TSO) + data descriptors. */
662 total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs;
663 if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs +
664 GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) {
668 if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0))
671 netdev_tx_sent_queue(tx->netdev_txq, skb->len);
672 skb_tx_timestamp(skb);
677 dev_kfree_skb_any(skb);
681 /* Transmit a given skb and ring the doorbell. */
682 netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
684 struct gve_priv *priv = netdev_priv(dev);
685 struct gve_tx_ring *tx;
687 tx = &priv->tx[skb_get_queue_mapping(skb)];
688 if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) {
689 /* We need to ring the txq doorbell -- we have stopped the Tx
690 * queue for want of resources, but prior calls to gve_tx()
691 * may have added descriptors without ringing the doorbell.
693 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
694 return NETDEV_TX_BUSY;
697 if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())
700 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
704 static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list,
705 struct gve_tx_pending_packet_dqo *pending_packet)
709 index = pending_packet - tx->dqo.pending_packets;
710 old_tail = list->tail;
715 tx->dqo.pending_packets[old_tail].next = index;
717 pending_packet->next = -1;
718 pending_packet->prev = old_tail;
721 static void remove_from_list(struct gve_tx_ring *tx,
722 struct gve_index_list *list,
723 struct gve_tx_pending_packet_dqo *pkt)
725 s16 prev_index, next_index;
727 prev_index = pkt->prev;
728 next_index = pkt->next;
730 if (prev_index == -1) {
732 list->head = next_index;
734 tx->dqo.pending_packets[prev_index].next = next_index;
736 if (next_index == -1) {
738 list->tail = prev_index;
740 tx->dqo.pending_packets[next_index].prev = prev_index;
744 static void gve_unmap_packet(struct device *dev,
745 struct gve_tx_pending_packet_dqo *pkt)
749 /* SKB linear portion is guaranteed to be mapped */
750 dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]),
751 dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE);
752 for (i = 1; i < pkt->num_bufs; i++) {
753 dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]),
754 dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE);
759 /* Completion types and expected behavior:
760 * No Miss compl + Packet compl = Packet completed normally.
761 * Miss compl + Re-inject compl = Packet completed normally.
762 * No Miss compl + Re-inject compl = Skipped i.e. packet not completed.
763 * Miss compl + Packet compl = Skipped i.e. packet not completed.
765 static void gve_handle_packet_completion(struct gve_priv *priv,
766 struct gve_tx_ring *tx, bool is_napi,
767 u16 compl_tag, u64 *bytes, u64 *pkts,
770 struct gve_tx_pending_packet_dqo *pending_packet;
772 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
773 net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
774 priv->dev->name, (int)compl_tag);
778 pending_packet = &tx->dqo.pending_packets[compl_tag];
780 if (unlikely(is_reinjection)) {
781 if (unlikely(pending_packet->state ==
782 GVE_PACKET_STATE_TIMED_OUT_COMPL)) {
783 net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n",
784 priv->dev->name, (int)compl_tag);
785 /* Packet was already completed as a result of timeout,
786 * so just remove from list and free pending packet.
789 &tx->dqo_compl.timed_out_completions,
791 gve_free_pending_packet(tx, pending_packet);
794 if (unlikely(pending_packet->state !=
795 GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) {
796 /* No outstanding miss completion but packet allocated
797 * implies packet receives a re-injection completion
798 * without a prior miss completion. Return without
799 * completing the packet.
801 net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n",
802 priv->dev->name, (int)compl_tag);
805 remove_from_list(tx, &tx->dqo_compl.miss_completions,
808 /* Packet is allocated but not a pending data completion. */
809 if (unlikely(pending_packet->state !=
810 GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
811 net_err_ratelimited("%s: No pending data completion: %d\n",
812 priv->dev->name, (int)compl_tag);
816 gve_unmap_packet(tx->dev, pending_packet);
818 *bytes += pending_packet->skb->len;
820 napi_consume_skb(pending_packet->skb, is_napi);
821 pending_packet->skb = NULL;
822 gve_free_pending_packet(tx, pending_packet);
825 static void gve_handle_miss_completion(struct gve_priv *priv,
826 struct gve_tx_ring *tx, u16 compl_tag,
827 u64 *bytes, u64 *pkts)
829 struct gve_tx_pending_packet_dqo *pending_packet;
831 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
832 net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
833 priv->dev->name, (int)compl_tag);
837 pending_packet = &tx->dqo.pending_packets[compl_tag];
838 if (unlikely(pending_packet->state !=
839 GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
840 net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n",
841 priv->dev->name, (int)pending_packet->state,
846 pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL;
847 /* jiffies can wraparound but time comparisons can handle overflows. */
848 pending_packet->timeout_jiffies =
850 msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT *
852 add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet);
854 *bytes += pending_packet->skb->len;
858 static void remove_miss_completions(struct gve_priv *priv,
859 struct gve_tx_ring *tx)
861 struct gve_tx_pending_packet_dqo *pending_packet;
864 next_index = tx->dqo_compl.miss_completions.head;
865 while (next_index != -1) {
866 pending_packet = &tx->dqo.pending_packets[next_index];
867 next_index = pending_packet->next;
868 /* Break early because packets should timeout in order. */
869 if (time_is_after_jiffies(pending_packet->timeout_jiffies))
872 remove_from_list(tx, &tx->dqo_compl.miss_completions,
874 /* Unmap buffers and free skb but do not unallocate packet i.e.
875 * the completion tag is not freed to ensure that the driver
876 * can take appropriate action if a corresponding valid
877 * completion is received later.
879 gve_unmap_packet(tx->dev, pending_packet);
880 /* This indicates the packet was dropped. */
881 dev_kfree_skb_any(pending_packet->skb);
882 pending_packet->skb = NULL;
884 net_err_ratelimited("%s: No reinjection completion was received for: %d.\n",
886 (int)(pending_packet - tx->dqo.pending_packets));
888 pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL;
889 pending_packet->timeout_jiffies =
891 msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT *
893 /* Maintain pending packet in another list so the packet can be
894 * unallocated at a later time.
896 add_to_list(tx, &tx->dqo_compl.timed_out_completions,
901 static void remove_timed_out_completions(struct gve_priv *priv,
902 struct gve_tx_ring *tx)
904 struct gve_tx_pending_packet_dqo *pending_packet;
907 next_index = tx->dqo_compl.timed_out_completions.head;
908 while (next_index != -1) {
909 pending_packet = &tx->dqo.pending_packets[next_index];
910 next_index = pending_packet->next;
911 /* Break early because packets should timeout in order. */
912 if (time_is_after_jiffies(pending_packet->timeout_jiffies))
915 remove_from_list(tx, &tx->dqo_compl.timed_out_completions,
917 gve_free_pending_packet(tx, pending_packet);
921 int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
922 struct napi_struct *napi)
924 u64 reinject_compl_bytes = 0;
925 u64 reinject_compl_pkts = 0;
926 int num_descs_cleaned = 0;
927 u64 miss_compl_bytes = 0;
928 u64 miss_compl_pkts = 0;
929 u64 pkt_compl_bytes = 0;
930 u64 pkt_compl_pkts = 0;
932 /* Limit in order to avoid blocking for too long */
933 while (!napi || pkt_compl_pkts < napi->weight) {
934 struct gve_tx_compl_desc *compl_desc =
935 &tx->dqo.compl_ring[tx->dqo_compl.head];
938 if (compl_desc->generation == tx->dqo_compl.cur_gen_bit)
941 /* Prefetch the next descriptor. */
942 prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) &
943 tx->dqo.complq_mask]);
945 /* Do not read data until we own the descriptor */
947 type = compl_desc->type;
949 if (type == GVE_COMPL_TYPE_DQO_DESC) {
950 /* This is the last descriptor fetched by HW plus one */
951 u16 tx_head = le16_to_cpu(compl_desc->tx_head);
953 atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head);
954 } else if (type == GVE_COMPL_TYPE_DQO_PKT) {
955 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
957 gve_handle_packet_completion(priv, tx, !!napi,
961 /*is_reinjection=*/false);
962 } else if (type == GVE_COMPL_TYPE_DQO_MISS) {
963 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
965 gve_handle_miss_completion(priv, tx, compl_tag,
968 } else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) {
969 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
971 gve_handle_packet_completion(priv, tx, !!napi,
973 &reinject_compl_bytes,
974 &reinject_compl_pkts,
975 /*is_reinjection=*/true);
979 (tx->dqo_compl.head + 1) & tx->dqo.complq_mask;
980 /* Flip the generation bit when we wrap around */
981 tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0;
985 netdev_tx_completed_queue(tx->netdev_txq,
986 pkt_compl_pkts + miss_compl_pkts,
987 pkt_compl_bytes + miss_compl_bytes);
989 remove_miss_completions(priv, tx);
990 remove_timed_out_completions(priv, tx);
992 u64_stats_update_begin(&tx->statss);
993 tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes;
994 tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts;
995 u64_stats_update_end(&tx->statss);
996 return num_descs_cleaned;
999 bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean)
1001 struct gve_tx_compl_desc *compl_desc;
1002 struct gve_tx_ring *tx = block->tx;
1003 struct gve_priv *priv = block->priv;
1006 int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx,
1009 /* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */
1012 if (netif_tx_queue_stopped(tx->netdev_txq) &&
1013 num_descs_cleaned > 0) {
1015 netif_tx_wake_queue(tx->netdev_txq);
1019 /* Return true if we still have work. */
1020 compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head];
1021 return compl_desc->generation != tx->dqo_compl.cur_gen_bit;