1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies. */
4 #include <net/dst_metadata.h>
5 #include <linux/netdevice.h>
6 #include <linux/list.h>
7 #include <linux/rculist.h>
8 #include <linux/rtnetlink.h>
9 #include <linux/workqueue.h>
10 #include <linux/spinlock.h>
15 #include "lib/fs_chains.h"
17 #include "en/mapping.h"
18 #include "en/tc_tun.h"
19 #include "lib/port_tun.h"
20 #include "esw/sample.h"
22 struct mlx5e_rep_indr_block_priv {
23 struct net_device *netdev;
24 struct mlx5e_rep_priv *rpriv;
26 struct list_head list;
29 int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
30 struct mlx5e_encap_entry *e,
31 struct mlx5e_neigh *m_neigh,
32 struct net_device *neigh_dev)
34 struct mlx5e_rep_priv *rpriv = priv->ppriv;
35 struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
36 struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy;
37 struct mlx5e_neigh_hash_entry *nhe;
40 err = mlx5_tun_entropy_refcount_inc(tun_entropy, e->reformat_type);
44 mutex_lock(&rpriv->neigh_update.encap_lock);
45 nhe = mlx5e_rep_neigh_entry_lookup(priv, m_neigh);
47 err = mlx5e_rep_neigh_entry_create(priv, m_neigh, neigh_dev, &nhe);
49 mutex_unlock(&rpriv->neigh_update.encap_lock);
50 mlx5_tun_entropy_refcount_dec(tun_entropy,
57 spin_lock(&nhe->encap_list_lock);
58 list_add_rcu(&e->encap_list, &nhe->encap_list);
59 spin_unlock(&nhe->encap_list_lock);
61 mutex_unlock(&rpriv->neigh_update.encap_lock);
66 void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
67 struct mlx5e_encap_entry *e)
69 struct mlx5e_rep_priv *rpriv = priv->ppriv;
70 struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
71 struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy;
76 spin_lock(&e->nhe->encap_list_lock);
77 list_del_rcu(&e->encap_list);
78 spin_unlock(&e->nhe->encap_list_lock);
80 mlx5e_rep_neigh_entry_release(e->nhe);
82 mlx5_tun_entropy_refcount_dec(tun_entropy, e->reformat_type);
85 void mlx5e_rep_update_flows(struct mlx5e_priv *priv,
86 struct mlx5e_encap_entry *e,
88 unsigned char ha[ETH_ALEN])
90 struct ethhdr *eth = (struct ethhdr *)e->encap_header;
91 struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
97 /* wait for encap to be fully initialized */
98 wait_for_completion(&e->res_ready);
100 mutex_lock(&esw->offloads.encap_tbl_lock);
101 encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID);
102 if (e->compl_result < 0 || (encap_connected == neigh_connected &&
103 ether_addr_equal(e->h_dest, ha)))
106 mlx5e_take_all_encap_flows(e, &flow_list);
108 if ((e->flags & MLX5_ENCAP_ENTRY_VALID) &&
109 (!neigh_connected || !ether_addr_equal(e->h_dest, ha)))
110 mlx5e_tc_encap_flows_del(priv, e, &flow_list);
112 if (neigh_connected && !(e->flags & MLX5_ENCAP_ENTRY_VALID)) {
113 struct net_device *route_dev;
115 ether_addr_copy(e->h_dest, ha);
116 ether_addr_copy(eth->h_dest, ha);
117 /* Update the encap source mac, in case that we delete
118 * the flows when encap source mac changed.
120 route_dev = __dev_get_by_index(dev_net(priv->netdev), e->route_dev_ifindex);
122 ether_addr_copy(eth->h_source, route_dev->dev_addr);
124 mlx5e_tc_encap_flows_add(priv, e, &flow_list);
127 mutex_unlock(&esw->offloads.encap_tbl_lock);
128 mlx5e_put_flow_list(priv, &flow_list);
132 mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv,
133 struct flow_cls_offload *cls_flower, int flags)
135 switch (cls_flower->command) {
136 case FLOW_CLS_REPLACE:
137 return mlx5e_configure_flower(priv->netdev, priv, cls_flower,
139 case FLOW_CLS_DESTROY:
140 return mlx5e_delete_flower(priv->netdev, priv, cls_flower,
143 return mlx5e_stats_flower(priv->netdev, priv, cls_flower,
151 int mlx5e_rep_setup_tc_cls_matchall(struct mlx5e_priv *priv,
152 struct tc_cls_matchall_offload *ma)
154 switch (ma->command) {
155 case TC_CLSMATCHALL_REPLACE:
156 return mlx5e_tc_configure_matchall(priv, ma);
157 case TC_CLSMATCHALL_DESTROY:
158 return mlx5e_tc_delete_matchall(priv, ma);
159 case TC_CLSMATCHALL_STATS:
160 mlx5e_tc_stats_matchall(priv, ma);
167 static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data,
170 unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD);
171 struct mlx5e_priv *priv = cb_priv;
173 if (!priv->netdev || !netif_device_present(priv->netdev))
177 case TC_SETUP_CLSFLOWER:
178 return mlx5e_rep_setup_tc_cls_flower(priv, type_data, flags);
179 case TC_SETUP_CLSMATCHALL:
180 return mlx5e_rep_setup_tc_cls_matchall(priv, type_data);
186 static int mlx5e_rep_setup_ft_cb(enum tc_setup_type type, void *type_data,
189 struct flow_cls_offload tmp, *f = type_data;
190 struct mlx5e_priv *priv = cb_priv;
191 struct mlx5_eswitch *esw;
195 flags = MLX5_TC_FLAG(INGRESS) |
196 MLX5_TC_FLAG(ESW_OFFLOAD) |
197 MLX5_TC_FLAG(FT_OFFLOAD);
198 esw = priv->mdev->priv.eswitch;
201 case TC_SETUP_CLSFLOWER:
202 memcpy(&tmp, f, sizeof(*f));
204 if (!mlx5_chains_prios_supported(esw_chains(esw)))
207 /* Re-use tc offload path by moving the ft flow to the
210 * FT offload can use prio range [0, INT_MAX], so we normalize
211 * it to range [1, mlx5_esw_chains_get_prio_range(esw)]
212 * as with tc, where prio 0 isn't supported.
214 * We only support chain 0 of FT offload.
216 if (tmp.common.prio >= mlx5_chains_get_prio_range(esw_chains(esw)))
218 if (tmp.common.chain_index != 0)
221 tmp.common.chain_index = mlx5_chains_get_nf_ft_chain(esw_chains(esw));
223 err = mlx5e_rep_setup_tc_cls_flower(priv, &tmp, flags);
224 memcpy(&f->stats, &tmp.stats, sizeof(f->stats));
231 static LIST_HEAD(mlx5e_rep_block_tc_cb_list);
232 static LIST_HEAD(mlx5e_rep_block_ft_cb_list);
233 int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
236 struct mlx5e_priv *priv = netdev_priv(dev);
237 struct flow_block_offload *f = type_data;
239 f->unlocked_driver_cb = true;
243 return flow_block_cb_setup_simple(type_data,
244 &mlx5e_rep_block_tc_cb_list,
245 mlx5e_rep_setup_tc_cb,
248 return flow_block_cb_setup_simple(type_data,
249 &mlx5e_rep_block_ft_cb_list,
250 mlx5e_rep_setup_ft_cb,
257 int mlx5e_rep_tc_init(struct mlx5e_rep_priv *rpriv)
259 struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
262 mutex_init(&uplink_priv->unready_flows_lock);
263 INIT_LIST_HEAD(&uplink_priv->unready_flows);
265 /* init shared tc flow table */
266 err = mlx5e_tc_esw_init(&uplink_priv->tc_ht);
270 void mlx5e_rep_tc_cleanup(struct mlx5e_rep_priv *rpriv)
272 /* delete shared tc flow table */
273 mlx5e_tc_esw_cleanup(&rpriv->uplink_priv.tc_ht);
274 mutex_destroy(&rpriv->uplink_priv.unready_flows_lock);
277 void mlx5e_rep_tc_enable(struct mlx5e_priv *priv)
279 struct mlx5e_rep_priv *rpriv = priv->ppriv;
281 INIT_WORK(&rpriv->uplink_priv.reoffload_flows_work,
282 mlx5e_tc_reoffload_flows_work);
285 void mlx5e_rep_tc_disable(struct mlx5e_priv *priv)
287 struct mlx5e_rep_priv *rpriv = priv->ppriv;
289 cancel_work_sync(&rpriv->uplink_priv.reoffload_flows_work);
292 int mlx5e_rep_tc_event_port_affinity(struct mlx5e_priv *priv)
294 struct mlx5e_rep_priv *rpriv = priv->ppriv;
296 queue_work(priv->wq, &rpriv->uplink_priv.reoffload_flows_work);
301 static struct mlx5e_rep_indr_block_priv *
302 mlx5e_rep_indr_block_priv_lookup(struct mlx5e_rep_priv *rpriv,
303 struct net_device *netdev)
305 struct mlx5e_rep_indr_block_priv *cb_priv;
307 /* All callback list access should be protected by RTNL. */
310 list_for_each_entry(cb_priv,
311 &rpriv->uplink_priv.tc_indr_block_priv_list,
313 if (cb_priv->netdev == netdev)
320 mlx5e_rep_indr_offload(struct net_device *netdev,
321 struct flow_cls_offload *flower,
322 struct mlx5e_rep_indr_block_priv *indr_priv,
325 struct mlx5e_priv *priv = netdev_priv(indr_priv->rpriv->netdev);
328 if (!netif_device_present(indr_priv->rpriv->netdev))
331 switch (flower->command) {
332 case FLOW_CLS_REPLACE:
333 err = mlx5e_configure_flower(netdev, priv, flower, flags);
335 case FLOW_CLS_DESTROY:
336 err = mlx5e_delete_flower(netdev, priv, flower, flags);
339 err = mlx5e_stats_flower(netdev, priv, flower, flags);
348 static int mlx5e_rep_indr_setup_tc_cb(enum tc_setup_type type,
349 void *type_data, void *indr_priv)
351 unsigned long flags = MLX5_TC_FLAG(EGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD);
352 struct mlx5e_rep_indr_block_priv *priv = indr_priv;
355 case TC_SETUP_CLSFLOWER:
356 return mlx5e_rep_indr_offload(priv->netdev, type_data, priv,
363 static int mlx5e_rep_indr_setup_ft_cb(enum tc_setup_type type,
364 void *type_data, void *indr_priv)
366 struct mlx5e_rep_indr_block_priv *priv = indr_priv;
367 struct flow_cls_offload *f = type_data;
368 struct flow_cls_offload tmp;
369 struct mlx5e_priv *mpriv;
370 struct mlx5_eswitch *esw;
374 mpriv = netdev_priv(priv->rpriv->netdev);
375 esw = mpriv->mdev->priv.eswitch;
377 flags = MLX5_TC_FLAG(EGRESS) |
378 MLX5_TC_FLAG(ESW_OFFLOAD) |
379 MLX5_TC_FLAG(FT_OFFLOAD);
382 case TC_SETUP_CLSFLOWER:
383 memcpy(&tmp, f, sizeof(*f));
385 /* Re-use tc offload path by moving the ft flow to the
388 * FT offload can use prio range [0, INT_MAX], so we normalize
389 * it to range [1, mlx5_esw_chains_get_prio_range(esw)]
390 * as with tc, where prio 0 isn't supported.
392 * We only support chain 0 of FT offload.
394 if (!mlx5_chains_prios_supported(esw_chains(esw)) ||
395 tmp.common.prio >= mlx5_chains_get_prio_range(esw_chains(esw)) ||
396 tmp.common.chain_index)
399 tmp.common.chain_index = mlx5_chains_get_nf_ft_chain(esw_chains(esw));
401 err = mlx5e_rep_indr_offload(priv->netdev, &tmp, priv, flags);
402 memcpy(&f->stats, &tmp.stats, sizeof(f->stats));
409 static void mlx5e_rep_indr_block_unbind(void *cb_priv)
411 struct mlx5e_rep_indr_block_priv *indr_priv = cb_priv;
413 list_del(&indr_priv->list);
417 static LIST_HEAD(mlx5e_block_cb_list);
420 mlx5e_rep_indr_setup_block(struct net_device *netdev, struct Qdisc *sch,
421 struct mlx5e_rep_priv *rpriv,
422 struct flow_block_offload *f,
423 flow_setup_cb_t *setup_cb,
425 void (*cleanup)(struct flow_block_cb *block_cb))
427 struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
428 struct mlx5e_rep_indr_block_priv *indr_priv;
429 struct flow_block_cb *block_cb;
431 if (!mlx5e_tc_tun_device_to_offload(priv, netdev) &&
432 !(is_vlan_dev(netdev) && vlan_dev_real_dev(netdev) == rpriv->netdev))
435 if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
438 f->unlocked_driver_cb = true;
439 f->driver_block_list = &mlx5e_block_cb_list;
441 switch (f->command) {
442 case FLOW_BLOCK_BIND:
443 indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
447 indr_priv = kmalloc(sizeof(*indr_priv), GFP_KERNEL);
451 indr_priv->netdev = netdev;
452 indr_priv->rpriv = rpriv;
453 list_add(&indr_priv->list,
454 &rpriv->uplink_priv.tc_indr_block_priv_list);
456 block_cb = flow_indr_block_cb_alloc(setup_cb, indr_priv, indr_priv,
457 mlx5e_rep_indr_block_unbind,
458 f, netdev, sch, data, rpriv,
460 if (IS_ERR(block_cb)) {
461 list_del(&indr_priv->list);
463 return PTR_ERR(block_cb);
465 flow_block_cb_add(block_cb, f);
466 list_add_tail(&block_cb->driver_list, &mlx5e_block_cb_list);
469 case FLOW_BLOCK_UNBIND:
470 indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
474 block_cb = flow_block_cb_lookup(f->block, setup_cb, indr_priv);
478 flow_indr_block_cb_remove(block_cb, f);
479 list_del(&block_cb->driver_list);
488 int mlx5e_rep_indr_setup_cb(struct net_device *netdev, struct Qdisc *sch, void *cb_priv,
489 enum tc_setup_type type, void *type_data,
491 void (*cleanup)(struct flow_block_cb *block_cb))
495 return mlx5e_rep_indr_setup_block(netdev, sch, cb_priv, type_data,
496 mlx5e_rep_indr_setup_tc_cb,
499 return mlx5e_rep_indr_setup_block(netdev, sch, cb_priv, type_data,
500 mlx5e_rep_indr_setup_ft_cb,
507 int mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv)
509 struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
511 /* init indirect block notifications */
512 INIT_LIST_HEAD(&uplink_priv->tc_indr_block_priv_list);
514 return flow_indr_dev_register(mlx5e_rep_indr_setup_cb, rpriv);
517 void mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv)
519 flow_indr_dev_unregister(mlx5e_rep_indr_setup_cb, rpriv,
520 mlx5e_rep_indr_block_unbind);
523 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
524 static bool mlx5e_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb,
525 struct mlx5e_tc_update_priv *tc_priv,
528 struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
529 struct tunnel_match_enc_opts enc_opts = {};
530 struct mlx5_rep_uplink_priv *uplink_priv;
531 struct mlx5e_rep_priv *uplink_rpriv;
532 struct metadata_dst *tun_dst;
533 struct tunnel_match_key key;
534 u32 tun_id, enc_opts_id;
535 struct net_device *dev;
538 enc_opts_id = tunnel_id & ENC_OPTS_BITS_MASK;
539 tun_id = tunnel_id >> ENC_OPTS_BITS;
544 uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
545 uplink_priv = &uplink_rpriv->uplink_priv;
547 err = mapping_find(uplink_priv->tunnel_mapping, tun_id, &key);
550 netdev_dbg(priv->netdev,
551 "Couldn't find tunnel for tun_id: %d, err: %d\n",
557 err = mapping_find(uplink_priv->tunnel_enc_opts_mapping,
558 enc_opts_id, &enc_opts);
560 netdev_dbg(priv->netdev,
561 "Couldn't find tunnel (opts) for tun_id: %d, err: %d\n",
567 if (key.enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
568 tun_dst = __ip_tun_set_dst(key.enc_ipv4.src, key.enc_ipv4.dst,
569 key.enc_ip.tos, key.enc_ip.ttl,
570 key.enc_tp.dst, TUNNEL_KEY,
571 key32_to_tunnel_id(key.enc_key_id.keyid),
573 } else if (key.enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
574 tun_dst = __ipv6_tun_set_dst(&key.enc_ipv6.src, &key.enc_ipv6.dst,
575 key.enc_ip.tos, key.enc_ip.ttl,
576 key.enc_tp.dst, 0, TUNNEL_KEY,
577 key32_to_tunnel_id(key.enc_key_id.keyid),
580 netdev_dbg(priv->netdev,
581 "Couldn't restore tunnel, unsupported addr_type: %d\n",
582 key.enc_control.addr_type);
587 netdev_dbg(priv->netdev, "Couldn't restore tunnel, no tun_dst\n");
591 tun_dst->u.tun_info.key.tp_src = key.enc_tp.src;
593 if (enc_opts.key.len)
594 ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
597 enc_opts.key.dst_opt_type);
599 skb_dst_set(skb, (struct dst_entry *)tun_dst);
600 dev = dev_get_by_index(&init_net, key.filter_ifindex);
602 netdev_dbg(priv->netdev,
603 "Couldn't find tunnel device with ifindex: %d\n",
608 /* Set tun_dev so we do dev_put() after datapath */
609 tc_priv->tun_dev = dev;
616 static bool mlx5e_restore_skb(struct sk_buff *skb, u32 chain, u32 reg_c1,
617 struct mlx5e_tc_update_priv *tc_priv)
619 struct mlx5e_priv *priv = netdev_priv(skb->dev);
620 u32 tunnel_id = reg_c1 >> ESW_TUN_OFFSET;
623 struct mlx5_rep_uplink_priv *uplink_priv;
624 struct mlx5e_rep_priv *uplink_rpriv;
625 struct tc_skb_ext *tc_skb_ext;
626 struct mlx5_eswitch *esw;
629 tc_skb_ext = tc_skb_ext_alloc(skb);
634 tc_skb_ext->chain = chain;
635 zone_restore_id = reg_c1 & ESW_ZONE_ID_MASK;
636 esw = priv->mdev->priv.eswitch;
637 uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
638 uplink_priv = &uplink_rpriv->uplink_priv;
639 if (!mlx5e_tc_ct_restore_flow(uplink_priv->ct_priv, skb,
643 return mlx5e_restore_tunnel(priv, skb, tc_priv, tunnel_id);
645 #endif /* CONFIG_NET_TC_SKB_EXT */
647 bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe,
649 struct mlx5e_tc_update_priv *tc_priv)
651 struct mlx5_mapped_obj mapped_obj;
652 struct mlx5_eswitch *esw;
653 struct mlx5e_priv *priv;
657 reg_c0 = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK);
658 if (!reg_c0 || reg_c0 == MLX5_FS_DEFAULT_FLOW_TAG)
661 /* If reg_c0 is not equal to the default flow tag then skb->mark
662 * is not supported and must be reset back to 0.
666 reg_c1 = be32_to_cpu(cqe->ft_metadata);
668 priv = netdev_priv(skb->dev);
669 esw = priv->mdev->priv.eswitch;
670 err = mapping_find(esw->offloads.reg_c0_obj_pool, reg_c0, &mapped_obj);
672 netdev_dbg(priv->netdev,
673 "Couldn't find mapped object for reg_c0: %d, err: %d\n",
678 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
679 if (mapped_obj.type == MLX5_MAPPED_OBJ_CHAIN)
680 return mlx5e_restore_skb(skb, mapped_obj.chain, reg_c1, tc_priv);
681 #endif /* CONFIG_NET_TC_SKB_EXT */
682 #if IS_ENABLED(CONFIG_MLX5_TC_SAMPLE)
683 if (mapped_obj.type == MLX5_MAPPED_OBJ_SAMPLE) {
684 mlx5_esw_sample_skb(skb, &mapped_obj);
687 #endif /* CONFIG_MLX5_TC_SAMPLE */
688 if (mapped_obj.type != MLX5_MAPPED_OBJ_SAMPLE &&
689 mapped_obj.type != MLX5_MAPPED_OBJ_CHAIN) {
690 netdev_dbg(priv->netdev, "Invalid mapped object type: %d\n", mapped_obj.type);
697 void mlx5_rep_tc_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv)
699 if (tc_priv->tun_dev)
700 dev_put(tc_priv->tun_dev);