1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies. */
4 #include <net/dst_metadata.h>
5 #include <linux/netdevice.h>
6 #include <linux/list.h>
7 #include <linux/rculist.h>
8 #include <linux/rtnetlink.h>
9 #include <linux/workqueue.h>
10 #include <linux/spinlock.h>
15 #include "esw/chains.h"
17 #include "en/mapping.h"
18 #include "en/tc_tun.h"
19 #include "lib/port_tun.h"
21 struct mlx5e_rep_indr_block_priv {
22 struct net_device *netdev;
23 struct mlx5e_rep_priv *rpriv;
25 struct list_head list;
28 int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
29 struct mlx5e_encap_entry *e)
31 struct mlx5e_rep_priv *rpriv = priv->ppriv;
32 struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
33 struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy;
34 struct mlx5e_neigh_hash_entry *nhe;
37 err = mlx5_tun_entropy_refcount_inc(tun_entropy, e->reformat_type);
41 mutex_lock(&rpriv->neigh_update.encap_lock);
42 nhe = mlx5e_rep_neigh_entry_lookup(priv, &e->m_neigh);
44 err = mlx5e_rep_neigh_entry_create(priv, e, &nhe);
46 mutex_unlock(&rpriv->neigh_update.encap_lock);
47 mlx5_tun_entropy_refcount_dec(tun_entropy,
54 spin_lock(&nhe->encap_list_lock);
55 list_add_rcu(&e->encap_list, &nhe->encap_list);
56 spin_unlock(&nhe->encap_list_lock);
58 mutex_unlock(&rpriv->neigh_update.encap_lock);
63 void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
64 struct mlx5e_encap_entry *e)
66 struct mlx5e_rep_priv *rpriv = priv->ppriv;
67 struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
68 struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy;
73 spin_lock(&e->nhe->encap_list_lock);
74 list_del_rcu(&e->encap_list);
75 spin_unlock(&e->nhe->encap_list_lock);
77 mlx5e_rep_neigh_entry_release(e->nhe);
79 mlx5_tun_entropy_refcount_dec(tun_entropy, e->reformat_type);
82 void mlx5e_rep_update_flows(struct mlx5e_priv *priv,
83 struct mlx5e_encap_entry *e,
85 unsigned char ha[ETH_ALEN])
87 struct ethhdr *eth = (struct ethhdr *)e->encap_header;
88 struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
94 /* wait for encap to be fully initialized */
95 wait_for_completion(&e->res_ready);
97 mutex_lock(&esw->offloads.encap_tbl_lock);
98 encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID);
99 if (e->compl_result < 0 || (encap_connected == neigh_connected &&
100 ether_addr_equal(e->h_dest, ha)))
103 mlx5e_take_all_encap_flows(e, &flow_list);
105 if ((e->flags & MLX5_ENCAP_ENTRY_VALID) &&
106 (!neigh_connected || !ether_addr_equal(e->h_dest, ha)))
107 mlx5e_tc_encap_flows_del(priv, e, &flow_list);
109 if (neigh_connected && !(e->flags & MLX5_ENCAP_ENTRY_VALID)) {
110 ether_addr_copy(e->h_dest, ha);
111 ether_addr_copy(eth->h_dest, ha);
112 /* Update the encap source mac, in case that we delete
113 * the flows when encap source mac changed.
115 ether_addr_copy(eth->h_source, e->route_dev->dev_addr);
117 mlx5e_tc_encap_flows_add(priv, e, &flow_list);
120 mutex_unlock(&esw->offloads.encap_tbl_lock);
121 mlx5e_put_encap_flow_list(priv, &flow_list);
125 mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv,
126 struct flow_cls_offload *cls_flower, int flags)
128 switch (cls_flower->command) {
129 case FLOW_CLS_REPLACE:
130 return mlx5e_configure_flower(priv->netdev, priv, cls_flower,
132 case FLOW_CLS_DESTROY:
133 return mlx5e_delete_flower(priv->netdev, priv, cls_flower,
136 return mlx5e_stats_flower(priv->netdev, priv, cls_flower,
144 int mlx5e_rep_setup_tc_cls_matchall(struct mlx5e_priv *priv,
145 struct tc_cls_matchall_offload *ma)
147 switch (ma->command) {
148 case TC_CLSMATCHALL_REPLACE:
149 return mlx5e_tc_configure_matchall(priv, ma);
150 case TC_CLSMATCHALL_DESTROY:
151 return mlx5e_tc_delete_matchall(priv, ma);
152 case TC_CLSMATCHALL_STATS:
153 mlx5e_tc_stats_matchall(priv, ma);
160 static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data,
163 unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD);
164 struct mlx5e_priv *priv = cb_priv;
167 case TC_SETUP_CLSFLOWER:
168 return mlx5e_rep_setup_tc_cls_flower(priv, type_data, flags);
169 case TC_SETUP_CLSMATCHALL:
170 return mlx5e_rep_setup_tc_cls_matchall(priv, type_data);
176 static int mlx5e_rep_setup_ft_cb(enum tc_setup_type type, void *type_data,
179 struct flow_cls_offload tmp, *f = type_data;
180 struct mlx5e_priv *priv = cb_priv;
181 struct mlx5_eswitch *esw;
185 flags = MLX5_TC_FLAG(INGRESS) |
186 MLX5_TC_FLAG(ESW_OFFLOAD) |
187 MLX5_TC_FLAG(FT_OFFLOAD);
188 esw = priv->mdev->priv.eswitch;
191 case TC_SETUP_CLSFLOWER:
192 memcpy(&tmp, f, sizeof(*f));
194 if (!mlx5_esw_chains_prios_supported(esw))
197 /* Re-use tc offload path by moving the ft flow to the
200 * FT offload can use prio range [0, INT_MAX], so we normalize
201 * it to range [1, mlx5_esw_chains_get_prio_range(esw)]
202 * as with tc, where prio 0 isn't supported.
204 * We only support chain 0 of FT offload.
206 if (tmp.common.prio >= mlx5_esw_chains_get_prio_range(esw))
208 if (tmp.common.chain_index != 0)
211 tmp.common.chain_index = mlx5_esw_chains_get_ft_chain(esw);
213 err = mlx5e_rep_setup_tc_cls_flower(priv, &tmp, flags);
214 memcpy(&f->stats, &tmp.stats, sizeof(f->stats));
221 static LIST_HEAD(mlx5e_rep_block_tc_cb_list);
222 static LIST_HEAD(mlx5e_rep_block_ft_cb_list);
223 int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
226 struct mlx5e_priv *priv = netdev_priv(dev);
227 struct flow_block_offload *f = type_data;
229 f->unlocked_driver_cb = true;
233 return flow_block_cb_setup_simple(type_data,
234 &mlx5e_rep_block_tc_cb_list,
235 mlx5e_rep_setup_tc_cb,
238 return flow_block_cb_setup_simple(type_data,
239 &mlx5e_rep_block_ft_cb_list,
240 mlx5e_rep_setup_ft_cb,
247 int mlx5e_rep_tc_init(struct mlx5e_rep_priv *rpriv)
249 struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
252 mutex_init(&uplink_priv->unready_flows_lock);
253 INIT_LIST_HEAD(&uplink_priv->unready_flows);
255 /* init shared tc flow table */
256 err = mlx5e_tc_esw_init(&uplink_priv->tc_ht);
260 void mlx5e_rep_tc_cleanup(struct mlx5e_rep_priv *rpriv)
262 /* delete shared tc flow table */
263 mlx5e_tc_esw_cleanup(&rpriv->uplink_priv.tc_ht);
264 mutex_destroy(&rpriv->uplink_priv.unready_flows_lock);
267 void mlx5e_rep_tc_enable(struct mlx5e_priv *priv)
269 struct mlx5e_rep_priv *rpriv = priv->ppriv;
271 INIT_WORK(&rpriv->uplink_priv.reoffload_flows_work,
272 mlx5e_tc_reoffload_flows_work);
275 void mlx5e_rep_tc_disable(struct mlx5e_priv *priv)
277 struct mlx5e_rep_priv *rpriv = priv->ppriv;
279 cancel_work_sync(&rpriv->uplink_priv.reoffload_flows_work);
282 int mlx5e_rep_tc_event_port_affinity(struct mlx5e_priv *priv)
284 struct mlx5e_rep_priv *rpriv = priv->ppriv;
286 queue_work(priv->wq, &rpriv->uplink_priv.reoffload_flows_work);
291 static struct mlx5e_rep_indr_block_priv *
292 mlx5e_rep_indr_block_priv_lookup(struct mlx5e_rep_priv *rpriv,
293 struct net_device *netdev)
295 struct mlx5e_rep_indr_block_priv *cb_priv;
297 /* All callback list access should be protected by RTNL. */
300 list_for_each_entry(cb_priv,
301 &rpriv->uplink_priv.tc_indr_block_priv_list,
303 if (cb_priv->netdev == netdev)
310 mlx5e_rep_indr_offload(struct net_device *netdev,
311 struct flow_cls_offload *flower,
312 struct mlx5e_rep_indr_block_priv *indr_priv,
315 struct mlx5e_priv *priv = netdev_priv(indr_priv->rpriv->netdev);
318 switch (flower->command) {
319 case FLOW_CLS_REPLACE:
320 err = mlx5e_configure_flower(netdev, priv, flower, flags);
322 case FLOW_CLS_DESTROY:
323 err = mlx5e_delete_flower(netdev, priv, flower, flags);
326 err = mlx5e_stats_flower(netdev, priv, flower, flags);
335 static int mlx5e_rep_indr_setup_tc_cb(enum tc_setup_type type,
336 void *type_data, void *indr_priv)
338 unsigned long flags = MLX5_TC_FLAG(EGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD);
339 struct mlx5e_rep_indr_block_priv *priv = indr_priv;
342 case TC_SETUP_CLSFLOWER:
343 return mlx5e_rep_indr_offload(priv->netdev, type_data, priv,
350 static int mlx5e_rep_indr_setup_ft_cb(enum tc_setup_type type,
351 void *type_data, void *indr_priv)
353 struct mlx5e_rep_indr_block_priv *priv = indr_priv;
354 struct flow_cls_offload *f = type_data;
355 struct flow_cls_offload tmp;
356 struct mlx5e_priv *mpriv;
357 struct mlx5_eswitch *esw;
361 mpriv = netdev_priv(priv->rpriv->netdev);
362 esw = mpriv->mdev->priv.eswitch;
364 flags = MLX5_TC_FLAG(EGRESS) |
365 MLX5_TC_FLAG(ESW_OFFLOAD) |
366 MLX5_TC_FLAG(FT_OFFLOAD);
369 case TC_SETUP_CLSFLOWER:
370 memcpy(&tmp, f, sizeof(*f));
372 /* Re-use tc offload path by moving the ft flow to the
375 * FT offload can use prio range [0, INT_MAX], so we normalize
376 * it to range [1, mlx5_esw_chains_get_prio_range(esw)]
377 * as with tc, where prio 0 isn't supported.
379 * We only support chain 0 of FT offload.
381 if (!mlx5_esw_chains_prios_supported(esw) ||
382 tmp.common.prio >= mlx5_esw_chains_get_prio_range(esw) ||
383 tmp.common.chain_index)
386 tmp.common.chain_index = mlx5_esw_chains_get_ft_chain(esw);
388 err = mlx5e_rep_indr_offload(priv->netdev, &tmp, priv, flags);
389 memcpy(&f->stats, &tmp.stats, sizeof(f->stats));
396 static void mlx5e_rep_indr_block_unbind(void *cb_priv)
398 struct mlx5e_rep_indr_block_priv *indr_priv = cb_priv;
400 list_del(&indr_priv->list);
404 static LIST_HEAD(mlx5e_block_cb_list);
407 mlx5e_rep_indr_setup_block(struct net_device *netdev,
408 struct mlx5e_rep_priv *rpriv,
409 struct flow_block_offload *f,
410 flow_setup_cb_t *setup_cb,
412 void (*cleanup)(struct flow_block_cb *block_cb))
414 struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
415 struct mlx5e_rep_indr_block_priv *indr_priv;
416 struct flow_block_cb *block_cb;
418 if (!mlx5e_tc_tun_device_to_offload(priv, netdev) &&
419 !(is_vlan_dev(netdev) && vlan_dev_real_dev(netdev) == rpriv->netdev))
422 if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
425 f->unlocked_driver_cb = true;
426 f->driver_block_list = &mlx5e_block_cb_list;
428 switch (f->command) {
429 case FLOW_BLOCK_BIND:
430 indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
434 indr_priv = kmalloc(sizeof(*indr_priv), GFP_KERNEL);
438 indr_priv->netdev = netdev;
439 indr_priv->rpriv = rpriv;
440 list_add(&indr_priv->list,
441 &rpriv->uplink_priv.tc_indr_block_priv_list);
443 block_cb = flow_indr_block_cb_alloc(setup_cb, indr_priv, indr_priv,
444 mlx5e_rep_indr_block_unbind,
445 f, netdev, data, rpriv,
447 if (IS_ERR(block_cb)) {
448 list_del(&indr_priv->list);
450 return PTR_ERR(block_cb);
452 flow_block_cb_add(block_cb, f);
453 list_add_tail(&block_cb->driver_list, &mlx5e_block_cb_list);
456 case FLOW_BLOCK_UNBIND:
457 indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
461 block_cb = flow_block_cb_lookup(f->block, setup_cb, indr_priv);
465 flow_indr_block_cb_remove(block_cb, f);
466 list_del(&block_cb->driver_list);
475 int mlx5e_rep_indr_setup_cb(struct net_device *netdev, void *cb_priv,
476 enum tc_setup_type type, void *type_data,
478 void (*cleanup)(struct flow_block_cb *block_cb))
482 return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data,
483 mlx5e_rep_indr_setup_tc_cb,
486 return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data,
487 mlx5e_rep_indr_setup_ft_cb,
494 int mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv)
496 struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
498 /* init indirect block notifications */
499 INIT_LIST_HEAD(&uplink_priv->tc_indr_block_priv_list);
501 return flow_indr_dev_register(mlx5e_rep_indr_setup_cb, rpriv);
504 void mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv)
506 flow_indr_dev_unregister(mlx5e_rep_indr_setup_cb, rpriv,
507 mlx5e_rep_indr_block_unbind);
510 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
511 static bool mlx5e_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb,
512 struct mlx5e_tc_update_priv *tc_priv,
515 struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
516 struct tunnel_match_enc_opts enc_opts = {};
517 struct mlx5_rep_uplink_priv *uplink_priv;
518 struct mlx5e_rep_priv *uplink_rpriv;
519 struct metadata_dst *tun_dst;
520 struct tunnel_match_key key;
521 u32 tun_id, enc_opts_id;
522 struct net_device *dev;
525 enc_opts_id = tunnel_id & ENC_OPTS_BITS_MASK;
526 tun_id = tunnel_id >> ENC_OPTS_BITS;
531 uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
532 uplink_priv = &uplink_rpriv->uplink_priv;
534 err = mapping_find(uplink_priv->tunnel_mapping, tun_id, &key);
537 netdev_dbg(priv->netdev,
538 "Couldn't find tunnel for tun_id: %d, err: %d\n",
544 err = mapping_find(uplink_priv->tunnel_enc_opts_mapping,
545 enc_opts_id, &enc_opts);
547 netdev_dbg(priv->netdev,
548 "Couldn't find tunnel (opts) for tun_id: %d, err: %d\n",
554 tun_dst = tun_rx_dst(enc_opts.key.len);
560 ip_tunnel_key_init(&tun_dst->u.tun_info.key,
561 key.enc_ipv4.src, key.enc_ipv4.dst,
562 key.enc_ip.tos, key.enc_ip.ttl,
564 key.enc_tp.src, key.enc_tp.dst,
565 key32_to_tunnel_id(key.enc_key_id.keyid),
568 if (enc_opts.key.len)
569 ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
572 enc_opts.key.dst_opt_type);
574 skb_dst_set(skb, (struct dst_entry *)tun_dst);
575 dev = dev_get_by_index(&init_net, key.filter_ifindex);
577 netdev_dbg(priv->netdev,
578 "Couldn't find tunnel device with ifindex: %d\n",
583 /* Set tun_dev so we do dev_put() after datapath */
584 tc_priv->tun_dev = dev;
590 #endif /* CONFIG_NET_TC_SKB_EXT */
592 bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe,
594 struct mlx5e_tc_update_priv *tc_priv)
596 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
597 u32 chain = 0, reg_c0, reg_c1, tunnel_id, tuple_id;
598 struct mlx5_rep_uplink_priv *uplink_priv;
599 struct mlx5e_rep_priv *uplink_rpriv;
600 struct tc_skb_ext *tc_skb_ext;
601 struct mlx5_eswitch *esw;
602 struct mlx5e_priv *priv;
606 reg_c0 = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK);
607 if (reg_c0 == MLX5_FS_DEFAULT_FLOW_TAG)
609 reg_c1 = be32_to_cpu(cqe->ft_metadata);
614 priv = netdev_priv(skb->dev);
615 esw = priv->mdev->priv.eswitch;
617 err = mlx5_eswitch_get_chain_for_tag(esw, reg_c0, &chain);
619 netdev_dbg(priv->netdev,
620 "Couldn't find chain for chain tag: %d, err: %d\n",
626 tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT);
632 tc_skb_ext->chain = chain;
634 tuple_id = reg_c1 & TUPLE_ID_MAX;
636 uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
637 uplink_priv = &uplink_rpriv->uplink_priv;
638 if (!mlx5e_tc_ct_restore_flow(uplink_priv, skb, tuple_id))
642 tunnel_moffset = mlx5e_tc_attr_to_reg_mappings[TUNNEL_TO_REG].moffset;
643 tunnel_id = reg_c1 >> (8 * tunnel_moffset);
644 return mlx5e_restore_tunnel(priv, skb, tc_priv, tunnel_id);
645 #endif /* CONFIG_NET_TC_SKB_EXT */
650 void mlx5_rep_tc_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv)
652 if (tc_priv->tun_dev)
653 dev_put(tc_priv->tun_dev);