1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 // Copyright (c) 2020 Mellanox Technologies.
4 #include <linux/mlx5/driver.h>
5 #include <linux/mlx5/mlx5_ifc.h>
6 #include <linux/mlx5/fs.h>
8 #include "eswitch_offloads_chains.h"
14 #define esw_chains_priv(esw) ((esw)->fdb_table.offloads.esw_chains_priv)
15 #define esw_chains_lock(esw) (esw_chains_priv(esw)->lock)
16 #define esw_chains_ht(esw) (esw_chains_priv(esw)->chains_ht)
17 #define esw_prios_ht(esw) (esw_chains_priv(esw)->prios_ht)
18 #define fdb_pool_left(esw) (esw_chains_priv(esw)->fdb_left)
19 #define tc_slow_fdb(esw) ((esw)->fdb_table.offloads.slow_fdb)
20 #define tc_end_fdb(esw) (esw_chains_priv(esw)->tc_end_fdb)
21 #define fdb_ignore_flow_level_supported(esw) \
22 (MLX5_CAP_ESW_FLOWTABLE_FDB((esw)->dev, ignore_flow_level))
24 #define ESW_OFFLOADS_NUM_GROUPS 4
26 /* Firmware currently has 4 pool of 4 sizes that it supports (ESW_POOLS),
27 * and a virtual memory region of 16M (ESW_SIZE), this region is duplicated
28 * for each flow table pool. We can allocate up to 16M of each pool,
29 * and we keep track of how much we used via get_next_avail_sz_from_pool.
30 * Firmware doesn't report any of this for now.
31 * ESW_POOL is expected to be sorted from large to small and match firmware
34 #define ESW_SIZE (16 * 1024 * 1024)
35 static const unsigned int ESW_POOLS[] = { 4 * 1024 * 1024,
40 struct mlx5_esw_chains_priv {
41 struct rhashtable chains_ht;
42 struct rhashtable prios_ht;
43 /* Protects above chains_ht and prios_ht */
46 struct mlx5_flow_table *tc_end_fdb;
48 int fdb_left[ARRAY_SIZE(ESW_POOLS)];
52 struct rhash_head node;
58 struct mlx5_eswitch *esw;
59 struct list_head prios_list;
69 struct rhash_head node;
70 struct list_head list;
72 struct fdb_prio_key key;
76 struct fdb_chain *fdb_chain;
77 struct mlx5_flow_table *fdb;
78 struct mlx5_flow_table *next_fdb;
79 struct mlx5_flow_group *miss_group;
80 struct mlx5_flow_handle *miss_rule;
83 static const struct rhashtable_params chain_params = {
84 .head_offset = offsetof(struct fdb_chain, node),
85 .key_offset = offsetof(struct fdb_chain, chain),
86 .key_len = sizeof_field(struct fdb_chain, chain),
87 .automatic_shrinking = true,
90 static const struct rhashtable_params prio_params = {
91 .head_offset = offsetof(struct fdb_prio, node),
92 .key_offset = offsetof(struct fdb_prio, key),
93 .key_len = sizeof_field(struct fdb_prio, key),
94 .automatic_shrinking = true,
97 bool mlx5_esw_chains_prios_supported(struct mlx5_eswitch *esw)
99 return esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED;
102 u32 mlx5_esw_chains_get_chain_range(struct mlx5_eswitch *esw)
104 if (!mlx5_esw_chains_prios_supported(esw))
107 if (fdb_ignore_flow_level_supported(esw))
110 return FDB_TC_MAX_CHAIN;
113 u32 mlx5_esw_chains_get_ft_chain(struct mlx5_eswitch *esw)
115 return mlx5_esw_chains_get_chain_range(esw) + 1;
118 u32 mlx5_esw_chains_get_prio_range(struct mlx5_eswitch *esw)
120 if (!mlx5_esw_chains_prios_supported(esw))
123 if (fdb_ignore_flow_level_supported(esw))
126 return FDB_TC_MAX_PRIO;
129 static unsigned int mlx5_esw_chains_get_level_range(struct mlx5_eswitch *esw)
131 if (fdb_ignore_flow_level_supported(esw))
134 return FDB_TC_LEVELS_PER_PRIO;
137 #define POOL_NEXT_SIZE 0
139 mlx5_esw_chains_get_avail_sz_from_pool(struct mlx5_eswitch *esw,
144 for (i = ARRAY_SIZE(ESW_POOLS) - 1; i >= 0; i--) {
145 if (fdb_pool_left(esw)[i] && ESW_POOLS[i] > desired_size) {
147 if (desired_size != POOL_NEXT_SIZE)
153 --fdb_pool_left(esw)[found_i];
154 return ESW_POOLS[found_i];
161 mlx5_esw_chains_put_sz_to_pool(struct mlx5_eswitch *esw, int sz)
165 for (i = ARRAY_SIZE(ESW_POOLS) - 1; i >= 0; i--) {
166 if (sz == ESW_POOLS[i]) {
167 ++fdb_pool_left(esw)[i];
172 WARN_ONCE(1, "Couldn't find size %d in fdb size pool", sz);
176 mlx5_esw_chains_init_sz_pool(struct mlx5_eswitch *esw)
181 fdb_max = 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, log_max_ft_size);
183 for (i = ARRAY_SIZE(ESW_POOLS) - 1; i >= 0; i--)
184 fdb_pool_left(esw)[i] =
185 ESW_POOLS[i] <= fdb_max ? ESW_SIZE / ESW_POOLS[i] : 0;
188 static struct mlx5_flow_table *
189 mlx5_esw_chains_create_fdb_table(struct mlx5_eswitch *esw,
190 u32 chain, u32 prio, u32 level)
192 struct mlx5_flow_table_attr ft_attr = {};
193 struct mlx5_flow_namespace *ns;
194 struct mlx5_flow_table *fdb;
197 if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
198 ft_attr.flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT |
199 MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
201 sz = mlx5_esw_chains_get_avail_sz_from_pool(esw, POOL_NEXT_SIZE);
203 return ERR_PTR(-ENOSPC);
204 ft_attr.max_fte = sz;
206 /* We use tc_slow_fdb(esw) as the table's next_ft till
207 * ignore_flow_level is allowed on FT creation and not just for FTEs.
208 * Instead caller should add an explicit miss rule if needed.
210 ft_attr.next_ft = tc_slow_fdb(esw);
212 /* The root table(chain 0, prio 1, level 0) is required to be
213 * connected to the previous prio (FDB_BYPASS_PATH if exists).
214 * We always create it, as a managed table, in order to align with
217 if (!fdb_ignore_flow_level_supported(esw) ||
218 (chain == 0 && prio == 1 && level == 0)) {
219 ft_attr.level = level;
220 ft_attr.prio = prio - 1;
221 ns = mlx5_get_fdb_sub_ns(esw->dev, chain);
223 ft_attr.flags |= MLX5_FLOW_TABLE_UNMANAGED;
224 ft_attr.prio = FDB_TC_OFFLOAD;
225 /* Firmware doesn't allow us to create another level 0 table,
226 * so we create all unmanaged tables as level 1.
228 * To connect them, we use explicit miss rules with
229 * ignore_flow_level. Caller is responsible to create
230 * these rules (if needed).
233 ns = mlx5_get_flow_namespace(esw->dev, MLX5_FLOW_NAMESPACE_FDB);
236 ft_attr.autogroup.num_reserved_entries = 2;
237 ft_attr.autogroup.max_num_groups = ESW_OFFLOADS_NUM_GROUPS;
238 fdb = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
241 "Failed to create FDB table err %d (chain: %d, prio: %d, level: %d, size: %d)\n",
242 (int)PTR_ERR(fdb), chain, prio, level, sz);
243 mlx5_esw_chains_put_sz_to_pool(esw, sz);
251 mlx5_esw_chains_destroy_fdb_table(struct mlx5_eswitch *esw,
252 struct mlx5_flow_table *fdb)
254 mlx5_esw_chains_put_sz_to_pool(esw, fdb->max_fte);
255 mlx5_destroy_flow_table(fdb);
258 static struct fdb_chain *
259 mlx5_esw_chains_create_fdb_chain(struct mlx5_eswitch *esw, u32 chain)
261 struct fdb_chain *fdb_chain = NULL;
264 fdb_chain = kvzalloc(sizeof(*fdb_chain), GFP_KERNEL);
266 return ERR_PTR(-ENOMEM);
268 fdb_chain->esw = esw;
269 fdb_chain->chain = chain;
270 INIT_LIST_HEAD(&fdb_chain->prios_list);
272 err = rhashtable_insert_fast(&esw_chains_ht(esw), &fdb_chain->node,
285 mlx5_esw_chains_destroy_fdb_chain(struct fdb_chain *fdb_chain)
287 struct mlx5_eswitch *esw = fdb_chain->esw;
289 rhashtable_remove_fast(&esw_chains_ht(esw), &fdb_chain->node,
294 static struct fdb_chain *
295 mlx5_esw_chains_get_fdb_chain(struct mlx5_eswitch *esw, u32 chain)
297 struct fdb_chain *fdb_chain;
299 fdb_chain = rhashtable_lookup_fast(&esw_chains_ht(esw), &chain,
302 fdb_chain = mlx5_esw_chains_create_fdb_chain(esw, chain);
303 if (IS_ERR(fdb_chain))
312 static struct mlx5_flow_handle *
313 mlx5_esw_chains_add_miss_rule(struct mlx5_flow_table *fdb,
314 struct mlx5_flow_table *next_fdb)
316 static const struct mlx5_flow_spec spec = {};
317 struct mlx5_flow_destination dest = {};
318 struct mlx5_flow_act act = {};
320 act.flags = FLOW_ACT_IGNORE_FLOW_LEVEL | FLOW_ACT_NO_APPEND;
321 act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
322 dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
325 return mlx5_add_flow_rules(fdb, &spec, &act, &dest, 1);
329 mlx5_esw_chains_update_prio_prevs(struct fdb_prio *fdb_prio,
330 struct mlx5_flow_table *next_fdb)
332 struct mlx5_flow_handle *miss_rules[FDB_TC_LEVELS_PER_PRIO + 1] = {};
333 struct fdb_chain *fdb_chain = fdb_prio->fdb_chain;
334 struct fdb_prio *pos;
337 if (fdb_prio->key.level)
340 /* Iterate in reverse order until reaching the level 0 rule of
341 * the previous priority, adding all the miss rules first, so we can
342 * revert them if any of them fails.
345 list_for_each_entry_continue_reverse(pos,
346 &fdb_chain->prios_list,
348 miss_rules[n] = mlx5_esw_chains_add_miss_rule(pos->fdb,
350 if (IS_ERR(miss_rules[n])) {
351 err = PTR_ERR(miss_rules[n]);
360 /* Success, delete old miss rules, and update the pointers. */
363 list_for_each_entry_continue_reverse(pos,
364 &fdb_chain->prios_list,
366 mlx5_del_flow_rules(pos->miss_rule);
368 pos->miss_rule = miss_rules[n];
369 pos->next_fdb = next_fdb;
380 mlx5_del_flow_rules(miss_rules[n]);
386 mlx5_esw_chains_put_fdb_chain(struct fdb_chain *fdb_chain)
388 if (--fdb_chain->ref == 0)
389 mlx5_esw_chains_destroy_fdb_chain(fdb_chain);
392 static struct fdb_prio *
393 mlx5_esw_chains_create_fdb_prio(struct mlx5_eswitch *esw,
394 u32 chain, u32 prio, u32 level)
396 int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
397 struct mlx5_flow_handle *miss_rule = NULL;
398 struct mlx5_flow_group *miss_group;
399 struct fdb_prio *fdb_prio = NULL;
400 struct mlx5_flow_table *next_fdb;
401 struct fdb_chain *fdb_chain;
402 struct mlx5_flow_table *fdb;
403 struct list_head *pos;
407 fdb_chain = mlx5_esw_chains_get_fdb_chain(esw, chain);
408 if (IS_ERR(fdb_chain))
409 return ERR_CAST(fdb_chain);
411 fdb_prio = kvzalloc(sizeof(*fdb_prio), GFP_KERNEL);
412 flow_group_in = kvzalloc(inlen, GFP_KERNEL);
413 if (!fdb_prio || !flow_group_in) {
418 /* Chain's prio list is sorted by prio and level.
419 * And all levels of some prio point to the next prio's level 0.
420 * Example list (prio, level):
421 * (3,0)->(3,1)->(5,0)->(5,1)->(6,1)->(7,0)
422 * In hardware, we will we have the following pointers:
423 * (3,0) -> (5,0) -> (7,0) -> Slow path
429 /* Default miss for each chain: */
430 next_fdb = (chain == mlx5_esw_chains_get_ft_chain(esw)) ?
433 list_for_each(pos, &fdb_chain->prios_list) {
434 struct fdb_prio *p = list_entry(pos, struct fdb_prio, list);
436 /* exit on first pos that is larger */
437 if (prio < p->key.prio || (prio == p->key.prio &&
438 level < p->key.level)) {
439 /* Get next level 0 table */
440 next_fdb = p->key.level == 0 ? p->fdb : p->next_fdb;
445 fdb = mlx5_esw_chains_create_fdb_table(esw, chain, prio, level);
451 MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index,
453 MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index,
455 miss_group = mlx5_create_flow_group(fdb, flow_group_in);
456 if (IS_ERR(miss_group)) {
457 err = PTR_ERR(miss_group);
461 /* Add miss rule to next_fdb */
462 miss_rule = mlx5_esw_chains_add_miss_rule(fdb, next_fdb);
463 if (IS_ERR(miss_rule)) {
464 err = PTR_ERR(miss_rule);
468 fdb_prio->miss_group = miss_group;
469 fdb_prio->miss_rule = miss_rule;
470 fdb_prio->next_fdb = next_fdb;
471 fdb_prio->fdb_chain = fdb_chain;
472 fdb_prio->key.chain = chain;
473 fdb_prio->key.prio = prio;
474 fdb_prio->key.level = level;
477 err = rhashtable_insert_fast(&esw_prios_ht(esw), &fdb_prio->node,
482 list_add(&fdb_prio->list, pos->prev);
484 /* Table is ready, connect it */
485 err = mlx5_esw_chains_update_prio_prevs(fdb_prio, fdb);
489 kvfree(flow_group_in);
493 list_del(&fdb_prio->list);
494 rhashtable_remove_fast(&esw_prios_ht(esw), &fdb_prio->node,
497 mlx5_del_flow_rules(miss_rule);
499 mlx5_destroy_flow_group(miss_group);
501 mlx5_esw_chains_destroy_fdb_table(esw, fdb);
505 kvfree(flow_group_in);
506 mlx5_esw_chains_put_fdb_chain(fdb_chain);
511 mlx5_esw_chains_destroy_fdb_prio(struct mlx5_eswitch *esw,
512 struct fdb_prio *fdb_prio)
514 struct fdb_chain *fdb_chain = fdb_prio->fdb_chain;
516 WARN_ON(mlx5_esw_chains_update_prio_prevs(fdb_prio,
517 fdb_prio->next_fdb));
519 list_del(&fdb_prio->list);
520 rhashtable_remove_fast(&esw_prios_ht(esw), &fdb_prio->node,
522 mlx5_del_flow_rules(fdb_prio->miss_rule);
523 mlx5_destroy_flow_group(fdb_prio->miss_group);
524 mlx5_esw_chains_destroy_fdb_table(esw, fdb_prio->fdb);
525 mlx5_esw_chains_put_fdb_chain(fdb_chain);
529 struct mlx5_flow_table *
530 mlx5_esw_chains_get_table(struct mlx5_eswitch *esw, u32 chain, u32 prio,
533 struct mlx5_flow_table *prev_fts;
534 struct fdb_prio *fdb_prio;
535 struct fdb_prio_key key;
538 if ((chain > mlx5_esw_chains_get_chain_range(esw) &&
539 chain != mlx5_esw_chains_get_ft_chain(esw)) ||
540 prio > mlx5_esw_chains_get_prio_range(esw) ||
541 level > mlx5_esw_chains_get_level_range(esw))
542 return ERR_PTR(-EOPNOTSUPP);
544 /* create earlier levels for correct fs_core lookup when
547 for (l = 0; l < level; l++) {
548 prev_fts = mlx5_esw_chains_get_table(esw, chain, prio, l);
549 if (IS_ERR(prev_fts)) {
550 fdb_prio = ERR_CAST(prev_fts);
559 mutex_lock(&esw_chains_lock(esw));
560 fdb_prio = rhashtable_lookup_fast(&esw_prios_ht(esw), &key,
563 fdb_prio = mlx5_esw_chains_create_fdb_prio(esw, chain,
565 if (IS_ERR(fdb_prio))
566 goto err_create_prio;
570 mutex_unlock(&esw_chains_lock(esw));
572 return fdb_prio->fdb;
575 mutex_unlock(&esw_chains_lock(esw));
578 mlx5_esw_chains_put_table(esw, chain, prio, l);
579 return ERR_CAST(fdb_prio);
583 mlx5_esw_chains_put_table(struct mlx5_eswitch *esw, u32 chain, u32 prio,
586 struct fdb_prio *fdb_prio;
587 struct fdb_prio_key key;
593 mutex_lock(&esw_chains_lock(esw));
594 fdb_prio = rhashtable_lookup_fast(&esw_prios_ht(esw), &key,
599 if (--fdb_prio->ref == 0)
600 mlx5_esw_chains_destroy_fdb_prio(esw, fdb_prio);
601 mutex_unlock(&esw_chains_lock(esw));
604 mlx5_esw_chains_put_table(esw, chain, prio, level);
609 mutex_unlock(&esw_chains_lock(esw));
611 "Couldn't find table: (chain: %d prio: %d level: %d)",
615 struct mlx5_flow_table *
616 mlx5_esw_chains_get_tc_end_ft(struct mlx5_eswitch *esw)
618 return tc_end_fdb(esw);
622 mlx5_esw_chains_init(struct mlx5_eswitch *esw)
624 struct mlx5_esw_chains_priv *chains_priv;
625 struct mlx5_core_dev *dev = esw->dev;
626 u32 max_flow_counter, fdb_max;
629 chains_priv = kzalloc(sizeof(*chains_priv), GFP_KERNEL);
632 esw_chains_priv(esw) = chains_priv;
634 max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) |
635 MLX5_CAP_GEN(dev, max_flow_counter_15_0);
636 fdb_max = 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size);
639 "Init esw offloads chains, max counters(%d), groups(%d), max flow table size(%d)\n",
640 max_flow_counter, ESW_OFFLOADS_NUM_GROUPS, fdb_max);
642 mlx5_esw_chains_init_sz_pool(esw);
644 if (!MLX5_CAP_ESW_FLOWTABLE(esw->dev, multi_fdb_encap) &&
645 esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE) {
646 esw->fdb_table.flags &= ~ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED;
647 esw_warn(dev, "Tc chains and priorities offload aren't supported, update firmware if needed\n");
649 esw->fdb_table.flags |= ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED;
650 esw_info(dev, "Supported tc offload range - chains: %u, prios: %u\n",
651 mlx5_esw_chains_get_chain_range(esw),
652 mlx5_esw_chains_get_prio_range(esw));
655 err = rhashtable_init(&esw_chains_ht(esw), &chain_params);
657 goto init_chains_ht_err;
659 err = rhashtable_init(&esw_prios_ht(esw), &prio_params);
661 goto init_prios_ht_err;
663 mutex_init(&esw_chains_lock(esw));
668 rhashtable_destroy(&esw_chains_ht(esw));
675 mlx5_esw_chains_cleanup(struct mlx5_eswitch *esw)
677 mutex_destroy(&esw_chains_lock(esw));
678 rhashtable_destroy(&esw_prios_ht(esw));
679 rhashtable_destroy(&esw_chains_ht(esw));
681 kfree(esw_chains_priv(esw));
685 mlx5_esw_chains_open(struct mlx5_eswitch *esw)
687 struct mlx5_flow_table *ft;
690 /* Create tc_end_fdb(esw) which is the always created ft chain */
691 ft = mlx5_esw_chains_get_table(esw, mlx5_esw_chains_get_ft_chain(esw),
696 tc_end_fdb(esw) = ft;
698 /* Always open the root for fast path */
699 ft = mlx5_esw_chains_get_table(esw, 0, 1, 0);
705 /* Open level 1 for split rules now if prios isn't supported */
706 if (!mlx5_esw_chains_prios_supported(esw)) {
707 ft = mlx5_esw_chains_get_table(esw, 0, 1, 1);
718 mlx5_esw_chains_put_table(esw, 0, 1, 0);
720 mlx5_esw_chains_put_table(esw, mlx5_esw_chains_get_ft_chain(esw), 1, 0);
725 mlx5_esw_chains_close(struct mlx5_eswitch *esw)
727 if (!mlx5_esw_chains_prios_supported(esw))
728 mlx5_esw_chains_put_table(esw, 0, 1, 1);
729 mlx5_esw_chains_put_table(esw, 0, 1, 0);
730 mlx5_esw_chains_put_table(esw, mlx5_esw_chains_get_ft_chain(esw), 1, 0);
734 mlx5_esw_chains_create(struct mlx5_eswitch *esw)
738 err = mlx5_esw_chains_init(esw);
742 err = mlx5_esw_chains_open(esw);
749 mlx5_esw_chains_cleanup(esw);
754 mlx5_esw_chains_destroy(struct mlx5_eswitch *esw)
756 mlx5_esw_chains_close(esw);
757 mlx5_esw_chains_cleanup(esw);