X-Git-Url: http://git.monstr.eu/?a=blobdiff_plain;f=net%2Fipv4%2Fnexthop.c;h=4075230b14c63080c5e12c0f6aea8f650ae7d19e;hb=592fa9532d4e4a7590ca383fd537eb4d53fa585a;hp=743777bce179481ce46536e6b7ca866a2d7f7fbf;hpb=070a7252d21b8e8900ee8540f82f0f1a348f8816;p=linux-2.6-microblaze.git diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 743777bce179..4075230b14c6 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -16,6 +16,9 @@ #include #include +#define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ) +#define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */ + static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo); @@ -32,6 +35,7 @@ static const struct nla_policy rtm_nh_policy_new[] = { [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_FDB] = { .type = NLA_FLAG }, + [NHA_RES_GROUP] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_policy_get[] = { @@ -45,6 +49,32 @@ static const struct nla_policy rtm_nh_policy_dump[] = { [NHA_FDB] = { .type = NLA_FLAG }, }; +static const struct nla_policy rtm_nh_res_policy_new[] = { + [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 }, + [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 }, + [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 }, +}; + +static const struct nla_policy rtm_nh_policy_dump_bucket[] = { + [NHA_ID] = { .type = NLA_U32 }, + [NHA_OIF] = { .type = NLA_U32 }, + [NHA_MASTER] = { .type = NLA_U32 }, + [NHA_RES_BUCKET] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = { + [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 }, +}; + +static const struct nla_policy rtm_nh_policy_get_bucket[] = { + [NHA_ID] = { .type = NLA_U32 }, + [NHA_RES_BUCKET] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy rtm_nh_res_bucket_policy_get[] = { + [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 }, +}; + static bool nexthop_notifiers_is_empty(struct net *net) { return !net->nexthop.notifier_chain.head; @@ -52,10 +82,8 @@ static bool nexthop_notifiers_is_empty(struct net *net) static void __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, - const struct nexthop *nh) + const struct nh_info *nhi) { - struct nh_info *nhi = rtnl_dereference(nh->nh_info); - nh_info->dev = nhi->fib_nhc.nhc_dev; nh_info->gw_family = nhi->fib_nhc.nhc_gw_family; if (nh_info->gw_family == AF_INET) @@ -71,12 +99,14 @@ __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, static int nh_notifier_single_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { + struct nh_info *nhi = rtnl_dereference(nh->nh_info); + info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); if (!info->nh) return -ENOMEM; - __nh_notifier_single_info_init(info->nh, nh); + __nh_notifier_single_info_init(info->nh, nhi); return 0; } @@ -86,8 +116,8 @@ static void nh_notifier_single_info_fini(struct nh_notifier_info *info) kfree(info->nh); } -static int nh_notifier_mp_info_init(struct nh_notifier_info *info, - struct nh_group *nhg) +static int nh_notifier_mpath_info_init(struct nh_notifier_info *info, + struct nh_group *nhg) { u16 num_nh = nhg->num_nh; int i; @@ -103,11 +133,44 @@ static int nh_notifier_mp_info_init(struct nh_notifier_info *info, for (i = 0; i < num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + struct nh_info *nhi; + nhi = rtnl_dereference(nhge->nh->nh_info); info->nh_grp->nh_entries[i].id = nhge->nh->id; info->nh_grp->nh_entries[i].weight = nhge->weight; __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh, - nhge->nh); + nhi); + } + + return 0; +} + +static int nh_notifier_res_table_info_init(struct nh_notifier_info *info, + struct nh_group *nhg) +{ + struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); + u16 num_nh_buckets = res_table->num_nh_buckets; + unsigned long size; + u16 i; + + info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE; + size = struct_size(info->nh_res_table, nhs, num_nh_buckets); + info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | + __GFP_NOWARN); + if (!info->nh_res_table) + return -ENOMEM; + + info->nh_res_table->num_nh_buckets = num_nh_buckets; + + for (i = 0; i < num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + struct nh_grp_entry *nhge; + struct nh_info *nhi; + + nhge = rtnl_dereference(bucket->nh_entry); + nhi = rtnl_dereference(nhge->nh->nh_info); + __nh_notifier_single_info_init(&info->nh_res_table->nhs[i], + nhi); } return 0; @@ -118,8 +181,10 @@ static int nh_notifier_grp_info_init(struct nh_notifier_info *info, { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); - if (nhg->mpath) - return nh_notifier_mp_info_init(info, nhg); + if (nhg->hash_threshold) + return nh_notifier_mpath_info_init(info, nhg); + else if (nhg->resilient) + return nh_notifier_res_table_info_init(info, nhg); return -EINVAL; } @@ -128,8 +193,10 @@ static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); - if (nhg->mpath) + if (nhg->hash_threshold) kfree(info->nh_grp); + else if (nhg->resilient) + vfree(info->nh_res_table); } static int nh_notifier_info_init(struct nh_notifier_info *info, @@ -181,6 +248,178 @@ static int call_nexthop_notifiers(struct net *net, return notifier_to_errno(err); } +static int +nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info, + bool force, unsigned int *p_idle_timer_ms) +{ + struct nh_res_table *res_table; + struct nh_group *nhg; + struct nexthop *nh; + int err = 0; + + /* When 'force' is false, nexthop bucket replacement is performed + * because the bucket was deemed to be idle. In this case, capable + * listeners can choose to perform an atomic replacement: The bucket is + * only replaced if it is inactive. However, if the idle timer interval + * is smaller than the interval in which a listener is querying + * buckets' activity from the device, then atomic replacement should + * not be tried. Pass the idle timer value to listeners, so that they + * could determine which type of replacement to perform. + */ + if (force) { + *p_idle_timer_ms = 0; + return 0; + } + + rcu_read_lock(); + + nh = nexthop_find_by_id(info->net, info->id); + if (!nh) { + err = -EINVAL; + goto out; + } + + nhg = rcu_dereference(nh->nh_grp); + res_table = rcu_dereference(nhg->res_table); + *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer); + +out: + rcu_read_unlock(); + + return err; +} + +static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info, + u16 bucket_index, bool force, + struct nh_info *oldi, + struct nh_info *newi) +{ + unsigned int idle_timer_ms; + int err; + + err = nh_notifier_res_bucket_idle_timer_get(info, force, + &idle_timer_ms); + if (err) + return err; + + info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET; + info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket), + GFP_KERNEL); + if (!info->nh_res_bucket) + return -ENOMEM; + + info->nh_res_bucket->bucket_index = bucket_index; + info->nh_res_bucket->idle_timer_ms = idle_timer_ms; + info->nh_res_bucket->force = force; + __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi); + __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi); + return 0; +} + +static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info) +{ + kfree(info->nh_res_bucket); +} + +static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, + u16 bucket_index, bool force, + struct nh_info *oldi, + struct nh_info *newi, + struct netlink_ext_ack *extack) +{ + struct nh_notifier_info info = { + .net = net, + .extack = extack, + .id = nhg_id, + }; + int err; + + if (nexthop_notifiers_is_empty(net)) + return 0; + + err = nh_notifier_res_bucket_info_init(&info, bucket_index, force, + oldi, newi); + if (err) + return err; + + err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, + NEXTHOP_EVENT_BUCKET_REPLACE, &info); + nh_notifier_res_bucket_info_fini(&info); + + return notifier_to_errno(err); +} + +/* There are three users of RES_TABLE, and NHs etc. referenced from there: + * + * 1) a collection of callbacks for NH maintenance. This operates under + * RTNL, + * 2) the delayed work that gradually balances the resilient table, + * 3) and nexthop_select_path(), operating under RCU. + * + * Both the delayed work and the RTNL block are writers, and need to + * maintain mutual exclusion. Since there are only two and well-known + * writers for each table, the RTNL code can make sure it has exclusive + * access thus: + * + * - Have the DW operate without locking; + * - synchronously cancel the DW; + * - do the writing; + * - if the write was not actually a delete, call upkeep, which schedules + * DW again if necessary. + * + * The functions that are always called from the RTNL context use + * rtnl_dereference(). The functions that can also be called from the DW do + * a raw dereference and rely on the above mutual exclusion scheme. + */ +#define nh_res_dereference(p) (rcu_dereference_raw(p)) + +static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, + u16 bucket_index, bool force, + struct nexthop *old_nh, + struct nexthop *new_nh, + struct netlink_ext_ack *extack) +{ + struct nh_info *oldi = nh_res_dereference(old_nh->nh_info); + struct nh_info *newi = nh_res_dereference(new_nh->nh_info); + + return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index, + force, oldi, newi, extack); +} + +static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh, + struct netlink_ext_ack *extack) +{ + struct nh_notifier_info info = { + .net = net, + .extack = extack, + }; + struct nh_group *nhg; + int err; + + ASSERT_RTNL(); + + if (nexthop_notifiers_is_empty(net)) + return 0; + + /* At this point, the nexthop buckets are still not populated. Only + * emit a notification with the logical nexthops, so that a listener + * could potentially veto it in case of unsupported configuration. + */ + nhg = rtnl_dereference(nh->nh_grp); + err = nh_notifier_mpath_info_init(&info, nhg); + if (err) { + NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); + return err; + } + + err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, + NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, + &info); + kfree(info.nh_grp); + + return notifier_to_errno(err); +} + static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, @@ -239,6 +478,9 @@ static void nexthop_free_group(struct nexthop *nh) WARN_ON(nhg->spare == nhg); + if (nhg->resilient) + vfree(rcu_dereference_raw(nhg->res_table)); + kfree(nhg->spare); kfree(nhg); } @@ -297,6 +539,30 @@ static struct nh_group *nexthop_grp_alloc(u16 num_nh) return nhg; } +static void nh_res_table_upkeep_dw(struct work_struct *work); + +static struct nh_res_table * +nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg) +{ + const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets; + struct nh_res_table *res_table; + unsigned long size; + + size = struct_size(res_table, nh_buckets, num_nh_buckets); + res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); + if (!res_table) + return NULL; + + res_table->net = net; + res_table->nhg_id = nhg_id; + INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw); + INIT_LIST_HEAD(&res_table->uw_nh_entries); + res_table->idle_timer = cfg->nh_grp_res_idle_timer; + res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; + res_table->num_nh_buckets = num_nh_buckets; + return res_table; +} + static void nh_base_seq_inc(struct net *net) { while (++net->nexthop.seq == 0) @@ -345,6 +611,48 @@ static u32 nh_find_unused_id(struct net *net) return 0; } +static void nh_res_time_set_deadline(unsigned long next_time, + unsigned long *deadline) +{ + if (time_before(next_time, *deadline)) + *deadline = next_time; +} + +static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table) +{ + if (list_empty(&res_table->uw_nh_entries)) + return 0; + return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since); +} + +static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg) +{ + struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); + struct nlattr *nest; + + nest = nla_nest_start(skb, NHA_RES_GROUP); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS, + res_table->num_nh_buckets) || + nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER, + jiffies_to_clock_t(res_table->idle_timer)) || + nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER, + jiffies_to_clock_t(res_table->unbalanced_timer)) || + nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME, + nh_res_table_unbalanced_time(res_table), + NHA_RES_GROUP_PAD)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) { struct nexthop_grp *p; @@ -353,8 +661,10 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) u16 group_type = 0; int i; - if (nhg->mpath) + if (nhg->hash_threshold) group_type = NEXTHOP_GRP_TYPE_MPATH; + else if (nhg->resilient) + group_type = NEXTHOP_GRP_TYPE_RES; if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) goto nla_put_failure; @@ -370,6 +680,9 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) p += 1; } + if (nhg->resilient && nla_put_nh_group_res(skb, nhg)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -457,13 +770,26 @@ nla_put_failure: return -EMSGSIZE; } +static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg) +{ + return nla_total_size(0) + /* NHA_RES_GROUP */ + nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */ + nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */ + nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */ + nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */ +} + static size_t nh_nlmsg_size_grp(struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; + size_t tot = nla_total_size(sz) + + nla_total_size(2); /* NHA_GROUP_TYPE */ + + if (nhg->resilient) + tot += nh_nlmsg_size_grp_res(nhg); - return nla_total_size(sz) + - nla_total_size(2); /* NHA_GROUP_TYPE */ + return tot; } static size_t nh_nlmsg_size_single(struct nexthop *nh) @@ -538,18 +864,142 @@ errout: rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); } +static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket) +{ + return (unsigned long)atomic_long_read(&bucket->used_time); +} + +static unsigned long +nh_res_bucket_idle_point(const struct nh_res_table *res_table, + const struct nh_res_bucket *bucket, + unsigned long now) +{ + unsigned long time = nh_res_bucket_used_time(bucket); + + /* Bucket was not used since it was migrated. The idle time is now. */ + if (time == bucket->migrated_time) + return now; + + return time + res_table->idle_timer; +} + +static unsigned long +nh_res_table_unb_point(const struct nh_res_table *res_table) +{ + return res_table->unbalanced_since + res_table->unbalanced_timer; +} + +static void nh_res_bucket_set_idle(const struct nh_res_table *res_table, + struct nh_res_bucket *bucket) +{ + unsigned long now = jiffies; + + atomic_long_set(&bucket->used_time, (long)now); + bucket->migrated_time = now; +} + +static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket) +{ + atomic_long_set(&bucket->used_time, (long)jiffies); +} + +static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket) +{ + unsigned long used_time = nh_res_bucket_used_time(bucket); + + return jiffies_delta_to_clock_t(jiffies - used_time); +} + +static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh, + struct nh_res_bucket *bucket, u16 bucket_index, + int event, u32 portid, u32 seq, + unsigned int nlflags, + struct netlink_ext_ack *extack) +{ + struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); + struct nlmsghdr *nlh; + struct nlattr *nest; + struct nhmsg *nhm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); + if (!nlh) + return -EMSGSIZE; + + nhm = nlmsg_data(nlh); + nhm->nh_family = AF_UNSPEC; + nhm->nh_flags = bucket->nh_flags; + nhm->nh_protocol = nh->protocol; + nhm->nh_scope = 0; + nhm->resvd = 0; + + if (nla_put_u32(skb, NHA_ID, nh->id)) + goto nla_put_failure; + + nest = nla_nest_start(skb, NHA_RES_BUCKET); + if (!nest) + goto nla_put_failure; + + if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) || + nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) || + nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME, + nh_res_bucket_idle_time(bucket), + NHA_RES_BUCKET_PAD)) + goto nla_put_failure_nest; + + nla_nest_end(skb, nest); + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure_nest: + nla_nest_cancel(skb, nest); +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static void nexthop_bucket_notify(struct nh_res_table *res_table, + u16 bucket_index) +{ + struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; + struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); + struct nexthop *nh = nhge->nh_parent; + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + goto errout; + + err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, + RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE, + NULL); + if (err < 0) { + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL); + return; +errout: + if (err < 0) + rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); +} + static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, bool *is_fdb, struct netlink_ext_ack *extack) { if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); - /* nested multipath (group within a group) is not - * supported - */ - if (nhg->mpath) { + /* Nesting groups within groups is not supported. */ + if (nhg->hash_threshold) { + NL_SET_ERR_MSG(extack, + "Hash-threshold group can not be a nexthop within a group"); + return false; + } + if (nhg->resilient) { NL_SET_ERR_MSG(extack, - "Multipath group can not be a nexthop within a group"); + "Resilient group can not be a nexthop within a group"); return false; } *is_fdb = nhg->fdb_nh; @@ -591,7 +1041,7 @@ static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, static int nh_check_attr_group(struct net *net, struct nlattr *tb[], size_t tb_size, - struct netlink_ext_ack *extack) + u16 nh_grp_type, struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); u8 nh_family = AF_UNSPEC; @@ -652,8 +1102,14 @@ static int nh_check_attr_group(struct net *net, for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { if (!tb[i]) continue; - if (i == NHA_FDB) + switch (i) { + case NHA_FDB: continue; + case NHA_RES_GROUP: + if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) + continue; + break; + } NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL; @@ -695,7 +1151,7 @@ static bool ipv4_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } -static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash) +static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) { struct nexthop *rc = NULL; int i; @@ -704,7 +1160,7 @@ static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash) struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; - if (hash > atomic_read(&nhge->mpath.upper_bound)) + if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nhi = rcu_dereference(nhge->nh->nh_info); @@ -732,6 +1188,22 @@ static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash) return rc; } +static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) +{ + struct nh_res_table *res_table = rcu_dereference(nhg->res_table); + u16 bucket_index = hash % res_table->num_nh_buckets; + struct nh_res_bucket *bucket; + struct nh_grp_entry *nhge; + + /* nexthop_select_path() is expected to return a non-NULL value, so + * skip protocol validation and just hand out whatever there is. + */ + bucket = &res_table->nh_buckets[bucket_index]; + nh_res_bucket_set_busy(bucket); + nhge = rcu_dereference(bucket->nh_entry); + return nhge->nh; +} + struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) { struct nh_group *nhg; @@ -740,8 +1212,10 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) return nh; nhg = rcu_dereference(nh->nh_grp); - if (nhg->mpath) - return nexthop_select_path_mp(nhg, hash); + if (nhg->hash_threshold) + return nexthop_select_path_hthr(nhg, hash); + else if (nhg->resilient) + return nexthop_select_path_res(nhg, hash); /* Unreachable. */ return NULL; @@ -924,31 +1398,343 @@ static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, return 0; } -static void nh_group_rebalance(struct nh_group *nhg) +static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge) { - int total = 0; - int w = 0; - int i; + return nhge->res.count_buckets == nhge->res.wants_buckets; +} - for (i = 0; i < nhg->num_nh; ++i) - total += nhg->nh_entries[i].weight; +static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge) +{ + return nhge->res.count_buckets > nhge->res.wants_buckets; +} - for (i = 0; i < nhg->num_nh; ++i) { - struct nh_grp_entry *nhge = &nhg->nh_entries[i]; - int upper_bound; +static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge) +{ + return nhge->res.count_buckets < nhge->res.wants_buckets; +} - w += nhge->weight; - upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; - atomic_set(&nhge->mpath.upper_bound, upper_bound); - } +static bool nh_res_table_is_balanced(const struct nh_res_table *res_table) +{ + return list_empty(&res_table->uw_nh_entries); } -static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, - struct nl_info *nlinfo) +static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket) { - struct nh_grp_entry *nhges, *new_nhges; - struct nexthop *nhp = nhge->nh_parent; - struct netlink_ext_ack extack; + struct nh_grp_entry *nhge; + + if (bucket->occupied) { + nhge = nh_res_dereference(bucket->nh_entry); + nhge->res.count_buckets--; + bucket->occupied = false; + } +} + +static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket, + struct nh_grp_entry *nhge) +{ + nh_res_bucket_unset_nh(bucket); + + bucket->occupied = true; + rcu_assign_pointer(bucket->nh_entry, nhge); + nhge->res.count_buckets++; +} + +static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table, + struct nh_res_bucket *bucket, + unsigned long *deadline, bool *force) +{ + unsigned long now = jiffies; + struct nh_grp_entry *nhge; + unsigned long idle_point; + + if (!bucket->occupied) { + /* The bucket is not occupied, its NHGE pointer is either + * NULL or obsolete. We _have to_ migrate: set force. + */ + *force = true; + return true; + } + + nhge = nh_res_dereference(bucket->nh_entry); + + /* If the bucket is populated by an underweight or balanced + * nexthop, do not migrate. + */ + if (!nh_res_nhge_is_ow(nhge)) + return false; + + /* At this point we know that the bucket is populated with an + * overweight nexthop. It needs to be migrated to a new nexthop if + * the idle timer of unbalanced timer expired. + */ + + idle_point = nh_res_bucket_idle_point(res_table, bucket, now); + if (time_after_eq(now, idle_point)) { + /* The bucket is idle. We _can_ migrate: unset force. */ + *force = false; + return true; + } + + /* Unbalanced timer of 0 means "never force". */ + if (res_table->unbalanced_timer) { + unsigned long unb_point; + + unb_point = nh_res_table_unb_point(res_table); + if (time_after(now, unb_point)) { + /* The bucket is not idle, but the unbalanced timer + * expired. We _can_ migrate, but set force anyway, + * so that drivers know to ignore activity reports + * from the HW. + */ + *force = true; + return true; + } + + nh_res_time_set_deadline(unb_point, deadline); + } + + nh_res_time_set_deadline(idle_point, deadline); + return false; +} + +static bool nh_res_bucket_migrate(struct nh_res_table *res_table, + u16 bucket_index, bool notify, + bool notify_nl, bool force) +{ + struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; + struct nh_grp_entry *new_nhge; + struct netlink_ext_ack extack; + int err; + + new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries, + struct nh_grp_entry, + res.uw_nh_entry); + if (WARN_ON_ONCE(!new_nhge)) + /* If this function is called, "bucket" is either not + * occupied, or it belongs to a next hop that is + * overweight. In either case, there ought to be a + * corresponding underweight next hop. + */ + return false; + + if (notify) { + struct nh_grp_entry *old_nhge; + + old_nhge = nh_res_dereference(bucket->nh_entry); + err = call_nexthop_res_bucket_notifiers(res_table->net, + res_table->nhg_id, + bucket_index, force, + old_nhge->nh, + new_nhge->nh, &extack); + if (err) { + pr_err_ratelimited("%s\n", extack._msg); + if (!force) + return false; + /* It is not possible to veto a forced replacement, so + * just clear the hardware flags from the nexthop + * bucket to indicate to user space that this bucket is + * not correctly populated in hardware. + */ + bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); + } + } + + nh_res_bucket_set_nh(bucket, new_nhge); + nh_res_bucket_set_idle(res_table, bucket); + + if (notify_nl) + nexthop_bucket_notify(res_table, bucket_index); + + if (nh_res_nhge_is_balanced(new_nhge)) + list_del(&new_nhge->res.uw_nh_entry); + return true; +} + +#define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2) + +static void nh_res_table_upkeep(struct nh_res_table *res_table, + bool notify, bool notify_nl) +{ + unsigned long now = jiffies; + unsigned long deadline; + u16 i; + + /* Deadline is the next time that upkeep should be run. It is the + * earliest time at which one of the buckets might be migrated. + * Start at the most pessimistic estimate: either unbalanced_timer + * from now, or if there is none, idle_timer from now. For each + * encountered time point, call nh_res_time_set_deadline() to + * refine the estimate. + */ + if (res_table->unbalanced_timer) + deadline = now + res_table->unbalanced_timer; + else + deadline = now + res_table->idle_timer; + + for (i = 0; i < res_table->num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + bool force; + + if (nh_res_bucket_should_migrate(res_table, bucket, + &deadline, &force)) { + if (!nh_res_bucket_migrate(res_table, i, notify, + notify_nl, force)) { + unsigned long idle_point; + + /* A driver can override the migration + * decision if the HW reports that the + * bucket is actually not idle. Therefore + * remark the bucket as busy again and + * update the deadline. + */ + nh_res_bucket_set_busy(bucket); + idle_point = nh_res_bucket_idle_point(res_table, + bucket, + now); + nh_res_time_set_deadline(idle_point, &deadline); + } + } + } + + /* If the group is still unbalanced, schedule the next upkeep to + * either the deadline computed above, or the minimum deadline, + * whichever comes later. + */ + if (!nh_res_table_is_balanced(res_table)) { + unsigned long now = jiffies; + unsigned long min_deadline; + + min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL; + if (time_before(deadline, min_deadline)) + deadline = min_deadline; + + queue_delayed_work(system_power_efficient_wq, + &res_table->upkeep_dw, deadline - now); + } +} + +static void nh_res_table_upkeep_dw(struct work_struct *work) +{ + struct delayed_work *dw = to_delayed_work(work); + struct nh_res_table *res_table; + + res_table = container_of(dw, struct nh_res_table, upkeep_dw); + nh_res_table_upkeep(res_table, true, true); +} + +static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) +{ + cancel_delayed_work_sync(&res_table->upkeep_dw); +} + +static void nh_res_group_rebalance(struct nh_group *nhg, + struct nh_res_table *res_table) +{ + int prev_upper_bound = 0; + int total = 0; + int w = 0; + int i; + + INIT_LIST_HEAD(&res_table->uw_nh_entries); + + for (i = 0; i < nhg->num_nh; ++i) + total += nhg->nh_entries[i].weight; + + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + int upper_bound; + + w += nhge->weight; + upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w, + total); + nhge->res.wants_buckets = upper_bound - prev_upper_bound; + prev_upper_bound = upper_bound; + + if (nh_res_nhge_is_uw(nhge)) { + if (list_empty(&res_table->uw_nh_entries)) + res_table->unbalanced_since = jiffies; + list_add(&nhge->res.uw_nh_entry, + &res_table->uw_nh_entries); + } + } +} + +/* Migrate buckets in res_table so that they reference NHGE's from NHG with + * the right NH ID. Set those buckets that do not have a corresponding NHGE + * entry in NHG as not occupied. + */ +static void nh_res_table_migrate_buckets(struct nh_res_table *res_table, + struct nh_group *nhg) +{ + u16 i; + + for (i = 0; i < res_table->num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + u32 id = rtnl_dereference(bucket->nh_entry)->nh->id; + bool found = false; + int j; + + for (j = 0; j < nhg->num_nh; j++) { + struct nh_grp_entry *nhge = &nhg->nh_entries[j]; + + if (nhge->nh->id == id) { + nh_res_bucket_set_nh(bucket, nhge); + found = true; + break; + } + } + + if (!found) + nh_res_bucket_unset_nh(bucket); + } +} + +static void replace_nexthop_grp_res(struct nh_group *oldg, + struct nh_group *newg) +{ + /* For NH group replacement, the new NHG might only have a stub + * hash table with 0 buckets, because the number of buckets was not + * specified. For NH removal, oldg and newg both reference the same + * res_table. So in any case, in the following, we want to work + * with oldg->res_table. + */ + struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table); + unsigned long prev_unbalanced_since = old_res_table->unbalanced_since; + bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries); + + nh_res_table_cancel_upkeep(old_res_table); + nh_res_table_migrate_buckets(old_res_table, newg); + nh_res_group_rebalance(newg, old_res_table); + if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries)) + old_res_table->unbalanced_since = prev_unbalanced_since; + nh_res_table_upkeep(old_res_table, true, false); +} + +static void nh_hthr_group_rebalance(struct nh_group *nhg) +{ + int total = 0; + int w = 0; + int i; + + for (i = 0; i < nhg->num_nh; ++i) + total += nhg->nh_entries[i].weight; + + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + int upper_bound; + + w += nhge->weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; + atomic_set(&nhge->hthr.upper_bound, upper_bound); + } +} + +static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, + struct nl_info *nlinfo) +{ + struct nh_grp_entry *nhges, *new_nhges; + struct nexthop *nhp = nhge->nh_parent; + struct netlink_ext_ack extack; struct nexthop *nh = nhge->nh; struct nh_group *nhg, *newg; int i, j, err; @@ -965,7 +1751,9 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, } newg->has_v4 = false; - newg->mpath = nhg->mpath; + newg->is_multipath = nhg->is_multipath; + newg->hash_threshold = nhg->hash_threshold; + newg->resilient = nhg->resilient; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; @@ -993,15 +1781,25 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, j++; } - nh_group_rebalance(newg); + if (newg->hash_threshold) + nh_hthr_group_rebalance(newg); + else if (newg->resilient) + replace_nexthop_grp_res(nhg, newg); + rcu_assign_pointer(nhp->nh_grp, newg); list_del(&nhge->nh_list); nexthop_put(nhge->nh); - err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack); - if (err) - pr_err("%s\n", extack._msg); + /* Removal of a NH from a resilient group is notified through + * bucket notifications. + */ + if (newg->hash_threshold) { + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, + &extack); + if (err) + pr_err("%s\n", extack._msg); + } if (nlinfo) nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); @@ -1022,6 +1820,7 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) { struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); + struct nh_res_table *res_table; int i, num_nh = nhg->num_nh; for (i = 0; i < num_nh; ++i) { @@ -1032,6 +1831,11 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) list_del_init(&nhge->nh_list); } + + if (nhg->resilient) { + res_table = rtnl_dereference(nhg->res_table); + nh_res_table_cancel_upkeep(res_table); + } } /* not called for nexthop replace */ @@ -1107,9 +1911,12 @@ static void nh_rt_cache_flush(struct net *net, struct nexthop *nh) } static int replace_nexthop_grp(struct net *net, struct nexthop *old, - struct nexthop *new, + struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { + struct nh_res_table *tmp_table = NULL; + struct nh_res_table *new_res_table; + struct nh_res_table *old_res_table; struct nh_group *oldg, *newg; int i, err; @@ -1118,19 +1925,67 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old, return -EINVAL; } - err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); - if (err) - return err; - oldg = rtnl_dereference(old->nh_grp); newg = rtnl_dereference(new->nh_grp); + if (newg->hash_threshold != oldg->hash_threshold) { + NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type."); + return -EINVAL; + } + + if (newg->hash_threshold) { + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, + extack); + if (err) + return err; + } else if (newg->resilient) { + new_res_table = rtnl_dereference(newg->res_table); + old_res_table = rtnl_dereference(oldg->res_table); + + /* Accept if num_nh_buckets was not given, but if it was + * given, demand that the value be correct. + */ + if (cfg->nh_grp_res_has_num_buckets && + cfg->nh_grp_res_num_buckets != + old_res_table->num_nh_buckets) { + NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group."); + return -EINVAL; + } + + /* Emit a pre-replace notification so that listeners could veto + * a potentially unsupported configuration. Otherwise, + * individual bucket replacement notifications would need to be + * vetoed, which is something that should only happen if the + * bucket is currently active. + */ + err = call_nexthop_res_table_notifiers(net, new, extack); + if (err) + return err; + + if (cfg->nh_grp_res_has_idle_timer) + old_res_table->idle_timer = cfg->nh_grp_res_idle_timer; + if (cfg->nh_grp_res_has_unbalanced_timer) + old_res_table->unbalanced_timer = + cfg->nh_grp_res_unbalanced_timer; + + replace_nexthop_grp_res(oldg, newg); + + tmp_table = new_res_table; + rcu_assign_pointer(newg->res_table, old_res_table); + rcu_assign_pointer(newg->spare->res_table, old_res_table); + } + /* update parents - used by nexthop code for cleanup */ for (i = 0; i < newg->num_nh; i++) newg->nh_entries[i].nh_parent = old; rcu_assign_pointer(old->nh_grp, newg); + if (newg->resilient) { + rcu_assign_pointer(oldg->res_table, tmp_table); + rcu_assign_pointer(oldg->spare->res_table, tmp_table); + } + for (i = 0; i < oldg->num_nh; i++) oldg->nh_entries[i].nh_parent = new; @@ -1156,6 +2011,71 @@ static void nh_group_v4_update(struct nh_group *nhg) nhg->has_v4 = has_v4; } +static int replace_nexthop_single_notify_res(struct net *net, + struct nh_res_table *res_table, + struct nexthop *old, + struct nh_info *oldi, + struct nh_info *newi, + struct netlink_ext_ack *extack) +{ + u32 nhg_id = res_table->nhg_id; + int err; + u16 i; + + for (i = 0; i < res_table->num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + struct nh_grp_entry *nhge; + + nhge = rtnl_dereference(bucket->nh_entry); + if (nhge->nh == old) { + err = __call_nexthop_res_bucket_notifiers(net, nhg_id, + i, true, + oldi, newi, + extack); + if (err) + goto err_notify; + } + } + + return 0; + +err_notify: + while (i-- > 0) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + struct nh_grp_entry *nhge; + + nhge = rtnl_dereference(bucket->nh_entry); + if (nhge->nh == old) + __call_nexthop_res_bucket_notifiers(net, nhg_id, i, + true, newi, oldi, + extack); + } + return err; +} + +static int replace_nexthop_single_notify(struct net *net, + struct nexthop *group_nh, + struct nexthop *old, + struct nh_info *oldi, + struct nh_info *newi, + struct netlink_ext_ack *extack) +{ + struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp); + struct nh_res_table *res_table; + + if (nhg->hash_threshold) { + return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, + group_nh, extack); + } else if (nhg->resilient) { + res_table = rtnl_dereference(nhg->res_table); + return replace_nexthop_single_notify_res(net, res_table, + old, oldi, newi, + extack); + } + + return -EINVAL; +} + static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) @@ -1198,8 +2118,8 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; - err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, - extack); + err = replace_nexthop_single_notify(net, nhp, old, oldi, newi, + extack); if (err) goto err_notify; } @@ -1229,7 +2149,7 @@ err_notify: list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; - call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, extack); + replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL); } call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack); return err; @@ -1276,7 +2196,8 @@ static void nexthop_replace_notify(struct net *net, struct nexthop *nh, } static int replace_nexthop(struct net *net, struct nexthop *old, - struct nexthop *new, struct netlink_ext_ack *extack) + struct nexthop *new, const struct nh_config *cfg, + struct netlink_ext_ack *extack) { bool new_is_reject = false; struct nh_grp_entry *nhge; @@ -1319,7 +2240,7 @@ static int replace_nexthop(struct net *net, struct nexthop *old, } if (old->is_group) - err = replace_nexthop_grp(net, old, new, extack); + err = replace_nexthop_grp(net, old, new, cfg, extack); else err = replace_nexthop_single(net, old, new, extack); @@ -1361,7 +2282,7 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, } else if (new_id > nh->id) { pp = &next->rb_right; } else if (replace) { - rc = replace_nexthop(net, nh, new_nh, extack); + rc = replace_nexthop(net, nh, new_nh, cfg, extack); if (!rc) { new_nh = nh; /* send notification with old nh */ replace_notify = 1; @@ -1379,9 +2300,37 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, goto out; } + if (new_nh->is_group) { + struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp); + struct nh_res_table *res_table; + + if (nhg->resilient) { + res_table = rtnl_dereference(nhg->res_table); + + /* Not passing the number of buckets is OK when + * replacing, but not when creating a new group. + */ + if (!cfg->nh_grp_res_has_num_buckets) { + NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion"); + rc = -EINVAL; + goto out; + } + + nh_res_group_rebalance(nhg, res_table); + + /* Do not send bucket notifications, we do full + * notification below. + */ + nh_res_table_upkeep(res_table, false, false); + } + } + rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); + /* The initial insertion is a full notification for hash-threshold as + * well as resilient groups. + */ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); if (rc) rb_erase(&new_nh->rb_node, &net->nexthop.rb_root); @@ -1441,6 +2390,7 @@ static struct nexthop *nexthop_create_group(struct net *net, u16 num_nh = nla_len(grps_attr) / sizeof(*entry); struct nh_group *nhg; struct nexthop *nh; + int err; int i; if (WARN_ON(!num_nh)) @@ -1472,8 +2422,10 @@ static struct nexthop *nexthop_create_group(struct net *net, struct nh_info *nhi; nhe = nexthop_find_by_id(net, entry[i].id); - if (!nexthop_get(nhe)) + if (!nexthop_get(nhe)) { + err = -ENOENT; goto out_no_nh; + } nhi = rtnl_dereference(nhe->nh_info); if (nhi->family == AF_INET) @@ -1485,13 +2437,28 @@ static struct nexthop *nexthop_create_group(struct net *net, nhg->nh_entries[i].nh_parent = nh; } - if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) - nhg->mpath = 1; + if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { + nhg->hash_threshold = 1; + nhg->is_multipath = true; + } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { + struct nh_res_table *res_table; - WARN_ON_ONCE(nhg->mpath != 1); + res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg); + if (!res_table) { + err = -ENOMEM; + goto out_no_nh; + } - if (nhg->mpath) - nh_group_rebalance(nhg); + rcu_assign_pointer(nhg->spare->res_table, res_table); + rcu_assign_pointer(nhg->res_table, res_table); + nhg->resilient = true; + nhg->is_multipath = true; + } + + WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1); + + if (nhg->hash_threshold) + nh_hthr_group_rebalance(nhg); if (cfg->nh_fdb) nhg->fdb_nh = 1; @@ -1510,7 +2477,7 @@ out_no_nh: kfree(nhg); kfree(nh); - return ERR_PTR(-ENOENT); + return ERR_PTR(err); } static int nh_create_ipv4(struct net *net, struct nexthop *nh, @@ -1680,6 +2647,70 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, return nh; } +static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback, + unsigned long *timer_p, bool *has_p, + struct netlink_ext_ack *extack) +{ + unsigned long timer; + u32 value; + + if (!attr) { + *timer_p = fallback; + *has_p = false; + return 0; + } + + value = nla_get_u32(attr); + timer = clock_t_to_jiffies(value); + if (timer == ~0UL) { + NL_SET_ERR_MSG(extack, "Timer value too large"); + return -EINVAL; + } + + *timer_p = timer; + *has_p = true; + return 0; +} + +static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {}; + int err; + + if (res) { + err = nla_parse_nested(tb, + ARRAY_SIZE(rtm_nh_res_policy_new) - 1, + res, rtm_nh_res_policy_new, extack); + if (err < 0) + return err; + } + + if (tb[NHA_RES_GROUP_BUCKETS]) { + cfg->nh_grp_res_num_buckets = + nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]); + cfg->nh_grp_res_has_num_buckets = true; + if (!cfg->nh_grp_res_num_buckets) { + NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0"); + return -EINVAL; + } + } + + err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER], + NH_RES_DEFAULT_IDLE_TIMER, + &cfg->nh_grp_res_idle_timer, + &cfg->nh_grp_res_has_idle_timer, + extack); + if (err) + return err; + + return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER], + NH_RES_DEFAULT_UNBALANCED_TIMER, + &cfg->nh_grp_res_unbalanced_timer, + &cfg->nh_grp_res_has_unbalanced_timer, + extack); +} + static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct nh_config *cfg, struct netlink_ext_ack *extack) @@ -1758,7 +2789,14 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, NL_SET_ERR_MSG(extack, "Invalid group type"); goto out; } - err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), extack); + err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), + cfg->nh_grp_type, extack); + if (err) + goto out; + + if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) + err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP], + cfg, extack); /* no other attributes should be set */ goto out; @@ -1983,10 +3021,12 @@ errout_free: } struct nh_dump_filter { + u32 nh_id; int dev_idx; int master_idx; bool group_filter; bool fdb_filter; + u32 res_bucket_nh_id; }; static bool nh_dump_filtered(struct nexthop *nh, @@ -2100,26 +3140,24 @@ static int rtm_dump_walk_nexthops(struct sk_buff *skb, void *data) { struct rb_node *node; - int idx = 0, s_idx; + int s_idx; int err; s_idx = ctx->idx; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; - if (idx < s_idx) - goto cont; - nh = rb_entry(node, struct nexthop, rb_node); - ctx->idx = idx; + if (nh->id < s_idx) + continue; + + ctx->idx = nh->id; err = nh_cb(skb, cb, nh, data); if (err) return err; -cont: - idx++; } - ctx->idx = idx; + ctx->idx++; return 0; } @@ -2166,6 +3204,318 @@ out_err: return err; } +static struct nexthop * +nexthop_find_group_resilient(struct net *net, u32 id, + struct netlink_ext_ack *extack) +{ + struct nh_group *nhg; + struct nexthop *nh; + + nh = nexthop_find_by_id(net, id); + if (!nh) + return ERR_PTR(-ENOENT); + + if (!nh->is_group) { + NL_SET_ERR_MSG(extack, "Not a nexthop group"); + return ERR_PTR(-EINVAL); + } + + nhg = rtnl_dereference(nh->nh_grp); + if (!nhg->resilient) { + NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient"); + return ERR_PTR(-EINVAL); + } + + return nh; +} + +static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p, + struct netlink_ext_ack *extack) +{ + u32 idx; + + if (attr) { + idx = nla_get_u32(attr); + if (!idx) { + NL_SET_ERR_MSG(extack, "Invalid nexthop id"); + return -EINVAL; + } + *nh_id_p = idx; + } else { + *nh_id_p = 0; + } + + return 0; +} + +static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh, + struct nh_dump_filter *filter, + struct netlink_callback *cb) +{ + struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)]; + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)]; + int err; + + err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, + ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1, + rtm_nh_policy_dump_bucket, NULL); + if (err < 0) + return err; + + err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack); + if (err) + return err; + + if (tb[NHA_RES_BUCKET]) { + size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1; + + err = nla_parse_nested(res_tb, max, + tb[NHA_RES_BUCKET], + rtm_nh_res_bucket_policy_dump, + cb->extack); + if (err < 0) + return err; + + err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID], + &filter->res_bucket_nh_id, + cb->extack); + if (err) + return err; + } + + return __nh_valid_dump_req(nlh, tb, filter, cb->extack); +} + +struct rtm_dump_res_bucket_ctx { + struct rtm_dump_nh_ctx nh; + u16 bucket_index; + u32 done_nh_idx; /* 1 + the index of the last fully processed NH. */ +}; + +static struct rtm_dump_res_bucket_ctx * +rtm_dump_res_bucket_ctx(struct netlink_callback *cb) +{ + struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + return ctx; +} + +struct rtm_dump_nexthop_bucket_data { + struct rtm_dump_res_bucket_ctx *ctx; + struct nh_dump_filter filter; +}; + +static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, + struct netlink_callback *cb, + struct nexthop *nh, + struct rtm_dump_nexthop_bucket_data *dd) +{ + u32 portid = NETLINK_CB(cb->skb).portid; + struct nhmsg *nhm = nlmsg_data(cb->nlh); + struct nh_res_table *res_table; + struct nh_group *nhg; + u16 bucket_index; + int err; + + if (dd->ctx->nh.idx < dd->ctx->done_nh_idx) + return 0; + + nhg = rtnl_dereference(nh->nh_grp); + res_table = rtnl_dereference(nhg->res_table); + for (bucket_index = dd->ctx->bucket_index; + bucket_index < res_table->num_nh_buckets; + bucket_index++) { + struct nh_res_bucket *bucket; + struct nh_grp_entry *nhge; + + bucket = &res_table->nh_buckets[bucket_index]; + nhge = rtnl_dereference(bucket->nh_entry); + if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family)) + continue; + + if (dd->filter.res_bucket_nh_id && + dd->filter.res_bucket_nh_id != nhge->nh->id) + continue; + + err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, + RTM_NEWNEXTHOPBUCKET, portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->extack); + if (err < 0) { + if (likely(skb->len)) + goto out; + goto out_err; + } + } + + dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1; + bucket_index = 0; + +out: + err = skb->len; +out_err: + dd->ctx->bucket_index = bucket_index; + return err; +} + +static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb, + struct netlink_callback *cb, + struct nexthop *nh, void *data) +{ + struct rtm_dump_nexthop_bucket_data *dd = data; + struct nh_group *nhg; + + if (!nh->is_group) + return 0; + + nhg = rtnl_dereference(nh->nh_grp); + if (!nhg->resilient) + return 0; + + return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd); +} + +/* rtnl */ +static int rtm_dump_nexthop_bucket(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb); + struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx }; + struct net *net = sock_net(skb->sk); + struct nexthop *nh; + int err; + + err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb); + if (err) + return err; + + if (dd.filter.nh_id) { + nh = nexthop_find_group_resilient(net, dd.filter.nh_id, + cb->extack); + if (IS_ERR(nh)) + return PTR_ERR(nh); + err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd); + } else { + struct rb_root *root = &net->nexthop.rb_root; + + err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh, + &rtm_dump_nexthop_bucket_cb, &dd); + } + + if (err < 0) { + if (likely(skb->len)) + goto out; + goto out_err; + } + +out: + err = skb->len; +out_err: + cb->seq = net->nexthop.seq; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + return err; +} + +static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res, + u16 *bucket_index, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)]; + int err; + + err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1, + res, rtm_nh_res_bucket_policy_get, extack); + if (err < 0) + return err; + + if (!tb[NHA_RES_BUCKET_INDEX]) { + NL_SET_ERR_MSG(extack, "Bucket index is missing"); + return -EINVAL; + } + + *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]); + return 0; +} + +static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh, + u32 *id, u16 *bucket_index, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)]; + int err; + + err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, + ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1, + rtm_nh_policy_get_bucket, extack); + if (err < 0) + return err; + + err = __nh_valid_get_del_req(nlh, tb, id, extack); + if (err) + return err; + + if (!tb[NHA_RES_BUCKET]) { + NL_SET_ERR_MSG(extack, "Bucket information is missing"); + return -EINVAL; + } + + err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET], + bucket_index, extack); + if (err) + return err; + + return 0; +} + +/* rtnl */ +static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(in_skb->sk); + struct nh_res_table *res_table; + struct sk_buff *skb = NULL; + struct nh_group *nhg; + struct nexthop *nh; + u16 bucket_index; + int err; + u32 id; + + err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack); + if (err) + return err; + + nh = nexthop_find_group_resilient(net, id, extack); + if (IS_ERR(nh)) + return PTR_ERR(nh); + + nhg = rtnl_dereference(nh->nh_grp); + res_table = rtnl_dereference(nhg->res_table); + if (bucket_index >= res_table->num_nh_buckets) { + NL_SET_ERR_MSG(extack, "Bucket index out of bounds"); + return -ENOENT; + } + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index], + bucket_index, RTM_NEWNEXTHOPBUCKET, + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, + 0, extack); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + goto errout_free; + } + + return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); + +errout_free: + kfree_skb(skb); + return err; +} + static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) { unsigned int hash = nh_dev_hashfn(dev->ifindex); @@ -2277,6 +3627,75 @@ out: } EXPORT_SYMBOL(nexthop_set_hw_flags); +void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, + bool offload, bool trap) +{ + struct nh_res_table *res_table; + struct nh_res_bucket *bucket; + struct nexthop *nexthop; + struct nh_group *nhg; + + rcu_read_lock(); + + nexthop = nexthop_find_by_id(net, id); + if (!nexthop || !nexthop->is_group) + goto out; + + nhg = rcu_dereference(nexthop->nh_grp); + if (!nhg->resilient) + goto out; + + if (bucket_index >= nhg->res_table->num_nh_buckets) + goto out; + + res_table = rcu_dereference(nhg->res_table); + bucket = &res_table->nh_buckets[bucket_index]; + bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); + if (offload) + bucket->nh_flags |= RTNH_F_OFFLOAD; + if (trap) + bucket->nh_flags |= RTNH_F_TRAP; + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(nexthop_bucket_set_hw_flags); + +void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, + unsigned long *activity) +{ + struct nh_res_table *res_table; + struct nexthop *nexthop; + struct nh_group *nhg; + u16 i; + + rcu_read_lock(); + + nexthop = nexthop_find_by_id(net, id); + if (!nexthop || !nexthop->is_group) + goto out; + + nhg = rcu_dereference(nexthop->nh_grp); + if (!nhg->resilient) + goto out; + + /* Instead of silently ignoring some buckets, demand that the sizes + * be the same. + */ + res_table = rcu_dereference(nhg->res_table); + if (num_buckets != res_table->num_nh_buckets) + goto out; + + for (i = 0; i < num_buckets; i++) { + if (test_bit(i, activity)) + nh_res_bucket_set_busy(&res_table->nh_buckets[i]); + } + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(nexthop_res_grp_activity_update); + static void __net_exit nexthop_net_exit(struct net *net) { rtnl_lock(); @@ -2320,6 +3739,9 @@ static int __init nexthop_init(void) rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEXTHOPBUCKET, rtm_get_nexthop_bucket, + rtm_dump_nexthop_bucket, 0); + return 0; } subsys_initcall(nexthop_init);