Merge tag 'riscv-for-linus-5.13-rc5' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-microblaze.git] / net / sched / sch_fq_pie.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Flow Queue PIE discipline
3  *
4  * Copyright (C) 2019 Mohit P. Tahiliani <tahiliani@nitk.edu.in>
5  * Copyright (C) 2019 Sachin D. Patil <sdp.sachin@gmail.com>
6  * Copyright (C) 2019 V. Saicharan <vsaicharan1998@gmail.com>
7  * Copyright (C) 2019 Mohit Bhasi <mohitbhasi1998@gmail.com>
8  * Copyright (C) 2019 Leslie Monis <lesliemonis@gmail.com>
9  * Copyright (C) 2019 Gautam Ramakrishnan <gautamramk@gmail.com>
10  */
11
12 #include <linux/jhash.h>
13 #include <linux/sizes.h>
14 #include <linux/vmalloc.h>
15 #include <net/pkt_cls.h>
16 #include <net/pie.h>
17
18 /* Flow Queue PIE
19  *
20  * Principles:
21  *   - Packets are classified on flows.
22  *   - This is a Stochastic model (as we use a hash, several flows might
23  *                                 be hashed to the same slot)
24  *   - Each flow has a PIE managed queue.
25  *   - Flows are linked onto two (Round Robin) lists,
26  *     so that new flows have priority on old ones.
27  *   - For a given flow, packets are not reordered.
28  *   - Drops during enqueue only.
29  *   - ECN capability is off by default.
30  *   - ECN threshold (if ECN is enabled) is at 10% by default.
31  *   - Uses timestamps to calculate queue delay by default.
32  */
33
34 /**
35  * struct fq_pie_flow - contains data for each flow
36  * @vars:       pie vars associated with the flow
37  * @deficit:    number of remaining byte credits
38  * @backlog:    size of data in the flow
39  * @qlen:       number of packets in the flow
40  * @flowchain:  flowchain for the flow
41  * @head:       first packet in the flow
42  * @tail:       last packet in the flow
43  */
44 struct fq_pie_flow {
45         struct pie_vars vars;
46         s32 deficit;
47         u32 backlog;
48         u32 qlen;
49         struct list_head flowchain;
50         struct sk_buff *head;
51         struct sk_buff *tail;
52 };
53
54 struct fq_pie_sched_data {
55         struct tcf_proto __rcu *filter_list; /* optional external classifier */
56         struct tcf_block *block;
57         struct fq_pie_flow *flows;
58         struct Qdisc *sch;
59         struct list_head old_flows;
60         struct list_head new_flows;
61         struct pie_params p_params;
62         u32 ecn_prob;
63         u32 flows_cnt;
64         u32 quantum;
65         u32 memory_limit;
66         u32 new_flow_count;
67         u32 memory_usage;
68         u32 overmemory;
69         struct pie_stats stats;
70         struct timer_list adapt_timer;
71 };
72
73 static unsigned int fq_pie_hash(const struct fq_pie_sched_data *q,
74                                 struct sk_buff *skb)
75 {
76         return reciprocal_scale(skb_get_hash(skb), q->flows_cnt);
77 }
78
79 static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch,
80                                     int *qerr)
81 {
82         struct fq_pie_sched_data *q = qdisc_priv(sch);
83         struct tcf_proto *filter;
84         struct tcf_result res;
85         int result;
86
87         if (TC_H_MAJ(skb->priority) == sch->handle &&
88             TC_H_MIN(skb->priority) > 0 &&
89             TC_H_MIN(skb->priority) <= q->flows_cnt)
90                 return TC_H_MIN(skb->priority);
91
92         filter = rcu_dereference_bh(q->filter_list);
93         if (!filter)
94                 return fq_pie_hash(q, skb) + 1;
95
96         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
97         result = tcf_classify(skb, filter, &res, false);
98         if (result >= 0) {
99 #ifdef CONFIG_NET_CLS_ACT
100                 switch (result) {
101                 case TC_ACT_STOLEN:
102                 case TC_ACT_QUEUED:
103                 case TC_ACT_TRAP:
104                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
105                         fallthrough;
106                 case TC_ACT_SHOT:
107                         return 0;
108                 }
109 #endif
110                 if (TC_H_MIN(res.classid) <= q->flows_cnt)
111                         return TC_H_MIN(res.classid);
112         }
113         return 0;
114 }
115
116 /* add skb to flow queue (tail add) */
117 static inline void flow_queue_add(struct fq_pie_flow *flow,
118                                   struct sk_buff *skb)
119 {
120         if (!flow->head)
121                 flow->head = skb;
122         else
123                 flow->tail->next = skb;
124         flow->tail = skb;
125         skb->next = NULL;
126 }
127
128 static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
129                                 struct sk_buff **to_free)
130 {
131         struct fq_pie_sched_data *q = qdisc_priv(sch);
132         struct fq_pie_flow *sel_flow;
133         int ret;
134         u8 memory_limited = false;
135         u8 enqueue = false;
136         u32 pkt_len;
137         u32 idx;
138
139         /* Classifies packet into corresponding flow */
140         idx = fq_pie_classify(skb, sch, &ret);
141         if (idx == 0) {
142                 if (ret & __NET_XMIT_BYPASS)
143                         qdisc_qstats_drop(sch);
144                 __qdisc_drop(skb, to_free);
145                 return ret;
146         }
147         idx--;
148
149         sel_flow = &q->flows[idx];
150         /* Checks whether adding a new packet would exceed memory limit */
151         get_pie_cb(skb)->mem_usage = skb->truesize;
152         memory_limited = q->memory_usage > q->memory_limit + skb->truesize;
153
154         /* Checks if the qdisc is full */
155         if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
156                 q->stats.overlimit++;
157                 goto out;
158         } else if (unlikely(memory_limited)) {
159                 q->overmemory++;
160         }
161
162         if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars,
163                             sel_flow->backlog, skb->len)) {
164                 enqueue = true;
165         } else if (q->p_params.ecn &&
166                    sel_flow->vars.prob <= (MAX_PROB / 100) * q->ecn_prob &&
167                    INET_ECN_set_ce(skb)) {
168                 /* If packet is ecn capable, mark it if drop probability
169                  * is lower than the parameter ecn_prob, else drop it.
170                  */
171                 q->stats.ecn_mark++;
172                 enqueue = true;
173         }
174         if (enqueue) {
175                 /* Set enqueue time only when dq_rate_estimator is disabled. */
176                 if (!q->p_params.dq_rate_estimator)
177                         pie_set_enqueue_time(skb);
178
179                 pkt_len = qdisc_pkt_len(skb);
180                 q->stats.packets_in++;
181                 q->memory_usage += skb->truesize;
182                 sch->qstats.backlog += pkt_len;
183                 sch->q.qlen++;
184                 flow_queue_add(sel_flow, skb);
185                 if (list_empty(&sel_flow->flowchain)) {
186                         list_add_tail(&sel_flow->flowchain, &q->new_flows);
187                         q->new_flow_count++;
188                         sel_flow->deficit = q->quantum;
189                         sel_flow->qlen = 0;
190                         sel_flow->backlog = 0;
191                 }
192                 sel_flow->qlen++;
193                 sel_flow->backlog += pkt_len;
194                 return NET_XMIT_SUCCESS;
195         }
196 out:
197         q->stats.dropped++;
198         sel_flow->vars.accu_prob = 0;
199         __qdisc_drop(skb, to_free);
200         qdisc_qstats_drop(sch);
201         return NET_XMIT_CN;
202 }
203
204 static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = {
205         [TCA_FQ_PIE_LIMIT]              = {.type = NLA_U32},
206         [TCA_FQ_PIE_FLOWS]              = {.type = NLA_U32},
207         [TCA_FQ_PIE_TARGET]             = {.type = NLA_U32},
208         [TCA_FQ_PIE_TUPDATE]            = {.type = NLA_U32},
209         [TCA_FQ_PIE_ALPHA]              = {.type = NLA_U32},
210         [TCA_FQ_PIE_BETA]               = {.type = NLA_U32},
211         [TCA_FQ_PIE_QUANTUM]            = {.type = NLA_U32},
212         [TCA_FQ_PIE_MEMORY_LIMIT]       = {.type = NLA_U32},
213         [TCA_FQ_PIE_ECN_PROB]           = {.type = NLA_U32},
214         [TCA_FQ_PIE_ECN]                = {.type = NLA_U32},
215         [TCA_FQ_PIE_BYTEMODE]           = {.type = NLA_U32},
216         [TCA_FQ_PIE_DQ_RATE_ESTIMATOR]  = {.type = NLA_U32},
217 };
218
219 static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow)
220 {
221         struct sk_buff *skb = flow->head;
222
223         flow->head = skb->next;
224         skb->next = NULL;
225         return skb;
226 }
227
228 static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch)
229 {
230         struct fq_pie_sched_data *q = qdisc_priv(sch);
231         struct sk_buff *skb = NULL;
232         struct fq_pie_flow *flow;
233         struct list_head *head;
234         u32 pkt_len;
235
236 begin:
237         head = &q->new_flows;
238         if (list_empty(head)) {
239                 head = &q->old_flows;
240                 if (list_empty(head))
241                         return NULL;
242         }
243
244         flow = list_first_entry(head, struct fq_pie_flow, flowchain);
245         /* Flow has exhausted all its credits */
246         if (flow->deficit <= 0) {
247                 flow->deficit += q->quantum;
248                 list_move_tail(&flow->flowchain, &q->old_flows);
249                 goto begin;
250         }
251
252         if (flow->head) {
253                 skb = dequeue_head(flow);
254                 pkt_len = qdisc_pkt_len(skb);
255                 sch->qstats.backlog -= pkt_len;
256                 sch->q.qlen--;
257                 qdisc_bstats_update(sch, skb);
258         }
259
260         if (!skb) {
261                 /* force a pass through old_flows to prevent starvation */
262                 if (head == &q->new_flows && !list_empty(&q->old_flows))
263                         list_move_tail(&flow->flowchain, &q->old_flows);
264                 else
265                         list_del_init(&flow->flowchain);
266                 goto begin;
267         }
268
269         flow->qlen--;
270         flow->deficit -= pkt_len;
271         flow->backlog -= pkt_len;
272         q->memory_usage -= get_pie_cb(skb)->mem_usage;
273         pie_process_dequeue(skb, &q->p_params, &flow->vars, flow->backlog);
274         return skb;
275 }
276
277 static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
278                          struct netlink_ext_ack *extack)
279 {
280         struct fq_pie_sched_data *q = qdisc_priv(sch);
281         struct nlattr *tb[TCA_FQ_PIE_MAX + 1];
282         unsigned int len_dropped = 0;
283         unsigned int num_dropped = 0;
284         int err;
285
286         if (!opt)
287                 return -EINVAL;
288
289         err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack);
290         if (err < 0)
291                 return err;
292
293         sch_tree_lock(sch);
294         if (tb[TCA_FQ_PIE_LIMIT]) {
295                 u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]);
296
297                 q->p_params.limit = limit;
298                 sch->limit = limit;
299         }
300         if (tb[TCA_FQ_PIE_FLOWS]) {
301                 if (q->flows) {
302                         NL_SET_ERR_MSG_MOD(extack,
303                                            "Number of flows cannot be changed");
304                         goto flow_error;
305                 }
306                 q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]);
307                 if (!q->flows_cnt || q->flows_cnt > 65536) {
308                         NL_SET_ERR_MSG_MOD(extack,
309                                            "Number of flows must range in [1..65536]");
310                         goto flow_error;
311                 }
312         }
313
314         /* convert from microseconds to pschedtime */
315         if (tb[TCA_FQ_PIE_TARGET]) {
316                 /* target is in us */
317                 u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]);
318
319                 /* convert to pschedtime */
320                 q->p_params.target =
321                         PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
322         }
323
324         /* tupdate is in jiffies */
325         if (tb[TCA_FQ_PIE_TUPDATE])
326                 q->p_params.tupdate =
327                         usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE]));
328
329         if (tb[TCA_FQ_PIE_ALPHA])
330                 q->p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]);
331
332         if (tb[TCA_FQ_PIE_BETA])
333                 q->p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]);
334
335         if (tb[TCA_FQ_PIE_QUANTUM])
336                 q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]);
337
338         if (tb[TCA_FQ_PIE_MEMORY_LIMIT])
339                 q->memory_limit = nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]);
340
341         if (tb[TCA_FQ_PIE_ECN_PROB])
342                 q->ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]);
343
344         if (tb[TCA_FQ_PIE_ECN])
345                 q->p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]);
346
347         if (tb[TCA_FQ_PIE_BYTEMODE])
348                 q->p_params.bytemode = nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]);
349
350         if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR])
351                 q->p_params.dq_rate_estimator =
352                         nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]);
353
354         /* Drop excess packets if new limit is lower */
355         while (sch->q.qlen > sch->limit) {
356                 struct sk_buff *skb = fq_pie_qdisc_dequeue(sch);
357
358                 len_dropped += qdisc_pkt_len(skb);
359                 num_dropped += 1;
360                 rtnl_kfree_skbs(skb, skb);
361         }
362         qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped);
363
364         sch_tree_unlock(sch);
365         return 0;
366
367 flow_error:
368         sch_tree_unlock(sch);
369         return -EINVAL;
370 }
371
372 static void fq_pie_timer(struct timer_list *t)
373 {
374         struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer);
375         struct Qdisc *sch = q->sch;
376         spinlock_t *root_lock; /* to lock qdisc for probability calculations */
377         u32 idx;
378
379         root_lock = qdisc_lock(qdisc_root_sleeping(sch));
380         spin_lock(root_lock);
381
382         for (idx = 0; idx < q->flows_cnt; idx++)
383                 pie_calculate_probability(&q->p_params, &q->flows[idx].vars,
384                                           q->flows[idx].backlog);
385
386         /* reset the timer to fire after 'tupdate' jiffies. */
387         if (q->p_params.tupdate)
388                 mod_timer(&q->adapt_timer, jiffies + q->p_params.tupdate);
389
390         spin_unlock(root_lock);
391 }
392
393 static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt,
394                        struct netlink_ext_ack *extack)
395 {
396         struct fq_pie_sched_data *q = qdisc_priv(sch);
397         int err;
398         u32 idx;
399
400         pie_params_init(&q->p_params);
401         sch->limit = 10 * 1024;
402         q->p_params.limit = sch->limit;
403         q->quantum = psched_mtu(qdisc_dev(sch));
404         q->sch = sch;
405         q->ecn_prob = 10;
406         q->flows_cnt = 1024;
407         q->memory_limit = SZ_32M;
408
409         INIT_LIST_HEAD(&q->new_flows);
410         INIT_LIST_HEAD(&q->old_flows);
411         timer_setup(&q->adapt_timer, fq_pie_timer, 0);
412
413         if (opt) {
414                 err = fq_pie_change(sch, opt, extack);
415
416                 if (err)
417                         return err;
418         }
419
420         err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
421         if (err)
422                 goto init_failure;
423
424         q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_pie_flow),
425                             GFP_KERNEL);
426         if (!q->flows) {
427                 err = -ENOMEM;
428                 goto init_failure;
429         }
430         for (idx = 0; idx < q->flows_cnt; idx++) {
431                 struct fq_pie_flow *flow = q->flows + idx;
432
433                 INIT_LIST_HEAD(&flow->flowchain);
434                 pie_vars_init(&flow->vars);
435         }
436
437         mod_timer(&q->adapt_timer, jiffies + HZ / 2);
438
439         return 0;
440
441 init_failure:
442         q->flows_cnt = 0;
443
444         return err;
445 }
446
447 static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb)
448 {
449         struct fq_pie_sched_data *q = qdisc_priv(sch);
450         struct nlattr *opts;
451
452         opts = nla_nest_start(skb, TCA_OPTIONS);
453         if (!opts)
454                 return -EMSGSIZE;
455
456         /* convert target from pschedtime to us */
457         if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) ||
458             nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->flows_cnt) ||
459             nla_put_u32(skb, TCA_FQ_PIE_TARGET,
460                         ((u32)PSCHED_TICKS2NS(q->p_params.target)) /
461                         NSEC_PER_USEC) ||
462             nla_put_u32(skb, TCA_FQ_PIE_TUPDATE,
463                         jiffies_to_usecs(q->p_params.tupdate)) ||
464             nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->p_params.alpha) ||
465             nla_put_u32(skb, TCA_FQ_PIE_BETA, q->p_params.beta) ||
466             nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) ||
467             nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, q->memory_limit) ||
468             nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->ecn_prob) ||
469             nla_put_u32(skb, TCA_FQ_PIE_ECN, q->p_params.ecn) ||
470             nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, q->p_params.bytemode) ||
471             nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR,
472                         q->p_params.dq_rate_estimator))
473                 goto nla_put_failure;
474
475         return nla_nest_end(skb, opts);
476
477 nla_put_failure:
478         nla_nest_cancel(skb, opts);
479         return -EMSGSIZE;
480 }
481
482 static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
483 {
484         struct fq_pie_sched_data *q = qdisc_priv(sch);
485         struct tc_fq_pie_xstats st = {
486                 .packets_in     = q->stats.packets_in,
487                 .overlimit      = q->stats.overlimit,
488                 .overmemory     = q->overmemory,
489                 .dropped        = q->stats.dropped,
490                 .ecn_mark       = q->stats.ecn_mark,
491                 .new_flow_count = q->new_flow_count,
492                 .memory_usage   = q->memory_usage,
493         };
494         struct list_head *pos;
495
496         sch_tree_lock(sch);
497         list_for_each(pos, &q->new_flows)
498                 st.new_flows_len++;
499
500         list_for_each(pos, &q->old_flows)
501                 st.old_flows_len++;
502         sch_tree_unlock(sch);
503
504         return gnet_stats_copy_app(d, &st, sizeof(st));
505 }
506
507 static void fq_pie_reset(struct Qdisc *sch)
508 {
509         struct fq_pie_sched_data *q = qdisc_priv(sch);
510         u32 idx;
511
512         INIT_LIST_HEAD(&q->new_flows);
513         INIT_LIST_HEAD(&q->old_flows);
514         for (idx = 0; idx < q->flows_cnt; idx++) {
515                 struct fq_pie_flow *flow = q->flows + idx;
516
517                 /* Removes all packets from flow */
518                 rtnl_kfree_skbs(flow->head, flow->tail);
519                 flow->head = NULL;
520
521                 INIT_LIST_HEAD(&flow->flowchain);
522                 pie_vars_init(&flow->vars);
523         }
524
525         sch->q.qlen = 0;
526         sch->qstats.backlog = 0;
527 }
528
529 static void fq_pie_destroy(struct Qdisc *sch)
530 {
531         struct fq_pie_sched_data *q = qdisc_priv(sch);
532
533         tcf_block_put(q->block);
534         del_timer_sync(&q->adapt_timer);
535         kvfree(q->flows);
536 }
537
538 static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = {
539         .id             = "fq_pie",
540         .priv_size      = sizeof(struct fq_pie_sched_data),
541         .enqueue        = fq_pie_qdisc_enqueue,
542         .dequeue        = fq_pie_qdisc_dequeue,
543         .peek           = qdisc_peek_dequeued,
544         .init           = fq_pie_init,
545         .destroy        = fq_pie_destroy,
546         .reset          = fq_pie_reset,
547         .change         = fq_pie_change,
548         .dump           = fq_pie_dump,
549         .dump_stats     = fq_pie_dump_stats,
550         .owner          = THIS_MODULE,
551 };
552
553 static int __init fq_pie_module_init(void)
554 {
555         return register_qdisc(&fq_pie_qdisc_ops);
556 }
557
558 static void __exit fq_pie_module_exit(void)
559 {
560         unregister_qdisc(&fq_pie_qdisc_ops);
561 }
562
563 module_init(fq_pie_module_init);
564 module_exit(fq_pie_module_exit);
565
566 MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE)");
567 MODULE_AUTHOR("Mohit P. Tahiliani");
568 MODULE_LICENSE("GPL");