GNU Linux-libre 4.19.264-gnu1
[releases.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38 #include <net/pkt_cls.h>
39
40 /*
41
42    Short review.
43    -------------
44
45    This file consists of two interrelated parts:
46
47    1. queueing disciplines manager frontend.
48    2. traffic classes manager frontend.
49
50    Generally, queueing discipline ("qdisc") is a black box,
51    which is able to enqueue packets and to dequeue them (when
52    device is ready to send something) in order and at times
53    determined by algorithm hidden in it.
54
55    qdisc's are divided to two categories:
56    - "queues", which have no internal structure visible from outside.
57    - "schedulers", which split all the packets to "traffic classes",
58      using "packet classifiers" (look at cls_api.c)
59
60    In turn, classes may have child qdiscs (as rule, queues)
61    attached to them etc. etc. etc.
62
63    The goal of the routines in this file is to translate
64    information supplied by user in the form of handles
65    to more intelligible for kernel form, to make some sanity
66    checks and part of work, which is common to all qdiscs
67    and to provide rtnetlink notifications.
68
69    All real intelligent work is done inside qdisc modules.
70
71
72
73    Every discipline has two major routines: enqueue and dequeue.
74
75    ---dequeue
76
77    dequeue usually returns a skb to send. It is allowed to return NULL,
78    but it does not mean that queue is empty, it just means that
79    discipline does not want to send anything this time.
80    Queue is really empty if q->q.qlen == 0.
81    For complicated disciplines with multiple queues q->q is not
82    real packet queue, but however q->q.qlen must be valid.
83
84    ---enqueue
85
86    enqueue returns 0, if packet was enqueued successfully.
87    If packet (this one or another one) was dropped, it returns
88    not zero error code.
89    NET_XMIT_DROP        - this packet dropped
90      Expected action: do not backoff, but wait until queue will clear.
91    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
92      Expected action: backoff or ignore
93
94    Auxiliary routines:
95
96    ---peek
97
98    like dequeue but without removing a packet from the queue
99
100    ---reset
101
102    returns qdisc to initial state: purge all buffers, clear all
103    timers, counters (except for statistics) etc.
104
105    ---init
106
107    initializes newly created qdisc.
108
109    ---destroy
110
111    destroys resources allocated by init and during lifetime of qdisc.
112
113    ---change
114
115    changes qdisc parameters.
116  */
117
118 /* Protects list of registered TC modules. It is pure SMP lock. */
119 static DEFINE_RWLOCK(qdisc_mod_lock);
120
121
122 /************************************************
123  *      Queueing disciplines manipulation.      *
124  ************************************************/
125
126
127 /* The list of all installed queueing disciplines. */
128
129 static struct Qdisc_ops *qdisc_base;
130
131 /* Register/unregister queueing discipline */
132
133 int register_qdisc(struct Qdisc_ops *qops)
134 {
135         struct Qdisc_ops *q, **qp;
136         int rc = -EEXIST;
137
138         write_lock(&qdisc_mod_lock);
139         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
140                 if (!strcmp(qops->id, q->id))
141                         goto out;
142
143         if (qops->enqueue == NULL)
144                 qops->enqueue = noop_qdisc_ops.enqueue;
145         if (qops->peek == NULL) {
146                 if (qops->dequeue == NULL)
147                         qops->peek = noop_qdisc_ops.peek;
148                 else
149                         goto out_einval;
150         }
151         if (qops->dequeue == NULL)
152                 qops->dequeue = noop_qdisc_ops.dequeue;
153
154         if (qops->cl_ops) {
155                 const struct Qdisc_class_ops *cops = qops->cl_ops;
156
157                 if (!(cops->find && cops->walk && cops->leaf))
158                         goto out_einval;
159
160                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
161                         goto out_einval;
162         }
163
164         qops->next = NULL;
165         *qp = qops;
166         rc = 0;
167 out:
168         write_unlock(&qdisc_mod_lock);
169         return rc;
170
171 out_einval:
172         rc = -EINVAL;
173         goto out;
174 }
175 EXPORT_SYMBOL(register_qdisc);
176
177 int unregister_qdisc(struct Qdisc_ops *qops)
178 {
179         struct Qdisc_ops *q, **qp;
180         int err = -ENOENT;
181
182         write_lock(&qdisc_mod_lock);
183         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
184                 if (q == qops)
185                         break;
186         if (q) {
187                 *qp = q->next;
188                 q->next = NULL;
189                 err = 0;
190         }
191         write_unlock(&qdisc_mod_lock);
192         return err;
193 }
194 EXPORT_SYMBOL(unregister_qdisc);
195
196 /* Get default qdisc if not otherwise specified */
197 void qdisc_get_default(char *name, size_t len)
198 {
199         read_lock(&qdisc_mod_lock);
200         strlcpy(name, default_qdisc_ops->id, len);
201         read_unlock(&qdisc_mod_lock);
202 }
203
204 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
205 {
206         struct Qdisc_ops *q = NULL;
207
208         for (q = qdisc_base; q; q = q->next) {
209                 if (!strcmp(name, q->id)) {
210                         if (!try_module_get(q->owner))
211                                 q = NULL;
212                         break;
213                 }
214         }
215
216         return q;
217 }
218
219 /* Set new default qdisc to use */
220 int qdisc_set_default(const char *name)
221 {
222         const struct Qdisc_ops *ops;
223
224         if (!capable(CAP_NET_ADMIN))
225                 return -EPERM;
226
227         write_lock(&qdisc_mod_lock);
228         ops = qdisc_lookup_default(name);
229         if (!ops) {
230                 /* Not found, drop lock and try to load module */
231                 write_unlock(&qdisc_mod_lock);
232                 request_module("sch_%s", name);
233                 write_lock(&qdisc_mod_lock);
234
235                 ops = qdisc_lookup_default(name);
236         }
237
238         if (ops) {
239                 /* Set new default */
240                 module_put(default_qdisc_ops->owner);
241                 default_qdisc_ops = ops;
242         }
243         write_unlock(&qdisc_mod_lock);
244
245         return ops ? 0 : -ENOENT;
246 }
247
248 #ifdef CONFIG_NET_SCH_DEFAULT
249 /* Set default value from kernel config */
250 static int __init sch_default_qdisc(void)
251 {
252         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
253 }
254 late_initcall(sch_default_qdisc);
255 #endif
256
257 /* We know handle. Find qdisc among all qdisc's attached to device
258  * (root qdisc, all its children, children of children etc.)
259  * Note: caller either uses rtnl or rcu_read_lock()
260  */
261
262 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
263 {
264         struct Qdisc *q;
265
266         if (!qdisc_dev(root))
267                 return (root->handle == handle ? root : NULL);
268
269         if (!(root->flags & TCQ_F_BUILTIN) &&
270             root->handle == handle)
271                 return root;
272
273         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
274                 if (q->handle == handle)
275                         return q;
276         }
277         return NULL;
278 }
279
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283                 ASSERT_RTNL();
284                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285                 if (invisible)
286                         q->flags |= TCQ_F_INVISIBLE;
287         }
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294                 ASSERT_RTNL();
295                 hash_del_rcu(&q->hash);
296         }
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302         struct Qdisc *q;
303
304         if (!handle)
305                 return NULL;
306         q = qdisc_match_from_root(dev->qdisc, handle);
307         if (q)
308                 goto out;
309
310         if (dev_ingress_queue(dev))
311                 q = qdisc_match_from_root(
312                         dev_ingress_queue(dev)->qdisc_sleeping,
313                         handle);
314 out:
315         return q;
316 }
317
318 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
319 {
320         struct netdev_queue *nq;
321         struct Qdisc *q;
322
323         if (!handle)
324                 return NULL;
325         q = qdisc_match_from_root(dev->qdisc, handle);
326         if (q)
327                 goto out;
328
329         nq = dev_ingress_queue_rcu(dev);
330         if (nq)
331                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
332 out:
333         return q;
334 }
335
336 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
337 {
338         unsigned long cl;
339         struct Qdisc *leaf;
340         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
341
342         if (cops == NULL)
343                 return NULL;
344         cl = cops->find(p, classid);
345
346         if (cl == 0)
347                 return NULL;
348         leaf = cops->leaf(p, cl);
349         return leaf;
350 }
351
352 /* Find queueing discipline by name */
353
354 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
355 {
356         struct Qdisc_ops *q = NULL;
357
358         if (kind) {
359                 read_lock(&qdisc_mod_lock);
360                 for (q = qdisc_base; q; q = q->next) {
361                         if (nla_strcmp(kind, q->id) == 0) {
362                                 if (!try_module_get(q->owner))
363                                         q = NULL;
364                                 break;
365                         }
366                 }
367                 read_unlock(&qdisc_mod_lock);
368         }
369         return q;
370 }
371
372 /* The linklayer setting were not transferred from iproute2, in older
373  * versions, and the rate tables lookup systems have been dropped in
374  * the kernel. To keep backward compatible with older iproute2 tc
375  * utils, we detect the linklayer setting by detecting if the rate
376  * table were modified.
377  *
378  * For linklayer ATM table entries, the rate table will be aligned to
379  * 48 bytes, thus some table entries will contain the same value.  The
380  * mpu (min packet unit) is also encoded into the old rate table, thus
381  * starting from the mpu, we find low and high table entries for
382  * mapping this cell.  If these entries contain the same value, when
383  * the rate tables have been modified for linklayer ATM.
384  *
385  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
386  * and then roundup to the next cell, calc the table entry one below,
387  * and compare.
388  */
389 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
390 {
391         int low       = roundup(r->mpu, 48);
392         int high      = roundup(low+1, 48);
393         int cell_low  = low >> r->cell_log;
394         int cell_high = (high >> r->cell_log) - 1;
395
396         /* rtab is too inaccurate at rates > 100Mbit/s */
397         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
398                 pr_debug("TC linklayer: Giving up ATM detection\n");
399                 return TC_LINKLAYER_ETHERNET;
400         }
401
402         if ((cell_high > cell_low) && (cell_high < 256)
403             && (rtab[cell_low] == rtab[cell_high])) {
404                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
405                          cell_low, cell_high, rtab[cell_high]);
406                 return TC_LINKLAYER_ATM;
407         }
408         return TC_LINKLAYER_ETHERNET;
409 }
410
411 static struct qdisc_rate_table *qdisc_rtab_list;
412
413 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
414                                         struct nlattr *tab,
415                                         struct netlink_ext_ack *extack)
416 {
417         struct qdisc_rate_table *rtab;
418
419         if (tab == NULL || r->rate == 0 ||
420             r->cell_log == 0 || r->cell_log >= 32 ||
421             nla_len(tab) != TC_RTAB_SIZE) {
422                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
423                 return NULL;
424         }
425
426         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
427                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
428                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
429                         rtab->refcnt++;
430                         return rtab;
431                 }
432         }
433
434         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
435         if (rtab) {
436                 rtab->rate = *r;
437                 rtab->refcnt = 1;
438                 memcpy(rtab->data, nla_data(tab), 1024);
439                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
440                         r->linklayer = __detect_linklayer(r, rtab->data);
441                 rtab->next = qdisc_rtab_list;
442                 qdisc_rtab_list = rtab;
443         } else {
444                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
445         }
446         return rtab;
447 }
448 EXPORT_SYMBOL(qdisc_get_rtab);
449
450 void qdisc_put_rtab(struct qdisc_rate_table *tab)
451 {
452         struct qdisc_rate_table *rtab, **rtabp;
453
454         if (!tab || --tab->refcnt)
455                 return;
456
457         for (rtabp = &qdisc_rtab_list;
458              (rtab = *rtabp) != NULL;
459              rtabp = &rtab->next) {
460                 if (rtab == tab) {
461                         *rtabp = rtab->next;
462                         kfree(rtab);
463                         return;
464                 }
465         }
466 }
467 EXPORT_SYMBOL(qdisc_put_rtab);
468
469 static LIST_HEAD(qdisc_stab_list);
470
471 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
472         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
473         [TCA_STAB_DATA] = { .type = NLA_BINARY },
474 };
475
476 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
477                                                struct netlink_ext_ack *extack)
478 {
479         struct nlattr *tb[TCA_STAB_MAX + 1];
480         struct qdisc_size_table *stab;
481         struct tc_sizespec *s;
482         unsigned int tsize = 0;
483         u16 *tab = NULL;
484         int err;
485
486         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
487         if (err < 0)
488                 return ERR_PTR(err);
489         if (!tb[TCA_STAB_BASE]) {
490                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
491                 return ERR_PTR(-EINVAL);
492         }
493
494         s = nla_data(tb[TCA_STAB_BASE]);
495
496         if (s->tsize > 0) {
497                 if (!tb[TCA_STAB_DATA]) {
498                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
499                         return ERR_PTR(-EINVAL);
500                 }
501                 tab = nla_data(tb[TCA_STAB_DATA]);
502                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
503         }
504
505         if (tsize != s->tsize || (!tab && tsize > 0)) {
506                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
507                 return ERR_PTR(-EINVAL);
508         }
509
510         list_for_each_entry(stab, &qdisc_stab_list, list) {
511                 if (memcmp(&stab->szopts, s, sizeof(*s)))
512                         continue;
513                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
514                         continue;
515                 stab->refcnt++;
516                 return stab;
517         }
518
519         if (s->size_log > STAB_SIZE_LOG_MAX ||
520             s->cell_log > STAB_SIZE_LOG_MAX) {
521                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
522                 return ERR_PTR(-EINVAL);
523         }
524
525         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
526         if (!stab)
527                 return ERR_PTR(-ENOMEM);
528
529         stab->refcnt = 1;
530         stab->szopts = *s;
531         if (tsize > 0)
532                 memcpy(stab->data, tab, tsize * sizeof(u16));
533
534         list_add_tail(&stab->list, &qdisc_stab_list);
535
536         return stab;
537 }
538
539 static void stab_kfree_rcu(struct rcu_head *head)
540 {
541         kfree(container_of(head, struct qdisc_size_table, rcu));
542 }
543
544 void qdisc_put_stab(struct qdisc_size_table *tab)
545 {
546         if (!tab)
547                 return;
548
549         if (--tab->refcnt == 0) {
550                 list_del(&tab->list);
551                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
552         }
553 }
554 EXPORT_SYMBOL(qdisc_put_stab);
555
556 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
557 {
558         struct nlattr *nest;
559
560         nest = nla_nest_start(skb, TCA_STAB);
561         if (nest == NULL)
562                 goto nla_put_failure;
563         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
564                 goto nla_put_failure;
565         nla_nest_end(skb, nest);
566
567         return skb->len;
568
569 nla_put_failure:
570         return -1;
571 }
572
573 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
574                                const struct qdisc_size_table *stab)
575 {
576         int pkt_len, slot;
577
578         pkt_len = skb->len + stab->szopts.overhead;
579         if (unlikely(!stab->szopts.tsize))
580                 goto out;
581
582         slot = pkt_len + stab->szopts.cell_align;
583         if (unlikely(slot < 0))
584                 slot = 0;
585
586         slot >>= stab->szopts.cell_log;
587         if (likely(slot < stab->szopts.tsize))
588                 pkt_len = stab->data[slot];
589         else
590                 pkt_len = stab->data[stab->szopts.tsize - 1] *
591                                 (slot / stab->szopts.tsize) +
592                                 stab->data[slot % stab->szopts.tsize];
593
594         pkt_len <<= stab->szopts.size_log;
595 out:
596         if (unlikely(pkt_len < 1))
597                 pkt_len = 1;
598         qdisc_skb_cb(skb)->pkt_len = pkt_len;
599 }
600 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
601
602 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
603 {
604         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
605                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
606                         txt, qdisc->ops->id, qdisc->handle >> 16);
607                 qdisc->flags |= TCQ_F_WARN_NONWC;
608         }
609 }
610 EXPORT_SYMBOL(qdisc_warn_nonwc);
611
612 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
613 {
614         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
615                                                  timer);
616
617         rcu_read_lock();
618         __netif_schedule(qdisc_root(wd->qdisc));
619         rcu_read_unlock();
620
621         return HRTIMER_NORESTART;
622 }
623
624 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
625                                  clockid_t clockid)
626 {
627         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
628         wd->timer.function = qdisc_watchdog;
629         wd->qdisc = qdisc;
630 }
631 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
632
633 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
634 {
635         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
636 }
637 EXPORT_SYMBOL(qdisc_watchdog_init);
638
639 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
640 {
641         if (test_bit(__QDISC_STATE_DEACTIVATED,
642                      &qdisc_root_sleeping(wd->qdisc)->state))
643                 return;
644
645         if (wd->last_expires == expires)
646                 return;
647
648         wd->last_expires = expires;
649         hrtimer_start(&wd->timer,
650                       ns_to_ktime(expires),
651                       HRTIMER_MODE_ABS_PINNED);
652 }
653 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
654
655 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
656 {
657         hrtimer_cancel(&wd->timer);
658 }
659 EXPORT_SYMBOL(qdisc_watchdog_cancel);
660
661 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
662 {
663         struct hlist_head *h;
664         unsigned int i;
665
666         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
667
668         if (h != NULL) {
669                 for (i = 0; i < n; i++)
670                         INIT_HLIST_HEAD(&h[i]);
671         }
672         return h;
673 }
674
675 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
676 {
677         struct Qdisc_class_common *cl;
678         struct hlist_node *next;
679         struct hlist_head *nhash, *ohash;
680         unsigned int nsize, nmask, osize;
681         unsigned int i, h;
682
683         /* Rehash when load factor exceeds 0.75 */
684         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
685                 return;
686         nsize = clhash->hashsize * 2;
687         nmask = nsize - 1;
688         nhash = qdisc_class_hash_alloc(nsize);
689         if (nhash == NULL)
690                 return;
691
692         ohash = clhash->hash;
693         osize = clhash->hashsize;
694
695         sch_tree_lock(sch);
696         for (i = 0; i < osize; i++) {
697                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
698                         h = qdisc_class_hash(cl->classid, nmask);
699                         hlist_add_head(&cl->hnode, &nhash[h]);
700                 }
701         }
702         clhash->hash     = nhash;
703         clhash->hashsize = nsize;
704         clhash->hashmask = nmask;
705         sch_tree_unlock(sch);
706
707         kvfree(ohash);
708 }
709 EXPORT_SYMBOL(qdisc_class_hash_grow);
710
711 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
712 {
713         unsigned int size = 4;
714
715         clhash->hash = qdisc_class_hash_alloc(size);
716         if (!clhash->hash)
717                 return -ENOMEM;
718         clhash->hashsize  = size;
719         clhash->hashmask  = size - 1;
720         clhash->hashelems = 0;
721         return 0;
722 }
723 EXPORT_SYMBOL(qdisc_class_hash_init);
724
725 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
726 {
727         kvfree(clhash->hash);
728 }
729 EXPORT_SYMBOL(qdisc_class_hash_destroy);
730
731 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
732                              struct Qdisc_class_common *cl)
733 {
734         unsigned int h;
735
736         INIT_HLIST_NODE(&cl->hnode);
737         h = qdisc_class_hash(cl->classid, clhash->hashmask);
738         hlist_add_head(&cl->hnode, &clhash->hash[h]);
739         clhash->hashelems++;
740 }
741 EXPORT_SYMBOL(qdisc_class_hash_insert);
742
743 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
744                              struct Qdisc_class_common *cl)
745 {
746         hlist_del(&cl->hnode);
747         clhash->hashelems--;
748 }
749 EXPORT_SYMBOL(qdisc_class_hash_remove);
750
751 /* Allocate an unique handle from space managed by kernel
752  * Possible range is [8000-FFFF]:0000 (0x8000 values)
753  */
754 static u32 qdisc_alloc_handle(struct net_device *dev)
755 {
756         int i = 0x8000;
757         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
758
759         do {
760                 autohandle += TC_H_MAKE(0x10000U, 0);
761                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
762                         autohandle = TC_H_MAKE(0x80000000U, 0);
763                 if (!qdisc_lookup(dev, autohandle))
764                         return autohandle;
765                 cond_resched();
766         } while (--i > 0);
767
768         return 0;
769 }
770
771 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
772                                unsigned int len)
773 {
774         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
775         const struct Qdisc_class_ops *cops;
776         unsigned long cl;
777         u32 parentid;
778         bool notify;
779         int drops;
780
781         if (n == 0 && len == 0)
782                 return;
783         drops = max_t(int, n, 0);
784         rcu_read_lock();
785         while ((parentid = sch->parent)) {
786                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
787                         break;
788
789                 if (sch->flags & TCQ_F_NOPARENT)
790                         break;
791                 /* Notify parent qdisc only if child qdisc becomes empty.
792                  *
793                  * If child was empty even before update then backlog
794                  * counter is screwed and we skip notification because
795                  * parent class is already passive.
796                  *
797                  * If the original child was offloaded then it is allowed
798                  * to be seem as empty, so the parent is notified anyway.
799                  */
800                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
801                                                        !qdisc_is_offloaded);
802                 /* TODO: perform the search on a per txq basis */
803                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
804                 if (sch == NULL) {
805                         WARN_ON_ONCE(parentid != TC_H_ROOT);
806                         break;
807                 }
808                 cops = sch->ops->cl_ops;
809                 if (notify && cops->qlen_notify) {
810                         cl = cops->find(sch, parentid);
811                         cops->qlen_notify(sch, cl);
812                 }
813                 sch->q.qlen -= n;
814                 sch->qstats.backlog -= len;
815                 __qdisc_qstats_drop(sch, drops);
816         }
817         rcu_read_unlock();
818 }
819 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
820
821 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
822                          u32 portid, u32 seq, u16 flags, int event)
823 {
824         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
825         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
826         struct tcmsg *tcm;
827         struct nlmsghdr  *nlh;
828         unsigned char *b = skb_tail_pointer(skb);
829         struct gnet_dump d;
830         struct qdisc_size_table *stab;
831         u32 block_index;
832         __u32 qlen;
833
834         cond_resched();
835         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
836         if (!nlh)
837                 goto out_nlmsg_trim;
838         tcm = nlmsg_data(nlh);
839         tcm->tcm_family = AF_UNSPEC;
840         tcm->tcm__pad1 = 0;
841         tcm->tcm__pad2 = 0;
842         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
843         tcm->tcm_parent = clid;
844         tcm->tcm_handle = q->handle;
845         tcm->tcm_info = refcount_read(&q->refcnt);
846         if (nla_put_string(skb, TCA_KIND, q->ops->id))
847                 goto nla_put_failure;
848         if (q->ops->ingress_block_get) {
849                 block_index = q->ops->ingress_block_get(q);
850                 if (block_index &&
851                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
852                         goto nla_put_failure;
853         }
854         if (q->ops->egress_block_get) {
855                 block_index = q->ops->egress_block_get(q);
856                 if (block_index &&
857                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
858                         goto nla_put_failure;
859         }
860         if (q->ops->dump && q->ops->dump(q, skb) < 0)
861                 goto nla_put_failure;
862         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
863                 goto nla_put_failure;
864         qlen = qdisc_qlen_sum(q);
865
866         stab = rtnl_dereference(q->stab);
867         if (stab && qdisc_dump_stab(skb, stab) < 0)
868                 goto nla_put_failure;
869
870         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
871                                          NULL, &d, TCA_PAD) < 0)
872                 goto nla_put_failure;
873
874         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
875                 goto nla_put_failure;
876
877         if (qdisc_is_percpu_stats(q)) {
878                 cpu_bstats = q->cpu_bstats;
879                 cpu_qstats = q->cpu_qstats;
880         }
881
882         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
883                                   &d, cpu_bstats, &q->bstats) < 0 ||
884             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
885             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
886                 goto nla_put_failure;
887
888         if (gnet_stats_finish_copy(&d) < 0)
889                 goto nla_put_failure;
890
891         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
892         return skb->len;
893
894 out_nlmsg_trim:
895 nla_put_failure:
896         nlmsg_trim(skb, b);
897         return -1;
898 }
899
900 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
901 {
902         if (q->flags & TCQ_F_BUILTIN)
903                 return true;
904         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
905                 return true;
906
907         return false;
908 }
909
910 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
911                         struct nlmsghdr *n, u32 clid,
912                         struct Qdisc *old, struct Qdisc *new)
913 {
914         struct sk_buff *skb;
915         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
916
917         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
918         if (!skb)
919                 return -ENOBUFS;
920
921         if (old && !tc_qdisc_dump_ignore(old, false)) {
922                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
923                                   0, RTM_DELQDISC) < 0)
924                         goto err_out;
925         }
926         if (new && !tc_qdisc_dump_ignore(new, false)) {
927                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
928                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
929                         goto err_out;
930         }
931
932         if (skb->len)
933                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
934                                       n->nlmsg_flags & NLM_F_ECHO);
935
936 err_out:
937         kfree_skb(skb);
938         return -EINVAL;
939 }
940
941 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
942                                struct nlmsghdr *n, u32 clid,
943                                struct Qdisc *old, struct Qdisc *new)
944 {
945         if (new || old)
946                 qdisc_notify(net, skb, n, clid, old, new);
947
948         if (old)
949                 qdisc_put(old);
950 }
951
952 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
953  * to device "dev".
954  *
955  * When appropriate send a netlink notification using 'skb'
956  * and "n".
957  *
958  * On success, destroy old qdisc.
959  */
960
961 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
962                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
963                        struct Qdisc *new, struct Qdisc *old,
964                        struct netlink_ext_ack *extack)
965 {
966         struct Qdisc *q = old;
967         struct net *net = dev_net(dev);
968         int err = 0;
969
970         if (parent == NULL) {
971                 unsigned int i, num_q, ingress;
972
973                 ingress = 0;
974                 num_q = dev->num_tx_queues;
975                 if ((q && q->flags & TCQ_F_INGRESS) ||
976                     (new && new->flags & TCQ_F_INGRESS)) {
977                         num_q = 1;
978                         ingress = 1;
979                         if (!dev_ingress_queue(dev)) {
980                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
981                                 return -ENOENT;
982                         }
983                 }
984
985                 if (dev->flags & IFF_UP)
986                         dev_deactivate(dev);
987
988                 if (new && new->ops->attach)
989                         goto skip;
990
991                 for (i = 0; i < num_q; i++) {
992                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
993
994                         if (!ingress)
995                                 dev_queue = netdev_get_tx_queue(dev, i);
996
997                         old = dev_graft_qdisc(dev_queue, new);
998                         if (new && i > 0)
999                                 qdisc_refcount_inc(new);
1000
1001                         if (!ingress)
1002                                 qdisc_put(old);
1003                 }
1004
1005 skip:
1006                 if (!ingress) {
1007                         notify_and_destroy(net, skb, n, classid,
1008                                            dev->qdisc, new);
1009                         if (new && !new->ops->attach)
1010                                 qdisc_refcount_inc(new);
1011                         dev->qdisc = new ? : &noop_qdisc;
1012
1013                         if (new && new->ops->attach)
1014                                 new->ops->attach(new);
1015                 } else {
1016                         notify_and_destroy(net, skb, n, classid, old, new);
1017                 }
1018
1019                 if (dev->flags & IFF_UP)
1020                         dev_activate(dev);
1021         } else {
1022                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1023
1024                 /* Only support running class lockless if parent is lockless */
1025                 if (new && (new->flags & TCQ_F_NOLOCK) &&
1026                     parent && !(parent->flags & TCQ_F_NOLOCK))
1027                         new->flags &= ~TCQ_F_NOLOCK;
1028
1029                 err = -EOPNOTSUPP;
1030                 if (cops && cops->graft) {
1031                         unsigned long cl = cops->find(parent, classid);
1032
1033                         if (cl) {
1034                                 err = cops->graft(parent, cl, new, &old,
1035                                                   extack);
1036                         } else {
1037                                 NL_SET_ERR_MSG(extack, "Specified class not found");
1038                                 err = -ENOENT;
1039                         }
1040                 }
1041                 if (!err)
1042                         notify_and_destroy(net, skb, n, classid, old, new);
1043         }
1044         return err;
1045 }
1046
1047 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1048                                    struct netlink_ext_ack *extack)
1049 {
1050         u32 block_index;
1051
1052         if (tca[TCA_INGRESS_BLOCK]) {
1053                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1054
1055                 if (!block_index) {
1056                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1057                         return -EINVAL;
1058                 }
1059                 if (!sch->ops->ingress_block_set) {
1060                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1061                         return -EOPNOTSUPP;
1062                 }
1063                 sch->ops->ingress_block_set(sch, block_index);
1064         }
1065         if (tca[TCA_EGRESS_BLOCK]) {
1066                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1067
1068                 if (!block_index) {
1069                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1070                         return -EINVAL;
1071                 }
1072                 if (!sch->ops->egress_block_set) {
1073                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1074                         return -EOPNOTSUPP;
1075                 }
1076                 sch->ops->egress_block_set(sch, block_index);
1077         }
1078         return 0;
1079 }
1080
1081 /* lockdep annotation is needed for ingress; egress gets it only for name */
1082 static struct lock_class_key qdisc_tx_lock;
1083 static struct lock_class_key qdisc_rx_lock;
1084
1085 /*
1086    Allocate and initialize new qdisc.
1087
1088    Parameters are passed via opt.
1089  */
1090
1091 static struct Qdisc *qdisc_create(struct net_device *dev,
1092                                   struct netdev_queue *dev_queue,
1093                                   struct Qdisc *p, u32 parent, u32 handle,
1094                                   struct nlattr **tca, int *errp,
1095                                   struct netlink_ext_ack *extack)
1096 {
1097         int err;
1098         struct nlattr *kind = tca[TCA_KIND];
1099         struct Qdisc *sch;
1100         struct Qdisc_ops *ops;
1101         struct qdisc_size_table *stab;
1102
1103         ops = qdisc_lookup_ops(kind);
1104 #ifdef CONFIG_MODULES
1105         if (ops == NULL && kind != NULL) {
1106                 char name[IFNAMSIZ];
1107                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1108                         /* We dropped the RTNL semaphore in order to
1109                          * perform the module load.  So, even if we
1110                          * succeeded in loading the module we have to
1111                          * tell the caller to replay the request.  We
1112                          * indicate this using -EAGAIN.
1113                          * We replay the request because the device may
1114                          * go away in the mean time.
1115                          */
1116                         rtnl_unlock();
1117                         request_module("sch_%s", name);
1118                         rtnl_lock();
1119                         ops = qdisc_lookup_ops(kind);
1120                         if (ops != NULL) {
1121                                 /* We will try again qdisc_lookup_ops,
1122                                  * so don't keep a reference.
1123                                  */
1124                                 module_put(ops->owner);
1125                                 err = -EAGAIN;
1126                                 goto err_out;
1127                         }
1128                 }
1129         }
1130 #endif
1131
1132         err = -ENOENT;
1133         if (!ops) {
1134                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1135                 goto err_out;
1136         }
1137
1138         sch = qdisc_alloc(dev_queue, ops, extack);
1139         if (IS_ERR(sch)) {
1140                 err = PTR_ERR(sch);
1141                 goto err_out2;
1142         }
1143
1144         sch->parent = parent;
1145
1146         if (handle == TC_H_INGRESS) {
1147                 sch->flags |= TCQ_F_INGRESS;
1148                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1149                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1150         } else {
1151                 if (handle == 0) {
1152                         handle = qdisc_alloc_handle(dev);
1153                         err = -ENOMEM;
1154                         if (handle == 0)
1155                                 goto err_out3;
1156                 }
1157                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1158                 if (!netif_is_multiqueue(dev))
1159                         sch->flags |= TCQ_F_ONETXQUEUE;
1160         }
1161
1162         sch->handle = handle;
1163
1164         /* This exist to keep backward compatible with a userspace
1165          * loophole, what allowed userspace to get IFF_NO_QUEUE
1166          * facility on older kernels by setting tx_queue_len=0 (prior
1167          * to qdisc init), and then forgot to reinit tx_queue_len
1168          * before again attaching a qdisc.
1169          */
1170         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1171                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1172                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1173         }
1174
1175         err = qdisc_block_indexes_set(sch, tca, extack);
1176         if (err)
1177                 goto err_out3;
1178
1179         if (ops->init) {
1180                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1181                 if (err != 0)
1182                         goto err_out5;
1183         }
1184
1185         if (tca[TCA_STAB]) {
1186                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1187                 if (IS_ERR(stab)) {
1188                         err = PTR_ERR(stab);
1189                         goto err_out4;
1190                 }
1191                 rcu_assign_pointer(sch->stab, stab);
1192         }
1193         if (tca[TCA_RATE]) {
1194                 seqcount_t *running;
1195
1196                 err = -EOPNOTSUPP;
1197                 if (sch->flags & TCQ_F_MQROOT) {
1198                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1199                         goto err_out4;
1200                 }
1201
1202                 if (sch->parent != TC_H_ROOT &&
1203                     !(sch->flags & TCQ_F_INGRESS) &&
1204                     (!p || !(p->flags & TCQ_F_MQROOT)))
1205                         running = qdisc_root_sleeping_running(sch);
1206                 else
1207                         running = &sch->running;
1208
1209                 err = gen_new_estimator(&sch->bstats,
1210                                         sch->cpu_bstats,
1211                                         &sch->rate_est,
1212                                         NULL,
1213                                         running,
1214                                         tca[TCA_RATE]);
1215                 if (err) {
1216                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1217                         goto err_out4;
1218                 }
1219         }
1220
1221         qdisc_hash_add(sch, false);
1222
1223         return sch;
1224
1225 err_out5:
1226         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1227         if (ops->destroy)
1228                 ops->destroy(sch);
1229 err_out3:
1230         dev_put(dev);
1231         qdisc_free(sch);
1232 err_out2:
1233         module_put(ops->owner);
1234 err_out:
1235         *errp = err;
1236         return NULL;
1237
1238 err_out4:
1239         /*
1240          * Any broken qdiscs that would require a ops->reset() here?
1241          * The qdisc was never in action so it shouldn't be necessary.
1242          */
1243         qdisc_put_stab(rtnl_dereference(sch->stab));
1244         if (ops->destroy)
1245                 ops->destroy(sch);
1246         goto err_out3;
1247 }
1248
1249 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1250                         struct netlink_ext_ack *extack)
1251 {
1252         struct qdisc_size_table *ostab, *stab = NULL;
1253         int err = 0;
1254
1255         if (tca[TCA_OPTIONS]) {
1256                 if (!sch->ops->change) {
1257                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1258                         return -EINVAL;
1259                 }
1260                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1261                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1262                         return -EOPNOTSUPP;
1263                 }
1264                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1265                 if (err)
1266                         return err;
1267         }
1268
1269         if (tca[TCA_STAB]) {
1270                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1271                 if (IS_ERR(stab))
1272                         return PTR_ERR(stab);
1273         }
1274
1275         ostab = rtnl_dereference(sch->stab);
1276         rcu_assign_pointer(sch->stab, stab);
1277         qdisc_put_stab(ostab);
1278
1279         if (tca[TCA_RATE]) {
1280                 /* NB: ignores errors from replace_estimator
1281                    because change can't be undone. */
1282                 if (sch->flags & TCQ_F_MQROOT)
1283                         goto out;
1284                 gen_replace_estimator(&sch->bstats,
1285                                       sch->cpu_bstats,
1286                                       &sch->rate_est,
1287                                       NULL,
1288                                       qdisc_root_sleeping_running(sch),
1289                                       tca[TCA_RATE]);
1290         }
1291 out:
1292         return 0;
1293 }
1294
1295 struct check_loop_arg {
1296         struct qdisc_walker     w;
1297         struct Qdisc            *p;
1298         int                     depth;
1299 };
1300
1301 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1302                          struct qdisc_walker *w);
1303
1304 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1305 {
1306         struct check_loop_arg   arg;
1307
1308         if (q->ops->cl_ops == NULL)
1309                 return 0;
1310
1311         arg.w.stop = arg.w.skip = arg.w.count = 0;
1312         arg.w.fn = check_loop_fn;
1313         arg.depth = depth;
1314         arg.p = p;
1315         q->ops->cl_ops->walk(q, &arg.w);
1316         return arg.w.stop ? -ELOOP : 0;
1317 }
1318
1319 static int
1320 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1321 {
1322         struct Qdisc *leaf;
1323         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1324         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1325
1326         leaf = cops->leaf(q, cl);
1327         if (leaf) {
1328                 if (leaf == arg->p || arg->depth > 7)
1329                         return -ELOOP;
1330                 return check_loop(leaf, arg->p, arg->depth + 1);
1331         }
1332         return 0;
1333 }
1334
1335 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1336         [TCA_KIND]              = { .type = NLA_NUL_STRING,
1337                                     .len = IFNAMSIZ - 1 },
1338         [TCA_RATE]              = { .type = NLA_BINARY,
1339                                     .len = sizeof(struct tc_estimator) },
1340         [TCA_STAB]              = { .type = NLA_NESTED },
1341         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1342         [TCA_CHAIN]             = { .type = NLA_U32 },
1343         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1344         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1345 };
1346
1347 /*
1348  * Delete/get qdisc.
1349  */
1350
1351 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1352                         struct netlink_ext_ack *extack)
1353 {
1354         struct net *net = sock_net(skb->sk);
1355         struct tcmsg *tcm = nlmsg_data(n);
1356         struct nlattr *tca[TCA_MAX + 1];
1357         struct net_device *dev;
1358         u32 clid;
1359         struct Qdisc *q = NULL;
1360         struct Qdisc *p = NULL;
1361         int err;
1362
1363         if ((n->nlmsg_type != RTM_GETQDISC) &&
1364             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1365                 return -EPERM;
1366
1367         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1368                           extack);
1369         if (err < 0)
1370                 return err;
1371
1372         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1373         if (!dev)
1374                 return -ENODEV;
1375
1376         clid = tcm->tcm_parent;
1377         if (clid) {
1378                 if (clid != TC_H_ROOT) {
1379                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1380                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1381                                 if (!p) {
1382                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1383                                         return -ENOENT;
1384                                 }
1385                                 q = qdisc_leaf(p, clid);
1386                         } else if (dev_ingress_queue(dev)) {
1387                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1388                         }
1389                 } else {
1390                         q = dev->qdisc;
1391                 }
1392                 if (!q) {
1393                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1394                         return -ENOENT;
1395                 }
1396
1397                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1398                         NL_SET_ERR_MSG(extack, "Invalid handle");
1399                         return -EINVAL;
1400                 }
1401         } else {
1402                 q = qdisc_lookup(dev, tcm->tcm_handle);
1403                 if (!q) {
1404                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1405                         return -ENOENT;
1406                 }
1407         }
1408
1409         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1410                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1411                 return -EINVAL;
1412         }
1413
1414         if (n->nlmsg_type == RTM_DELQDISC) {
1415                 if (!clid) {
1416                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1417                         return -EINVAL;
1418                 }
1419                 if (q->handle == 0) {
1420                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1421                         return -ENOENT;
1422                 }
1423                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1424                 if (err != 0)
1425                         return err;
1426         } else {
1427                 qdisc_notify(net, skb, n, clid, NULL, q);
1428         }
1429         return 0;
1430 }
1431
1432 /*
1433  * Create/change qdisc.
1434  */
1435
1436 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1437                            struct netlink_ext_ack *extack)
1438 {
1439         struct net *net = sock_net(skb->sk);
1440         struct tcmsg *tcm;
1441         struct nlattr *tca[TCA_MAX + 1];
1442         struct net_device *dev;
1443         u32 clid;
1444         struct Qdisc *q, *p;
1445         int err;
1446
1447         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1448                 return -EPERM;
1449
1450 replay:
1451         /* Reinit, just in case something touches this. */
1452         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1453                           extack);
1454         if (err < 0)
1455                 return err;
1456
1457         tcm = nlmsg_data(n);
1458         clid = tcm->tcm_parent;
1459         q = p = NULL;
1460
1461         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1462         if (!dev)
1463                 return -ENODEV;
1464
1465
1466         if (clid) {
1467                 if (clid != TC_H_ROOT) {
1468                         if (clid != TC_H_INGRESS) {
1469                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1470                                 if (!p) {
1471                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1472                                         return -ENOENT;
1473                                 }
1474                                 q = qdisc_leaf(p, clid);
1475                         } else if (dev_ingress_queue_create(dev)) {
1476                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1477                         }
1478                 } else {
1479                         q = dev->qdisc;
1480                 }
1481
1482                 /* It may be default qdisc, ignore it */
1483                 if (q && q->handle == 0)
1484                         q = NULL;
1485
1486                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1487                         if (tcm->tcm_handle) {
1488                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1489                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1490                                         return -EEXIST;
1491                                 }
1492                                 if (TC_H_MIN(tcm->tcm_handle)) {
1493                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1494                                         return -EINVAL;
1495                                 }
1496                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1497                                 if (!q)
1498                                         goto create_n_graft;
1499                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1500                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1501                                         return -EEXIST;
1502                                 }
1503                                 if (tca[TCA_KIND] &&
1504                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1505                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1506                                         return -EINVAL;
1507                                 }
1508                                 if (q == p ||
1509                                     (p && check_loop(q, p, 0))) {
1510                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1511                                         return -ELOOP;
1512                                 }
1513                                 qdisc_refcount_inc(q);
1514                                 goto graft;
1515                         } else {
1516                                 if (!q)
1517                                         goto create_n_graft;
1518
1519                                 /* This magic test requires explanation.
1520                                  *
1521                                  *   We know, that some child q is already
1522                                  *   attached to this parent and have choice:
1523                                  *   either to change it or to create/graft new one.
1524                                  *
1525                                  *   1. We are allowed to create/graft only
1526                                  *   if CREATE and REPLACE flags are set.
1527                                  *
1528                                  *   2. If EXCL is set, requestor wanted to say,
1529                                  *   that qdisc tcm_handle is not expected
1530                                  *   to exist, so that we choose create/graft too.
1531                                  *
1532                                  *   3. The last case is when no flags are set.
1533                                  *   Alas, it is sort of hole in API, we
1534                                  *   cannot decide what to do unambiguously.
1535                                  *   For now we select create/graft, if
1536                                  *   user gave KIND, which does not match existing.
1537                                  */
1538                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1539                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1540                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1541                                      (tca[TCA_KIND] &&
1542                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1543                                         goto create_n_graft;
1544                         }
1545                 }
1546         } else {
1547                 if (!tcm->tcm_handle) {
1548                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1549                         return -EINVAL;
1550                 }
1551                 q = qdisc_lookup(dev, tcm->tcm_handle);
1552         }
1553
1554         /* Change qdisc parameters */
1555         if (!q) {
1556                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1557                 return -ENOENT;
1558         }
1559         if (n->nlmsg_flags & NLM_F_EXCL) {
1560                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1561                 return -EEXIST;
1562         }
1563         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1564                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1565                 return -EINVAL;
1566         }
1567         err = qdisc_change(q, tca, extack);
1568         if (err == 0)
1569                 qdisc_notify(net, skb, n, clid, NULL, q);
1570         return err;
1571
1572 create_n_graft:
1573         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1574                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1575                 return -ENOENT;
1576         }
1577         if (clid == TC_H_INGRESS) {
1578                 if (dev_ingress_queue(dev)) {
1579                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1580                                          tcm->tcm_parent, tcm->tcm_parent,
1581                                          tca, &err, extack);
1582                 } else {
1583                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1584                         err = -ENOENT;
1585                 }
1586         } else {
1587                 struct netdev_queue *dev_queue;
1588
1589                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1590                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1591                 else if (p)
1592                         dev_queue = p->dev_queue;
1593                 else
1594                         dev_queue = netdev_get_tx_queue(dev, 0);
1595
1596                 q = qdisc_create(dev, dev_queue, p,
1597                                  tcm->tcm_parent, tcm->tcm_handle,
1598                                  tca, &err, extack);
1599         }
1600         if (q == NULL) {
1601                 if (err == -EAGAIN)
1602                         goto replay;
1603                 return err;
1604         }
1605
1606 graft:
1607         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1608         if (err) {
1609                 if (q)
1610                         qdisc_put(q);
1611                 return err;
1612         }
1613
1614         return 0;
1615 }
1616
1617 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1618                               struct netlink_callback *cb,
1619                               int *q_idx_p, int s_q_idx, bool recur,
1620                               bool dump_invisible)
1621 {
1622         int ret = 0, q_idx = *q_idx_p;
1623         struct Qdisc *q;
1624         int b;
1625
1626         if (!root)
1627                 return 0;
1628
1629         q = root;
1630         if (q_idx < s_q_idx) {
1631                 q_idx++;
1632         } else {
1633                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1634                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1635                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1636                                   RTM_NEWQDISC) <= 0)
1637                         goto done;
1638                 q_idx++;
1639         }
1640
1641         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1642          * itself has already been dumped.
1643          *
1644          * If we've already dumped the top-level (ingress) qdisc above and the global
1645          * qdisc hashtable, we don't want to hit it again
1646          */
1647         if (!qdisc_dev(root) || !recur)
1648                 goto out;
1649
1650         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1651                 if (q_idx < s_q_idx) {
1652                         q_idx++;
1653                         continue;
1654                 }
1655                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1656                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1657                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1658                                   RTM_NEWQDISC) <= 0)
1659                         goto done;
1660                 q_idx++;
1661         }
1662
1663 out:
1664         *q_idx_p = q_idx;
1665         return ret;
1666 done:
1667         ret = -1;
1668         goto out;
1669 }
1670
1671 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1672 {
1673         struct net *net = sock_net(skb->sk);
1674         int idx, q_idx;
1675         int s_idx, s_q_idx;
1676         struct net_device *dev;
1677         const struct nlmsghdr *nlh = cb->nlh;
1678         struct nlattr *tca[TCA_MAX + 1];
1679         int err;
1680
1681         s_idx = cb->args[0];
1682         s_q_idx = q_idx = cb->args[1];
1683
1684         idx = 0;
1685         ASSERT_RTNL();
1686
1687         err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1688                           rtm_tca_policy, NULL);
1689         if (err < 0)
1690                 return err;
1691
1692         for_each_netdev(net, dev) {
1693                 struct netdev_queue *dev_queue;
1694
1695                 if (idx < s_idx)
1696                         goto cont;
1697                 if (idx > s_idx)
1698                         s_q_idx = 0;
1699                 q_idx = 0;
1700
1701                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1702                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1703                         goto done;
1704
1705                 dev_queue = dev_ingress_queue(dev);
1706                 if (dev_queue &&
1707                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1708                                        &q_idx, s_q_idx, false,
1709                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1710                         goto done;
1711
1712 cont:
1713                 idx++;
1714         }
1715
1716 done:
1717         cb->args[0] = idx;
1718         cb->args[1] = q_idx;
1719
1720         return skb->len;
1721 }
1722
1723
1724
1725 /************************************************
1726  *      Traffic classes manipulation.           *
1727  ************************************************/
1728
1729 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1730                           unsigned long cl,
1731                           u32 portid, u32 seq, u16 flags, int event)
1732 {
1733         struct tcmsg *tcm;
1734         struct nlmsghdr  *nlh;
1735         unsigned char *b = skb_tail_pointer(skb);
1736         struct gnet_dump d;
1737         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1738
1739         cond_resched();
1740         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1741         if (!nlh)
1742                 goto out_nlmsg_trim;
1743         tcm = nlmsg_data(nlh);
1744         tcm->tcm_family = AF_UNSPEC;
1745         tcm->tcm__pad1 = 0;
1746         tcm->tcm__pad2 = 0;
1747         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1748         tcm->tcm_parent = q->handle;
1749         tcm->tcm_handle = q->handle;
1750         tcm->tcm_info = 0;
1751         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1752                 goto nla_put_failure;
1753         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1754                 goto nla_put_failure;
1755
1756         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1757                                          NULL, &d, TCA_PAD) < 0)
1758                 goto nla_put_failure;
1759
1760         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1761                 goto nla_put_failure;
1762
1763         if (gnet_stats_finish_copy(&d) < 0)
1764                 goto nla_put_failure;
1765
1766         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1767         return skb->len;
1768
1769 out_nlmsg_trim:
1770 nla_put_failure:
1771         nlmsg_trim(skb, b);
1772         return -1;
1773 }
1774
1775 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1776                          struct nlmsghdr *n, struct Qdisc *q,
1777                          unsigned long cl, int event)
1778 {
1779         struct sk_buff *skb;
1780         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1781
1782         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1783         if (!skb)
1784                 return -ENOBUFS;
1785
1786         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1787                 kfree_skb(skb);
1788                 return -EINVAL;
1789         }
1790
1791         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1792                               n->nlmsg_flags & NLM_F_ECHO);
1793 }
1794
1795 static int tclass_del_notify(struct net *net,
1796                              const struct Qdisc_class_ops *cops,
1797                              struct sk_buff *oskb, struct nlmsghdr *n,
1798                              struct Qdisc *q, unsigned long cl)
1799 {
1800         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1801         struct sk_buff *skb;
1802         int err = 0;
1803
1804         if (!cops->delete)
1805                 return -EOPNOTSUPP;
1806
1807         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1808         if (!skb)
1809                 return -ENOBUFS;
1810
1811         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1812                            RTM_DELTCLASS) < 0) {
1813                 kfree_skb(skb);
1814                 return -EINVAL;
1815         }
1816
1817         err = cops->delete(q, cl);
1818         if (err) {
1819                 kfree_skb(skb);
1820                 return err;
1821         }
1822
1823         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1824                               n->nlmsg_flags & NLM_F_ECHO);
1825 }
1826
1827 #ifdef CONFIG_NET_CLS
1828
1829 struct tcf_bind_args {
1830         struct tcf_walker w;
1831         unsigned long base;
1832         unsigned long cl;
1833         u32 classid;
1834 };
1835
1836 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1837 {
1838         struct tcf_bind_args *a = (void *)arg;
1839
1840         if (tp->ops->bind_class) {
1841                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1842
1843                 sch_tree_lock(q);
1844                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1845                 sch_tree_unlock(q);
1846         }
1847         return 0;
1848 }
1849
1850 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1851                            unsigned long new_cl)
1852 {
1853         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1854         struct tcf_block *block;
1855         struct tcf_chain *chain;
1856         unsigned long cl;
1857
1858         cl = cops->find(q, portid);
1859         if (!cl)
1860                 return;
1861         if (!cops->tcf_block)
1862                 return;
1863         block = cops->tcf_block(q, cl, NULL);
1864         if (!block)
1865                 return;
1866         list_for_each_entry(chain, &block->chain_list, list) {
1867                 struct tcf_proto *tp;
1868
1869                 for (tp = rtnl_dereference(chain->filter_chain);
1870                      tp; tp = rtnl_dereference(tp->next)) {
1871                         struct tcf_bind_args arg = {};
1872
1873                         arg.w.fn = tcf_node_bind;
1874                         arg.classid = clid;
1875                         arg.base = cl;
1876                         arg.cl = new_cl;
1877                         tp->ops->walk(tp, &arg.w);
1878                 }
1879         }
1880 }
1881
1882 #else
1883
1884 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1885                            unsigned long new_cl)
1886 {
1887 }
1888
1889 #endif
1890
1891 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1892                          struct netlink_ext_ack *extack)
1893 {
1894         struct net *net = sock_net(skb->sk);
1895         struct tcmsg *tcm = nlmsg_data(n);
1896         struct nlattr *tca[TCA_MAX + 1];
1897         struct net_device *dev;
1898         struct Qdisc *q = NULL;
1899         const struct Qdisc_class_ops *cops;
1900         unsigned long cl = 0;
1901         unsigned long new_cl;
1902         u32 portid;
1903         u32 clid;
1904         u32 qid;
1905         int err;
1906
1907         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1908             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1909                 return -EPERM;
1910
1911         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1912                           extack);
1913         if (err < 0)
1914                 return err;
1915
1916         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1917         if (!dev)
1918                 return -ENODEV;
1919
1920         /*
1921            parent == TC_H_UNSPEC - unspecified parent.
1922            parent == TC_H_ROOT   - class is root, which has no parent.
1923            parent == X:0         - parent is root class.
1924            parent == X:Y         - parent is a node in hierarchy.
1925            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1926
1927            handle == 0:0         - generate handle from kernel pool.
1928            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1929            handle == X:Y         - clear.
1930            handle == X:0         - root class.
1931          */
1932
1933         /* Step 1. Determine qdisc handle X:0 */
1934
1935         portid = tcm->tcm_parent;
1936         clid = tcm->tcm_handle;
1937         qid = TC_H_MAJ(clid);
1938
1939         if (portid != TC_H_ROOT) {
1940                 u32 qid1 = TC_H_MAJ(portid);
1941
1942                 if (qid && qid1) {
1943                         /* If both majors are known, they must be identical. */
1944                         if (qid != qid1)
1945                                 return -EINVAL;
1946                 } else if (qid1) {
1947                         qid = qid1;
1948                 } else if (qid == 0)
1949                         qid = dev->qdisc->handle;
1950
1951                 /* Now qid is genuine qdisc handle consistent
1952                  * both with parent and child.
1953                  *
1954                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1955                  */
1956                 if (portid)
1957                         portid = TC_H_MAKE(qid, portid);
1958         } else {
1959                 if (qid == 0)
1960                         qid = dev->qdisc->handle;
1961         }
1962
1963         /* OK. Locate qdisc */
1964         q = qdisc_lookup(dev, qid);
1965         if (!q)
1966                 return -ENOENT;
1967
1968         /* An check that it supports classes */
1969         cops = q->ops->cl_ops;
1970         if (cops == NULL)
1971                 return -EINVAL;
1972
1973         /* Now try to get class */
1974         if (clid == 0) {
1975                 if (portid == TC_H_ROOT)
1976                         clid = qid;
1977         } else
1978                 clid = TC_H_MAKE(qid, clid);
1979
1980         if (clid)
1981                 cl = cops->find(q, clid);
1982
1983         if (cl == 0) {
1984                 err = -ENOENT;
1985                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1986                     !(n->nlmsg_flags & NLM_F_CREATE))
1987                         goto out;
1988         } else {
1989                 switch (n->nlmsg_type) {
1990                 case RTM_NEWTCLASS:
1991                         err = -EEXIST;
1992                         if (n->nlmsg_flags & NLM_F_EXCL)
1993                                 goto out;
1994                         break;
1995                 case RTM_DELTCLASS:
1996                         err = tclass_del_notify(net, cops, skb, n, q, cl);
1997                         /* Unbind the class with flilters with 0 */
1998                         tc_bind_tclass(q, portid, clid, 0);
1999                         goto out;
2000                 case RTM_GETTCLASS:
2001                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2002                         goto out;
2003                 default:
2004                         err = -EINVAL;
2005                         goto out;
2006                 }
2007         }
2008
2009         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2010                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2011                 return -EOPNOTSUPP;
2012         }
2013
2014         new_cl = cl;
2015         err = -EOPNOTSUPP;
2016         if (cops->change)
2017                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2018         if (err == 0) {
2019                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2020                 /* We just create a new class, need to do reverse binding. */
2021                 if (cl != new_cl)
2022                         tc_bind_tclass(q, portid, clid, new_cl);
2023         }
2024 out:
2025         return err;
2026 }
2027
2028 struct qdisc_dump_args {
2029         struct qdisc_walker     w;
2030         struct sk_buff          *skb;
2031         struct netlink_callback *cb;
2032 };
2033
2034 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2035                             struct qdisc_walker *arg)
2036 {
2037         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2038
2039         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2040                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2041                               RTM_NEWTCLASS);
2042 }
2043
2044 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2045                                 struct tcmsg *tcm, struct netlink_callback *cb,
2046                                 int *t_p, int s_t)
2047 {
2048         struct qdisc_dump_args arg;
2049
2050         if (tc_qdisc_dump_ignore(q, false) ||
2051             *t_p < s_t || !q->ops->cl_ops ||
2052             (tcm->tcm_parent &&
2053              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2054                 (*t_p)++;
2055                 return 0;
2056         }
2057         if (*t_p > s_t)
2058                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2059         arg.w.fn = qdisc_class_dump;
2060         arg.skb = skb;
2061         arg.cb = cb;
2062         arg.w.stop  = 0;
2063         arg.w.skip = cb->args[1];
2064         arg.w.count = 0;
2065         q->ops->cl_ops->walk(q, &arg.w);
2066         cb->args[1] = arg.w.count;
2067         if (arg.w.stop)
2068                 return -1;
2069         (*t_p)++;
2070         return 0;
2071 }
2072
2073 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2074                                struct tcmsg *tcm, struct netlink_callback *cb,
2075                                int *t_p, int s_t, bool recur)
2076 {
2077         struct Qdisc *q;
2078         int b;
2079
2080         if (!root)
2081                 return 0;
2082
2083         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2084                 return -1;
2085
2086         if (!qdisc_dev(root) || !recur)
2087                 return 0;
2088
2089         if (tcm->tcm_parent) {
2090                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2091                 if (q && q != root &&
2092                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2093                         return -1;
2094                 return 0;
2095         }
2096         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2097                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2098                         return -1;
2099         }
2100
2101         return 0;
2102 }
2103
2104 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2105 {
2106         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2107         struct net *net = sock_net(skb->sk);
2108         struct netdev_queue *dev_queue;
2109         struct net_device *dev;
2110         int t, s_t;
2111
2112         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2113                 return 0;
2114         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2115         if (!dev)
2116                 return 0;
2117
2118         s_t = cb->args[0];
2119         t = 0;
2120
2121         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0)
2122                 goto done;
2123
2124         dev_queue = dev_ingress_queue(dev);
2125         if (dev_queue &&
2126             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2127                                 &t, s_t, false) < 0)
2128                 goto done;
2129
2130 done:
2131         cb->args[0] = t;
2132
2133         dev_put(dev);
2134         return skb->len;
2135 }
2136
2137 #ifdef CONFIG_PROC_FS
2138 static int psched_show(struct seq_file *seq, void *v)
2139 {
2140         seq_printf(seq, "%08x %08x %08x %08x\n",
2141                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2142                    1000000,
2143                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2144
2145         return 0;
2146 }
2147
2148 static int __net_init psched_net_init(struct net *net)
2149 {
2150         struct proc_dir_entry *e;
2151
2152         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2153         if (e == NULL)
2154                 return -ENOMEM;
2155
2156         return 0;
2157 }
2158
2159 static void __net_exit psched_net_exit(struct net *net)
2160 {
2161         remove_proc_entry("psched", net->proc_net);
2162 }
2163 #else
2164 static int __net_init psched_net_init(struct net *net)
2165 {
2166         return 0;
2167 }
2168
2169 static void __net_exit psched_net_exit(struct net *net)
2170 {
2171 }
2172 #endif
2173
2174 static struct pernet_operations psched_net_ops = {
2175         .init = psched_net_init,
2176         .exit = psched_net_exit,
2177 };
2178
2179 static int __init pktsched_init(void)
2180 {
2181         int err;
2182
2183         err = register_pernet_subsys(&psched_net_ops);
2184         if (err) {
2185                 pr_err("pktsched_init: "
2186                        "cannot initialize per netns operations\n");
2187                 return err;
2188         }
2189
2190         register_qdisc(&pfifo_fast_ops);
2191         register_qdisc(&pfifo_qdisc_ops);
2192         register_qdisc(&bfifo_qdisc_ops);
2193         register_qdisc(&pfifo_head_drop_qdisc_ops);
2194         register_qdisc(&mq_qdisc_ops);
2195         register_qdisc(&noqueue_qdisc_ops);
2196
2197         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2198         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2199         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2200                       0);
2201         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2202         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2203         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2204                       0);
2205
2206         return 0;
2207 }
2208
2209 subsys_initcall(pktsched_init);