2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
34 #include <net/net_namespace.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38 #include <net/pkt_cls.h>
45 This file consists of two interrelated parts:
47 1. queueing disciplines manager frontend.
48 2. traffic classes manager frontend.
50 Generally, queueing discipline ("qdisc") is a black box,
51 which is able to enqueue packets and to dequeue them (when
52 device is ready to send something) in order and at times
53 determined by algorithm hidden in it.
55 qdisc's are divided to two categories:
56 - "queues", which have no internal structure visible from outside.
57 - "schedulers", which split all the packets to "traffic classes",
58 using "packet classifiers" (look at cls_api.c)
60 In turn, classes may have child qdiscs (as rule, queues)
61 attached to them etc. etc. etc.
63 The goal of the routines in this file is to translate
64 information supplied by user in the form of handles
65 to more intelligible for kernel form, to make some sanity
66 checks and part of work, which is common to all qdiscs
67 and to provide rtnetlink notifications.
69 All real intelligent work is done inside qdisc modules.
73 Every discipline has two major routines: enqueue and dequeue.
77 dequeue usually returns a skb to send. It is allowed to return NULL,
78 but it does not mean that queue is empty, it just means that
79 discipline does not want to send anything this time.
80 Queue is really empty if q->q.qlen == 0.
81 For complicated disciplines with multiple queues q->q is not
82 real packet queue, but however q->q.qlen must be valid.
86 enqueue returns 0, if packet was enqueued successfully.
87 If packet (this one or another one) was dropped, it returns
89 NET_XMIT_DROP - this packet dropped
90 Expected action: do not backoff, but wait until queue will clear.
91 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
92 Expected action: backoff or ignore
98 like dequeue but without removing a packet from the queue
102 returns qdisc to initial state: purge all buffers, clear all
103 timers, counters (except for statistics) etc.
107 initializes newly created qdisc.
111 destroys resources allocated by init and during lifetime of qdisc.
115 changes qdisc parameters.
118 /* Protects list of registered TC modules. It is pure SMP lock. */
119 static DEFINE_RWLOCK(qdisc_mod_lock);
122 /************************************************
123 * Queueing disciplines manipulation. *
124 ************************************************/
127 /* The list of all installed queueing disciplines. */
129 static struct Qdisc_ops *qdisc_base;
131 /* Register/unregister queueing discipline */
133 int register_qdisc(struct Qdisc_ops *qops)
135 struct Qdisc_ops *q, **qp;
138 write_lock(&qdisc_mod_lock);
139 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
140 if (!strcmp(qops->id, q->id))
143 if (qops->enqueue == NULL)
144 qops->enqueue = noop_qdisc_ops.enqueue;
145 if (qops->peek == NULL) {
146 if (qops->dequeue == NULL)
147 qops->peek = noop_qdisc_ops.peek;
151 if (qops->dequeue == NULL)
152 qops->dequeue = noop_qdisc_ops.dequeue;
155 const struct Qdisc_class_ops *cops = qops->cl_ops;
157 if (!(cops->find && cops->walk && cops->leaf))
160 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
168 write_unlock(&qdisc_mod_lock);
175 EXPORT_SYMBOL(register_qdisc);
177 int unregister_qdisc(struct Qdisc_ops *qops)
179 struct Qdisc_ops *q, **qp;
182 write_lock(&qdisc_mod_lock);
183 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191 write_unlock(&qdisc_mod_lock);
194 EXPORT_SYMBOL(unregister_qdisc);
196 /* Get default qdisc if not otherwise specified */
197 void qdisc_get_default(char *name, size_t len)
199 read_lock(&qdisc_mod_lock);
200 strlcpy(name, default_qdisc_ops->id, len);
201 read_unlock(&qdisc_mod_lock);
204 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
206 struct Qdisc_ops *q = NULL;
208 for (q = qdisc_base; q; q = q->next) {
209 if (!strcmp(name, q->id)) {
210 if (!try_module_get(q->owner))
219 /* Set new default qdisc to use */
220 int qdisc_set_default(const char *name)
222 const struct Qdisc_ops *ops;
224 if (!capable(CAP_NET_ADMIN))
227 write_lock(&qdisc_mod_lock);
228 ops = qdisc_lookup_default(name);
230 /* Not found, drop lock and try to load module */
231 write_unlock(&qdisc_mod_lock);
232 request_module("sch_%s", name);
233 write_lock(&qdisc_mod_lock);
235 ops = qdisc_lookup_default(name);
239 /* Set new default */
240 module_put(default_qdisc_ops->owner);
241 default_qdisc_ops = ops;
243 write_unlock(&qdisc_mod_lock);
245 return ops ? 0 : -ENOENT;
248 #ifdef CONFIG_NET_SCH_DEFAULT
249 /* Set default value from kernel config */
250 static int __init sch_default_qdisc(void)
252 return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
254 late_initcall(sch_default_qdisc);
257 /* We know handle. Find qdisc among all qdisc's attached to device
258 * (root qdisc, all its children, children of children etc.)
259 * Note: caller either uses rtnl or rcu_read_lock()
262 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
266 if (!qdisc_dev(root))
267 return (root->handle == handle ? root : NULL);
269 if (!(root->flags & TCQ_F_BUILTIN) &&
270 root->handle == handle)
273 hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
274 if (q->handle == handle)
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
282 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
284 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
286 q->flags |= TCQ_F_INVISIBLE;
289 EXPORT_SYMBOL(qdisc_hash_add);
291 void qdisc_hash_del(struct Qdisc *q)
293 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
295 hash_del_rcu(&q->hash);
298 EXPORT_SYMBOL(qdisc_hash_del);
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
306 q = qdisc_match_from_root(dev->qdisc, handle);
310 if (dev_ingress_queue(dev))
311 q = qdisc_match_from_root(
312 dev_ingress_queue(dev)->qdisc_sleeping,
318 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
320 struct netdev_queue *nq;
325 q = qdisc_match_from_root(dev->qdisc, handle);
329 nq = dev_ingress_queue_rcu(dev);
331 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
336 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
340 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
344 cl = cops->find(p, classid);
348 leaf = cops->leaf(p, cl);
352 /* Find queueing discipline by name */
354 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
356 struct Qdisc_ops *q = NULL;
359 read_lock(&qdisc_mod_lock);
360 for (q = qdisc_base; q; q = q->next) {
361 if (nla_strcmp(kind, q->id) == 0) {
362 if (!try_module_get(q->owner))
367 read_unlock(&qdisc_mod_lock);
372 /* The linklayer setting were not transferred from iproute2, in older
373 * versions, and the rate tables lookup systems have been dropped in
374 * the kernel. To keep backward compatible with older iproute2 tc
375 * utils, we detect the linklayer setting by detecting if the rate
376 * table were modified.
378 * For linklayer ATM table entries, the rate table will be aligned to
379 * 48 bytes, thus some table entries will contain the same value. The
380 * mpu (min packet unit) is also encoded into the old rate table, thus
381 * starting from the mpu, we find low and high table entries for
382 * mapping this cell. If these entries contain the same value, when
383 * the rate tables have been modified for linklayer ATM.
385 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
386 * and then roundup to the next cell, calc the table entry one below,
389 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
391 int low = roundup(r->mpu, 48);
392 int high = roundup(low+1, 48);
393 int cell_low = low >> r->cell_log;
394 int cell_high = (high >> r->cell_log) - 1;
396 /* rtab is too inaccurate at rates > 100Mbit/s */
397 if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
398 pr_debug("TC linklayer: Giving up ATM detection\n");
399 return TC_LINKLAYER_ETHERNET;
402 if ((cell_high > cell_low) && (cell_high < 256)
403 && (rtab[cell_low] == rtab[cell_high])) {
404 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
405 cell_low, cell_high, rtab[cell_high]);
406 return TC_LINKLAYER_ATM;
408 return TC_LINKLAYER_ETHERNET;
411 static struct qdisc_rate_table *qdisc_rtab_list;
413 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
415 struct netlink_ext_ack *extack)
417 struct qdisc_rate_table *rtab;
419 if (tab == NULL || r->rate == 0 ||
420 r->cell_log == 0 || r->cell_log >= 32 ||
421 nla_len(tab) != TC_RTAB_SIZE) {
422 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
426 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
427 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
428 !memcmp(&rtab->data, nla_data(tab), 1024)) {
434 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
438 memcpy(rtab->data, nla_data(tab), 1024);
439 if (r->linklayer == TC_LINKLAYER_UNAWARE)
440 r->linklayer = __detect_linklayer(r, rtab->data);
441 rtab->next = qdisc_rtab_list;
442 qdisc_rtab_list = rtab;
444 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
448 EXPORT_SYMBOL(qdisc_get_rtab);
450 void qdisc_put_rtab(struct qdisc_rate_table *tab)
452 struct qdisc_rate_table *rtab, **rtabp;
454 if (!tab || --tab->refcnt)
457 for (rtabp = &qdisc_rtab_list;
458 (rtab = *rtabp) != NULL;
459 rtabp = &rtab->next) {
467 EXPORT_SYMBOL(qdisc_put_rtab);
469 static LIST_HEAD(qdisc_stab_list);
471 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
472 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
473 [TCA_STAB_DATA] = { .type = NLA_BINARY },
476 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
477 struct netlink_ext_ack *extack)
479 struct nlattr *tb[TCA_STAB_MAX + 1];
480 struct qdisc_size_table *stab;
481 struct tc_sizespec *s;
482 unsigned int tsize = 0;
486 err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
489 if (!tb[TCA_STAB_BASE]) {
490 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
491 return ERR_PTR(-EINVAL);
494 s = nla_data(tb[TCA_STAB_BASE]);
497 if (!tb[TCA_STAB_DATA]) {
498 NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
499 return ERR_PTR(-EINVAL);
501 tab = nla_data(tb[TCA_STAB_DATA]);
502 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
505 if (tsize != s->tsize || (!tab && tsize > 0)) {
506 NL_SET_ERR_MSG(extack, "Invalid size of size table");
507 return ERR_PTR(-EINVAL);
510 list_for_each_entry(stab, &qdisc_stab_list, list) {
511 if (memcmp(&stab->szopts, s, sizeof(*s)))
513 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
519 if (s->size_log > STAB_SIZE_LOG_MAX ||
520 s->cell_log > STAB_SIZE_LOG_MAX) {
521 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
522 return ERR_PTR(-EINVAL);
525 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
527 return ERR_PTR(-ENOMEM);
532 memcpy(stab->data, tab, tsize * sizeof(u16));
534 list_add_tail(&stab->list, &qdisc_stab_list);
539 static void stab_kfree_rcu(struct rcu_head *head)
541 kfree(container_of(head, struct qdisc_size_table, rcu));
544 void qdisc_put_stab(struct qdisc_size_table *tab)
549 if (--tab->refcnt == 0) {
550 list_del(&tab->list);
551 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
554 EXPORT_SYMBOL(qdisc_put_stab);
556 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
560 nest = nla_nest_start(skb, TCA_STAB);
562 goto nla_put_failure;
563 if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
564 goto nla_put_failure;
565 nla_nest_end(skb, nest);
573 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
574 const struct qdisc_size_table *stab)
578 pkt_len = skb->len + stab->szopts.overhead;
579 if (unlikely(!stab->szopts.tsize))
582 slot = pkt_len + stab->szopts.cell_align;
583 if (unlikely(slot < 0))
586 slot >>= stab->szopts.cell_log;
587 if (likely(slot < stab->szopts.tsize))
588 pkt_len = stab->data[slot];
590 pkt_len = stab->data[stab->szopts.tsize - 1] *
591 (slot / stab->szopts.tsize) +
592 stab->data[slot % stab->szopts.tsize];
594 pkt_len <<= stab->szopts.size_log;
596 if (unlikely(pkt_len < 1))
598 qdisc_skb_cb(skb)->pkt_len = pkt_len;
600 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
602 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
604 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
605 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
606 txt, qdisc->ops->id, qdisc->handle >> 16);
607 qdisc->flags |= TCQ_F_WARN_NONWC;
610 EXPORT_SYMBOL(qdisc_warn_nonwc);
612 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
614 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
618 __netif_schedule(qdisc_root(wd->qdisc));
621 return HRTIMER_NORESTART;
624 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
627 hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
628 wd->timer.function = qdisc_watchdog;
631 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
633 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
635 qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
637 EXPORT_SYMBOL(qdisc_watchdog_init);
639 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
641 if (test_bit(__QDISC_STATE_DEACTIVATED,
642 &qdisc_root_sleeping(wd->qdisc)->state))
645 if (wd->last_expires == expires)
648 wd->last_expires = expires;
649 hrtimer_start(&wd->timer,
650 ns_to_ktime(expires),
651 HRTIMER_MODE_ABS_PINNED);
653 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
655 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
657 hrtimer_cancel(&wd->timer);
659 EXPORT_SYMBOL(qdisc_watchdog_cancel);
661 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
663 struct hlist_head *h;
666 h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
669 for (i = 0; i < n; i++)
670 INIT_HLIST_HEAD(&h[i]);
675 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
677 struct Qdisc_class_common *cl;
678 struct hlist_node *next;
679 struct hlist_head *nhash, *ohash;
680 unsigned int nsize, nmask, osize;
683 /* Rehash when load factor exceeds 0.75 */
684 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
686 nsize = clhash->hashsize * 2;
688 nhash = qdisc_class_hash_alloc(nsize);
692 ohash = clhash->hash;
693 osize = clhash->hashsize;
696 for (i = 0; i < osize; i++) {
697 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
698 h = qdisc_class_hash(cl->classid, nmask);
699 hlist_add_head(&cl->hnode, &nhash[h]);
702 clhash->hash = nhash;
703 clhash->hashsize = nsize;
704 clhash->hashmask = nmask;
705 sch_tree_unlock(sch);
709 EXPORT_SYMBOL(qdisc_class_hash_grow);
711 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
713 unsigned int size = 4;
715 clhash->hash = qdisc_class_hash_alloc(size);
718 clhash->hashsize = size;
719 clhash->hashmask = size - 1;
720 clhash->hashelems = 0;
723 EXPORT_SYMBOL(qdisc_class_hash_init);
725 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
727 kvfree(clhash->hash);
729 EXPORT_SYMBOL(qdisc_class_hash_destroy);
731 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
732 struct Qdisc_class_common *cl)
736 INIT_HLIST_NODE(&cl->hnode);
737 h = qdisc_class_hash(cl->classid, clhash->hashmask);
738 hlist_add_head(&cl->hnode, &clhash->hash[h]);
741 EXPORT_SYMBOL(qdisc_class_hash_insert);
743 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
744 struct Qdisc_class_common *cl)
746 hlist_del(&cl->hnode);
749 EXPORT_SYMBOL(qdisc_class_hash_remove);
751 /* Allocate an unique handle from space managed by kernel
752 * Possible range is [8000-FFFF]:0000 (0x8000 values)
754 static u32 qdisc_alloc_handle(struct net_device *dev)
757 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
760 autohandle += TC_H_MAKE(0x10000U, 0);
761 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
762 autohandle = TC_H_MAKE(0x80000000U, 0);
763 if (!qdisc_lookup(dev, autohandle))
771 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
774 bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
775 const struct Qdisc_class_ops *cops;
781 if (n == 0 && len == 0)
783 drops = max_t(int, n, 0);
785 while ((parentid = sch->parent)) {
786 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
789 if (sch->flags & TCQ_F_NOPARENT)
791 /* Notify parent qdisc only if child qdisc becomes empty.
793 * If child was empty even before update then backlog
794 * counter is screwed and we skip notification because
795 * parent class is already passive.
797 * If the original child was offloaded then it is allowed
798 * to be seem as empty, so the parent is notified anyway.
800 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
801 !qdisc_is_offloaded);
802 /* TODO: perform the search on a per txq basis */
803 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
805 WARN_ON_ONCE(parentid != TC_H_ROOT);
808 cops = sch->ops->cl_ops;
809 if (notify && cops->qlen_notify) {
810 cl = cops->find(sch, parentid);
811 cops->qlen_notify(sch, cl);
814 sch->qstats.backlog -= len;
815 __qdisc_qstats_drop(sch, drops);
819 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
821 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
822 u32 portid, u32 seq, u16 flags, int event)
824 struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
825 struct gnet_stats_queue __percpu *cpu_qstats = NULL;
827 struct nlmsghdr *nlh;
828 unsigned char *b = skb_tail_pointer(skb);
830 struct qdisc_size_table *stab;
835 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
838 tcm = nlmsg_data(nlh);
839 tcm->tcm_family = AF_UNSPEC;
842 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
843 tcm->tcm_parent = clid;
844 tcm->tcm_handle = q->handle;
845 tcm->tcm_info = refcount_read(&q->refcnt);
846 if (nla_put_string(skb, TCA_KIND, q->ops->id))
847 goto nla_put_failure;
848 if (q->ops->ingress_block_get) {
849 block_index = q->ops->ingress_block_get(q);
851 nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
852 goto nla_put_failure;
854 if (q->ops->egress_block_get) {
855 block_index = q->ops->egress_block_get(q);
857 nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
858 goto nla_put_failure;
860 if (q->ops->dump && q->ops->dump(q, skb) < 0)
861 goto nla_put_failure;
862 if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
863 goto nla_put_failure;
864 qlen = qdisc_qlen_sum(q);
866 stab = rtnl_dereference(q->stab);
867 if (stab && qdisc_dump_stab(skb, stab) < 0)
868 goto nla_put_failure;
870 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
871 NULL, &d, TCA_PAD) < 0)
872 goto nla_put_failure;
874 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
875 goto nla_put_failure;
877 if (qdisc_is_percpu_stats(q)) {
878 cpu_bstats = q->cpu_bstats;
879 cpu_qstats = q->cpu_qstats;
882 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
883 &d, cpu_bstats, &q->bstats) < 0 ||
884 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
885 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
886 goto nla_put_failure;
888 if (gnet_stats_finish_copy(&d) < 0)
889 goto nla_put_failure;
891 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
900 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
902 if (q->flags & TCQ_F_BUILTIN)
904 if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
910 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
911 struct nlmsghdr *n, u32 clid,
912 struct Qdisc *old, struct Qdisc *new)
915 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
917 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
921 if (old && !tc_qdisc_dump_ignore(old, false)) {
922 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
923 0, RTM_DELQDISC) < 0)
926 if (new && !tc_qdisc_dump_ignore(new, false)) {
927 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
928 old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
933 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
934 n->nlmsg_flags & NLM_F_ECHO);
941 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
942 struct nlmsghdr *n, u32 clid,
943 struct Qdisc *old, struct Qdisc *new)
946 qdisc_notify(net, skb, n, clid, old, new);
952 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
955 * When appropriate send a netlink notification using 'skb'
958 * On success, destroy old qdisc.
961 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
962 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
963 struct Qdisc *new, struct Qdisc *old,
964 struct netlink_ext_ack *extack)
966 struct Qdisc *q = old;
967 struct net *net = dev_net(dev);
970 if (parent == NULL) {
971 unsigned int i, num_q, ingress;
974 num_q = dev->num_tx_queues;
975 if ((q && q->flags & TCQ_F_INGRESS) ||
976 (new && new->flags & TCQ_F_INGRESS)) {
979 if (!dev_ingress_queue(dev)) {
980 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
985 if (dev->flags & IFF_UP)
988 if (new && new->ops->attach)
991 for (i = 0; i < num_q; i++) {
992 struct netdev_queue *dev_queue = dev_ingress_queue(dev);
995 dev_queue = netdev_get_tx_queue(dev, i);
997 old = dev_graft_qdisc(dev_queue, new);
999 qdisc_refcount_inc(new);
1007 notify_and_destroy(net, skb, n, classid,
1009 if (new && !new->ops->attach)
1010 qdisc_refcount_inc(new);
1011 dev->qdisc = new ? : &noop_qdisc;
1013 if (new && new->ops->attach)
1014 new->ops->attach(new);
1016 notify_and_destroy(net, skb, n, classid, old, new);
1019 if (dev->flags & IFF_UP)
1022 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1024 /* Only support running class lockless if parent is lockless */
1025 if (new && (new->flags & TCQ_F_NOLOCK) &&
1026 parent && !(parent->flags & TCQ_F_NOLOCK))
1027 new->flags &= ~TCQ_F_NOLOCK;
1030 if (cops && cops->graft) {
1031 unsigned long cl = cops->find(parent, classid);
1034 err = cops->graft(parent, cl, new, &old,
1037 NL_SET_ERR_MSG(extack, "Specified class not found");
1042 notify_and_destroy(net, skb, n, classid, old, new);
1047 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1048 struct netlink_ext_ack *extack)
1052 if (tca[TCA_INGRESS_BLOCK]) {
1053 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1056 NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1059 if (!sch->ops->ingress_block_set) {
1060 NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1063 sch->ops->ingress_block_set(sch, block_index);
1065 if (tca[TCA_EGRESS_BLOCK]) {
1066 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1069 NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1072 if (!sch->ops->egress_block_set) {
1073 NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1076 sch->ops->egress_block_set(sch, block_index);
1081 /* lockdep annotation is needed for ingress; egress gets it only for name */
1082 static struct lock_class_key qdisc_tx_lock;
1083 static struct lock_class_key qdisc_rx_lock;
1086 Allocate and initialize new qdisc.
1088 Parameters are passed via opt.
1091 static struct Qdisc *qdisc_create(struct net_device *dev,
1092 struct netdev_queue *dev_queue,
1093 struct Qdisc *p, u32 parent, u32 handle,
1094 struct nlattr **tca, int *errp,
1095 struct netlink_ext_ack *extack)
1098 struct nlattr *kind = tca[TCA_KIND];
1100 struct Qdisc_ops *ops;
1101 struct qdisc_size_table *stab;
1103 ops = qdisc_lookup_ops(kind);
1104 #ifdef CONFIG_MODULES
1105 if (ops == NULL && kind != NULL) {
1106 char name[IFNAMSIZ];
1107 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1108 /* We dropped the RTNL semaphore in order to
1109 * perform the module load. So, even if we
1110 * succeeded in loading the module we have to
1111 * tell the caller to replay the request. We
1112 * indicate this using -EAGAIN.
1113 * We replay the request because the device may
1114 * go away in the mean time.
1117 request_module("sch_%s", name);
1119 ops = qdisc_lookup_ops(kind);
1121 /* We will try again qdisc_lookup_ops,
1122 * so don't keep a reference.
1124 module_put(ops->owner);
1134 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1138 sch = qdisc_alloc(dev_queue, ops, extack);
1144 sch->parent = parent;
1146 if (handle == TC_H_INGRESS) {
1147 sch->flags |= TCQ_F_INGRESS;
1148 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1149 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1152 handle = qdisc_alloc_handle(dev);
1157 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1158 if (!netif_is_multiqueue(dev))
1159 sch->flags |= TCQ_F_ONETXQUEUE;
1162 sch->handle = handle;
1164 /* This exist to keep backward compatible with a userspace
1165 * loophole, what allowed userspace to get IFF_NO_QUEUE
1166 * facility on older kernels by setting tx_queue_len=0 (prior
1167 * to qdisc init), and then forgot to reinit tx_queue_len
1168 * before again attaching a qdisc.
1170 if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1171 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1172 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1175 err = qdisc_block_indexes_set(sch, tca, extack);
1180 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1185 if (tca[TCA_STAB]) {
1186 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1188 err = PTR_ERR(stab);
1191 rcu_assign_pointer(sch->stab, stab);
1193 if (tca[TCA_RATE]) {
1194 seqcount_t *running;
1197 if (sch->flags & TCQ_F_MQROOT) {
1198 NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1202 if (sch->parent != TC_H_ROOT &&
1203 !(sch->flags & TCQ_F_INGRESS) &&
1204 (!p || !(p->flags & TCQ_F_MQROOT)))
1205 running = qdisc_root_sleeping_running(sch);
1207 running = &sch->running;
1209 err = gen_new_estimator(&sch->bstats,
1216 NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1221 qdisc_hash_add(sch, false);
1226 /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1233 module_put(ops->owner);
1240 * Any broken qdiscs that would require a ops->reset() here?
1241 * The qdisc was never in action so it shouldn't be necessary.
1243 qdisc_put_stab(rtnl_dereference(sch->stab));
1249 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1250 struct netlink_ext_ack *extack)
1252 struct qdisc_size_table *ostab, *stab = NULL;
1255 if (tca[TCA_OPTIONS]) {
1256 if (!sch->ops->change) {
1257 NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1260 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1261 NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1264 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1269 if (tca[TCA_STAB]) {
1270 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1272 return PTR_ERR(stab);
1275 ostab = rtnl_dereference(sch->stab);
1276 rcu_assign_pointer(sch->stab, stab);
1277 qdisc_put_stab(ostab);
1279 if (tca[TCA_RATE]) {
1280 /* NB: ignores errors from replace_estimator
1281 because change can't be undone. */
1282 if (sch->flags & TCQ_F_MQROOT)
1284 gen_replace_estimator(&sch->bstats,
1288 qdisc_root_sleeping_running(sch),
1295 struct check_loop_arg {
1296 struct qdisc_walker w;
1301 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1302 struct qdisc_walker *w);
1304 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1306 struct check_loop_arg arg;
1308 if (q->ops->cl_ops == NULL)
1311 arg.w.stop = arg.w.skip = arg.w.count = 0;
1312 arg.w.fn = check_loop_fn;
1315 q->ops->cl_ops->walk(q, &arg.w);
1316 return arg.w.stop ? -ELOOP : 0;
1320 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1323 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1324 struct check_loop_arg *arg = (struct check_loop_arg *)w;
1326 leaf = cops->leaf(q, cl);
1328 if (leaf == arg->p || arg->depth > 7)
1330 return check_loop(leaf, arg->p, arg->depth + 1);
1335 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1336 [TCA_KIND] = { .type = NLA_NUL_STRING,
1337 .len = IFNAMSIZ - 1 },
1338 [TCA_RATE] = { .type = NLA_BINARY,
1339 .len = sizeof(struct tc_estimator) },
1340 [TCA_STAB] = { .type = NLA_NESTED },
1341 [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG },
1342 [TCA_CHAIN] = { .type = NLA_U32 },
1343 [TCA_INGRESS_BLOCK] = { .type = NLA_U32 },
1344 [TCA_EGRESS_BLOCK] = { .type = NLA_U32 },
1351 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1352 struct netlink_ext_ack *extack)
1354 struct net *net = sock_net(skb->sk);
1355 struct tcmsg *tcm = nlmsg_data(n);
1356 struct nlattr *tca[TCA_MAX + 1];
1357 struct net_device *dev;
1359 struct Qdisc *q = NULL;
1360 struct Qdisc *p = NULL;
1363 if ((n->nlmsg_type != RTM_GETQDISC) &&
1364 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1367 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1372 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1376 clid = tcm->tcm_parent;
1378 if (clid != TC_H_ROOT) {
1379 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1380 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1382 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1385 q = qdisc_leaf(p, clid);
1386 } else if (dev_ingress_queue(dev)) {
1387 q = dev_ingress_queue(dev)->qdisc_sleeping;
1393 NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1397 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1398 NL_SET_ERR_MSG(extack, "Invalid handle");
1402 q = qdisc_lookup(dev, tcm->tcm_handle);
1404 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1409 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1410 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1414 if (n->nlmsg_type == RTM_DELQDISC) {
1416 NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1419 if (q->handle == 0) {
1420 NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1423 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1427 qdisc_notify(net, skb, n, clid, NULL, q);
1433 * Create/change qdisc.
1436 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1437 struct netlink_ext_ack *extack)
1439 struct net *net = sock_net(skb->sk);
1441 struct nlattr *tca[TCA_MAX + 1];
1442 struct net_device *dev;
1444 struct Qdisc *q, *p;
1447 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1451 /* Reinit, just in case something touches this. */
1452 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1457 tcm = nlmsg_data(n);
1458 clid = tcm->tcm_parent;
1461 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1467 if (clid != TC_H_ROOT) {
1468 if (clid != TC_H_INGRESS) {
1469 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1471 NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1474 q = qdisc_leaf(p, clid);
1475 } else if (dev_ingress_queue_create(dev)) {
1476 q = dev_ingress_queue(dev)->qdisc_sleeping;
1482 /* It may be default qdisc, ignore it */
1483 if (q && q->handle == 0)
1486 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1487 if (tcm->tcm_handle) {
1488 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1489 NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1492 if (TC_H_MIN(tcm->tcm_handle)) {
1493 NL_SET_ERR_MSG(extack, "Invalid minor handle");
1496 q = qdisc_lookup(dev, tcm->tcm_handle);
1498 goto create_n_graft;
1499 if (n->nlmsg_flags & NLM_F_EXCL) {
1500 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1503 if (tca[TCA_KIND] &&
1504 nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1505 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1509 (p && check_loop(q, p, 0))) {
1510 NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1513 qdisc_refcount_inc(q);
1517 goto create_n_graft;
1519 /* This magic test requires explanation.
1521 * We know, that some child q is already
1522 * attached to this parent and have choice:
1523 * either to change it or to create/graft new one.
1525 * 1. We are allowed to create/graft only
1526 * if CREATE and REPLACE flags are set.
1528 * 2. If EXCL is set, requestor wanted to say,
1529 * that qdisc tcm_handle is not expected
1530 * to exist, so that we choose create/graft too.
1532 * 3. The last case is when no flags are set.
1533 * Alas, it is sort of hole in API, we
1534 * cannot decide what to do unambiguously.
1535 * For now we select create/graft, if
1536 * user gave KIND, which does not match existing.
1538 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1539 (n->nlmsg_flags & NLM_F_REPLACE) &&
1540 ((n->nlmsg_flags & NLM_F_EXCL) ||
1542 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1543 goto create_n_graft;
1547 if (!tcm->tcm_handle) {
1548 NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1551 q = qdisc_lookup(dev, tcm->tcm_handle);
1554 /* Change qdisc parameters */
1556 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1559 if (n->nlmsg_flags & NLM_F_EXCL) {
1560 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1563 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1564 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1567 err = qdisc_change(q, tca, extack);
1569 qdisc_notify(net, skb, n, clid, NULL, q);
1573 if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1574 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1577 if (clid == TC_H_INGRESS) {
1578 if (dev_ingress_queue(dev)) {
1579 q = qdisc_create(dev, dev_ingress_queue(dev), p,
1580 tcm->tcm_parent, tcm->tcm_parent,
1583 NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1587 struct netdev_queue *dev_queue;
1589 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1590 dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1592 dev_queue = p->dev_queue;
1594 dev_queue = netdev_get_tx_queue(dev, 0);
1596 q = qdisc_create(dev, dev_queue, p,
1597 tcm->tcm_parent, tcm->tcm_handle,
1607 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1617 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1618 struct netlink_callback *cb,
1619 int *q_idx_p, int s_q_idx, bool recur,
1620 bool dump_invisible)
1622 int ret = 0, q_idx = *q_idx_p;
1630 if (q_idx < s_q_idx) {
1633 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1634 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1635 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1641 /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1642 * itself has already been dumped.
1644 * If we've already dumped the top-level (ingress) qdisc above and the global
1645 * qdisc hashtable, we don't want to hit it again
1647 if (!qdisc_dev(root) || !recur)
1650 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1651 if (q_idx < s_q_idx) {
1655 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1656 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1657 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1671 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1673 struct net *net = sock_net(skb->sk);
1676 struct net_device *dev;
1677 const struct nlmsghdr *nlh = cb->nlh;
1678 struct nlattr *tca[TCA_MAX + 1];
1681 s_idx = cb->args[0];
1682 s_q_idx = q_idx = cb->args[1];
1687 err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1688 rtm_tca_policy, NULL);
1692 for_each_netdev(net, dev) {
1693 struct netdev_queue *dev_queue;
1701 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1702 true, tca[TCA_DUMP_INVISIBLE]) < 0)
1705 dev_queue = dev_ingress_queue(dev);
1707 tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1708 &q_idx, s_q_idx, false,
1709 tca[TCA_DUMP_INVISIBLE]) < 0)
1718 cb->args[1] = q_idx;
1725 /************************************************
1726 * Traffic classes manipulation. *
1727 ************************************************/
1729 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1731 u32 portid, u32 seq, u16 flags, int event)
1734 struct nlmsghdr *nlh;
1735 unsigned char *b = skb_tail_pointer(skb);
1737 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1740 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1742 goto out_nlmsg_trim;
1743 tcm = nlmsg_data(nlh);
1744 tcm->tcm_family = AF_UNSPEC;
1747 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1748 tcm->tcm_parent = q->handle;
1749 tcm->tcm_handle = q->handle;
1751 if (nla_put_string(skb, TCA_KIND, q->ops->id))
1752 goto nla_put_failure;
1753 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1754 goto nla_put_failure;
1756 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1757 NULL, &d, TCA_PAD) < 0)
1758 goto nla_put_failure;
1760 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1761 goto nla_put_failure;
1763 if (gnet_stats_finish_copy(&d) < 0)
1764 goto nla_put_failure;
1766 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1775 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1776 struct nlmsghdr *n, struct Qdisc *q,
1777 unsigned long cl, int event)
1779 struct sk_buff *skb;
1780 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1782 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1786 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1791 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1792 n->nlmsg_flags & NLM_F_ECHO);
1795 static int tclass_del_notify(struct net *net,
1796 const struct Qdisc_class_ops *cops,
1797 struct sk_buff *oskb, struct nlmsghdr *n,
1798 struct Qdisc *q, unsigned long cl)
1800 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1801 struct sk_buff *skb;
1807 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1811 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1812 RTM_DELTCLASS) < 0) {
1817 err = cops->delete(q, cl);
1823 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1824 n->nlmsg_flags & NLM_F_ECHO);
1827 #ifdef CONFIG_NET_CLS
1829 struct tcf_bind_args {
1830 struct tcf_walker w;
1836 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1838 struct tcf_bind_args *a = (void *)arg;
1840 if (tp->ops->bind_class) {
1841 struct Qdisc *q = tcf_block_q(tp->chain->block);
1844 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1850 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1851 unsigned long new_cl)
1853 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1854 struct tcf_block *block;
1855 struct tcf_chain *chain;
1858 cl = cops->find(q, portid);
1861 if (!cops->tcf_block)
1863 block = cops->tcf_block(q, cl, NULL);
1866 list_for_each_entry(chain, &block->chain_list, list) {
1867 struct tcf_proto *tp;
1869 for (tp = rtnl_dereference(chain->filter_chain);
1870 tp; tp = rtnl_dereference(tp->next)) {
1871 struct tcf_bind_args arg = {};
1873 arg.w.fn = tcf_node_bind;
1877 tp->ops->walk(tp, &arg.w);
1884 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1885 unsigned long new_cl)
1891 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1892 struct netlink_ext_ack *extack)
1894 struct net *net = sock_net(skb->sk);
1895 struct tcmsg *tcm = nlmsg_data(n);
1896 struct nlattr *tca[TCA_MAX + 1];
1897 struct net_device *dev;
1898 struct Qdisc *q = NULL;
1899 const struct Qdisc_class_ops *cops;
1900 unsigned long cl = 0;
1901 unsigned long new_cl;
1907 if ((n->nlmsg_type != RTM_GETTCLASS) &&
1908 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1911 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1916 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1921 parent == TC_H_UNSPEC - unspecified parent.
1922 parent == TC_H_ROOT - class is root, which has no parent.
1923 parent == X:0 - parent is root class.
1924 parent == X:Y - parent is a node in hierarchy.
1925 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1927 handle == 0:0 - generate handle from kernel pool.
1928 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1929 handle == X:Y - clear.
1930 handle == X:0 - root class.
1933 /* Step 1. Determine qdisc handle X:0 */
1935 portid = tcm->tcm_parent;
1936 clid = tcm->tcm_handle;
1937 qid = TC_H_MAJ(clid);
1939 if (portid != TC_H_ROOT) {
1940 u32 qid1 = TC_H_MAJ(portid);
1943 /* If both majors are known, they must be identical. */
1948 } else if (qid == 0)
1949 qid = dev->qdisc->handle;
1951 /* Now qid is genuine qdisc handle consistent
1952 * both with parent and child.
1954 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1957 portid = TC_H_MAKE(qid, portid);
1960 qid = dev->qdisc->handle;
1963 /* OK. Locate qdisc */
1964 q = qdisc_lookup(dev, qid);
1968 /* An check that it supports classes */
1969 cops = q->ops->cl_ops;
1973 /* Now try to get class */
1975 if (portid == TC_H_ROOT)
1978 clid = TC_H_MAKE(qid, clid);
1981 cl = cops->find(q, clid);
1985 if (n->nlmsg_type != RTM_NEWTCLASS ||
1986 !(n->nlmsg_flags & NLM_F_CREATE))
1989 switch (n->nlmsg_type) {
1992 if (n->nlmsg_flags & NLM_F_EXCL)
1996 err = tclass_del_notify(net, cops, skb, n, q, cl);
1997 /* Unbind the class with flilters with 0 */
1998 tc_bind_tclass(q, portid, clid, 0);
2001 err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2009 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2010 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2017 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2019 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2020 /* We just create a new class, need to do reverse binding. */
2022 tc_bind_tclass(q, portid, clid, new_cl);
2028 struct qdisc_dump_args {
2029 struct qdisc_walker w;
2030 struct sk_buff *skb;
2031 struct netlink_callback *cb;
2034 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2035 struct qdisc_walker *arg)
2037 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2039 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2040 a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2044 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2045 struct tcmsg *tcm, struct netlink_callback *cb,
2048 struct qdisc_dump_args arg;
2050 if (tc_qdisc_dump_ignore(q, false) ||
2051 *t_p < s_t || !q->ops->cl_ops ||
2053 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2058 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2059 arg.w.fn = qdisc_class_dump;
2063 arg.w.skip = cb->args[1];
2065 q->ops->cl_ops->walk(q, &arg.w);
2066 cb->args[1] = arg.w.count;
2073 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2074 struct tcmsg *tcm, struct netlink_callback *cb,
2075 int *t_p, int s_t, bool recur)
2083 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2086 if (!qdisc_dev(root) || !recur)
2089 if (tcm->tcm_parent) {
2090 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2091 if (q && q != root &&
2092 tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2096 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2097 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2104 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2106 struct tcmsg *tcm = nlmsg_data(cb->nlh);
2107 struct net *net = sock_net(skb->sk);
2108 struct netdev_queue *dev_queue;
2109 struct net_device *dev;
2112 if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2114 dev = dev_get_by_index(net, tcm->tcm_ifindex);
2121 if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0)
2124 dev_queue = dev_ingress_queue(dev);
2126 tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2127 &t, s_t, false) < 0)
2137 #ifdef CONFIG_PROC_FS
2138 static int psched_show(struct seq_file *seq, void *v)
2140 seq_printf(seq, "%08x %08x %08x %08x\n",
2141 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2143 (u32)NSEC_PER_SEC / hrtimer_resolution);
2148 static int __net_init psched_net_init(struct net *net)
2150 struct proc_dir_entry *e;
2152 e = proc_create_single("psched", 0, net->proc_net, psched_show);
2159 static void __net_exit psched_net_exit(struct net *net)
2161 remove_proc_entry("psched", net->proc_net);
2164 static int __net_init psched_net_init(struct net *net)
2169 static void __net_exit psched_net_exit(struct net *net)
2174 static struct pernet_operations psched_net_ops = {
2175 .init = psched_net_init,
2176 .exit = psched_net_exit,
2179 static int __init pktsched_init(void)
2183 err = register_pernet_subsys(&psched_net_ops);
2185 pr_err("pktsched_init: "
2186 "cannot initialize per netns operations\n");
2190 register_qdisc(&pfifo_fast_ops);
2191 register_qdisc(&pfifo_qdisc_ops);
2192 register_qdisc(&bfifo_qdisc_ops);
2193 register_qdisc(&pfifo_head_drop_qdisc_ops);
2194 register_qdisc(&mq_qdisc_ops);
2195 register_qdisc(&noqueue_qdisc_ops);
2197 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2198 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2199 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2201 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2202 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2203 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2209 subsys_initcall(pktsched_init);