GNU Linux-libre 4.14.266-gnu1
[releases.git] / net / sched / cls_u32.c
1 /*
2  * net/sched/cls_u32.c  Ugly (or Universal) 32bit key Packet Classifier.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  *      The filters are packed to hash tables of key nodes
12  *      with a set of 32bit key/mask pairs at every node.
13  *      Nodes reference next level hash tables etc.
14  *
15  *      This scheme is the best universal classifier I managed to
16  *      invent; it is not super-fast, but it is not slow (provided you
17  *      program it correctly), and general enough.  And its relative
18  *      speed grows as the number of rules becomes larger.
19  *
20  *      It seems that it represents the best middle point between
21  *      speed and manageability both by human and by machine.
22  *
23  *      It is especially useful for link sharing combined with QoS;
24  *      pure RSVP doesn't need such a general approach and can use
25  *      much simpler (and faster) schemes, sort of cls_rsvp.c.
26  *
27  *      JHS: We should remove the CONFIG_NET_CLS_IND from here
28  *      eventually when the meta match extension is made available
29  *
30  *      nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
31  */
32
33 #include <linux/module.h>
34 #include <linux/slab.h>
35 #include <linux/types.h>
36 #include <linux/kernel.h>
37 #include <linux/string.h>
38 #include <linux/errno.h>
39 #include <linux/percpu.h>
40 #include <linux/rtnetlink.h>
41 #include <linux/skbuff.h>
42 #include <linux/bitmap.h>
43 #include <linux/netdevice.h>
44 #include <linux/hash.h>
45 #include <net/netlink.h>
46 #include <net/act_api.h>
47 #include <net/pkt_cls.h>
48 #include <linux/netdevice.h>
49
50 struct tc_u_knode {
51         struct tc_u_knode __rcu *next;
52         u32                     handle;
53         struct tc_u_hnode __rcu *ht_up;
54         struct tcf_exts         exts;
55 #ifdef CONFIG_NET_CLS_IND
56         int                     ifindex;
57 #endif
58         u8                      fshift;
59         struct tcf_result       res;
60         struct tc_u_hnode __rcu *ht_down;
61 #ifdef CONFIG_CLS_U32_PERF
62         struct tc_u32_pcnt __percpu *pf;
63 #endif
64         u32                     flags;
65 #ifdef CONFIG_CLS_U32_MARK
66         u32                     val;
67         u32                     mask;
68         u32 __percpu            *pcpu_success;
69 #endif
70         struct tcf_proto        *tp;
71         union {
72                 struct work_struct      work;
73                 struct rcu_head         rcu;
74         };
75         /* The 'sel' field MUST be the last field in structure to allow for
76          * tc_u32_keys allocated at end of structure.
77          */
78         struct tc_u32_sel       sel;
79 };
80
81 struct tc_u_hnode {
82         struct tc_u_hnode __rcu *next;
83         u32                     handle;
84         u32                     prio;
85         struct tc_u_common      *tp_c;
86         int                     refcnt;
87         unsigned int            divisor;
88         struct rcu_head         rcu;
89         /* The 'ht' field MUST be the last field in structure to allow for
90          * more entries allocated at end of structure.
91          */
92         struct tc_u_knode __rcu *ht[1];
93 };
94
95 struct tc_u_common {
96         struct tc_u_hnode __rcu *hlist;
97         struct Qdisc            *q;
98         int                     refcnt;
99         u32                     hgenerator;
100         struct hlist_node       hnode;
101         struct rcu_head         rcu;
102 };
103
104 static inline unsigned int u32_hash_fold(__be32 key,
105                                          const struct tc_u32_sel *sel,
106                                          u8 fshift)
107 {
108         unsigned int h = ntohl(key & sel->hmask) >> fshift;
109
110         return h;
111 }
112
113 static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp,
114                         struct tcf_result *res)
115 {
116         struct {
117                 struct tc_u_knode *knode;
118                 unsigned int      off;
119         } stack[TC_U32_MAXDEPTH];
120
121         struct tc_u_hnode *ht = rcu_dereference_bh(tp->root);
122         unsigned int off = skb_network_offset(skb);
123         struct tc_u_knode *n;
124         int sdepth = 0;
125         int off2 = 0;
126         int sel = 0;
127 #ifdef CONFIG_CLS_U32_PERF
128         int j;
129 #endif
130         int i, r;
131
132 next_ht:
133         n = rcu_dereference_bh(ht->ht[sel]);
134
135 next_knode:
136         if (n) {
137                 struct tc_u32_key *key = n->sel.keys;
138
139 #ifdef CONFIG_CLS_U32_PERF
140                 __this_cpu_inc(n->pf->rcnt);
141                 j = 0;
142 #endif
143
144                 if (tc_skip_sw(n->flags)) {
145                         n = rcu_dereference_bh(n->next);
146                         goto next_knode;
147                 }
148
149 #ifdef CONFIG_CLS_U32_MARK
150                 if ((skb->mark & n->mask) != n->val) {
151                         n = rcu_dereference_bh(n->next);
152                         goto next_knode;
153                 } else {
154                         __this_cpu_inc(*n->pcpu_success);
155                 }
156 #endif
157
158                 for (i = n->sel.nkeys; i > 0; i--, key++) {
159                         int toff = off + key->off + (off2 & key->offmask);
160                         __be32 *data, hdata;
161
162                         if (skb_headroom(skb) + toff > INT_MAX)
163                                 goto out;
164
165                         data = skb_header_pointer(skb, toff, 4, &hdata);
166                         if (!data)
167                                 goto out;
168                         if ((*data ^ key->val) & key->mask) {
169                                 n = rcu_dereference_bh(n->next);
170                                 goto next_knode;
171                         }
172 #ifdef CONFIG_CLS_U32_PERF
173                         __this_cpu_inc(n->pf->kcnts[j]);
174                         j++;
175 #endif
176                 }
177
178                 ht = rcu_dereference_bh(n->ht_down);
179                 if (!ht) {
180 check_terminal:
181                         if (n->sel.flags & TC_U32_TERMINAL) {
182
183                                 *res = n->res;
184 #ifdef CONFIG_NET_CLS_IND
185                                 if (!tcf_match_indev(skb, n->ifindex)) {
186                                         n = rcu_dereference_bh(n->next);
187                                         goto next_knode;
188                                 }
189 #endif
190 #ifdef CONFIG_CLS_U32_PERF
191                                 __this_cpu_inc(n->pf->rhit);
192 #endif
193                                 r = tcf_exts_exec(skb, &n->exts, res);
194                                 if (r < 0) {
195                                         n = rcu_dereference_bh(n->next);
196                                         goto next_knode;
197                                 }
198
199                                 return r;
200                         }
201                         n = rcu_dereference_bh(n->next);
202                         goto next_knode;
203                 }
204
205                 /* PUSH */
206                 if (sdepth >= TC_U32_MAXDEPTH)
207                         goto deadloop;
208                 stack[sdepth].knode = n;
209                 stack[sdepth].off = off;
210                 sdepth++;
211
212                 ht = rcu_dereference_bh(n->ht_down);
213                 sel = 0;
214                 if (ht->divisor) {
215                         __be32 *data, hdata;
216
217                         data = skb_header_pointer(skb, off + n->sel.hoff, 4,
218                                                   &hdata);
219                         if (!data)
220                                 goto out;
221                         sel = ht->divisor & u32_hash_fold(*data, &n->sel,
222                                                           n->fshift);
223                 }
224                 if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT)))
225                         goto next_ht;
226
227                 if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) {
228                         off2 = n->sel.off + 3;
229                         if (n->sel.flags & TC_U32_VAROFFSET) {
230                                 __be16 *data, hdata;
231
232                                 data = skb_header_pointer(skb,
233                                                           off + n->sel.offoff,
234                                                           2, &hdata);
235                                 if (!data)
236                                         goto out;
237                                 off2 += ntohs(n->sel.offmask & *data) >>
238                                         n->sel.offshift;
239                         }
240                         off2 &= ~3;
241                 }
242                 if (n->sel.flags & TC_U32_EAT) {
243                         off += off2;
244                         off2 = 0;
245                 }
246
247                 if (off < skb->len)
248                         goto next_ht;
249         }
250
251         /* POP */
252         if (sdepth--) {
253                 n = stack[sdepth].knode;
254                 ht = rcu_dereference_bh(n->ht_up);
255                 off = stack[sdepth].off;
256                 goto check_terminal;
257         }
258 out:
259         return -1;
260
261 deadloop:
262         net_warn_ratelimited("cls_u32: dead loop\n");
263         return -1;
264 }
265
266 static struct tc_u_hnode *u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
267 {
268         struct tc_u_hnode *ht;
269
270         for (ht = rtnl_dereference(tp_c->hlist);
271              ht;
272              ht = rtnl_dereference(ht->next))
273                 if (ht->handle == handle)
274                         break;
275
276         return ht;
277 }
278
279 static struct tc_u_knode *u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
280 {
281         unsigned int sel;
282         struct tc_u_knode *n = NULL;
283
284         sel = TC_U32_HASH(handle);
285         if (sel > ht->divisor)
286                 goto out;
287
288         for (n = rtnl_dereference(ht->ht[sel]);
289              n;
290              n = rtnl_dereference(n->next))
291                 if (n->handle == handle)
292                         break;
293 out:
294         return n;
295 }
296
297
298 static void *u32_get(struct tcf_proto *tp, u32 handle)
299 {
300         struct tc_u_hnode *ht;
301         struct tc_u_common *tp_c = tp->data;
302
303         if (TC_U32_HTID(handle) == TC_U32_ROOT)
304                 ht = rtnl_dereference(tp->root);
305         else
306                 ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle));
307
308         if (!ht)
309                 return NULL;
310
311         if (TC_U32_KEY(handle) == 0)
312                 return ht;
313
314         return u32_lookup_key(ht, handle);
315 }
316
317 static u32 gen_new_htid(struct tc_u_common *tp_c)
318 {
319         int i = 0x800;
320
321         /* hgenerator only used inside rtnl lock it is safe to increment
322          * without read _copy_ update semantics
323          */
324         do {
325                 if (++tp_c->hgenerator == 0x7FF)
326                         tp_c->hgenerator = 1;
327         } while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
328
329         return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
330 }
331
332 static struct hlist_head *tc_u_common_hash;
333
334 #define U32_HASH_SHIFT 10
335 #define U32_HASH_SIZE (1 << U32_HASH_SHIFT)
336
337 static unsigned int tc_u_hash(const struct tcf_proto *tp)
338 {
339         struct net_device *dev = tp->q->dev_queue->dev;
340         u32 qhandle = tp->q->handle;
341         int ifindex = dev->ifindex;
342
343         return hash_64((u64)ifindex << 32 | qhandle, U32_HASH_SHIFT);
344 }
345
346 static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp)
347 {
348         struct tc_u_common *tc;
349         unsigned int h;
350
351         h = tc_u_hash(tp);
352         hlist_for_each_entry(tc, &tc_u_common_hash[h], hnode) {
353                 if (tc->q == tp->q)
354                         return tc;
355         }
356         return NULL;
357 }
358
359 static int u32_init(struct tcf_proto *tp)
360 {
361         struct tc_u_hnode *root_ht;
362         struct tc_u_common *tp_c;
363         unsigned int h;
364
365         tp_c = tc_u_common_find(tp);
366
367         root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
368         if (root_ht == NULL)
369                 return -ENOBUFS;
370
371         root_ht->refcnt++;
372         root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
373         root_ht->prio = tp->prio;
374
375         if (tp_c == NULL) {
376                 tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL);
377                 if (tp_c == NULL) {
378                         kfree(root_ht);
379                         return -ENOBUFS;
380                 }
381                 tp_c->q = tp->q;
382                 INIT_HLIST_NODE(&tp_c->hnode);
383
384                 h = tc_u_hash(tp);
385                 hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]);
386         }
387
388         tp_c->refcnt++;
389         RCU_INIT_POINTER(root_ht->next, tp_c->hlist);
390         rcu_assign_pointer(tp_c->hlist, root_ht);
391         root_ht->tp_c = tp_c;
392
393         rcu_assign_pointer(tp->root, root_ht);
394         tp->data = tp_c;
395         return 0;
396 }
397
398 static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
399                            bool free_pf)
400 {
401         struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
402
403         tcf_exts_destroy(&n->exts);
404         tcf_exts_put_net(&n->exts);
405         if (ht && --ht->refcnt == 0)
406                 kfree(ht);
407 #ifdef CONFIG_CLS_U32_PERF
408         if (free_pf)
409                 free_percpu(n->pf);
410 #endif
411 #ifdef CONFIG_CLS_U32_MARK
412         if (free_pf)
413                 free_percpu(n->pcpu_success);
414 #endif
415         kfree(n);
416         return 0;
417 }
418
419 /* u32_delete_key_rcu should be called when free'ing a copied
420  * version of a tc_u_knode obtained from u32_init_knode(). When
421  * copies are obtained from u32_init_knode() the statistics are
422  * shared between the old and new copies to allow readers to
423  * continue to update the statistics during the copy. To support
424  * this the u32_delete_key_rcu variant does not free the percpu
425  * statistics.
426  */
427 static void u32_delete_key_work(struct work_struct *work)
428 {
429         struct tc_u_knode *key = container_of(work, struct tc_u_knode, work);
430
431         rtnl_lock();
432         u32_destroy_key(key->tp, key, false);
433         rtnl_unlock();
434 }
435
436 static void u32_delete_key_rcu(struct rcu_head *rcu)
437 {
438         struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
439
440         INIT_WORK(&key->work, u32_delete_key_work);
441         tcf_queue_work(&key->work);
442 }
443
444 /* u32_delete_key_freepf_rcu is the rcu callback variant
445  * that free's the entire structure including the statistics
446  * percpu variables. Only use this if the key is not a copy
447  * returned by u32_init_knode(). See u32_delete_key_rcu()
448  * for the variant that should be used with keys return from
449  * u32_init_knode()
450  */
451 static void u32_delete_key_freepf_work(struct work_struct *work)
452 {
453         struct tc_u_knode *key = container_of(work, struct tc_u_knode, work);
454
455         rtnl_lock();
456         u32_destroy_key(key->tp, key, true);
457         rtnl_unlock();
458 }
459
460 static void u32_delete_key_freepf_rcu(struct rcu_head *rcu)
461 {
462         struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
463
464         INIT_WORK(&key->work, u32_delete_key_freepf_work);
465         tcf_queue_work(&key->work);
466 }
467
468 static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
469 {
470         struct tc_u_knode __rcu **kp;
471         struct tc_u_knode *pkp;
472         struct tc_u_hnode *ht = rtnl_dereference(key->ht_up);
473
474         if (ht) {
475                 kp = &ht->ht[TC_U32_HASH(key->handle)];
476                 for (pkp = rtnl_dereference(*kp); pkp;
477                      kp = &pkp->next, pkp = rtnl_dereference(*kp)) {
478                         if (pkp == key) {
479                                 RCU_INIT_POINTER(*kp, key->next);
480
481                                 tcf_unbind_filter(tp, &key->res);
482                                 tcf_exts_get_net(&key->exts);
483                                 call_rcu(&key->rcu, u32_delete_key_freepf_rcu);
484                                 return 0;
485                         }
486                 }
487         }
488         WARN_ON(1);
489         return 0;
490 }
491
492 static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
493 {
494         struct net_device *dev = tp->q->dev_queue->dev;
495         struct tc_cls_u32_offload cls_u32 = {};
496
497         if (!tc_should_offload(dev, 0))
498                 return;
499
500         tc_cls_common_offload_init(&cls_u32.common, tp);
501         cls_u32.command = TC_CLSU32_DELETE_KNODE;
502         cls_u32.knode.handle = handle;
503
504         dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32);
505 }
506
507 static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
508                                 u32 flags)
509 {
510         struct net_device *dev = tp->q->dev_queue->dev;
511         struct tc_cls_u32_offload cls_u32 = {};
512         int err;
513
514         if (!tc_should_offload(dev, flags))
515                 return tc_skip_sw(flags) ? -EINVAL : 0;
516
517         tc_cls_common_offload_init(&cls_u32.common, tp);
518         cls_u32.command = TC_CLSU32_NEW_HNODE;
519         cls_u32.hnode.divisor = h->divisor;
520         cls_u32.hnode.handle = h->handle;
521         cls_u32.hnode.prio = h->prio;
522
523         err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32);
524         if (tc_skip_sw(flags))
525                 return err;
526
527         return 0;
528 }
529
530 static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
531 {
532         struct net_device *dev = tp->q->dev_queue->dev;
533         struct tc_cls_u32_offload cls_u32 = {};
534
535         if (!tc_should_offload(dev, 0))
536                 return;
537
538         tc_cls_common_offload_init(&cls_u32.common, tp);
539         cls_u32.command = TC_CLSU32_DELETE_HNODE;
540         cls_u32.hnode.divisor = h->divisor;
541         cls_u32.hnode.handle = h->handle;
542         cls_u32.hnode.prio = h->prio;
543
544         dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32);
545 }
546
547 static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
548                                 u32 flags)
549 {
550         struct net_device *dev = tp->q->dev_queue->dev;
551         struct tc_cls_u32_offload cls_u32 = {};
552         int err;
553
554         if (!tc_should_offload(dev, flags))
555                 return tc_skip_sw(flags) ? -EINVAL : 0;
556
557         tc_cls_common_offload_init(&cls_u32.common, tp);
558         cls_u32.command = TC_CLSU32_REPLACE_KNODE;
559         cls_u32.knode.handle = n->handle;
560         cls_u32.knode.fshift = n->fshift;
561 #ifdef CONFIG_CLS_U32_MARK
562         cls_u32.knode.val = n->val;
563         cls_u32.knode.mask = n->mask;
564 #else
565         cls_u32.knode.val = 0;
566         cls_u32.knode.mask = 0;
567 #endif
568         cls_u32.knode.sel = &n->sel;
569         cls_u32.knode.exts = &n->exts;
570         if (n->ht_down)
571                 cls_u32.knode.link_handle = n->ht_down->handle;
572
573         err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32);
574
575         if (!err)
576                 n->flags |= TCA_CLS_FLAGS_IN_HW;
577
578         if (tc_skip_sw(flags))
579                 return err;
580
581         return 0;
582 }
583
584 static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
585 {
586         struct tc_u_knode *n;
587         unsigned int h;
588
589         for (h = 0; h <= ht->divisor; h++) {
590                 while ((n = rtnl_dereference(ht->ht[h])) != NULL) {
591                         RCU_INIT_POINTER(ht->ht[h],
592                                          rtnl_dereference(n->next));
593                         tcf_unbind_filter(tp, &n->res);
594                         u32_remove_hw_knode(tp, n->handle);
595                         if (tcf_exts_get_net(&n->exts))
596                                 call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
597                         else
598                                 u32_destroy_key(n->tp, n, true);
599                 }
600         }
601 }
602
603 static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
604 {
605         struct tc_u_common *tp_c = tp->data;
606         struct tc_u_hnode __rcu **hn;
607         struct tc_u_hnode *phn;
608
609         WARN_ON(ht->refcnt);
610
611         u32_clear_hnode(tp, ht);
612
613         hn = &tp_c->hlist;
614         for (phn = rtnl_dereference(*hn);
615              phn;
616              hn = &phn->next, phn = rtnl_dereference(*hn)) {
617                 if (phn == ht) {
618                         u32_clear_hw_hnode(tp, ht);
619                         RCU_INIT_POINTER(*hn, ht->next);
620                         kfree_rcu(ht, rcu);
621                         return 0;
622                 }
623         }
624
625         return -ENOENT;
626 }
627
628 static bool ht_empty(struct tc_u_hnode *ht)
629 {
630         unsigned int h;
631
632         for (h = 0; h <= ht->divisor; h++)
633                 if (rcu_access_pointer(ht->ht[h]))
634                         return false;
635
636         return true;
637 }
638
639 static void u32_destroy(struct tcf_proto *tp)
640 {
641         struct tc_u_common *tp_c = tp->data;
642         struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
643
644         WARN_ON(root_ht == NULL);
645
646         if (root_ht && --root_ht->refcnt == 0)
647                 u32_destroy_hnode(tp, root_ht);
648
649         if (--tp_c->refcnt == 0) {
650                 struct tc_u_hnode *ht;
651
652                 hlist_del(&tp_c->hnode);
653
654                 while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) {
655                         u32_clear_hnode(tp, ht);
656                         RCU_INIT_POINTER(tp_c->hlist, ht->next);
657
658                         /* u32_destroy_key() will later free ht for us, if it's
659                          * still referenced by some knode
660                          */
661                         if (--ht->refcnt == 0)
662                                 kfree_rcu(ht, rcu);
663                 }
664
665                 kfree(tp_c);
666         }
667
668         tp->data = NULL;
669 }
670
671 static int u32_delete(struct tcf_proto *tp, void *arg, bool *last)
672 {
673         struct tc_u_hnode *ht = arg;
674         struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
675         struct tc_u_common *tp_c = tp->data;
676         int ret = 0;
677
678         if (ht == NULL)
679                 goto out;
680
681         if (TC_U32_KEY(ht->handle)) {
682                 u32_remove_hw_knode(tp, ht->handle);
683                 ret = u32_delete_key(tp, (struct tc_u_knode *)ht);
684                 goto out;
685         }
686
687         if (root_ht == ht)
688                 return -EINVAL;
689
690         if (ht->refcnt == 1) {
691                 ht->refcnt--;
692                 u32_destroy_hnode(tp, ht);
693         } else {
694                 return -EBUSY;
695         }
696
697 out:
698         *last = true;
699         if (root_ht) {
700                 if (root_ht->refcnt > 1) {
701                         *last = false;
702                         goto ret;
703                 }
704                 if (root_ht->refcnt == 1) {
705                         if (!ht_empty(root_ht)) {
706                                 *last = false;
707                                 goto ret;
708                         }
709                 }
710         }
711
712         if (tp_c->refcnt > 1) {
713                 *last = false;
714                 goto ret;
715         }
716
717         if (tp_c->refcnt == 1) {
718                 struct tc_u_hnode *ht;
719
720                 for (ht = rtnl_dereference(tp_c->hlist);
721                      ht;
722                      ht = rtnl_dereference(ht->next))
723                         if (!ht_empty(ht)) {
724                                 *last = false;
725                                 break;
726                         }
727         }
728
729 ret:
730         return ret;
731 }
732
733 #define NR_U32_NODE (1<<12)
734 static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
735 {
736         struct tc_u_knode *n;
737         unsigned long i;
738         unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long),
739                                         GFP_KERNEL);
740         if (!bitmap)
741                 return handle | 0xFFF;
742
743         for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]);
744              n;
745              n = rtnl_dereference(n->next))
746                 set_bit(TC_U32_NODE(n->handle), bitmap);
747
748         i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800);
749         if (i >= NR_U32_NODE)
750                 i = find_next_zero_bit(bitmap, NR_U32_NODE, 1);
751
752         kfree(bitmap);
753         return handle | (i >= NR_U32_NODE ? 0xFFF : i);
754 }
755
756 static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
757         [TCA_U32_CLASSID]       = { .type = NLA_U32 },
758         [TCA_U32_HASH]          = { .type = NLA_U32 },
759         [TCA_U32_LINK]          = { .type = NLA_U32 },
760         [TCA_U32_DIVISOR]       = { .type = NLA_U32 },
761         [TCA_U32_SEL]           = { .len = sizeof(struct tc_u32_sel) },
762         [TCA_U32_INDEV]         = { .type = NLA_STRING, .len = IFNAMSIZ },
763         [TCA_U32_MARK]          = { .len = sizeof(struct tc_u32_mark) },
764         [TCA_U32_FLAGS]         = { .type = NLA_U32 },
765 };
766
767 static int u32_set_parms(struct net *net, struct tcf_proto *tp,
768                          unsigned long base, struct tc_u_hnode *ht,
769                          struct tc_u_knode *n, struct nlattr **tb,
770                          struct nlattr *est, bool ovr)
771 {
772         int err;
773
774         err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr);
775         if (err < 0)
776                 return err;
777
778         if (tb[TCA_U32_LINK]) {
779                 u32 handle = nla_get_u32(tb[TCA_U32_LINK]);
780                 struct tc_u_hnode *ht_down = NULL, *ht_old;
781
782                 if (TC_U32_KEY(handle))
783                         return -EINVAL;
784
785                 if (handle) {
786                         ht_down = u32_lookup_ht(ht->tp_c, handle);
787
788                         if (ht_down == NULL)
789                                 return -EINVAL;
790                         ht_down->refcnt++;
791                 }
792
793                 ht_old = rtnl_dereference(n->ht_down);
794                 rcu_assign_pointer(n->ht_down, ht_down);
795
796                 if (ht_old)
797                         ht_old->refcnt--;
798         }
799         if (tb[TCA_U32_CLASSID]) {
800                 n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]);
801                 tcf_bind_filter(tp, &n->res, base);
802         }
803
804 #ifdef CONFIG_NET_CLS_IND
805         if (tb[TCA_U32_INDEV]) {
806                 int ret;
807                 ret = tcf_change_indev(net, tb[TCA_U32_INDEV]);
808                 if (ret < 0)
809                         return -EINVAL;
810                 n->ifindex = ret;
811         }
812 #endif
813         return 0;
814 }
815
816 static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c,
817                               struct tc_u_knode *n)
818 {
819         struct tc_u_knode __rcu **ins;
820         struct tc_u_knode *pins;
821         struct tc_u_hnode *ht;
822
823         if (TC_U32_HTID(n->handle) == TC_U32_ROOT)
824                 ht = rtnl_dereference(tp->root);
825         else
826                 ht = u32_lookup_ht(tp_c, TC_U32_HTID(n->handle));
827
828         ins = &ht->ht[TC_U32_HASH(n->handle)];
829
830         /* The node must always exist for it to be replaced if this is not the
831          * case then something went very wrong elsewhere.
832          */
833         for (pins = rtnl_dereference(*ins); ;
834              ins = &pins->next, pins = rtnl_dereference(*ins))
835                 if (pins->handle == n->handle)
836                         break;
837
838         RCU_INIT_POINTER(n->next, pins->next);
839         rcu_assign_pointer(*ins, n);
840 }
841
842 static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
843                                          struct tc_u_knode *n)
844 {
845         struct tc_u_knode *new;
846         struct tc_u32_sel *s = &n->sel;
847
848         new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key),
849                       GFP_KERNEL);
850
851         if (!new)
852                 return NULL;
853
854         RCU_INIT_POINTER(new->next, n->next);
855         new->handle = n->handle;
856         RCU_INIT_POINTER(new->ht_up, n->ht_up);
857
858 #ifdef CONFIG_NET_CLS_IND
859         new->ifindex = n->ifindex;
860 #endif
861         new->fshift = n->fshift;
862         new->res = n->res;
863         new->flags = n->flags;
864         RCU_INIT_POINTER(new->ht_down, n->ht_down);
865
866         /* bump reference count as long as we hold pointer to structure */
867         if (new->ht_down)
868                 new->ht_down->refcnt++;
869
870 #ifdef CONFIG_CLS_U32_PERF
871         /* Statistics may be incremented by readers during update
872          * so we must keep them in tact. When the node is later destroyed
873          * a special destroy call must be made to not free the pf memory.
874          */
875         new->pf = n->pf;
876 #endif
877
878 #ifdef CONFIG_CLS_U32_MARK
879         new->val = n->val;
880         new->mask = n->mask;
881         /* Similarly success statistics must be moved as pointers */
882         new->pcpu_success = n->pcpu_success;
883 #endif
884         new->tp = tp;
885         memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
886
887         if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) {
888                 kfree(new);
889                 return NULL;
890         }
891
892         return new;
893 }
894
895 static int u32_change(struct net *net, struct sk_buff *in_skb,
896                       struct tcf_proto *tp, unsigned long base, u32 handle,
897                       struct nlattr **tca, void **arg, bool ovr)
898 {
899         struct tc_u_common *tp_c = tp->data;
900         struct tc_u_hnode *ht;
901         struct tc_u_knode *n;
902         struct tc_u32_sel *s;
903         struct nlattr *opt = tca[TCA_OPTIONS];
904         struct nlattr *tb[TCA_U32_MAX + 1];
905         u32 htid, flags = 0;
906         size_t sel_size;
907         int err;
908 #ifdef CONFIG_CLS_U32_PERF
909         size_t size;
910 #endif
911
912         if (opt == NULL)
913                 return handle ? -EINVAL : 0;
914
915         err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy, NULL);
916         if (err < 0)
917                 return err;
918
919         if (tb[TCA_U32_FLAGS]) {
920                 flags = nla_get_u32(tb[TCA_U32_FLAGS]);
921                 if (!tc_flags_valid(flags))
922                         return -EINVAL;
923         }
924
925         n = *arg;
926         if (n) {
927                 struct tc_u_knode *new;
928
929                 if (TC_U32_KEY(n->handle) == 0)
930                         return -EINVAL;
931
932                 if ((n->flags ^ flags) &
933                     ~(TCA_CLS_FLAGS_IN_HW | TCA_CLS_FLAGS_NOT_IN_HW))
934                         return -EINVAL;
935
936                 new = u32_init_knode(tp, n);
937                 if (!new)
938                         return -ENOMEM;
939
940                 err = u32_set_parms(net, tp, base,
941                                     rtnl_dereference(n->ht_up), new, tb,
942                                     tca[TCA_RATE], ovr);
943
944                 if (err) {
945                         u32_destroy_key(tp, new, false);
946                         return err;
947                 }
948
949                 err = u32_replace_hw_knode(tp, new, flags);
950                 if (err) {
951                         u32_destroy_key(tp, new, false);
952                         return err;
953                 }
954
955                 if (!tc_in_hw(new->flags))
956                         new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
957
958                 u32_replace_knode(tp, tp_c, new);
959                 tcf_unbind_filter(tp, &n->res);
960                 tcf_exts_get_net(&n->exts);
961                 call_rcu(&n->rcu, u32_delete_key_rcu);
962                 return 0;
963         }
964
965         if (tb[TCA_U32_DIVISOR]) {
966                 unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
967
968                 if (--divisor > 0x100)
969                         return -EINVAL;
970                 if (TC_U32_KEY(handle))
971                         return -EINVAL;
972                 if (handle == 0) {
973                         handle = gen_new_htid(tp->data);
974                         if (handle == 0)
975                                 return -ENOMEM;
976                 }
977                 ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
978                 if (ht == NULL)
979                         return -ENOBUFS;
980                 ht->tp_c = tp_c;
981                 ht->refcnt = 1;
982                 ht->divisor = divisor;
983                 ht->handle = handle;
984                 ht->prio = tp->prio;
985
986                 err = u32_replace_hw_hnode(tp, ht, flags);
987                 if (err) {
988                         kfree(ht);
989                         return err;
990                 }
991
992                 RCU_INIT_POINTER(ht->next, tp_c->hlist);
993                 rcu_assign_pointer(tp_c->hlist, ht);
994                 *arg = ht;
995
996                 return 0;
997         }
998
999         if (tb[TCA_U32_HASH]) {
1000                 htid = nla_get_u32(tb[TCA_U32_HASH]);
1001                 if (TC_U32_HTID(htid) == TC_U32_ROOT) {
1002                         ht = rtnl_dereference(tp->root);
1003                         htid = ht->handle;
1004                 } else {
1005                         ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
1006                         if (ht == NULL)
1007                                 return -EINVAL;
1008                 }
1009         } else {
1010                 ht = rtnl_dereference(tp->root);
1011                 htid = ht->handle;
1012         }
1013
1014         if (ht->divisor < TC_U32_HASH(htid))
1015                 return -EINVAL;
1016
1017         if (handle) {
1018                 if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
1019                         return -EINVAL;
1020                 handle = htid | TC_U32_NODE(handle);
1021         } else
1022                 handle = gen_new_kid(ht, htid);
1023
1024         if (tb[TCA_U32_SEL] == NULL)
1025                 return -EINVAL;
1026
1027         s = nla_data(tb[TCA_U32_SEL]);
1028         sel_size = sizeof(*s) + sizeof(*s->keys) * s->nkeys;
1029         if (nla_len(tb[TCA_U32_SEL]) < sel_size)
1030                 return -EINVAL;
1031
1032         n = kzalloc(offsetof(typeof(*n), sel) + sel_size, GFP_KERNEL);
1033         if (n == NULL)
1034                 return -ENOBUFS;
1035
1036 #ifdef CONFIG_CLS_U32_PERF
1037         size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64);
1038         n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt));
1039         if (!n->pf) {
1040                 kfree(n);
1041                 return -ENOBUFS;
1042         }
1043 #endif
1044
1045         memcpy(&n->sel, s, sel_size);
1046         RCU_INIT_POINTER(n->ht_up, ht);
1047         n->handle = handle;
1048         n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
1049         n->flags = flags;
1050         n->tp = tp;
1051
1052         err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
1053         if (err < 0)
1054                 goto errout;
1055
1056 #ifdef CONFIG_CLS_U32_MARK
1057         n->pcpu_success = alloc_percpu(u32);
1058         if (!n->pcpu_success) {
1059                 err = -ENOMEM;
1060                 goto errout;
1061         }
1062
1063         if (tb[TCA_U32_MARK]) {
1064                 struct tc_u32_mark *mark;
1065
1066                 mark = nla_data(tb[TCA_U32_MARK]);
1067                 n->val = mark->val;
1068                 n->mask = mark->mask;
1069         }
1070 #endif
1071
1072         err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr);
1073         if (err == 0) {
1074                 struct tc_u_knode __rcu **ins;
1075                 struct tc_u_knode *pins;
1076
1077                 err = u32_replace_hw_knode(tp, n, flags);
1078                 if (err)
1079                         goto errhw;
1080
1081                 if (!tc_in_hw(n->flags))
1082                         n->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
1083
1084                 ins = &ht->ht[TC_U32_HASH(handle)];
1085                 for (pins = rtnl_dereference(*ins); pins;
1086                      ins = &pins->next, pins = rtnl_dereference(*ins))
1087                         if (TC_U32_NODE(handle) < TC_U32_NODE(pins->handle))
1088                                 break;
1089
1090                 RCU_INIT_POINTER(n->next, pins);
1091                 rcu_assign_pointer(*ins, n);
1092                 *arg = n;
1093                 return 0;
1094         }
1095
1096 errhw:
1097 #ifdef CONFIG_CLS_U32_MARK
1098         free_percpu(n->pcpu_success);
1099 #endif
1100
1101 errout:
1102         tcf_exts_destroy(&n->exts);
1103 #ifdef CONFIG_CLS_U32_PERF
1104         free_percpu(n->pf);
1105 #endif
1106         kfree(n);
1107         return err;
1108 }
1109
1110 static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
1111 {
1112         struct tc_u_common *tp_c = tp->data;
1113         struct tc_u_hnode *ht;
1114         struct tc_u_knode *n;
1115         unsigned int h;
1116
1117         if (arg->stop)
1118                 return;
1119
1120         for (ht = rtnl_dereference(tp_c->hlist);
1121              ht;
1122              ht = rtnl_dereference(ht->next)) {
1123                 if (ht->prio != tp->prio)
1124                         continue;
1125                 if (arg->count >= arg->skip) {
1126                         if (arg->fn(tp, ht, arg) < 0) {
1127                                 arg->stop = 1;
1128                                 return;
1129                         }
1130                 }
1131                 arg->count++;
1132                 for (h = 0; h <= ht->divisor; h++) {
1133                         for (n = rtnl_dereference(ht->ht[h]);
1134                              n;
1135                              n = rtnl_dereference(n->next)) {
1136                                 if (arg->count < arg->skip) {
1137                                         arg->count++;
1138                                         continue;
1139                                 }
1140                                 if (arg->fn(tp, n, arg) < 0) {
1141                                         arg->stop = 1;
1142                                         return;
1143                                 }
1144                                 arg->count++;
1145                         }
1146                 }
1147         }
1148 }
1149
1150 static void u32_bind_class(void *fh, u32 classid, unsigned long cl)
1151 {
1152         struct tc_u_knode *n = fh;
1153
1154         if (n && n->res.classid == classid)
1155                 n->res.class = cl;
1156 }
1157
1158 static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
1159                     struct sk_buff *skb, struct tcmsg *t)
1160 {
1161         struct tc_u_knode *n = fh;
1162         struct tc_u_hnode *ht_up, *ht_down;
1163         struct nlattr *nest;
1164
1165         if (n == NULL)
1166                 return skb->len;
1167
1168         t->tcm_handle = n->handle;
1169
1170         nest = nla_nest_start(skb, TCA_OPTIONS);
1171         if (nest == NULL)
1172                 goto nla_put_failure;
1173
1174         if (TC_U32_KEY(n->handle) == 0) {
1175                 struct tc_u_hnode *ht = fh;
1176                 u32 divisor = ht->divisor + 1;
1177
1178                 if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor))
1179                         goto nla_put_failure;
1180         } else {
1181 #ifdef CONFIG_CLS_U32_PERF
1182                 struct tc_u32_pcnt *gpf;
1183                 int cpu;
1184 #endif
1185
1186                 if (nla_put(skb, TCA_U32_SEL,
1187                             sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
1188                             &n->sel))
1189                         goto nla_put_failure;
1190
1191                 ht_up = rtnl_dereference(n->ht_up);
1192                 if (ht_up) {
1193                         u32 htid = n->handle & 0xFFFFF000;
1194                         if (nla_put_u32(skb, TCA_U32_HASH, htid))
1195                                 goto nla_put_failure;
1196                 }
1197                 if (n->res.classid &&
1198                     nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid))
1199                         goto nla_put_failure;
1200
1201                 ht_down = rtnl_dereference(n->ht_down);
1202                 if (ht_down &&
1203                     nla_put_u32(skb, TCA_U32_LINK, ht_down->handle))
1204                         goto nla_put_failure;
1205
1206                 if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags))
1207                         goto nla_put_failure;
1208
1209 #ifdef CONFIG_CLS_U32_MARK
1210                 if ((n->val || n->mask)) {
1211                         struct tc_u32_mark mark = {.val = n->val,
1212                                                    .mask = n->mask,
1213                                                    .success = 0};
1214                         int cpum;
1215
1216                         for_each_possible_cpu(cpum) {
1217                                 __u32 cnt = *per_cpu_ptr(n->pcpu_success, cpum);
1218
1219                                 mark.success += cnt;
1220                         }
1221
1222                         if (nla_put(skb, TCA_U32_MARK, sizeof(mark), &mark))
1223                                 goto nla_put_failure;
1224                 }
1225 #endif
1226
1227                 if (tcf_exts_dump(skb, &n->exts) < 0)
1228                         goto nla_put_failure;
1229
1230 #ifdef CONFIG_NET_CLS_IND
1231                 if (n->ifindex) {
1232                         struct net_device *dev;
1233                         dev = __dev_get_by_index(net, n->ifindex);
1234                         if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
1235                                 goto nla_put_failure;
1236                 }
1237 #endif
1238 #ifdef CONFIG_CLS_U32_PERF
1239                 gpf = kzalloc(sizeof(struct tc_u32_pcnt) +
1240                               n->sel.nkeys * sizeof(u64),
1241                               GFP_KERNEL);
1242                 if (!gpf)
1243                         goto nla_put_failure;
1244
1245                 for_each_possible_cpu(cpu) {
1246                         int i;
1247                         struct tc_u32_pcnt *pf = per_cpu_ptr(n->pf, cpu);
1248
1249                         gpf->rcnt += pf->rcnt;
1250                         gpf->rhit += pf->rhit;
1251                         for (i = 0; i < n->sel.nkeys; i++)
1252                                 gpf->kcnts[i] += pf->kcnts[i];
1253                 }
1254
1255                 if (nla_put_64bit(skb, TCA_U32_PCNT,
1256                                   sizeof(struct tc_u32_pcnt) +
1257                                   n->sel.nkeys * sizeof(u64),
1258                                   gpf, TCA_U32_PAD)) {
1259                         kfree(gpf);
1260                         goto nla_put_failure;
1261                 }
1262                 kfree(gpf);
1263 #endif
1264         }
1265
1266         nla_nest_end(skb, nest);
1267
1268         if (TC_U32_KEY(n->handle))
1269                 if (tcf_exts_dump_stats(skb, &n->exts) < 0)
1270                         goto nla_put_failure;
1271         return skb->len;
1272
1273 nla_put_failure:
1274         nla_nest_cancel(skb, nest);
1275         return -1;
1276 }
1277
1278 static struct tcf_proto_ops cls_u32_ops __read_mostly = {
1279         .kind           =       "u32",
1280         .classify       =       u32_classify,
1281         .init           =       u32_init,
1282         .destroy        =       u32_destroy,
1283         .get            =       u32_get,
1284         .change         =       u32_change,
1285         .delete         =       u32_delete,
1286         .walk           =       u32_walk,
1287         .dump           =       u32_dump,
1288         .bind_class     =       u32_bind_class,
1289         .owner          =       THIS_MODULE,
1290 };
1291
1292 static int __init init_u32(void)
1293 {
1294         int i, ret;
1295
1296         pr_info("u32 classifier\n");
1297 #ifdef CONFIG_CLS_U32_PERF
1298         pr_info("    Performance counters on\n");
1299 #endif
1300 #ifdef CONFIG_NET_CLS_IND
1301         pr_info("    input device check on\n");
1302 #endif
1303 #ifdef CONFIG_NET_CLS_ACT
1304         pr_info("    Actions configured\n");
1305 #endif
1306         tc_u_common_hash = kvmalloc_array(U32_HASH_SIZE,
1307                                           sizeof(struct hlist_head),
1308                                           GFP_KERNEL);
1309         if (!tc_u_common_hash)
1310                 return -ENOMEM;
1311
1312         for (i = 0; i < U32_HASH_SIZE; i++)
1313                 INIT_HLIST_HEAD(&tc_u_common_hash[i]);
1314
1315         ret = register_tcf_proto_ops(&cls_u32_ops);
1316         if (ret)
1317                 kvfree(tc_u_common_hash);
1318         return ret;
1319 }
1320
1321 static void __exit exit_u32(void)
1322 {
1323         unregister_tcf_proto_ops(&cls_u32_ops);
1324         kvfree(tc_u_common_hash);
1325 }
1326
1327 module_init(init_u32)
1328 module_exit(exit_u32)
1329 MODULE_LICENSE("GPL");