GNU Linux-libre 4.19.264-gnu1
[releases.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <linux/siphash.h>
49 #include <net/net_namespace.h>
50 #include <net/snmp.h>
51 #include <net/ipv6.h>
52 #include <net/ip6_fib.h>
53 #include <net/ip6_route.h>
54 #include <net/ndisc.h>
55 #include <net/addrconf.h>
56 #include <net/tcp.h>
57 #include <linux/rtnetlink.h>
58 #include <net/dst.h>
59 #include <net/dst_metadata.h>
60 #include <net/xfrm.h>
61 #include <net/netevent.h>
62 #include <net/netlink.h>
63 #include <net/nexthop.h>
64 #include <net/lwtunnel.h>
65 #include <net/ip_tunnels.h>
66 #include <net/l3mdev.h>
67 #include <net/ip.h>
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 static int ip6_rt_type_to_error(u8 fib6_type);
75
76 #define CREATE_TRACE_POINTS
77 #include <trace/events/fib6.h>
78 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
79 #undef CREATE_TRACE_POINTS
80
81 enum rt6_nud_state {
82         RT6_NUD_FAIL_HARD = -3,
83         RT6_NUD_FAIL_PROBE = -2,
84         RT6_NUD_FAIL_DO_RR = -1,
85         RT6_NUD_SUCCEED = 1
86 };
87
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
90 static unsigned int      ip6_mtu(const struct dst_entry *dst);
91 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
92 static void             ip6_dst_destroy(struct dst_entry *);
93 static void             ip6_dst_ifdown(struct dst_entry *,
94                                        struct net_device *dev, int how);
95 static int               ip6_dst_gc(struct dst_ops *ops);
96
97 static int              ip6_pkt_discard(struct sk_buff *skb);
98 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
99 static int              ip6_pkt_prohibit(struct sk_buff *skb);
100 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
101 static void             ip6_link_failure(struct sk_buff *skb);
102 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
103                                            struct sk_buff *skb, u32 mtu,
104                                            bool confirm_neigh);
105 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
106                                         struct sk_buff *skb);
107 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
108 static size_t rt6_nlmsg_size(struct fib6_info *rt);
109 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
110                          struct fib6_info *rt, struct dst_entry *dst,
111                          struct in6_addr *dest, struct in6_addr *src,
112                          int iif, int type, u32 portid, u32 seq,
113                          unsigned int flags);
114 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
115                                            const struct in6_addr *daddr,
116                                            const struct in6_addr *saddr);
117
118 #ifdef CONFIG_IPV6_ROUTE_INFO
119 static struct fib6_info *rt6_add_route_info(struct net *net,
120                                            const struct in6_addr *prefix, int prefixlen,
121                                            const struct in6_addr *gwaddr,
122                                            struct net_device *dev,
123                                            unsigned int pref);
124 static struct fib6_info *rt6_get_route_info(struct net *net,
125                                            const struct in6_addr *prefix, int prefixlen,
126                                            const struct in6_addr *gwaddr,
127                                            struct net_device *dev);
128 #endif
129
130 struct uncached_list {
131         spinlock_t              lock;
132         struct list_head        head;
133 };
134
135 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
136
137 void rt6_uncached_list_add(struct rt6_info *rt)
138 {
139         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
140
141         rt->rt6i_uncached_list = ul;
142
143         spin_lock_bh(&ul->lock);
144         list_add_tail(&rt->rt6i_uncached, &ul->head);
145         spin_unlock_bh(&ul->lock);
146 }
147
148 void rt6_uncached_list_del(struct rt6_info *rt)
149 {
150         if (!list_empty(&rt->rt6i_uncached)) {
151                 struct uncached_list *ul = rt->rt6i_uncached_list;
152                 struct net *net = dev_net(rt->dst.dev);
153
154                 spin_lock_bh(&ul->lock);
155                 list_del(&rt->rt6i_uncached);
156                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
157                 spin_unlock_bh(&ul->lock);
158         }
159 }
160
161 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
162 {
163         struct net_device *loopback_dev = net->loopback_dev;
164         int cpu;
165
166         if (dev == loopback_dev)
167                 return;
168
169         for_each_possible_cpu(cpu) {
170                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
171                 struct rt6_info *rt;
172
173                 spin_lock_bh(&ul->lock);
174                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
175                         struct inet6_dev *rt_idev = rt->rt6i_idev;
176                         struct net_device *rt_dev = rt->dst.dev;
177
178                         if (rt_idev->dev == dev) {
179                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
180                                 in6_dev_put(rt_idev);
181                         }
182
183                         if (rt_dev == dev) {
184                                 rt->dst.dev = loopback_dev;
185                                 dev_hold(rt->dst.dev);
186                                 dev_put(rt_dev);
187                         }
188                 }
189                 spin_unlock_bh(&ul->lock);
190         }
191 }
192
193 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
194                                              struct sk_buff *skb,
195                                              const void *daddr)
196 {
197         if (!ipv6_addr_any(p))
198                 return (const void *) p;
199         else if (skb)
200                 return &ipv6_hdr(skb)->daddr;
201         return daddr;
202 }
203
204 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
205                                    struct net_device *dev,
206                                    struct sk_buff *skb,
207                                    const void *daddr)
208 {
209         struct neighbour *n;
210
211         daddr = choose_neigh_daddr(gw, skb, daddr);
212         n = __ipv6_neigh_lookup(dev, daddr);
213         if (n)
214                 return n;
215
216         n = neigh_create(&nd_tbl, daddr, dev);
217         return IS_ERR(n) ? NULL : n;
218 }
219
220 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
221                                               struct sk_buff *skb,
222                                               const void *daddr)
223 {
224         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
225
226         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
227 }
228
229 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
230 {
231         struct net_device *dev = dst->dev;
232         struct rt6_info *rt = (struct rt6_info *)dst;
233
234         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
235         if (!daddr)
236                 return;
237         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
238                 return;
239         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
240                 return;
241         __ipv6_confirm_neigh(dev, daddr);
242 }
243
244 static struct dst_ops ip6_dst_ops_template = {
245         .family                 =       AF_INET6,
246         .gc                     =       ip6_dst_gc,
247         .gc_thresh              =       1024,
248         .check                  =       ip6_dst_check,
249         .default_advmss         =       ip6_default_advmss,
250         .mtu                    =       ip6_mtu,
251         .cow_metrics            =       dst_cow_metrics_generic,
252         .destroy                =       ip6_dst_destroy,
253         .ifdown                 =       ip6_dst_ifdown,
254         .negative_advice        =       ip6_negative_advice,
255         .link_failure           =       ip6_link_failure,
256         .update_pmtu            =       ip6_rt_update_pmtu,
257         .redirect               =       rt6_do_redirect,
258         .local_out              =       __ip6_local_out,
259         .neigh_lookup           =       ip6_dst_neigh_lookup,
260         .confirm_neigh          =       ip6_confirm_neigh,
261 };
262
263 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
264 {
265         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
266
267         return mtu ? : dst->dev->mtu;
268 }
269
270 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
271                                          struct sk_buff *skb, u32 mtu,
272                                          bool confirm_neigh)
273 {
274 }
275
276 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
277                                       struct sk_buff *skb)
278 {
279 }
280
281 static struct dst_ops ip6_dst_blackhole_ops = {
282         .family                 =       AF_INET6,
283         .destroy                =       ip6_dst_destroy,
284         .check                  =       ip6_dst_check,
285         .mtu                    =       ip6_blackhole_mtu,
286         .default_advmss         =       ip6_default_advmss,
287         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
288         .redirect               =       ip6_rt_blackhole_redirect,
289         .cow_metrics            =       dst_cow_metrics_generic,
290         .neigh_lookup           =       ip6_dst_neigh_lookup,
291 };
292
293 static const u32 ip6_template_metrics[RTAX_MAX] = {
294         [RTAX_HOPLIMIT - 1] = 0,
295 };
296
297 static const struct fib6_info fib6_null_entry_template = {
298         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
299         .fib6_protocol  = RTPROT_KERNEL,
300         .fib6_metric    = ~(u32)0,
301         .fib6_ref       = ATOMIC_INIT(1),
302         .fib6_type      = RTN_UNREACHABLE,
303         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
304 };
305
306 static const struct rt6_info ip6_null_entry_template = {
307         .dst = {
308                 .__refcnt       = ATOMIC_INIT(1),
309                 .__use          = 1,
310                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
311                 .error          = -ENETUNREACH,
312                 .input          = ip6_pkt_discard,
313                 .output         = ip6_pkt_discard_out,
314         },
315         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
316 };
317
318 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
319
320 static const struct rt6_info ip6_prohibit_entry_template = {
321         .dst = {
322                 .__refcnt       = ATOMIC_INIT(1),
323                 .__use          = 1,
324                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
325                 .error          = -EACCES,
326                 .input          = ip6_pkt_prohibit,
327                 .output         = ip6_pkt_prohibit_out,
328         },
329         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
330 };
331
332 static const struct rt6_info ip6_blk_hole_entry_template = {
333         .dst = {
334                 .__refcnt       = ATOMIC_INIT(1),
335                 .__use          = 1,
336                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
337                 .error          = -EINVAL,
338                 .input          = dst_discard,
339                 .output         = dst_discard_out,
340         },
341         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
342 };
343
344 #endif
345
346 static void rt6_info_init(struct rt6_info *rt)
347 {
348         struct dst_entry *dst = &rt->dst;
349
350         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
351         INIT_LIST_HEAD(&rt->rt6i_uncached);
352 }
353
354 /* allocate dst with ip6_dst_ops */
355 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
356                                int flags)
357 {
358         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
359                                         1, DST_OBSOLETE_FORCE_CHK, flags);
360
361         if (rt) {
362                 rt6_info_init(rt);
363                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
364         }
365
366         return rt;
367 }
368 EXPORT_SYMBOL(ip6_dst_alloc);
369
370 static void ip6_dst_destroy(struct dst_entry *dst)
371 {
372         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
373         struct rt6_info *rt = (struct rt6_info *)dst;
374         struct fib6_info *from;
375         struct inet6_dev *idev;
376
377         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
378                 kfree(p);
379
380         rt6_uncached_list_del(rt);
381
382         idev = rt->rt6i_idev;
383         if (idev) {
384                 rt->rt6i_idev = NULL;
385                 in6_dev_put(idev);
386         }
387
388         from = xchg((__force struct fib6_info **)&rt->from, NULL);
389         fib6_info_release(from);
390 }
391
392 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
393                            int how)
394 {
395         struct rt6_info *rt = (struct rt6_info *)dst;
396         struct inet6_dev *idev = rt->rt6i_idev;
397         struct net_device *loopback_dev =
398                 dev_net(dev)->loopback_dev;
399
400         if (idev && idev->dev != loopback_dev) {
401                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
402                 if (loopback_idev) {
403                         rt->rt6i_idev = loopback_idev;
404                         in6_dev_put(idev);
405                 }
406         }
407 }
408
409 static bool __rt6_check_expired(const struct rt6_info *rt)
410 {
411         if (rt->rt6i_flags & RTF_EXPIRES)
412                 return time_after(jiffies, rt->dst.expires);
413         else
414                 return false;
415 }
416
417 static bool rt6_check_expired(const struct rt6_info *rt)
418 {
419         struct fib6_info *from;
420
421         from = rcu_dereference(rt->from);
422
423         if (rt->rt6i_flags & RTF_EXPIRES) {
424                 if (time_after(jiffies, rt->dst.expires))
425                         return true;
426         } else if (from) {
427                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
428                         fib6_check_expired(from);
429         }
430         return false;
431 }
432
433 struct fib6_info *fib6_multipath_select(const struct net *net,
434                                         struct fib6_info *match,
435                                         struct flowi6 *fl6, int oif,
436                                         const struct sk_buff *skb,
437                                         int strict)
438 {
439         struct fib6_info *sibling, *next_sibling;
440
441         /* We might have already computed the hash for ICMPv6 errors. In such
442          * case it will always be non-zero. Otherwise now is the time to do it.
443          */
444         if (!fl6->mp_hash)
445                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
446
447         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
448                 return match;
449
450         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
451                                  fib6_siblings) {
452                 int nh_upper_bound;
453
454                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
455                 if (fl6->mp_hash > nh_upper_bound)
456                         continue;
457                 if (rt6_score_route(sibling, oif, strict) < 0)
458                         break;
459                 match = sibling;
460                 break;
461         }
462
463         return match;
464 }
465
466 /*
467  *      Route lookup. rcu_read_lock() should be held.
468  */
469
470 static inline struct fib6_info *rt6_device_match(struct net *net,
471                                                  struct fib6_info *rt,
472                                                     const struct in6_addr *saddr,
473                                                     int oif,
474                                                     int flags)
475 {
476         struct fib6_info *sprt;
477
478         if (!oif && ipv6_addr_any(saddr) &&
479             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
480                 return rt;
481
482         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
483                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
484
485                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
486                         continue;
487
488                 if (oif) {
489                         if (dev->ifindex == oif)
490                                 return sprt;
491                 } else {
492                         if (ipv6_chk_addr(net, saddr, dev,
493                                           flags & RT6_LOOKUP_F_IFACE))
494                                 return sprt;
495                 }
496         }
497
498         if (oif && flags & RT6_LOOKUP_F_IFACE)
499                 return net->ipv6.fib6_null_entry;
500
501         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
502 }
503
504 #ifdef CONFIG_IPV6_ROUTER_PREF
505 struct __rt6_probe_work {
506         struct work_struct work;
507         struct in6_addr target;
508         struct net_device *dev;
509 };
510
511 static void rt6_probe_deferred(struct work_struct *w)
512 {
513         struct in6_addr mcaddr;
514         struct __rt6_probe_work *work =
515                 container_of(w, struct __rt6_probe_work, work);
516
517         addrconf_addr_solict_mult(&work->target, &mcaddr);
518         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
519         dev_put(work->dev);
520         kfree(work);
521 }
522
523 static void rt6_probe(struct fib6_info *rt)
524 {
525         struct __rt6_probe_work *work = NULL;
526         const struct in6_addr *nh_gw;
527         unsigned long last_probe;
528         struct neighbour *neigh;
529         struct net_device *dev;
530         struct inet6_dev *idev;
531
532         /*
533          * Okay, this does not seem to be appropriate
534          * for now, however, we need to check if it
535          * is really so; aka Router Reachability Probing.
536          *
537          * Router Reachability Probe MUST be rate-limited
538          * to no more than one per minute.
539          */
540         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
541                 return;
542
543         nh_gw = &rt->fib6_nh.nh_gw;
544         dev = rt->fib6_nh.nh_dev;
545         rcu_read_lock_bh();
546         last_probe = READ_ONCE(rt->last_probe);
547         idev = __in6_dev_get(dev);
548         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
549         if (neigh) {
550                 if (neigh->nud_state & NUD_VALID)
551                         goto out;
552
553                 write_lock(&neigh->lock);
554                 if (!(neigh->nud_state & NUD_VALID) &&
555                     time_after(jiffies,
556                                neigh->updated + idev->cnf.rtr_probe_interval)) {
557                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
558                         if (work)
559                                 __neigh_set_probe_once(neigh);
560                 }
561                 write_unlock(&neigh->lock);
562         } else if (time_after(jiffies, last_probe +
563                                        idev->cnf.rtr_probe_interval)) {
564                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
565         }
566
567         if (!work || cmpxchg(&rt->last_probe,
568                              last_probe, jiffies) != last_probe) {
569                 kfree(work);
570         } else {
571                 INIT_WORK(&work->work, rt6_probe_deferred);
572                 work->target = *nh_gw;
573                 dev_hold(dev);
574                 work->dev = dev;
575                 schedule_work(&work->work);
576         }
577
578 out:
579         rcu_read_unlock_bh();
580 }
581 #else
582 static inline void rt6_probe(struct fib6_info *rt)
583 {
584 }
585 #endif
586
587 /*
588  * Default Router Selection (RFC 2461 6.3.6)
589  */
590 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
591 {
592         const struct net_device *dev = rt->fib6_nh.nh_dev;
593
594         if (!oif || dev->ifindex == oif)
595                 return 2;
596         return 0;
597 }
598
599 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
600 {
601         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
602         struct neighbour *neigh;
603
604         if (rt->fib6_flags & RTF_NONEXTHOP ||
605             !(rt->fib6_flags & RTF_GATEWAY))
606                 return RT6_NUD_SUCCEED;
607
608         rcu_read_lock_bh();
609         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
610                                           &rt->fib6_nh.nh_gw);
611         if (neigh) {
612                 read_lock(&neigh->lock);
613                 if (neigh->nud_state & NUD_VALID)
614                         ret = RT6_NUD_SUCCEED;
615 #ifdef CONFIG_IPV6_ROUTER_PREF
616                 else if (!(neigh->nud_state & NUD_FAILED))
617                         ret = RT6_NUD_SUCCEED;
618                 else
619                         ret = RT6_NUD_FAIL_PROBE;
620 #endif
621                 read_unlock(&neigh->lock);
622         } else {
623                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
624                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
625         }
626         rcu_read_unlock_bh();
627
628         return ret;
629 }
630
631 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
632 {
633         int m;
634
635         m = rt6_check_dev(rt, oif);
636         if (!m && (strict & RT6_LOOKUP_F_IFACE))
637                 return RT6_NUD_FAIL_HARD;
638 #ifdef CONFIG_IPV6_ROUTER_PREF
639         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
640 #endif
641         if (strict & RT6_LOOKUP_F_REACHABLE) {
642                 int n = rt6_check_neigh(rt);
643                 if (n < 0)
644                         return n;
645         }
646         return m;
647 }
648
649 /* called with rc_read_lock held */
650 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
651 {
652         const struct net_device *dev = fib6_info_nh_dev(f6i);
653         bool rc = false;
654
655         if (dev) {
656                 const struct inet6_dev *idev = __in6_dev_get(dev);
657
658                 rc = !!idev->cnf.ignore_routes_with_linkdown;
659         }
660
661         return rc;
662 }
663
664 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
665                                    int *mpri, struct fib6_info *match,
666                                    bool *do_rr)
667 {
668         int m;
669         bool match_do_rr = false;
670
671         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
672                 goto out;
673
674         if (fib6_ignore_linkdown(rt) &&
675             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
676             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
677                 goto out;
678
679         if (fib6_check_expired(rt))
680                 goto out;
681
682         m = rt6_score_route(rt, oif, strict);
683         if (m == RT6_NUD_FAIL_DO_RR) {
684                 match_do_rr = true;
685                 m = 0; /* lowest valid score */
686         } else if (m == RT6_NUD_FAIL_HARD) {
687                 goto out;
688         }
689
690         if (strict & RT6_LOOKUP_F_REACHABLE)
691                 rt6_probe(rt);
692
693         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
694         if (m > *mpri) {
695                 *do_rr = match_do_rr;
696                 *mpri = m;
697                 match = rt;
698         }
699 out:
700         return match;
701 }
702
703 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
704                                      struct fib6_info *leaf,
705                                      struct fib6_info *rr_head,
706                                      u32 metric, int oif, int strict,
707                                      bool *do_rr)
708 {
709         struct fib6_info *rt, *match, *cont;
710         int mpri = -1;
711
712         match = NULL;
713         cont = NULL;
714         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
715                 if (rt->fib6_metric != metric) {
716                         cont = rt;
717                         break;
718                 }
719
720                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
721         }
722
723         for (rt = leaf; rt && rt != rr_head;
724              rt = rcu_dereference(rt->fib6_next)) {
725                 if (rt->fib6_metric != metric) {
726                         cont = rt;
727                         break;
728                 }
729
730                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
731         }
732
733         if (match || !cont)
734                 return match;
735
736         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
737                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
738
739         return match;
740 }
741
742 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
743                                    int oif, int strict)
744 {
745         struct fib6_info *leaf = rcu_dereference(fn->leaf);
746         struct fib6_info *match, *rt0;
747         bool do_rr = false;
748         int key_plen;
749
750         if (!leaf || leaf == net->ipv6.fib6_null_entry)
751                 return net->ipv6.fib6_null_entry;
752
753         rt0 = rcu_dereference(fn->rr_ptr);
754         if (!rt0)
755                 rt0 = leaf;
756
757         /* Double check to make sure fn is not an intermediate node
758          * and fn->leaf does not points to its child's leaf
759          * (This might happen if all routes under fn are deleted from
760          * the tree and fib6_repair_tree() is called on the node.)
761          */
762         key_plen = rt0->fib6_dst.plen;
763 #ifdef CONFIG_IPV6_SUBTREES
764         if (rt0->fib6_src.plen)
765                 key_plen = rt0->fib6_src.plen;
766 #endif
767         if (fn->fn_bit != key_plen)
768                 return net->ipv6.fib6_null_entry;
769
770         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
771                              &do_rr);
772
773         if (do_rr) {
774                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
775
776                 /* no entries matched; do round-robin */
777                 if (!next || next->fib6_metric != rt0->fib6_metric)
778                         next = leaf;
779
780                 if (next != rt0) {
781                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
782                         /* make sure next is not being deleted from the tree */
783                         if (next->fib6_node)
784                                 rcu_assign_pointer(fn->rr_ptr, next);
785                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
786                 }
787         }
788
789         return match ? match : net->ipv6.fib6_null_entry;
790 }
791
792 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
793 {
794         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
795 }
796
797 #ifdef CONFIG_IPV6_ROUTE_INFO
798 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
799                   const struct in6_addr *gwaddr)
800 {
801         struct net *net = dev_net(dev);
802         struct route_info *rinfo = (struct route_info *) opt;
803         struct in6_addr prefix_buf, *prefix;
804         unsigned int pref;
805         unsigned long lifetime;
806         struct fib6_info *rt;
807
808         if (len < sizeof(struct route_info)) {
809                 return -EINVAL;
810         }
811
812         /* Sanity check for prefix_len and length */
813         if (rinfo->length > 3) {
814                 return -EINVAL;
815         } else if (rinfo->prefix_len > 128) {
816                 return -EINVAL;
817         } else if (rinfo->prefix_len > 64) {
818                 if (rinfo->length < 2) {
819                         return -EINVAL;
820                 }
821         } else if (rinfo->prefix_len > 0) {
822                 if (rinfo->length < 1) {
823                         return -EINVAL;
824                 }
825         }
826
827         pref = rinfo->route_pref;
828         if (pref == ICMPV6_ROUTER_PREF_INVALID)
829                 return -EINVAL;
830
831         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
832
833         if (rinfo->length == 3)
834                 prefix = (struct in6_addr *)rinfo->prefix;
835         else {
836                 /* this function is safe */
837                 ipv6_addr_prefix(&prefix_buf,
838                                  (struct in6_addr *)rinfo->prefix,
839                                  rinfo->prefix_len);
840                 prefix = &prefix_buf;
841         }
842
843         if (rinfo->prefix_len == 0)
844                 rt = rt6_get_dflt_router(net, gwaddr, dev);
845         else
846                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
847                                         gwaddr, dev);
848
849         if (rt && !lifetime) {
850                 ip6_del_rt(net, rt);
851                 rt = NULL;
852         }
853
854         if (!rt && lifetime)
855                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
856                                         dev, pref);
857         else if (rt)
858                 rt->fib6_flags = RTF_ROUTEINFO |
859                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
860
861         if (rt) {
862                 if (!addrconf_finite_timeout(lifetime))
863                         fib6_clean_expires(rt);
864                 else
865                         fib6_set_expires(rt, jiffies + HZ * lifetime);
866
867                 fib6_info_release(rt);
868         }
869         return 0;
870 }
871 #endif
872
873 /*
874  *      Misc support functions
875  */
876
877 /* called with rcu_lock held */
878 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
879 {
880         struct net_device *dev = rt->fib6_nh.nh_dev;
881
882         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
883                 /* for copies of local routes, dst->dev needs to be the
884                  * device if it is a master device, the master device if
885                  * device is enslaved, and the loopback as the default
886                  */
887                 if (netif_is_l3_slave(dev) &&
888                     !rt6_need_strict(&rt->fib6_dst.addr))
889                         dev = l3mdev_master_dev_rcu(dev);
890                 else if (!netif_is_l3_master(dev))
891                         dev = dev_net(dev)->loopback_dev;
892                 /* last case is netif_is_l3_master(dev) is true in which
893                  * case we want dev returned to be dev
894                  */
895         }
896
897         return dev;
898 }
899
900 static const int fib6_prop[RTN_MAX + 1] = {
901         [RTN_UNSPEC]    = 0,
902         [RTN_UNICAST]   = 0,
903         [RTN_LOCAL]     = 0,
904         [RTN_BROADCAST] = 0,
905         [RTN_ANYCAST]   = 0,
906         [RTN_MULTICAST] = 0,
907         [RTN_BLACKHOLE] = -EINVAL,
908         [RTN_UNREACHABLE] = -EHOSTUNREACH,
909         [RTN_PROHIBIT]  = -EACCES,
910         [RTN_THROW]     = -EAGAIN,
911         [RTN_NAT]       = -EINVAL,
912         [RTN_XRESOLVE]  = -EINVAL,
913 };
914
915 static int ip6_rt_type_to_error(u8 fib6_type)
916 {
917         return fib6_prop[fib6_type];
918 }
919
920 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
921 {
922         unsigned short flags = 0;
923
924         if (rt->dst_nocount)
925                 flags |= DST_NOCOUNT;
926         if (rt->dst_nopolicy)
927                 flags |= DST_NOPOLICY;
928         if (rt->dst_host)
929                 flags |= DST_HOST;
930
931         return flags;
932 }
933
934 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
935 {
936         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
937
938         switch (ort->fib6_type) {
939         case RTN_BLACKHOLE:
940                 rt->dst.output = dst_discard_out;
941                 rt->dst.input = dst_discard;
942                 break;
943         case RTN_PROHIBIT:
944                 rt->dst.output = ip6_pkt_prohibit_out;
945                 rt->dst.input = ip6_pkt_prohibit;
946                 break;
947         case RTN_THROW:
948         case RTN_UNREACHABLE:
949         default:
950                 rt->dst.output = ip6_pkt_discard_out;
951                 rt->dst.input = ip6_pkt_discard;
952                 break;
953         }
954 }
955
956 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
957 {
958         if (ort->fib6_flags & RTF_REJECT) {
959                 ip6_rt_init_dst_reject(rt, ort);
960                 return;
961         }
962
963         rt->dst.error = 0;
964         rt->dst.output = ip6_output;
965
966         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
967                 rt->dst.input = ip6_input;
968         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
969                 rt->dst.input = ip6_mc_input;
970         } else {
971                 rt->dst.input = ip6_forward;
972         }
973
974         if (ort->fib6_nh.nh_lwtstate) {
975                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
976                 lwtunnel_set_redirect(&rt->dst);
977         }
978
979         rt->dst.lastuse = jiffies;
980 }
981
982 /* Caller must already hold reference to @from */
983 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
984 {
985         rt->rt6i_flags &= ~RTF_EXPIRES;
986         rcu_assign_pointer(rt->from, from);
987         dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
988         if (from->fib6_metrics != &dst_default_metrics) {
989                 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
990                 refcount_inc(&from->fib6_metrics->refcnt);
991         }
992 }
993
994 /* Caller must already hold reference to @ort */
995 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
996 {
997         struct net_device *dev = fib6_info_nh_dev(ort);
998
999         ip6_rt_init_dst(rt, ort);
1000
1001         rt->rt6i_dst = ort->fib6_dst;
1002         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1003         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
1004         rt->rt6i_flags = ort->fib6_flags;
1005         rt6_set_from(rt, ort);
1006 #ifdef CONFIG_IPV6_SUBTREES
1007         rt->rt6i_src = ort->fib6_src;
1008 #endif
1009         rt->rt6i_prefsrc = ort->fib6_prefsrc;
1010 }
1011
1012 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1013                                         struct in6_addr *saddr)
1014 {
1015         struct fib6_node *pn, *sn;
1016         while (1) {
1017                 if (fn->fn_flags & RTN_TL_ROOT)
1018                         return NULL;
1019                 pn = rcu_dereference(fn->parent);
1020                 sn = FIB6_SUBTREE(pn);
1021                 if (sn && sn != fn)
1022                         fn = fib6_node_lookup(sn, NULL, saddr);
1023                 else
1024                         fn = pn;
1025                 if (fn->fn_flags & RTN_RTINFO)
1026                         return fn;
1027         }
1028 }
1029
1030 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1031                           bool null_fallback)
1032 {
1033         struct rt6_info *rt = *prt;
1034
1035         if (dst_hold_safe(&rt->dst))
1036                 return true;
1037         if (null_fallback) {
1038                 rt = net->ipv6.ip6_null_entry;
1039                 dst_hold(&rt->dst);
1040         } else {
1041                 rt = NULL;
1042         }
1043         *prt = rt;
1044         return false;
1045 }
1046
1047 /* called with rcu_lock held */
1048 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1049 {
1050         unsigned short flags = fib6_info_dst_flags(rt);
1051         struct net_device *dev = rt->fib6_nh.nh_dev;
1052         struct rt6_info *nrt;
1053
1054         if (!fib6_info_hold_safe(rt))
1055                 goto fallback;
1056
1057         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1058         if (!nrt) {
1059                 fib6_info_release(rt);
1060                 goto fallback;
1061         }
1062
1063         ip6_rt_copy_init(nrt, rt);
1064         return nrt;
1065
1066 fallback:
1067         nrt = dev_net(dev)->ipv6.ip6_null_entry;
1068         dst_hold(&nrt->dst);
1069         return nrt;
1070 }
1071
1072 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1073                                              struct fib6_table *table,
1074                                              struct flowi6 *fl6,
1075                                              const struct sk_buff *skb,
1076                                              int flags)
1077 {
1078         struct fib6_info *f6i;
1079         struct fib6_node *fn;
1080         struct rt6_info *rt;
1081
1082         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1083                 flags &= ~RT6_LOOKUP_F_IFACE;
1084
1085         rcu_read_lock();
1086         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1087 restart:
1088         f6i = rcu_dereference(fn->leaf);
1089         if (!f6i) {
1090                 f6i = net->ipv6.fib6_null_entry;
1091         } else {
1092                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1093                                       fl6->flowi6_oif, flags);
1094                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1095                         f6i = fib6_multipath_select(net, f6i, fl6,
1096                                                     fl6->flowi6_oif, skb,
1097                                                     flags);
1098         }
1099         if (f6i == net->ipv6.fib6_null_entry) {
1100                 fn = fib6_backtrack(fn, &fl6->saddr);
1101                 if (fn)
1102                         goto restart;
1103         }
1104
1105         trace_fib6_table_lookup(net, f6i, table, fl6);
1106
1107         /* Search through exception table */
1108         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1109         if (rt) {
1110                 if (ip6_hold_safe(net, &rt, true))
1111                         dst_use_noref(&rt->dst, jiffies);
1112         } else if (f6i == net->ipv6.fib6_null_entry) {
1113                 rt = net->ipv6.ip6_null_entry;
1114                 dst_hold(&rt->dst);
1115         } else {
1116                 rt = ip6_create_rt_rcu(f6i);
1117         }
1118
1119         rcu_read_unlock();
1120
1121         return rt;
1122 }
1123
1124 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1125                                    const struct sk_buff *skb, int flags)
1126 {
1127         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1128 }
1129 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1130
1131 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1132                             const struct in6_addr *saddr, int oif,
1133                             const struct sk_buff *skb, int strict)
1134 {
1135         struct flowi6 fl6 = {
1136                 .flowi6_oif = oif,
1137                 .daddr = *daddr,
1138         };
1139         struct dst_entry *dst;
1140         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1141
1142         if (saddr) {
1143                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1144                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1145         }
1146
1147         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1148         if (dst->error == 0)
1149                 return (struct rt6_info *) dst;
1150
1151         dst_release(dst);
1152
1153         return NULL;
1154 }
1155 EXPORT_SYMBOL(rt6_lookup);
1156
1157 /* ip6_ins_rt is called with FREE table->tb6_lock.
1158  * It takes new route entry, the addition fails by any reason the
1159  * route is released.
1160  * Caller must hold dst before calling it.
1161  */
1162
1163 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1164                         struct netlink_ext_ack *extack)
1165 {
1166         int err;
1167         struct fib6_table *table;
1168
1169         table = rt->fib6_table;
1170         spin_lock_bh(&table->tb6_lock);
1171         err = fib6_add(&table->tb6_root, rt, info, extack);
1172         spin_unlock_bh(&table->tb6_lock);
1173
1174         return err;
1175 }
1176
1177 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1178 {
1179         struct nl_info info = { .nl_net = net, };
1180
1181         return __ip6_ins_rt(rt, &info, NULL);
1182 }
1183
1184 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1185                                            const struct in6_addr *daddr,
1186                                            const struct in6_addr *saddr)
1187 {
1188         struct net_device *dev;
1189         struct rt6_info *rt;
1190
1191         /*
1192          *      Clone the route.
1193          */
1194
1195         if (!fib6_info_hold_safe(ort))
1196                 return NULL;
1197
1198         dev = ip6_rt_get_dev_rcu(ort);
1199         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1200         if (!rt) {
1201                 fib6_info_release(ort);
1202                 return NULL;
1203         }
1204
1205         ip6_rt_copy_init(rt, ort);
1206         rt->rt6i_flags |= RTF_CACHE;
1207         rt->dst.flags |= DST_HOST;
1208         rt->rt6i_dst.addr = *daddr;
1209         rt->rt6i_dst.plen = 128;
1210
1211         if (!rt6_is_gw_or_nonexthop(ort)) {
1212                 if (ort->fib6_dst.plen != 128 &&
1213                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1214                         rt->rt6i_flags |= RTF_ANYCAST;
1215 #ifdef CONFIG_IPV6_SUBTREES
1216                 if (rt->rt6i_src.plen && saddr) {
1217                         rt->rt6i_src.addr = *saddr;
1218                         rt->rt6i_src.plen = 128;
1219                 }
1220 #endif
1221         }
1222
1223         return rt;
1224 }
1225
1226 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1227 {
1228         unsigned short flags = fib6_info_dst_flags(rt);
1229         struct net_device *dev;
1230         struct rt6_info *pcpu_rt;
1231
1232         if (!fib6_info_hold_safe(rt))
1233                 return NULL;
1234
1235         rcu_read_lock();
1236         dev = ip6_rt_get_dev_rcu(rt);
1237         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1238         rcu_read_unlock();
1239         if (!pcpu_rt) {
1240                 fib6_info_release(rt);
1241                 return NULL;
1242         }
1243         ip6_rt_copy_init(pcpu_rt, rt);
1244         pcpu_rt->rt6i_flags |= RTF_PCPU;
1245         return pcpu_rt;
1246 }
1247
1248 /* It should be called with rcu_read_lock() acquired */
1249 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1250 {
1251         struct rt6_info *pcpu_rt, **p;
1252
1253         p = this_cpu_ptr(rt->rt6i_pcpu);
1254         pcpu_rt = *p;
1255
1256         if (pcpu_rt)
1257                 ip6_hold_safe(NULL, &pcpu_rt, false);
1258
1259         return pcpu_rt;
1260 }
1261
1262 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1263                                             struct fib6_info *rt)
1264 {
1265         struct rt6_info *pcpu_rt, *prev, **p;
1266
1267         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1268         if (!pcpu_rt) {
1269                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1270                 return net->ipv6.ip6_null_entry;
1271         }
1272
1273         dst_hold(&pcpu_rt->dst);
1274         p = this_cpu_ptr(rt->rt6i_pcpu);
1275         prev = cmpxchg(p, NULL, pcpu_rt);
1276         BUG_ON(prev);
1277
1278         if (rt->fib6_destroying) {
1279                 struct fib6_info *from;
1280
1281                 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1282                 fib6_info_release(from);
1283         }
1284
1285         return pcpu_rt;
1286 }
1287
1288 /* exception hash table implementation
1289  */
1290 static DEFINE_SPINLOCK(rt6_exception_lock);
1291
1292 /* Remove rt6_ex from hash table and free the memory
1293  * Caller must hold rt6_exception_lock
1294  */
1295 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1296                                  struct rt6_exception *rt6_ex)
1297 {
1298         struct fib6_info *from;
1299         struct net *net;
1300
1301         if (!bucket || !rt6_ex)
1302                 return;
1303
1304         net = dev_net(rt6_ex->rt6i->dst.dev);
1305         net->ipv6.rt6_stats->fib_rt_cache--;
1306
1307         /* purge completely the exception to allow releasing the held resources:
1308          * some [sk] cache may keep the dst around for unlimited time
1309          */
1310         from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1311         fib6_info_release(from);
1312         dst_dev_put(&rt6_ex->rt6i->dst);
1313
1314         hlist_del_rcu(&rt6_ex->hlist);
1315         dst_release(&rt6_ex->rt6i->dst);
1316         kfree_rcu(rt6_ex, rcu);
1317         WARN_ON_ONCE(!bucket->depth);
1318         bucket->depth--;
1319 }
1320
1321 /* Remove oldest rt6_ex in bucket and free the memory
1322  * Caller must hold rt6_exception_lock
1323  */
1324 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1325 {
1326         struct rt6_exception *rt6_ex, *oldest = NULL;
1327
1328         if (!bucket)
1329                 return;
1330
1331         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1332                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1333                         oldest = rt6_ex;
1334         }
1335         rt6_remove_exception(bucket, oldest);
1336 }
1337
1338 static u32 rt6_exception_hash(const struct in6_addr *dst,
1339                               const struct in6_addr *src)
1340 {
1341         static siphash_key_t rt6_exception_key __read_mostly;
1342         struct {
1343                 struct in6_addr dst;
1344                 struct in6_addr src;
1345         } __aligned(SIPHASH_ALIGNMENT) combined = {
1346                 .dst = *dst,
1347         };
1348         u64 val;
1349
1350         net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key));
1351
1352 #ifdef CONFIG_IPV6_SUBTREES
1353         if (src)
1354                 combined.src = *src;
1355 #endif
1356         val = siphash(&combined, sizeof(combined), &rt6_exception_key);
1357
1358         return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1359 }
1360
1361 /* Helper function to find the cached rt in the hash table
1362  * and update bucket pointer to point to the bucket for this
1363  * (daddr, saddr) pair
1364  * Caller must hold rt6_exception_lock
1365  */
1366 static struct rt6_exception *
1367 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1368                               const struct in6_addr *daddr,
1369                               const struct in6_addr *saddr)
1370 {
1371         struct rt6_exception *rt6_ex;
1372         u32 hval;
1373
1374         if (!(*bucket) || !daddr)
1375                 return NULL;
1376
1377         hval = rt6_exception_hash(daddr, saddr);
1378         *bucket += hval;
1379
1380         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1381                 struct rt6_info *rt6 = rt6_ex->rt6i;
1382                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1383
1384 #ifdef CONFIG_IPV6_SUBTREES
1385                 if (matched && saddr)
1386                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1387 #endif
1388                 if (matched)
1389                         return rt6_ex;
1390         }
1391         return NULL;
1392 }
1393
1394 /* Helper function to find the cached rt in the hash table
1395  * and update bucket pointer to point to the bucket for this
1396  * (daddr, saddr) pair
1397  * Caller must hold rcu_read_lock()
1398  */
1399 static struct rt6_exception *
1400 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1401                          const struct in6_addr *daddr,
1402                          const struct in6_addr *saddr)
1403 {
1404         struct rt6_exception *rt6_ex;
1405         u32 hval;
1406
1407         WARN_ON_ONCE(!rcu_read_lock_held());
1408
1409         if (!(*bucket) || !daddr)
1410                 return NULL;
1411
1412         hval = rt6_exception_hash(daddr, saddr);
1413         *bucket += hval;
1414
1415         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1416                 struct rt6_info *rt6 = rt6_ex->rt6i;
1417                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1418
1419 #ifdef CONFIG_IPV6_SUBTREES
1420                 if (matched && saddr)
1421                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1422 #endif
1423                 if (matched)
1424                         return rt6_ex;
1425         }
1426         return NULL;
1427 }
1428
1429 static unsigned int fib6_mtu(const struct fib6_info *rt)
1430 {
1431         unsigned int mtu;
1432
1433         if (rt->fib6_pmtu) {
1434                 mtu = rt->fib6_pmtu;
1435         } else {
1436                 struct net_device *dev = fib6_info_nh_dev(rt);
1437                 struct inet6_dev *idev;
1438
1439                 rcu_read_lock();
1440                 idev = __in6_dev_get(dev);
1441                 mtu = idev->cnf.mtu6;
1442                 rcu_read_unlock();
1443         }
1444
1445         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1446
1447         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1448 }
1449
1450 static int rt6_insert_exception(struct rt6_info *nrt,
1451                                 struct fib6_info *ort)
1452 {
1453         struct net *net = dev_net(nrt->dst.dev);
1454         struct rt6_exception_bucket *bucket;
1455         struct in6_addr *src_key = NULL;
1456         struct rt6_exception *rt6_ex;
1457         int max_depth;
1458         int err = 0;
1459
1460         spin_lock_bh(&rt6_exception_lock);
1461
1462         if (ort->exception_bucket_flushed) {
1463                 err = -EINVAL;
1464                 goto out;
1465         }
1466
1467         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1468                                         lockdep_is_held(&rt6_exception_lock));
1469         if (!bucket) {
1470                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1471                                  GFP_ATOMIC);
1472                 if (!bucket) {
1473                         err = -ENOMEM;
1474                         goto out;
1475                 }
1476                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1477         }
1478
1479 #ifdef CONFIG_IPV6_SUBTREES
1480         /* rt6i_src.plen != 0 indicates ort is in subtree
1481          * and exception table is indexed by a hash of
1482          * both rt6i_dst and rt6i_src.
1483          * Otherwise, the exception table is indexed by
1484          * a hash of only rt6i_dst.
1485          */
1486         if (ort->fib6_src.plen)
1487                 src_key = &nrt->rt6i_src.addr;
1488 #endif
1489
1490         /* Update rt6i_prefsrc as it could be changed
1491          * in rt6_remove_prefsrc()
1492          */
1493         nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1494         /* rt6_mtu_change() might lower mtu on ort.
1495          * Only insert this exception route if its mtu
1496          * is less than ort's mtu value.
1497          */
1498         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1499                 err = -EINVAL;
1500                 goto out;
1501         }
1502
1503         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1504                                                src_key);
1505         if (rt6_ex)
1506                 rt6_remove_exception(bucket, rt6_ex);
1507
1508         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1509         if (!rt6_ex) {
1510                 err = -ENOMEM;
1511                 goto out;
1512         }
1513         rt6_ex->rt6i = nrt;
1514         rt6_ex->stamp = jiffies;
1515         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1516         bucket->depth++;
1517         net->ipv6.rt6_stats->fib_rt_cache++;
1518
1519         /* Randomize max depth to avoid some side channels attacks. */
1520         max_depth = FIB6_MAX_DEPTH + prandom_u32_max(FIB6_MAX_DEPTH);
1521         while (bucket->depth > max_depth)
1522                 rt6_exception_remove_oldest(bucket);
1523
1524 out:
1525         spin_unlock_bh(&rt6_exception_lock);
1526
1527         /* Update fn->fn_sernum to invalidate all cached dst */
1528         if (!err) {
1529                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1530                 fib6_update_sernum(net, ort);
1531                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1532                 fib6_force_start_gc(net);
1533         }
1534
1535         return err;
1536 }
1537
1538 void rt6_flush_exceptions(struct fib6_info *rt)
1539 {
1540         struct rt6_exception_bucket *bucket;
1541         struct rt6_exception *rt6_ex;
1542         struct hlist_node *tmp;
1543         int i;
1544
1545         spin_lock_bh(&rt6_exception_lock);
1546         /* Prevent rt6_insert_exception() to recreate the bucket list */
1547         rt->exception_bucket_flushed = 1;
1548
1549         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550                                     lockdep_is_held(&rt6_exception_lock));
1551         if (!bucket)
1552                 goto out;
1553
1554         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1555                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1556                         rt6_remove_exception(bucket, rt6_ex);
1557                 WARN_ON_ONCE(bucket->depth);
1558                 bucket++;
1559         }
1560
1561 out:
1562         spin_unlock_bh(&rt6_exception_lock);
1563 }
1564
1565 /* Find cached rt in the hash table inside passed in rt
1566  * Caller has to hold rcu_read_lock()
1567  */
1568 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1569                                            const struct in6_addr *daddr,
1570                                            const struct in6_addr *saddr)
1571 {
1572         const struct in6_addr *src_key = NULL;
1573         struct rt6_exception_bucket *bucket;
1574         struct rt6_exception *rt6_ex;
1575         struct rt6_info *res = NULL;
1576
1577 #ifdef CONFIG_IPV6_SUBTREES
1578         /* rt6i_src.plen != 0 indicates rt is in subtree
1579          * and exception table is indexed by a hash of
1580          * both rt6i_dst and rt6i_src.
1581          * However, the src addr used to create the hash
1582          * might not be exactly the passed in saddr which
1583          * is a /128 addr from the flow.
1584          * So we need to use f6i->fib6_src to redo lookup
1585          * if the passed in saddr does not find anything.
1586          * (See the logic in ip6_rt_cache_alloc() on how
1587          * rt->rt6i_src is updated.)
1588          */
1589         if (rt->fib6_src.plen)
1590                 src_key = saddr;
1591 find_ex:
1592 #endif
1593         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1594         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1595
1596         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1597                 res = rt6_ex->rt6i;
1598
1599 #ifdef CONFIG_IPV6_SUBTREES
1600         /* Use fib6_src as src_key and redo lookup */
1601         if (!res && src_key && src_key != &rt->fib6_src.addr) {
1602                 src_key = &rt->fib6_src.addr;
1603                 goto find_ex;
1604         }
1605 #endif
1606
1607         return res;
1608 }
1609
1610 /* Remove the passed in cached rt from the hash table that contains it */
1611 static int rt6_remove_exception_rt(struct rt6_info *rt)
1612 {
1613         struct rt6_exception_bucket *bucket;
1614         struct in6_addr *src_key = NULL;
1615         struct rt6_exception *rt6_ex;
1616         struct fib6_info *from;
1617         int err;
1618
1619         from = rcu_dereference(rt->from);
1620         if (!from ||
1621             !(rt->rt6i_flags & RTF_CACHE))
1622                 return -EINVAL;
1623
1624         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1625                 return -ENOENT;
1626
1627         spin_lock_bh(&rt6_exception_lock);
1628         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1629                                     lockdep_is_held(&rt6_exception_lock));
1630 #ifdef CONFIG_IPV6_SUBTREES
1631         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1632          * and exception table is indexed by a hash of
1633          * both rt6i_dst and rt6i_src.
1634          * Otherwise, the exception table is indexed by
1635          * a hash of only rt6i_dst.
1636          */
1637         if (from->fib6_src.plen)
1638                 src_key = &rt->rt6i_src.addr;
1639 #endif
1640         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1641                                                &rt->rt6i_dst.addr,
1642                                                src_key);
1643         if (rt6_ex) {
1644                 rt6_remove_exception(bucket, rt6_ex);
1645                 err = 0;
1646         } else {
1647                 err = -ENOENT;
1648         }
1649
1650         spin_unlock_bh(&rt6_exception_lock);
1651         return err;
1652 }
1653
1654 /* Find rt6_ex which contains the passed in rt cache and
1655  * refresh its stamp
1656  */
1657 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1658 {
1659         struct rt6_exception_bucket *bucket;
1660         struct in6_addr *src_key = NULL;
1661         struct rt6_exception *rt6_ex;
1662         struct fib6_info *from;
1663
1664         rcu_read_lock();
1665         from = rcu_dereference(rt->from);
1666         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1667                 goto unlock;
1668
1669         bucket = rcu_dereference(from->rt6i_exception_bucket);
1670
1671 #ifdef CONFIG_IPV6_SUBTREES
1672         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1673          * and exception table is indexed by a hash of
1674          * both rt6i_dst and rt6i_src.
1675          * Otherwise, the exception table is indexed by
1676          * a hash of only rt6i_dst.
1677          */
1678         if (from->fib6_src.plen)
1679                 src_key = &rt->rt6i_src.addr;
1680 #endif
1681         rt6_ex = __rt6_find_exception_rcu(&bucket,
1682                                           &rt->rt6i_dst.addr,
1683                                           src_key);
1684         if (rt6_ex)
1685                 rt6_ex->stamp = jiffies;
1686
1687 unlock:
1688         rcu_read_unlock();
1689 }
1690
1691 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1692 {
1693         struct rt6_exception_bucket *bucket;
1694         struct rt6_exception *rt6_ex;
1695         int i;
1696
1697         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1698                                         lockdep_is_held(&rt6_exception_lock));
1699
1700         if (bucket) {
1701                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1702                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1703                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1704                         }
1705                         bucket++;
1706                 }
1707         }
1708 }
1709
1710 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1711                                          struct rt6_info *rt, int mtu)
1712 {
1713         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1714          * lowest MTU in the path: always allow updating the route PMTU to
1715          * reflect PMTU decreases.
1716          *
1717          * If the new MTU is higher, and the route PMTU is equal to the local
1718          * MTU, this means the old MTU is the lowest in the path, so allow
1719          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1720          * handle this.
1721          */
1722
1723         if (dst_mtu(&rt->dst) >= mtu)
1724                 return true;
1725
1726         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1727                 return true;
1728
1729         return false;
1730 }
1731
1732 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1733                                        struct fib6_info *rt, int mtu)
1734 {
1735         struct rt6_exception_bucket *bucket;
1736         struct rt6_exception *rt6_ex;
1737         int i;
1738
1739         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1740                                         lockdep_is_held(&rt6_exception_lock));
1741
1742         if (!bucket)
1743                 return;
1744
1745         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1746                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1747                         struct rt6_info *entry = rt6_ex->rt6i;
1748
1749                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1750                          * route), the metrics of its rt->from have already
1751                          * been updated.
1752                          */
1753                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1754                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1755                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1756                 }
1757                 bucket++;
1758         }
1759 }
1760
1761 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1762
1763 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1764                                         struct in6_addr *gateway)
1765 {
1766         struct rt6_exception_bucket *bucket;
1767         struct rt6_exception *rt6_ex;
1768         struct hlist_node *tmp;
1769         int i;
1770
1771         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1772                 return;
1773
1774         spin_lock_bh(&rt6_exception_lock);
1775         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1776                                      lockdep_is_held(&rt6_exception_lock));
1777
1778         if (bucket) {
1779                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1780                         hlist_for_each_entry_safe(rt6_ex, tmp,
1781                                                   &bucket->chain, hlist) {
1782                                 struct rt6_info *entry = rt6_ex->rt6i;
1783
1784                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1785                                     RTF_CACHE_GATEWAY &&
1786                                     ipv6_addr_equal(gateway,
1787                                                     &entry->rt6i_gateway)) {
1788                                         rt6_remove_exception(bucket, rt6_ex);
1789                                 }
1790                         }
1791                         bucket++;
1792                 }
1793         }
1794
1795         spin_unlock_bh(&rt6_exception_lock);
1796 }
1797
1798 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1799                                       struct rt6_exception *rt6_ex,
1800                                       struct fib6_gc_args *gc_args,
1801                                       unsigned long now)
1802 {
1803         struct rt6_info *rt = rt6_ex->rt6i;
1804
1805         /* we are pruning and obsoleting aged-out and non gateway exceptions
1806          * even if others have still references to them, so that on next
1807          * dst_check() such references can be dropped.
1808          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1809          * expired, independently from their aging, as per RFC 8201 section 4
1810          */
1811         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1812                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1813                         RT6_TRACE("aging clone %p\n", rt);
1814                         rt6_remove_exception(bucket, rt6_ex);
1815                         return;
1816                 }
1817         } else if (time_after(jiffies, rt->dst.expires)) {
1818                 RT6_TRACE("purging expired route %p\n", rt);
1819                 rt6_remove_exception(bucket, rt6_ex);
1820                 return;
1821         }
1822
1823         if (rt->rt6i_flags & RTF_GATEWAY) {
1824                 struct neighbour *neigh;
1825                 __u8 neigh_flags = 0;
1826
1827                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1828                 if (neigh)
1829                         neigh_flags = neigh->flags;
1830
1831                 if (!(neigh_flags & NTF_ROUTER)) {
1832                         RT6_TRACE("purging route %p via non-router but gateway\n",
1833                                   rt);
1834                         rt6_remove_exception(bucket, rt6_ex);
1835                         return;
1836                 }
1837         }
1838
1839         gc_args->more++;
1840 }
1841
1842 void rt6_age_exceptions(struct fib6_info *rt,
1843                         struct fib6_gc_args *gc_args,
1844                         unsigned long now)
1845 {
1846         struct rt6_exception_bucket *bucket;
1847         struct rt6_exception *rt6_ex;
1848         struct hlist_node *tmp;
1849         int i;
1850
1851         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1852                 return;
1853
1854         rcu_read_lock_bh();
1855         spin_lock(&rt6_exception_lock);
1856         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1857                                     lockdep_is_held(&rt6_exception_lock));
1858
1859         if (bucket) {
1860                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1861                         hlist_for_each_entry_safe(rt6_ex, tmp,
1862                                                   &bucket->chain, hlist) {
1863                                 rt6_age_examine_exception(bucket, rt6_ex,
1864                                                           gc_args, now);
1865                         }
1866                         bucket++;
1867                 }
1868         }
1869         spin_unlock(&rt6_exception_lock);
1870         rcu_read_unlock_bh();
1871 }
1872
1873 /* must be called with rcu lock held */
1874 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1875                                     int oif, struct flowi6 *fl6, int strict)
1876 {
1877         struct fib6_node *fn, *saved_fn;
1878         struct fib6_info *f6i;
1879
1880         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1881         saved_fn = fn;
1882
1883         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1884                 oif = 0;
1885
1886 redo_rt6_select:
1887         f6i = rt6_select(net, fn, oif, strict);
1888         if (f6i == net->ipv6.fib6_null_entry) {
1889                 fn = fib6_backtrack(fn, &fl6->saddr);
1890                 if (fn)
1891                         goto redo_rt6_select;
1892                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1893                         /* also consider unreachable route */
1894                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1895                         fn = saved_fn;
1896                         goto redo_rt6_select;
1897                 }
1898         }
1899
1900         trace_fib6_table_lookup(net, f6i, table, fl6);
1901
1902         return f6i;
1903 }
1904
1905 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1906                                int oif, struct flowi6 *fl6,
1907                                const struct sk_buff *skb, int flags)
1908 {
1909         struct fib6_info *f6i;
1910         struct rt6_info *rt;
1911         int strict = 0;
1912
1913         strict |= flags & RT6_LOOKUP_F_IFACE;
1914         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1915         if (net->ipv6.devconf_all->forwarding == 0)
1916                 strict |= RT6_LOOKUP_F_REACHABLE;
1917
1918         rcu_read_lock();
1919
1920         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1921         if (f6i->fib6_nsiblings)
1922                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1923
1924         if (f6i == net->ipv6.fib6_null_entry) {
1925                 rt = net->ipv6.ip6_null_entry;
1926                 rcu_read_unlock();
1927                 dst_hold(&rt->dst);
1928                 return rt;
1929         }
1930
1931         /*Search through exception table */
1932         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1933         if (rt) {
1934                 if (ip6_hold_safe(net, &rt, true))
1935                         dst_use_noref(&rt->dst, jiffies);
1936
1937                 rcu_read_unlock();
1938                 return rt;
1939         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1940                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1941                 /* Create a RTF_CACHE clone which will not be
1942                  * owned by the fib6 tree.  It is for the special case where
1943                  * the daddr in the skb during the neighbor look-up is different
1944                  * from the fl6->daddr used to look-up route here.
1945                  */
1946                 struct rt6_info *uncached_rt;
1947
1948                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1949
1950                 rcu_read_unlock();
1951
1952                 if (uncached_rt) {
1953                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1954                          * No need for another dst_hold()
1955                          */
1956                         rt6_uncached_list_add(uncached_rt);
1957                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1958                 } else {
1959                         uncached_rt = net->ipv6.ip6_null_entry;
1960                         dst_hold(&uncached_rt->dst);
1961                 }
1962
1963                 return uncached_rt;
1964         } else {
1965                 /* Get a percpu copy */
1966
1967                 struct rt6_info *pcpu_rt;
1968
1969                 local_bh_disable();
1970                 pcpu_rt = rt6_get_pcpu_route(f6i);
1971
1972                 if (!pcpu_rt)
1973                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1974
1975                 local_bh_enable();
1976                 rcu_read_unlock();
1977
1978                 return pcpu_rt;
1979         }
1980 }
1981 EXPORT_SYMBOL_GPL(ip6_pol_route);
1982
1983 static struct rt6_info *ip6_pol_route_input(struct net *net,
1984                                             struct fib6_table *table,
1985                                             struct flowi6 *fl6,
1986                                             const struct sk_buff *skb,
1987                                             int flags)
1988 {
1989         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1990 }
1991
1992 struct dst_entry *ip6_route_input_lookup(struct net *net,
1993                                          struct net_device *dev,
1994                                          struct flowi6 *fl6,
1995                                          const struct sk_buff *skb,
1996                                          int flags)
1997 {
1998         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1999                 flags |= RT6_LOOKUP_F_IFACE;
2000
2001         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
2002 }
2003 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
2004
2005 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
2006                                   struct flow_keys *keys,
2007                                   struct flow_keys *flkeys)
2008 {
2009         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
2010         const struct ipv6hdr *key_iph = outer_iph;
2011         struct flow_keys *_flkeys = flkeys;
2012         const struct ipv6hdr *inner_iph;
2013         const struct icmp6hdr *icmph;
2014         struct ipv6hdr _inner_iph;
2015         struct icmp6hdr _icmph;
2016
2017         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2018                 goto out;
2019
2020         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2021                                    sizeof(_icmph), &_icmph);
2022         if (!icmph)
2023                 goto out;
2024
2025         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2026             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2027             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2028             icmph->icmp6_type != ICMPV6_PARAMPROB)
2029                 goto out;
2030
2031         inner_iph = skb_header_pointer(skb,
2032                                        skb_transport_offset(skb) + sizeof(*icmph),
2033                                        sizeof(_inner_iph), &_inner_iph);
2034         if (!inner_iph)
2035                 goto out;
2036
2037         key_iph = inner_iph;
2038         _flkeys = NULL;
2039 out:
2040         if (_flkeys) {
2041                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2042                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2043                 keys->tags.flow_label = _flkeys->tags.flow_label;
2044                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2045         } else {
2046                 keys->addrs.v6addrs.src = key_iph->saddr;
2047                 keys->addrs.v6addrs.dst = key_iph->daddr;
2048                 keys->tags.flow_label = ip6_flowlabel(key_iph);
2049                 keys->basic.ip_proto = key_iph->nexthdr;
2050         }
2051 }
2052
2053 /* if skb is set it will be used and fl6 can be NULL */
2054 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2055                        const struct sk_buff *skb, struct flow_keys *flkeys)
2056 {
2057         struct flow_keys hash_keys;
2058         u32 mhash;
2059
2060         switch (ip6_multipath_hash_policy(net)) {
2061         case 0:
2062                 memset(&hash_keys, 0, sizeof(hash_keys));
2063                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2064                 if (skb) {
2065                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2066                 } else {
2067                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2068                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2069                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2070                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2071                 }
2072                 break;
2073         case 1:
2074                 if (skb) {
2075                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2076                         struct flow_keys keys;
2077
2078                         /* short-circuit if we already have L4 hash present */
2079                         if (skb->l4_hash)
2080                                 return skb_get_hash_raw(skb) >> 1;
2081
2082                         memset(&hash_keys, 0, sizeof(hash_keys));
2083
2084                         if (!flkeys) {
2085                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2086                                 flkeys = &keys;
2087                         }
2088                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2089                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2090                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2091                         hash_keys.ports.src = flkeys->ports.src;
2092                         hash_keys.ports.dst = flkeys->ports.dst;
2093                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2094                 } else {
2095                         memset(&hash_keys, 0, sizeof(hash_keys));
2096                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2097                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2098                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2099                         hash_keys.ports.src = fl6->fl6_sport;
2100                         hash_keys.ports.dst = fl6->fl6_dport;
2101                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2102                 }
2103                 break;
2104         }
2105         mhash = flow_hash_from_keys(&hash_keys);
2106
2107         return mhash >> 1;
2108 }
2109
2110 void ip6_route_input(struct sk_buff *skb)
2111 {
2112         const struct ipv6hdr *iph = ipv6_hdr(skb);
2113         struct net *net = dev_net(skb->dev);
2114         int flags = RT6_LOOKUP_F_HAS_SADDR;
2115         struct ip_tunnel_info *tun_info;
2116         struct flowi6 fl6 = {
2117                 .flowi6_iif = skb->dev->ifindex,
2118                 .daddr = iph->daddr,
2119                 .saddr = iph->saddr,
2120                 .flowlabel = ip6_flowinfo(iph),
2121                 .flowi6_mark = skb->mark,
2122                 .flowi6_proto = iph->nexthdr,
2123         };
2124         struct flow_keys *flkeys = NULL, _flkeys;
2125
2126         tun_info = skb_tunnel_info(skb);
2127         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2128                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2129
2130         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2131                 flkeys = &_flkeys;
2132
2133         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2134                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2135         skb_dst_drop(skb);
2136         skb_dst_set(skb,
2137                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2138 }
2139
2140 static struct rt6_info *ip6_pol_route_output(struct net *net,
2141                                              struct fib6_table *table,
2142                                              struct flowi6 *fl6,
2143                                              const struct sk_buff *skb,
2144                                              int flags)
2145 {
2146         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2147 }
2148
2149 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2150                                          struct flowi6 *fl6, int flags)
2151 {
2152         bool any_src;
2153
2154         if (rt6_need_strict(&fl6->daddr)) {
2155                 struct dst_entry *dst;
2156
2157                 dst = l3mdev_link_scope_lookup(net, fl6);
2158                 if (dst)
2159                         return dst;
2160         }
2161
2162         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2163
2164         any_src = ipv6_addr_any(&fl6->saddr);
2165         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2166             (fl6->flowi6_oif && any_src))
2167                 flags |= RT6_LOOKUP_F_IFACE;
2168
2169         if (!any_src)
2170                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2171         else if (sk)
2172                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2173
2174         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2175 }
2176 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2177
2178 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2179 {
2180         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2181         struct net_device *loopback_dev = net->loopback_dev;
2182         struct dst_entry *new = NULL;
2183
2184         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2185                        DST_OBSOLETE_DEAD, 0);
2186         if (rt) {
2187                 rt6_info_init(rt);
2188                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2189
2190                 new = &rt->dst;
2191                 new->__use = 1;
2192                 new->input = dst_discard;
2193                 new->output = dst_discard_out;
2194
2195                 dst_copy_metrics(new, &ort->dst);
2196
2197                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2198                 rt->rt6i_gateway = ort->rt6i_gateway;
2199                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2200
2201                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2202 #ifdef CONFIG_IPV6_SUBTREES
2203                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2204 #endif
2205         }
2206
2207         dst_release(dst_orig);
2208         return new ? new : ERR_PTR(-ENOMEM);
2209 }
2210
2211 /*
2212  *      Destination cache support functions
2213  */
2214
2215 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2216 {
2217         u32 rt_cookie = 0;
2218
2219         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2220                 return false;
2221
2222         if (fib6_check_expired(f6i))
2223                 return false;
2224
2225         return true;
2226 }
2227
2228 static struct dst_entry *rt6_check(struct rt6_info *rt,
2229                                    struct fib6_info *from,
2230                                    u32 cookie)
2231 {
2232         u32 rt_cookie = 0;
2233
2234         if (!from || !fib6_get_cookie_safe(from, &rt_cookie) ||
2235             rt_cookie != cookie)
2236                 return NULL;
2237
2238         if (rt6_check_expired(rt))
2239                 return NULL;
2240
2241         return &rt->dst;
2242 }
2243
2244 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2245                                             struct fib6_info *from,
2246                                             u32 cookie)
2247 {
2248         if (!__rt6_check_expired(rt) &&
2249             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2250             fib6_check(from, cookie))
2251                 return &rt->dst;
2252         else
2253                 return NULL;
2254 }
2255
2256 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2257 {
2258         struct dst_entry *dst_ret;
2259         struct fib6_info *from;
2260         struct rt6_info *rt;
2261
2262         rt = container_of(dst, struct rt6_info, dst);
2263
2264         rcu_read_lock();
2265
2266         /* All IPV6 dsts are created with ->obsolete set to the value
2267          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2268          * into this function always.
2269          */
2270
2271         from = rcu_dereference(rt->from);
2272
2273         if (from && (rt->rt6i_flags & RTF_PCPU ||
2274             unlikely(!list_empty(&rt->rt6i_uncached))))
2275                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2276         else
2277                 dst_ret = rt6_check(rt, from, cookie);
2278
2279         rcu_read_unlock();
2280
2281         return dst_ret;
2282 }
2283
2284 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2285 {
2286         struct rt6_info *rt = (struct rt6_info *) dst;
2287
2288         if (rt) {
2289                 if (rt->rt6i_flags & RTF_CACHE) {
2290                         rcu_read_lock();
2291                         if (rt6_check_expired(rt)) {
2292                                 rt6_remove_exception_rt(rt);
2293                                 dst = NULL;
2294                         }
2295                         rcu_read_unlock();
2296                 } else {
2297                         dst_release(dst);
2298                         dst = NULL;
2299                 }
2300         }
2301         return dst;
2302 }
2303
2304 static void ip6_link_failure(struct sk_buff *skb)
2305 {
2306         struct rt6_info *rt;
2307
2308         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2309
2310         rt = (struct rt6_info *) skb_dst(skb);
2311         if (rt) {
2312                 rcu_read_lock();
2313                 if (rt->rt6i_flags & RTF_CACHE) {
2314                         rt6_remove_exception_rt(rt);
2315                 } else {
2316                         struct fib6_info *from;
2317                         struct fib6_node *fn;
2318
2319                         from = rcu_dereference(rt->from);
2320                         if (from) {
2321                                 fn = rcu_dereference(from->fib6_node);
2322                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2323                                         WRITE_ONCE(fn->fn_sernum, -1);
2324                         }
2325                 }
2326                 rcu_read_unlock();
2327         }
2328 }
2329
2330 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2331 {
2332         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2333                 struct fib6_info *from;
2334
2335                 rcu_read_lock();
2336                 from = rcu_dereference(rt0->from);
2337                 if (from)
2338                         rt0->dst.expires = from->expires;
2339                 rcu_read_unlock();
2340         }
2341
2342         dst_set_expires(&rt0->dst, timeout);
2343         rt0->rt6i_flags |= RTF_EXPIRES;
2344 }
2345
2346 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2347 {
2348         struct net *net = dev_net(rt->dst.dev);
2349
2350         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2351         rt->rt6i_flags |= RTF_MODIFIED;
2352         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2353 }
2354
2355 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2356 {
2357         bool from_set;
2358
2359         rcu_read_lock();
2360         from_set = !!rcu_dereference(rt->from);
2361         rcu_read_unlock();
2362
2363         return !(rt->rt6i_flags & RTF_CACHE) &&
2364                 (rt->rt6i_flags & RTF_PCPU || from_set);
2365 }
2366
2367 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2368                                  const struct ipv6hdr *iph, u32 mtu,
2369                                  bool confirm_neigh)
2370 {
2371         const struct in6_addr *daddr, *saddr;
2372         struct rt6_info *rt6 = (struct rt6_info *)dst;
2373
2374         /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
2375          * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
2376          * [see also comment in rt6_mtu_change_route()]
2377          */
2378
2379         if (iph) {
2380                 daddr = &iph->daddr;
2381                 saddr = &iph->saddr;
2382         } else if (sk) {
2383                 daddr = &sk->sk_v6_daddr;
2384                 saddr = &inet6_sk(sk)->saddr;
2385         } else {
2386                 daddr = NULL;
2387                 saddr = NULL;
2388         }
2389
2390         if (confirm_neigh)
2391                 dst_confirm_neigh(dst, daddr);
2392
2393         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2394         if (mtu >= dst_mtu(dst))
2395                 return;
2396
2397         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2398                 rt6_do_update_pmtu(rt6, mtu);
2399                 /* update rt6_ex->stamp for cache */
2400                 if (rt6->rt6i_flags & RTF_CACHE)
2401                         rt6_update_exception_stamp_rt(rt6);
2402         } else if (daddr) {
2403                 struct fib6_info *from;
2404                 struct rt6_info *nrt6;
2405
2406                 rcu_read_lock();
2407                 from = rcu_dereference(rt6->from);
2408                 if (!from) {
2409                         rcu_read_unlock();
2410                         return;
2411                 }
2412                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2413                 if (nrt6) {
2414                         rt6_do_update_pmtu(nrt6, mtu);
2415                         if (rt6_insert_exception(nrt6, from))
2416                                 dst_release_immediate(&nrt6->dst);
2417                 }
2418                 rcu_read_unlock();
2419         }
2420 }
2421
2422 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2423                                struct sk_buff *skb, u32 mtu,
2424                                bool confirm_neigh)
2425 {
2426         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
2427                              confirm_neigh);
2428 }
2429
2430 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2431                      int oif, u32 mark, kuid_t uid)
2432 {
2433         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2434         struct dst_entry *dst;
2435         struct flowi6 fl6;
2436
2437         memset(&fl6, 0, sizeof(fl6));
2438         fl6.flowi6_oif = oif;
2439         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2440         fl6.daddr = iph->daddr;
2441         fl6.saddr = iph->saddr;
2442         fl6.flowlabel = ip6_flowinfo(iph);
2443         fl6.flowi6_uid = uid;
2444
2445         dst = ip6_route_output(net, NULL, &fl6);
2446         if (!dst->error)
2447                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
2448         dst_release(dst);
2449 }
2450 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2451
2452 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2453 {
2454         int oif = sk->sk_bound_dev_if;
2455         struct dst_entry *dst;
2456
2457         if (!oif && skb->dev)
2458                 oif = l3mdev_master_ifindex(skb->dev);
2459
2460         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2461
2462         dst = __sk_dst_get(sk);
2463         if (!dst || !dst->obsolete ||
2464             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2465                 return;
2466
2467         bh_lock_sock(sk);
2468         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2469                 ip6_datagram_dst_update(sk, false);
2470         bh_unlock_sock(sk);
2471 }
2472 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2473
2474 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2475                            const struct flowi6 *fl6)
2476 {
2477 #ifdef CONFIG_IPV6_SUBTREES
2478         struct ipv6_pinfo *np = inet6_sk(sk);
2479 #endif
2480
2481         ip6_dst_store(sk, dst,
2482                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2483                       &sk->sk_v6_daddr : NULL,
2484 #ifdef CONFIG_IPV6_SUBTREES
2485                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2486                       &np->saddr :
2487 #endif
2488                       NULL);
2489 }
2490
2491 /* Handle redirects */
2492 struct ip6rd_flowi {
2493         struct flowi6 fl6;
2494         struct in6_addr gateway;
2495 };
2496
2497 static struct rt6_info *__ip6_route_redirect(struct net *net,
2498                                              struct fib6_table *table,
2499                                              struct flowi6 *fl6,
2500                                              const struct sk_buff *skb,
2501                                              int flags)
2502 {
2503         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2504         struct rt6_info *ret = NULL, *rt_cache;
2505         struct fib6_info *rt;
2506         struct fib6_node *fn;
2507
2508         /* l3mdev_update_flow overrides oif if the device is enslaved; in
2509          * this case we must match on the real ingress device, so reset it
2510          */
2511         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2512                 fl6->flowi6_oif = skb->dev->ifindex;
2513
2514         /* Get the "current" route for this destination and
2515          * check if the redirect has come from appropriate router.
2516          *
2517          * RFC 4861 specifies that redirects should only be
2518          * accepted if they come from the nexthop to the target.
2519          * Due to the way the routes are chosen, this notion
2520          * is a bit fuzzy and one might need to check all possible
2521          * routes.
2522          */
2523
2524         rcu_read_lock();
2525         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2526 restart:
2527         for_each_fib6_node_rt_rcu(fn) {
2528                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2529                         continue;
2530                 if (fib6_check_expired(rt))
2531                         continue;
2532                 if (rt->fib6_flags & RTF_REJECT)
2533                         break;
2534                 if (!(rt->fib6_flags & RTF_GATEWAY))
2535                         continue;
2536                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2537                         continue;
2538                 /* rt_cache's gateway might be different from its 'parent'
2539                  * in the case of an ip redirect.
2540                  * So we keep searching in the exception table if the gateway
2541                  * is different.
2542                  */
2543                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2544                         rt_cache = rt6_find_cached_rt(rt,
2545                                                       &fl6->daddr,
2546                                                       &fl6->saddr);
2547                         if (rt_cache &&
2548                             ipv6_addr_equal(&rdfl->gateway,
2549                                             &rt_cache->rt6i_gateway)) {
2550                                 ret = rt_cache;
2551                                 break;
2552                         }
2553                         continue;
2554                 }
2555                 break;
2556         }
2557
2558         if (!rt)
2559                 rt = net->ipv6.fib6_null_entry;
2560         else if (rt->fib6_flags & RTF_REJECT) {
2561                 ret = net->ipv6.ip6_null_entry;
2562                 goto out;
2563         }
2564
2565         if (rt == net->ipv6.fib6_null_entry) {
2566                 fn = fib6_backtrack(fn, &fl6->saddr);
2567                 if (fn)
2568                         goto restart;
2569         }
2570
2571 out:
2572         if (ret)
2573                 ip6_hold_safe(net, &ret, true);
2574         else
2575                 ret = ip6_create_rt_rcu(rt);
2576
2577         rcu_read_unlock();
2578
2579         trace_fib6_table_lookup(net, rt, table, fl6);
2580         return ret;
2581 };
2582
2583 static struct dst_entry *ip6_route_redirect(struct net *net,
2584                                             const struct flowi6 *fl6,
2585                                             const struct sk_buff *skb,
2586                                             const struct in6_addr *gateway)
2587 {
2588         int flags = RT6_LOOKUP_F_HAS_SADDR;
2589         struct ip6rd_flowi rdfl;
2590
2591         rdfl.fl6 = *fl6;
2592         rdfl.gateway = *gateway;
2593
2594         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2595                                 flags, __ip6_route_redirect);
2596 }
2597
2598 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2599                   kuid_t uid)
2600 {
2601         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2602         struct dst_entry *dst;
2603         struct flowi6 fl6;
2604
2605         memset(&fl6, 0, sizeof(fl6));
2606         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2607         fl6.flowi6_oif = oif;
2608         fl6.flowi6_mark = mark;
2609         fl6.daddr = iph->daddr;
2610         fl6.saddr = iph->saddr;
2611         fl6.flowlabel = ip6_flowinfo(iph);
2612         fl6.flowi6_uid = uid;
2613
2614         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2615         rt6_do_redirect(dst, NULL, skb);
2616         dst_release(dst);
2617 }
2618 EXPORT_SYMBOL_GPL(ip6_redirect);
2619
2620 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2621                             u32 mark)
2622 {
2623         const struct ipv6hdr *iph = ipv6_hdr(skb);
2624         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2625         struct dst_entry *dst;
2626         struct flowi6 fl6;
2627
2628         memset(&fl6, 0, sizeof(fl6));
2629         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2630         fl6.flowi6_oif = oif;
2631         fl6.flowi6_mark = mark;
2632         fl6.daddr = msg->dest;
2633         fl6.saddr = iph->daddr;
2634         fl6.flowi6_uid = sock_net_uid(net, NULL);
2635
2636         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2637         rt6_do_redirect(dst, NULL, skb);
2638         dst_release(dst);
2639 }
2640
2641 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2642 {
2643         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2644                      sk->sk_uid);
2645 }
2646 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2647
2648 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2649 {
2650         struct net_device *dev = dst->dev;
2651         unsigned int mtu = dst_mtu(dst);
2652         struct net *net = dev_net(dev);
2653
2654         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2655
2656         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2657                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2658
2659         /*
2660          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2661          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2662          * IPV6_MAXPLEN is also valid and means: "any MSS,
2663          * rely only on pmtu discovery"
2664          */
2665         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2666                 mtu = IPV6_MAXPLEN;
2667         return mtu;
2668 }
2669
2670 static unsigned int ip6_mtu(const struct dst_entry *dst)
2671 {
2672         struct inet6_dev *idev;
2673         unsigned int mtu;
2674
2675         mtu = dst_metric_raw(dst, RTAX_MTU);
2676         if (mtu)
2677                 goto out;
2678
2679         mtu = IPV6_MIN_MTU;
2680
2681         rcu_read_lock();
2682         idev = __in6_dev_get(dst->dev);
2683         if (idev)
2684                 mtu = idev->cnf.mtu6;
2685         rcu_read_unlock();
2686
2687 out:
2688         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2689
2690         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2691 }
2692
2693 /* MTU selection:
2694  * 1. mtu on route is locked - use it
2695  * 2. mtu from nexthop exception
2696  * 3. mtu from egress device
2697  *
2698  * based on ip6_dst_mtu_forward and exception logic of
2699  * rt6_find_cached_rt; called with rcu_read_lock
2700  */
2701 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2702                       struct in6_addr *saddr)
2703 {
2704         struct inet6_dev *idev;
2705         struct rt6_info *rt;
2706         u32 mtu = 0;
2707
2708         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2709                 mtu = f6i->fib6_pmtu;
2710                 if (mtu)
2711                         goto out;
2712         }
2713
2714         rt = rt6_find_cached_rt(f6i, daddr, saddr);
2715         if (unlikely(rt)) {
2716                 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2717         } else {
2718                 struct net_device *dev = fib6_info_nh_dev(f6i);
2719
2720                 mtu = IPV6_MIN_MTU;
2721                 idev = __in6_dev_get(dev);
2722                 if (idev && idev->cnf.mtu6 > mtu)
2723                         mtu = idev->cnf.mtu6;
2724         }
2725
2726         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2727 out:
2728         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2729 }
2730
2731 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2732                                   struct flowi6 *fl6)
2733 {
2734         struct dst_entry *dst;
2735         struct rt6_info *rt;
2736         struct inet6_dev *idev = in6_dev_get(dev);
2737         struct net *net = dev_net(dev);
2738
2739         if (unlikely(!idev))
2740                 return ERR_PTR(-ENODEV);
2741
2742         rt = ip6_dst_alloc(net, dev, 0);
2743         if (unlikely(!rt)) {
2744                 in6_dev_put(idev);
2745                 dst = ERR_PTR(-ENOMEM);
2746                 goto out;
2747         }
2748
2749         rt->dst.flags |= DST_HOST;
2750         rt->dst.input = ip6_input;
2751         rt->dst.output  = ip6_output;
2752         rt->rt6i_gateway  = fl6->daddr;
2753         rt->rt6i_dst.addr = fl6->daddr;
2754         rt->rt6i_dst.plen = 128;
2755         rt->rt6i_idev     = idev;
2756         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2757
2758         /* Add this dst into uncached_list so that rt6_disable_ip() can
2759          * do proper release of the net_device
2760          */
2761         rt6_uncached_list_add(rt);
2762         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2763
2764         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2765
2766 out:
2767         return dst;
2768 }
2769
2770 static int ip6_dst_gc(struct dst_ops *ops)
2771 {
2772         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2773         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2774         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2775         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2776         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2777         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2778         int entries;
2779
2780         entries = dst_entries_get_fast(ops);
2781         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2782             entries <= rt_max_size)
2783                 goto out;
2784
2785         net->ipv6.ip6_rt_gc_expire++;
2786         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2787         entries = dst_entries_get_slow(ops);
2788         if (entries < ops->gc_thresh)
2789                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2790 out:
2791         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2792         return entries > rt_max_size;
2793 }
2794
2795 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2796                                struct fib6_config *cfg)
2797 {
2798         struct dst_metrics *p;
2799
2800         if (!cfg->fc_mx)
2801                 return 0;
2802
2803         p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2804         if (unlikely(!p))
2805                 return -ENOMEM;
2806
2807         refcount_set(&p->refcnt, 1);
2808         rt->fib6_metrics = p;
2809
2810         return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2811 }
2812
2813 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2814                                             struct fib6_config *cfg,
2815                                             const struct in6_addr *gw_addr,
2816                                             u32 tbid, int flags)
2817 {
2818         struct flowi6 fl6 = {
2819                 .flowi6_oif = cfg->fc_ifindex,
2820                 .daddr = *gw_addr,
2821                 .saddr = cfg->fc_prefsrc,
2822         };
2823         struct fib6_table *table;
2824         struct rt6_info *rt;
2825
2826         table = fib6_get_table(net, tbid);
2827         if (!table)
2828                 return NULL;
2829
2830         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2831                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2832
2833         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2834         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2835
2836         /* if table lookup failed, fall back to full lookup */
2837         if (rt == net->ipv6.ip6_null_entry) {
2838                 ip6_rt_put(rt);
2839                 rt = NULL;
2840         }
2841
2842         return rt;
2843 }
2844
2845 static int ip6_route_check_nh_onlink(struct net *net,
2846                                      struct fib6_config *cfg,
2847                                      const struct net_device *dev,
2848                                      struct netlink_ext_ack *extack)
2849 {
2850         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2851         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2852         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2853         struct fib6_info *from;
2854         struct rt6_info *grt;
2855         int err;
2856
2857         err = 0;
2858         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2859         if (grt) {
2860                 rcu_read_lock();
2861                 from = rcu_dereference(grt->from);
2862                 if (!grt->dst.error &&
2863                     /* ignore match if it is the default route */
2864                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2865                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2866                         NL_SET_ERR_MSG(extack,
2867                                        "Nexthop has invalid gateway or device mismatch");
2868                         err = -EINVAL;
2869                 }
2870                 rcu_read_unlock();
2871
2872                 ip6_rt_put(grt);
2873         }
2874
2875         return err;
2876 }
2877
2878 static int ip6_route_check_nh(struct net *net,
2879                               struct fib6_config *cfg,
2880                               struct net_device **_dev,
2881                               struct inet6_dev **idev)
2882 {
2883         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2884         struct net_device *dev = _dev ? *_dev : NULL;
2885         struct rt6_info *grt = NULL;
2886         int err = -EHOSTUNREACH;
2887
2888         if (cfg->fc_table) {
2889                 int flags = RT6_LOOKUP_F_IFACE;
2890
2891                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2892                                           cfg->fc_table, flags);
2893                 if (grt) {
2894                         if (grt->rt6i_flags & RTF_GATEWAY ||
2895                             (dev && dev != grt->dst.dev)) {
2896                                 ip6_rt_put(grt);
2897                                 grt = NULL;
2898                         }
2899                 }
2900         }
2901
2902         if (!grt)
2903                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2904
2905         if (!grt)
2906                 goto out;
2907
2908         if (dev) {
2909                 if (dev != grt->dst.dev) {
2910                         ip6_rt_put(grt);
2911                         goto out;
2912                 }
2913         } else {
2914                 *_dev = dev = grt->dst.dev;
2915                 *idev = grt->rt6i_idev;
2916                 dev_hold(dev);
2917                 in6_dev_hold(grt->rt6i_idev);
2918         }
2919
2920         if (!(grt->rt6i_flags & RTF_GATEWAY))
2921                 err = 0;
2922
2923         ip6_rt_put(grt);
2924
2925 out:
2926         return err;
2927 }
2928
2929 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2930                            struct net_device **_dev, struct inet6_dev **idev,
2931                            struct netlink_ext_ack *extack)
2932 {
2933         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2934         int gwa_type = ipv6_addr_type(gw_addr);
2935         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2936         const struct net_device *dev = *_dev;
2937         bool need_addr_check = !dev;
2938         int err = -EINVAL;
2939
2940         /* if gw_addr is local we will fail to detect this in case
2941          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2942          * will return already-added prefix route via interface that
2943          * prefix route was assigned to, which might be non-loopback.
2944          */
2945         if (dev &&
2946             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2947                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2948                 goto out;
2949         }
2950
2951         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2952                 /* IPv6 strictly inhibits using not link-local
2953                  * addresses as nexthop address.
2954                  * Otherwise, router will not able to send redirects.
2955                  * It is very good, but in some (rare!) circumstances
2956                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2957                  * some exceptions. --ANK
2958                  * We allow IPv4-mapped nexthops to support RFC4798-type
2959                  * addressing
2960                  */
2961                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2962                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2963                         goto out;
2964                 }
2965
2966                 if (cfg->fc_flags & RTNH_F_ONLINK)
2967                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2968                 else
2969                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2970
2971                 if (err)
2972                         goto out;
2973         }
2974
2975         /* reload in case device was changed */
2976         dev = *_dev;
2977
2978         err = -EINVAL;
2979         if (!dev) {
2980                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2981                 goto out;
2982         } else if (dev->flags & IFF_LOOPBACK) {
2983                 NL_SET_ERR_MSG(extack,
2984                                "Egress device can not be loopback device for this route");
2985                 goto out;
2986         }
2987
2988         /* if we did not check gw_addr above, do so now that the
2989          * egress device has been resolved.
2990          */
2991         if (need_addr_check &&
2992             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2993                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2994                 goto out;
2995         }
2996
2997         err = 0;
2998 out:
2999         return err;
3000 }
3001
3002 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3003                                               gfp_t gfp_flags,
3004                                               struct netlink_ext_ack *extack)
3005 {
3006         struct net *net = cfg->fc_nlinfo.nl_net;
3007         struct fib6_info *rt = NULL;
3008         struct net_device *dev = NULL;
3009         struct inet6_dev *idev = NULL;
3010         struct fib6_table *table;
3011         int addr_type;
3012         int err = -EINVAL;
3013
3014         /* RTF_PCPU is an internal flag; can not be set by userspace */
3015         if (cfg->fc_flags & RTF_PCPU) {
3016                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3017                 goto out;
3018         }
3019
3020         /* RTF_CACHE is an internal flag; can not be set by userspace */
3021         if (cfg->fc_flags & RTF_CACHE) {
3022                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3023                 goto out;
3024         }
3025
3026         if (cfg->fc_type > RTN_MAX) {
3027                 NL_SET_ERR_MSG(extack, "Invalid route type");
3028                 goto out;
3029         }
3030
3031         if (cfg->fc_dst_len > 128) {
3032                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3033                 goto out;
3034         }
3035         if (cfg->fc_src_len > 128) {
3036                 NL_SET_ERR_MSG(extack, "Invalid source address length");
3037                 goto out;
3038         }
3039 #ifndef CONFIG_IPV6_SUBTREES
3040         if (cfg->fc_src_len) {
3041                 NL_SET_ERR_MSG(extack,
3042                                "Specifying source address requires IPV6_SUBTREES to be enabled");
3043                 goto out;
3044         }
3045 #endif
3046         if (cfg->fc_ifindex) {
3047                 err = -ENODEV;
3048                 dev = dev_get_by_index(net, cfg->fc_ifindex);
3049                 if (!dev)
3050                         goto out;
3051                 idev = in6_dev_get(dev);
3052                 if (!idev)
3053                         goto out;
3054         }
3055
3056         if (cfg->fc_metric == 0)
3057                 cfg->fc_metric = IP6_RT_PRIO_USER;
3058
3059         if (cfg->fc_flags & RTNH_F_ONLINK) {
3060                 if (!dev) {
3061                         NL_SET_ERR_MSG(extack,
3062                                        "Nexthop device required for onlink");
3063                         err = -ENODEV;
3064                         goto out;
3065                 }
3066
3067                 if (!(dev->flags & IFF_UP)) {
3068                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3069                         err = -ENETDOWN;
3070                         goto out;
3071                 }
3072         }
3073
3074         err = -ENOBUFS;
3075         if (cfg->fc_nlinfo.nlh &&
3076             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3077                 table = fib6_get_table(net, cfg->fc_table);
3078                 if (!table) {
3079                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3080                         table = fib6_new_table(net, cfg->fc_table);
3081                 }
3082         } else {
3083                 table = fib6_new_table(net, cfg->fc_table);
3084         }
3085
3086         if (!table)
3087                 goto out;
3088
3089         err = -ENOMEM;
3090         rt = fib6_info_alloc(gfp_flags);
3091         if (!rt)
3092                 goto out;
3093
3094 #ifdef CONFIG_IPV6_ROUTER_PREF
3095         rt->last_probe = jiffies;
3096 #endif
3097         if (cfg->fc_flags & RTF_ADDRCONF)
3098                 rt->dst_nocount = true;
3099
3100         err = ip6_convert_metrics(net, rt, cfg);
3101         if (err < 0)
3102                 goto out;
3103
3104         if (cfg->fc_flags & RTF_EXPIRES)
3105                 fib6_set_expires(rt, jiffies +
3106                                 clock_t_to_jiffies(cfg->fc_expires));
3107         else
3108                 fib6_clean_expires(rt);
3109
3110         if (cfg->fc_protocol == RTPROT_UNSPEC)
3111                 cfg->fc_protocol = RTPROT_BOOT;
3112         rt->fib6_protocol = cfg->fc_protocol;
3113
3114         addr_type = ipv6_addr_type(&cfg->fc_dst);
3115
3116         if (cfg->fc_encap) {
3117                 struct lwtunnel_state *lwtstate;
3118
3119                 err = lwtunnel_build_state(cfg->fc_encap_type,
3120                                            cfg->fc_encap, AF_INET6, cfg,
3121                                            &lwtstate, extack);
3122                 if (err)
3123                         goto out;
3124                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3125         }
3126
3127         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3128         rt->fib6_dst.plen = cfg->fc_dst_len;
3129         if (rt->fib6_dst.plen == 128)
3130                 rt->dst_host = true;
3131
3132 #ifdef CONFIG_IPV6_SUBTREES
3133         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3134         rt->fib6_src.plen = cfg->fc_src_len;
3135 #endif
3136
3137         rt->fib6_metric = cfg->fc_metric;
3138         rt->fib6_nh.nh_weight = 1;
3139
3140         rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
3141
3142         /* We cannot add true routes via loopback here,
3143            they would result in kernel looping; promote them to reject routes
3144          */
3145         if ((cfg->fc_flags & RTF_REJECT) ||
3146             (dev && (dev->flags & IFF_LOOPBACK) &&
3147              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3148              !(cfg->fc_flags & RTF_LOCAL))) {
3149                 /* hold loopback dev/idev if we haven't done so. */
3150                 if (dev != net->loopback_dev) {
3151                         if (dev) {
3152                                 dev_put(dev);
3153                                 in6_dev_put(idev);
3154                         }
3155                         dev = net->loopback_dev;
3156                         dev_hold(dev);
3157                         idev = in6_dev_get(dev);
3158                         if (!idev) {
3159                                 err = -ENODEV;
3160                                 goto out;
3161                         }
3162                 }
3163                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3164                 goto install_route;
3165         }
3166
3167         if (cfg->fc_flags & RTF_GATEWAY) {
3168                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3169                 if (err)
3170                         goto out;
3171
3172                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3173         }
3174
3175         err = -ENODEV;
3176         if (!dev)
3177                 goto out;
3178
3179         if (idev->cnf.disable_ipv6) {
3180                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3181                 err = -EACCES;
3182                 goto out;
3183         }
3184
3185         if (!(dev->flags & IFF_UP)) {
3186                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3187                 err = -ENETDOWN;
3188                 goto out;
3189         }
3190
3191         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3192                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3193                         NL_SET_ERR_MSG(extack, "Invalid source address");
3194                         err = -EINVAL;
3195                         goto out;
3196                 }
3197                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3198                 rt->fib6_prefsrc.plen = 128;
3199         } else
3200                 rt->fib6_prefsrc.plen = 0;
3201
3202         rt->fib6_flags = cfg->fc_flags;
3203
3204 install_route:
3205         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3206             !netif_carrier_ok(dev))
3207                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3208         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3209         rt->fib6_nh.nh_dev = dev;
3210         rt->fib6_table = table;
3211
3212         cfg->fc_nlinfo.nl_net = dev_net(dev);
3213
3214         if (idev)
3215                 in6_dev_put(idev);
3216
3217         return rt;
3218 out:
3219         if (dev)
3220                 dev_put(dev);
3221         if (idev)
3222                 in6_dev_put(idev);
3223
3224         fib6_info_release(rt);
3225         return ERR_PTR(err);
3226 }
3227
3228 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3229                   struct netlink_ext_ack *extack)
3230 {
3231         struct fib6_info *rt;
3232         int err;
3233
3234         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3235         if (IS_ERR(rt))
3236                 return PTR_ERR(rt);
3237
3238         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3239         fib6_info_release(rt);
3240
3241         return err;
3242 }
3243
3244 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3245 {
3246         struct net *net = info->nl_net;
3247         struct fib6_table *table;
3248         int err;
3249
3250         if (rt == net->ipv6.fib6_null_entry) {
3251                 err = -ENOENT;
3252                 goto out;
3253         }
3254
3255         table = rt->fib6_table;
3256         spin_lock_bh(&table->tb6_lock);
3257         err = fib6_del(rt, info);
3258         spin_unlock_bh(&table->tb6_lock);
3259
3260 out:
3261         fib6_info_release(rt);
3262         return err;
3263 }
3264
3265 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3266 {
3267         struct nl_info info = { .nl_net = net };
3268
3269         return __ip6_del_rt(rt, &info);
3270 }
3271
3272 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3273 {
3274         struct nl_info *info = &cfg->fc_nlinfo;
3275         struct net *net = info->nl_net;
3276         struct sk_buff *skb = NULL;
3277         struct fib6_table *table;
3278         int err = -ENOENT;
3279
3280         if (rt == net->ipv6.fib6_null_entry)
3281                 goto out_put;
3282         table = rt->fib6_table;
3283         spin_lock_bh(&table->tb6_lock);
3284
3285         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3286                 struct fib6_info *sibling, *next_sibling;
3287
3288                 /* prefer to send a single notification with all hops */
3289                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3290                 if (skb) {
3291                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3292
3293                         if (rt6_fill_node(net, skb, rt, NULL,
3294                                           NULL, NULL, 0, RTM_DELROUTE,
3295                                           info->portid, seq, 0) < 0) {
3296                                 kfree_skb(skb);
3297                                 skb = NULL;
3298                         } else
3299                                 info->skip_notify = 1;
3300                 }
3301
3302                 list_for_each_entry_safe(sibling, next_sibling,
3303                                          &rt->fib6_siblings,
3304                                          fib6_siblings) {
3305                         err = fib6_del(sibling, info);
3306                         if (err)
3307                                 goto out_unlock;
3308                 }
3309         }
3310
3311         err = fib6_del(rt, info);
3312 out_unlock:
3313         spin_unlock_bh(&table->tb6_lock);
3314 out_put:
3315         fib6_info_release(rt);
3316
3317         if (skb) {
3318                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3319                             info->nlh, gfp_any());
3320         }
3321         return err;
3322 }
3323
3324 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3325 {
3326         int rc = -ESRCH;
3327
3328         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3329                 goto out;
3330
3331         if (cfg->fc_flags & RTF_GATEWAY &&
3332             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3333                 goto out;
3334
3335         rc = rt6_remove_exception_rt(rt);
3336 out:
3337         return rc;
3338 }
3339
3340 static int ip6_route_del(struct fib6_config *cfg,
3341                          struct netlink_ext_ack *extack)
3342 {
3343         struct rt6_info *rt_cache;
3344         struct fib6_table *table;
3345         struct fib6_info *rt;
3346         struct fib6_node *fn;
3347         int err = -ESRCH;
3348
3349         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3350         if (!table) {
3351                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3352                 return err;
3353         }
3354
3355         rcu_read_lock();
3356
3357         fn = fib6_locate(&table->tb6_root,
3358                          &cfg->fc_dst, cfg->fc_dst_len,
3359                          &cfg->fc_src, cfg->fc_src_len,
3360                          !(cfg->fc_flags & RTF_CACHE));
3361
3362         if (fn) {
3363                 for_each_fib6_node_rt_rcu(fn) {
3364                         if (cfg->fc_flags & RTF_CACHE) {
3365                                 int rc;
3366
3367                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3368                                                               &cfg->fc_src);
3369                                 if (rt_cache) {
3370                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3371                                         if (rc != -ESRCH) {
3372                                                 rcu_read_unlock();
3373                                                 return rc;
3374                                         }
3375                                 }
3376                                 continue;
3377                         }
3378                         if (cfg->fc_ifindex &&
3379                             (!rt->fib6_nh.nh_dev ||
3380                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3381                                 continue;
3382                         if (cfg->fc_flags & RTF_GATEWAY &&
3383                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3384                                 continue;
3385                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3386                                 continue;
3387                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3388                                 continue;
3389                         if (!fib6_info_hold_safe(rt))
3390                                 continue;
3391                         rcu_read_unlock();
3392
3393                         /* if gateway was specified only delete the one hop */
3394                         if (cfg->fc_flags & RTF_GATEWAY)
3395                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3396
3397                         return __ip6_del_rt_siblings(rt, cfg);
3398                 }
3399         }
3400         rcu_read_unlock();
3401
3402         return err;
3403 }
3404
3405 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3406 {
3407         struct netevent_redirect netevent;
3408         struct rt6_info *rt, *nrt = NULL;
3409         struct ndisc_options ndopts;
3410         struct inet6_dev *in6_dev;
3411         struct neighbour *neigh;
3412         struct fib6_info *from;
3413         struct rd_msg *msg;
3414         int optlen, on_link;
3415         u8 *lladdr;
3416
3417         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3418         optlen -= sizeof(*msg);
3419
3420         if (optlen < 0) {
3421                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3422                 return;
3423         }
3424
3425         msg = (struct rd_msg *)icmp6_hdr(skb);
3426
3427         if (ipv6_addr_is_multicast(&msg->dest)) {
3428                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3429                 return;
3430         }
3431
3432         on_link = 0;
3433         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3434                 on_link = 1;
3435         } else if (ipv6_addr_type(&msg->target) !=
3436                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3437                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3438                 return;
3439         }
3440
3441         in6_dev = __in6_dev_get(skb->dev);
3442         if (!in6_dev)
3443                 return;
3444         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3445                 return;
3446
3447         /* RFC2461 8.1:
3448          *      The IP source address of the Redirect MUST be the same as the current
3449          *      first-hop router for the specified ICMP Destination Address.
3450          */
3451
3452         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3453                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3454                 return;
3455         }
3456
3457         lladdr = NULL;
3458         if (ndopts.nd_opts_tgt_lladdr) {
3459                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3460                                              skb->dev);
3461                 if (!lladdr) {
3462                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3463                         return;
3464                 }
3465         }
3466
3467         rt = (struct rt6_info *) dst;
3468         if (rt->rt6i_flags & RTF_REJECT) {
3469                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3470                 return;
3471         }
3472
3473         /* Redirect received -> path was valid.
3474          * Look, redirects are sent only in response to data packets,
3475          * so that this nexthop apparently is reachable. --ANK
3476          */
3477         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3478
3479         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3480         if (!neigh)
3481                 return;
3482
3483         /*
3484          *      We have finally decided to accept it.
3485          */
3486
3487         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3488                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3489                      NEIGH_UPDATE_F_OVERRIDE|
3490                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3491                                      NEIGH_UPDATE_F_ISROUTER)),
3492                      NDISC_REDIRECT, &ndopts);
3493
3494         rcu_read_lock();
3495         from = rcu_dereference(rt->from);
3496         if (!from)
3497                 goto out;
3498
3499         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3500         if (!nrt)
3501                 goto out;
3502
3503         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3504         if (on_link)
3505                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3506
3507         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3508
3509         /* rt6_insert_exception() will take care of duplicated exceptions */
3510         if (rt6_insert_exception(nrt, from)) {
3511                 dst_release_immediate(&nrt->dst);
3512                 goto out;
3513         }
3514
3515         netevent.old = &rt->dst;
3516         netevent.new = &nrt->dst;
3517         netevent.daddr = &msg->dest;
3518         netevent.neigh = neigh;
3519         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3520
3521 out:
3522         rcu_read_unlock();
3523         neigh_release(neigh);
3524 }
3525
3526 #ifdef CONFIG_IPV6_ROUTE_INFO
3527 static struct fib6_info *rt6_get_route_info(struct net *net,
3528                                            const struct in6_addr *prefix, int prefixlen,
3529                                            const struct in6_addr *gwaddr,
3530                                            struct net_device *dev)
3531 {
3532         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3533         int ifindex = dev->ifindex;
3534         struct fib6_node *fn;
3535         struct fib6_info *rt = NULL;
3536         struct fib6_table *table;
3537
3538         table = fib6_get_table(net, tb_id);
3539         if (!table)
3540                 return NULL;
3541
3542         rcu_read_lock();
3543         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3544         if (!fn)
3545                 goto out;
3546
3547         for_each_fib6_node_rt_rcu(fn) {
3548                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3549                         continue;
3550                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3551                         continue;
3552                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3553                         continue;
3554                 if (!fib6_info_hold_safe(rt))
3555                         continue;
3556                 break;
3557         }
3558 out:
3559         rcu_read_unlock();
3560         return rt;
3561 }
3562
3563 static struct fib6_info *rt6_add_route_info(struct net *net,
3564                                            const struct in6_addr *prefix, int prefixlen,
3565                                            const struct in6_addr *gwaddr,
3566                                            struct net_device *dev,
3567                                            unsigned int pref)
3568 {
3569         struct fib6_config cfg = {
3570                 .fc_metric      = IP6_RT_PRIO_USER,
3571                 .fc_ifindex     = dev->ifindex,
3572                 .fc_dst_len     = prefixlen,
3573                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3574                                   RTF_UP | RTF_PREF(pref),
3575                 .fc_protocol = RTPROT_RA,
3576                 .fc_type = RTN_UNICAST,
3577                 .fc_nlinfo.portid = 0,
3578                 .fc_nlinfo.nlh = NULL,
3579                 .fc_nlinfo.nl_net = net,
3580         };
3581
3582         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3583         cfg.fc_dst = *prefix;
3584         cfg.fc_gateway = *gwaddr;
3585
3586         /* We should treat it as a default route if prefix length is 0. */
3587         if (!prefixlen)
3588                 cfg.fc_flags |= RTF_DEFAULT;
3589
3590         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3591
3592         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3593 }
3594 #endif
3595
3596 struct fib6_info *rt6_get_dflt_router(struct net *net,
3597                                      const struct in6_addr *addr,
3598                                      struct net_device *dev)
3599 {
3600         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3601         struct fib6_info *rt;
3602         struct fib6_table *table;
3603
3604         table = fib6_get_table(net, tb_id);
3605         if (!table)
3606                 return NULL;
3607
3608         rcu_read_lock();
3609         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3610                 if (dev == rt->fib6_nh.nh_dev &&
3611                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3612                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3613                         break;
3614         }
3615         if (rt && !fib6_info_hold_safe(rt))
3616                 rt = NULL;
3617         rcu_read_unlock();
3618         return rt;
3619 }
3620
3621 struct fib6_info *rt6_add_dflt_router(struct net *net,
3622                                      const struct in6_addr *gwaddr,
3623                                      struct net_device *dev,
3624                                      unsigned int pref)
3625 {
3626         struct fib6_config cfg = {
3627                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3628                 .fc_metric      = IP6_RT_PRIO_USER,
3629                 .fc_ifindex     = dev->ifindex,
3630                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3631                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3632                 .fc_protocol = RTPROT_RA,
3633                 .fc_type = RTN_UNICAST,
3634                 .fc_nlinfo.portid = 0,
3635                 .fc_nlinfo.nlh = NULL,
3636                 .fc_nlinfo.nl_net = net,
3637         };
3638
3639         cfg.fc_gateway = *gwaddr;
3640
3641         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3642                 struct fib6_table *table;
3643
3644                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3645                 if (table)
3646                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3647         }
3648
3649         return rt6_get_dflt_router(net, gwaddr, dev);
3650 }
3651
3652 static void __rt6_purge_dflt_routers(struct net *net,
3653                                      struct fib6_table *table)
3654 {
3655         struct fib6_info *rt;
3656
3657 restart:
3658         rcu_read_lock();
3659         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3660                 struct net_device *dev = fib6_info_nh_dev(rt);
3661                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3662
3663                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3664                     (!idev || idev->cnf.accept_ra != 2) &&
3665                     fib6_info_hold_safe(rt)) {
3666                         rcu_read_unlock();
3667                         ip6_del_rt(net, rt);
3668                         goto restart;
3669                 }
3670         }
3671         rcu_read_unlock();
3672
3673         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3674 }
3675
3676 void rt6_purge_dflt_routers(struct net *net)
3677 {
3678         struct fib6_table *table;
3679         struct hlist_head *head;
3680         unsigned int h;
3681
3682         rcu_read_lock();
3683
3684         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3685                 head = &net->ipv6.fib_table_hash[h];
3686                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3687                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3688                                 __rt6_purge_dflt_routers(net, table);
3689                 }
3690         }
3691
3692         rcu_read_unlock();
3693 }
3694
3695 static void rtmsg_to_fib6_config(struct net *net,
3696                                  struct in6_rtmsg *rtmsg,
3697                                  struct fib6_config *cfg)
3698 {
3699         memset(cfg, 0, sizeof(*cfg));
3700
3701         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3702                          : RT6_TABLE_MAIN;
3703         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3704         cfg->fc_metric = rtmsg->rtmsg_metric;
3705         cfg->fc_expires = rtmsg->rtmsg_info;
3706         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3707         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3708         cfg->fc_flags = rtmsg->rtmsg_flags;
3709         cfg->fc_type = rtmsg->rtmsg_type;
3710
3711         cfg->fc_nlinfo.nl_net = net;
3712
3713         cfg->fc_dst = rtmsg->rtmsg_dst;
3714         cfg->fc_src = rtmsg->rtmsg_src;
3715         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3716 }
3717
3718 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3719 {
3720         struct fib6_config cfg;
3721         struct in6_rtmsg rtmsg;
3722         int err;
3723
3724         switch (cmd) {
3725         case SIOCADDRT:         /* Add a route */
3726         case SIOCDELRT:         /* Delete a route */
3727                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3728                         return -EPERM;
3729                 err = copy_from_user(&rtmsg, arg,
3730                                      sizeof(struct in6_rtmsg));
3731                 if (err)
3732                         return -EFAULT;
3733
3734                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3735
3736                 rtnl_lock();
3737                 switch (cmd) {
3738                 case SIOCADDRT:
3739                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3740                         break;
3741                 case SIOCDELRT:
3742                         err = ip6_route_del(&cfg, NULL);
3743                         break;
3744                 default:
3745                         err = -EINVAL;
3746                 }
3747                 rtnl_unlock();
3748
3749                 return err;
3750         }
3751
3752         return -EINVAL;
3753 }
3754
3755 /*
3756  *      Drop the packet on the floor
3757  */
3758
3759 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3760 {
3761         int type;
3762         struct dst_entry *dst = skb_dst(skb);
3763         switch (ipstats_mib_noroutes) {
3764         case IPSTATS_MIB_INNOROUTES:
3765                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3766                 if (type == IPV6_ADDR_ANY) {
3767                         IP6_INC_STATS(dev_net(dst->dev),
3768                                       __in6_dev_get_safely(skb->dev),
3769                                       IPSTATS_MIB_INADDRERRORS);
3770                         break;
3771                 }
3772                 /* FALLTHROUGH */
3773         case IPSTATS_MIB_OUTNOROUTES:
3774                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3775                               ipstats_mib_noroutes);
3776                 break;
3777         }
3778         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3779         kfree_skb(skb);
3780         return 0;
3781 }
3782
3783 static int ip6_pkt_discard(struct sk_buff *skb)
3784 {
3785         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3786 }
3787
3788 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3789 {
3790         skb->dev = skb_dst(skb)->dev;
3791         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3792 }
3793
3794 static int ip6_pkt_prohibit(struct sk_buff *skb)
3795 {
3796         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3797 }
3798
3799 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3800 {
3801         skb->dev = skb_dst(skb)->dev;
3802         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3803 }
3804
3805 /*
3806  *      Allocate a dst for local (unicast / anycast) address.
3807  */
3808
3809 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3810                                      struct inet6_dev *idev,
3811                                      const struct in6_addr *addr,
3812                                      bool anycast, gfp_t gfp_flags)
3813 {
3814         u32 tb_id;
3815         struct net_device *dev = idev->dev;
3816         struct fib6_info *f6i;
3817
3818         f6i = fib6_info_alloc(gfp_flags);
3819         if (!f6i)
3820                 return ERR_PTR(-ENOMEM);
3821
3822         f6i->dst_nocount = true;
3823         f6i->dst_host = true;
3824         f6i->fib6_protocol = RTPROT_KERNEL;
3825         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3826         if (anycast) {
3827                 f6i->fib6_type = RTN_ANYCAST;
3828                 f6i->fib6_flags |= RTF_ANYCAST;
3829         } else {
3830                 f6i->fib6_type = RTN_LOCAL;
3831                 f6i->fib6_flags |= RTF_LOCAL;
3832         }
3833
3834         f6i->fib6_nh.nh_gw = *addr;
3835         dev_hold(dev);
3836         f6i->fib6_nh.nh_dev = dev;
3837         f6i->fib6_dst.addr = *addr;
3838         f6i->fib6_dst.plen = 128;
3839         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3840         f6i->fib6_table = fib6_get_table(net, tb_id);
3841
3842         return f6i;
3843 }
3844
3845 /* remove deleted ip from prefsrc entries */
3846 struct arg_dev_net_ip {
3847         struct net_device *dev;
3848         struct net *net;
3849         struct in6_addr *addr;
3850 };
3851
3852 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3853 {
3854         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3855         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3856         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3857
3858         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3859             rt != net->ipv6.fib6_null_entry &&
3860             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3861                 spin_lock_bh(&rt6_exception_lock);
3862                 /* remove prefsrc entry */
3863                 rt->fib6_prefsrc.plen = 0;
3864                 /* need to update cache as well */
3865                 rt6_exceptions_remove_prefsrc(rt);
3866                 spin_unlock_bh(&rt6_exception_lock);
3867         }
3868         return 0;
3869 }
3870
3871 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3872 {
3873         struct net *net = dev_net(ifp->idev->dev);
3874         struct arg_dev_net_ip adni = {
3875                 .dev = ifp->idev->dev,
3876                 .net = net,
3877                 .addr = &ifp->addr,
3878         };
3879         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3880 }
3881
3882 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3883
3884 /* Remove routers and update dst entries when gateway turn into host. */
3885 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3886 {
3887         struct in6_addr *gateway = (struct in6_addr *)arg;
3888
3889         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3890             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3891                 return -1;
3892         }
3893
3894         /* Further clean up cached routes in exception table.
3895          * This is needed because cached route may have a different
3896          * gateway than its 'parent' in the case of an ip redirect.
3897          */
3898         rt6_exceptions_clean_tohost(rt, gateway);
3899
3900         return 0;
3901 }
3902
3903 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3904 {
3905         fib6_clean_all(net, fib6_clean_tohost, gateway);
3906 }
3907
3908 struct arg_netdev_event {
3909         const struct net_device *dev;
3910         union {
3911                 unsigned int nh_flags;
3912                 unsigned long event;
3913         };
3914 };
3915
3916 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3917 {
3918         struct fib6_info *iter;
3919         struct fib6_node *fn;
3920
3921         fn = rcu_dereference_protected(rt->fib6_node,
3922                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3923         iter = rcu_dereference_protected(fn->leaf,
3924                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3925         while (iter) {
3926                 if (iter->fib6_metric == rt->fib6_metric &&
3927                     rt6_qualify_for_ecmp(iter))
3928                         return iter;
3929                 iter = rcu_dereference_protected(iter->fib6_next,
3930                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3931         }
3932
3933         return NULL;
3934 }
3935
3936 static bool rt6_is_dead(const struct fib6_info *rt)
3937 {
3938         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3939             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3940              fib6_ignore_linkdown(rt)))
3941                 return true;
3942
3943         return false;
3944 }
3945
3946 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3947 {
3948         struct fib6_info *iter;
3949         int total = 0;
3950
3951         if (!rt6_is_dead(rt))
3952                 total += rt->fib6_nh.nh_weight;
3953
3954         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3955                 if (!rt6_is_dead(iter))
3956                         total += iter->fib6_nh.nh_weight;
3957         }
3958
3959         return total;
3960 }
3961
3962 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3963 {
3964         int upper_bound = -1;
3965
3966         if (!rt6_is_dead(rt)) {
3967                 *weight += rt->fib6_nh.nh_weight;
3968                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3969                                                     total) - 1;
3970         }
3971         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3972 }
3973
3974 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3975 {
3976         struct fib6_info *iter;
3977         int weight = 0;
3978
3979         rt6_upper_bound_set(rt, &weight, total);
3980
3981         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3982                 rt6_upper_bound_set(iter, &weight, total);
3983 }
3984
3985 void rt6_multipath_rebalance(struct fib6_info *rt)
3986 {
3987         struct fib6_info *first;
3988         int total;
3989
3990         /* In case the entire multipath route was marked for flushing,
3991          * then there is no need to rebalance upon the removal of every
3992          * sibling route.
3993          */
3994         if (!rt->fib6_nsiblings || rt->should_flush)
3995                 return;
3996
3997         /* During lookup routes are evaluated in order, so we need to
3998          * make sure upper bounds are assigned from the first sibling
3999          * onwards.
4000          */
4001         first = rt6_multipath_first_sibling(rt);
4002         if (WARN_ON_ONCE(!first))
4003                 return;
4004
4005         total = rt6_multipath_total_weight(first);
4006         rt6_multipath_upper_bound_set(first, total);
4007 }
4008
4009 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4010 {
4011         const struct arg_netdev_event *arg = p_arg;
4012         struct net *net = dev_net(arg->dev);
4013
4014         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
4015                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
4016                 fib6_update_sernum_upto_root(net, rt);
4017                 rt6_multipath_rebalance(rt);
4018         }
4019
4020         return 0;
4021 }
4022
4023 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
4024 {
4025         struct arg_netdev_event arg = {
4026                 .dev = dev,
4027                 {
4028                         .nh_flags = nh_flags,
4029                 },
4030         };
4031
4032         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4033                 arg.nh_flags |= RTNH_F_LINKDOWN;
4034
4035         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4036 }
4037
4038 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4039                                    const struct net_device *dev)
4040 {
4041         struct fib6_info *iter;
4042
4043         if (rt->fib6_nh.nh_dev == dev)
4044                 return true;
4045         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4046                 if (iter->fib6_nh.nh_dev == dev)
4047                         return true;
4048
4049         return false;
4050 }
4051
4052 static void rt6_multipath_flush(struct fib6_info *rt)
4053 {
4054         struct fib6_info *iter;
4055
4056         rt->should_flush = 1;
4057         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4058                 iter->should_flush = 1;
4059 }
4060
4061 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4062                                              const struct net_device *down_dev)
4063 {
4064         struct fib6_info *iter;
4065         unsigned int dead = 0;
4066
4067         if (rt->fib6_nh.nh_dev == down_dev ||
4068             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4069                 dead++;
4070         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4071                 if (iter->fib6_nh.nh_dev == down_dev ||
4072                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4073                         dead++;
4074
4075         return dead;
4076 }
4077
4078 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4079                                        const struct net_device *dev,
4080                                        unsigned int nh_flags)
4081 {
4082         struct fib6_info *iter;
4083
4084         if (rt->fib6_nh.nh_dev == dev)
4085                 rt->fib6_nh.nh_flags |= nh_flags;
4086         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4087                 if (iter->fib6_nh.nh_dev == dev)
4088                         iter->fib6_nh.nh_flags |= nh_flags;
4089 }
4090
4091 /* called with write lock held for table with rt */
4092 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4093 {
4094         const struct arg_netdev_event *arg = p_arg;
4095         const struct net_device *dev = arg->dev;
4096         struct net *net = dev_net(dev);
4097
4098         if (rt == net->ipv6.fib6_null_entry)
4099                 return 0;
4100
4101         switch (arg->event) {
4102         case NETDEV_UNREGISTER:
4103                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4104         case NETDEV_DOWN:
4105                 if (rt->should_flush)
4106                         return -1;
4107                 if (!rt->fib6_nsiblings)
4108                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4109                 if (rt6_multipath_uses_dev(rt, dev)) {
4110                         unsigned int count;
4111
4112                         count = rt6_multipath_dead_count(rt, dev);
4113                         if (rt->fib6_nsiblings + 1 == count) {
4114                                 rt6_multipath_flush(rt);
4115                                 return -1;
4116                         }
4117                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4118                                                    RTNH_F_LINKDOWN);
4119                         fib6_update_sernum(net, rt);
4120                         rt6_multipath_rebalance(rt);
4121                 }
4122                 return -2;
4123         case NETDEV_CHANGE:
4124                 if (rt->fib6_nh.nh_dev != dev ||
4125                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4126                         break;
4127                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4128                 rt6_multipath_rebalance(rt);
4129                 break;
4130         }
4131
4132         return 0;
4133 }
4134
4135 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4136 {
4137         struct arg_netdev_event arg = {
4138                 .dev = dev,
4139                 {
4140                         .event = event,
4141                 },
4142         };
4143
4144         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4145 }
4146
4147 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4148 {
4149         rt6_sync_down_dev(dev, event);
4150         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4151         neigh_ifdown(&nd_tbl, dev);
4152 }
4153
4154 struct rt6_mtu_change_arg {
4155         struct net_device *dev;
4156         unsigned int mtu;
4157 };
4158
4159 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4160 {
4161         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4162         struct inet6_dev *idev;
4163
4164         /* In IPv6 pmtu discovery is not optional,
4165            so that RTAX_MTU lock cannot disable it.
4166            We still use this lock to block changes
4167            caused by addrconf/ndisc.
4168         */
4169
4170         idev = __in6_dev_get(arg->dev);
4171         if (!idev)
4172                 return 0;
4173
4174         /* For administrative MTU increase, there is no way to discover
4175            IPv6 PMTU increase, so PMTU increase should be updated here.
4176            Since RFC 1981 doesn't include administrative MTU increase
4177            update PMTU increase is a MUST. (i.e. jumbo frame)
4178          */
4179         if (rt->fib6_nh.nh_dev == arg->dev &&
4180             !fib6_metric_locked(rt, RTAX_MTU)) {
4181                 u32 mtu = rt->fib6_pmtu;
4182
4183                 if (mtu >= arg->mtu ||
4184                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4185                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4186
4187                 spin_lock_bh(&rt6_exception_lock);
4188                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4189                 spin_unlock_bh(&rt6_exception_lock);
4190         }
4191         return 0;
4192 }
4193
4194 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4195 {
4196         struct rt6_mtu_change_arg arg = {
4197                 .dev = dev,
4198                 .mtu = mtu,
4199         };
4200
4201         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4202 }
4203
4204 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4205         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4206         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4207         [RTA_OIF]               = { .type = NLA_U32 },
4208         [RTA_IIF]               = { .type = NLA_U32 },
4209         [RTA_PRIORITY]          = { .type = NLA_U32 },
4210         [RTA_METRICS]           = { .type = NLA_NESTED },
4211         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4212         [RTA_PREF]              = { .type = NLA_U8 },
4213         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4214         [RTA_ENCAP]             = { .type = NLA_NESTED },
4215         [RTA_EXPIRES]           = { .type = NLA_U32 },
4216         [RTA_UID]               = { .type = NLA_U32 },
4217         [RTA_MARK]              = { .type = NLA_U32 },
4218         [RTA_TABLE]             = { .type = NLA_U32 },
4219         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4220         [RTA_SPORT]             = { .type = NLA_U16 },
4221         [RTA_DPORT]             = { .type = NLA_U16 },
4222 };
4223
4224 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4225                               struct fib6_config *cfg,
4226                               struct netlink_ext_ack *extack)
4227 {
4228         struct rtmsg *rtm;
4229         struct nlattr *tb[RTA_MAX+1];
4230         unsigned int pref;
4231         int err;
4232
4233         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4234                           NULL);
4235         if (err < 0)
4236                 goto errout;
4237
4238         err = -EINVAL;
4239         rtm = nlmsg_data(nlh);
4240         memset(cfg, 0, sizeof(*cfg));
4241
4242         cfg->fc_table = rtm->rtm_table;
4243         cfg->fc_dst_len = rtm->rtm_dst_len;
4244         cfg->fc_src_len = rtm->rtm_src_len;
4245         cfg->fc_flags = RTF_UP;
4246         cfg->fc_protocol = rtm->rtm_protocol;
4247         cfg->fc_type = rtm->rtm_type;
4248
4249         if (rtm->rtm_type == RTN_UNREACHABLE ||
4250             rtm->rtm_type == RTN_BLACKHOLE ||
4251             rtm->rtm_type == RTN_PROHIBIT ||
4252             rtm->rtm_type == RTN_THROW)
4253                 cfg->fc_flags |= RTF_REJECT;
4254
4255         if (rtm->rtm_type == RTN_LOCAL)
4256                 cfg->fc_flags |= RTF_LOCAL;
4257
4258         if (rtm->rtm_flags & RTM_F_CLONED)
4259                 cfg->fc_flags |= RTF_CACHE;
4260
4261         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4262
4263         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4264         cfg->fc_nlinfo.nlh = nlh;
4265         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4266
4267         if (tb[RTA_GATEWAY]) {
4268                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4269                 cfg->fc_flags |= RTF_GATEWAY;
4270         }
4271         if (tb[RTA_VIA]) {
4272                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4273                 goto errout;
4274         }
4275
4276         if (tb[RTA_DST]) {
4277                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4278
4279                 if (nla_len(tb[RTA_DST]) < plen)
4280                         goto errout;
4281
4282                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4283         }
4284
4285         if (tb[RTA_SRC]) {
4286                 int plen = (rtm->rtm_src_len + 7) >> 3;
4287
4288                 if (nla_len(tb[RTA_SRC]) < plen)
4289                         goto errout;
4290
4291                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4292         }
4293
4294         if (tb[RTA_PREFSRC])
4295                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4296
4297         if (tb[RTA_OIF])
4298                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4299
4300         if (tb[RTA_PRIORITY])
4301                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4302
4303         if (tb[RTA_METRICS]) {
4304                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4305                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4306         }
4307
4308         if (tb[RTA_TABLE])
4309                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4310
4311         if (tb[RTA_MULTIPATH]) {
4312                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4313                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4314
4315                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4316                                                      cfg->fc_mp_len, extack);
4317                 if (err < 0)
4318                         goto errout;
4319         }
4320
4321         if (tb[RTA_PREF]) {
4322                 pref = nla_get_u8(tb[RTA_PREF]);
4323                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4324                     pref != ICMPV6_ROUTER_PREF_HIGH)
4325                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4326                 cfg->fc_flags |= RTF_PREF(pref);
4327         }
4328
4329         if (tb[RTA_ENCAP])
4330                 cfg->fc_encap = tb[RTA_ENCAP];
4331
4332         if (tb[RTA_ENCAP_TYPE]) {
4333                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4334
4335                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4336                 if (err < 0)
4337                         goto errout;
4338         }
4339
4340         if (tb[RTA_EXPIRES]) {
4341                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4342
4343                 if (addrconf_finite_timeout(timeout)) {
4344                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4345                         cfg->fc_flags |= RTF_EXPIRES;
4346                 }
4347         }
4348
4349         err = 0;
4350 errout:
4351         return err;
4352 }
4353
4354 struct rt6_nh {
4355         struct fib6_info *fib6_info;
4356         struct fib6_config r_cfg;
4357         struct list_head next;
4358 };
4359
4360 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4361 {
4362         struct rt6_nh *nh;
4363
4364         list_for_each_entry(nh, rt6_nh_list, next) {
4365                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4366                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4367                         nh->r_cfg.fc_ifindex);
4368         }
4369 }
4370
4371 static int ip6_route_info_append(struct net *net,
4372                                  struct list_head *rt6_nh_list,
4373                                  struct fib6_info *rt,
4374                                  struct fib6_config *r_cfg)
4375 {
4376         struct rt6_nh *nh;
4377         int err = -EEXIST;
4378
4379         list_for_each_entry(nh, rt6_nh_list, next) {
4380                 /* check if fib6_info already exists */
4381                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4382                         return err;
4383         }
4384
4385         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4386         if (!nh)
4387                 return -ENOMEM;
4388         nh->fib6_info = rt;
4389         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4390         list_add_tail(&nh->next, rt6_nh_list);
4391
4392         return 0;
4393 }
4394
4395 static void ip6_route_mpath_notify(struct fib6_info *rt,
4396                                    struct fib6_info *rt_last,
4397                                    struct nl_info *info,
4398                                    __u16 nlflags)
4399 {
4400         /* if this is an APPEND route, then rt points to the first route
4401          * inserted and rt_last points to last route inserted. Userspace
4402          * wants a consistent dump of the route which starts at the first
4403          * nexthop. Since sibling routes are always added at the end of
4404          * the list, find the first sibling of the last route appended
4405          */
4406         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4407                 rt = list_first_entry(&rt_last->fib6_siblings,
4408                                       struct fib6_info,
4409                                       fib6_siblings);
4410         }
4411
4412         if (rt)
4413                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4414 }
4415
4416 static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla,
4417                              struct netlink_ext_ack *extack)
4418 {
4419         if (nla_len(nla) < sizeof(*gw)) {
4420                 NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY");
4421                 return -EINVAL;
4422         }
4423
4424         *gw = nla_get_in6_addr(nla);
4425
4426         return 0;
4427 }
4428
4429 static int ip6_route_multipath_add(struct fib6_config *cfg,
4430                                    struct netlink_ext_ack *extack)
4431 {
4432         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4433         struct nl_info *info = &cfg->fc_nlinfo;
4434         struct fib6_config r_cfg;
4435         struct rtnexthop *rtnh;
4436         struct fib6_info *rt;
4437         struct rt6_nh *err_nh;
4438         struct rt6_nh *nh, *nh_safe;
4439         __u16 nlflags;
4440         int remaining;
4441         int attrlen;
4442         int err = 1;
4443         int nhn = 0;
4444         int replace = (cfg->fc_nlinfo.nlh &&
4445                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4446         LIST_HEAD(rt6_nh_list);
4447
4448         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4449         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4450                 nlflags |= NLM_F_APPEND;
4451
4452         remaining = cfg->fc_mp_len;
4453         rtnh = (struct rtnexthop *)cfg->fc_mp;
4454
4455         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4456          * fib6_info structs per nexthop
4457          */
4458         while (rtnh_ok(rtnh, remaining)) {
4459                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4460                 if (rtnh->rtnh_ifindex)
4461                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4462
4463                 attrlen = rtnh_attrlen(rtnh);
4464                 if (attrlen > 0) {
4465                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4466
4467                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4468                         if (nla) {
4469                                 err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
4470                                                         extack);
4471                                 if (err)
4472                                         goto cleanup;
4473
4474                                 r_cfg.fc_flags |= RTF_GATEWAY;
4475                         }
4476                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4477                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4478                         if (nla)
4479                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4480                 }
4481
4482                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4483                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4484                 if (IS_ERR(rt)) {
4485                         err = PTR_ERR(rt);
4486                         rt = NULL;
4487                         goto cleanup;
4488                 }
4489                 if (!rt6_qualify_for_ecmp(rt)) {
4490                         err = -EINVAL;
4491                         NL_SET_ERR_MSG(extack,
4492                                        "Device only routes can not be added for IPv6 using the multipath API.");
4493                         fib6_info_release(rt);
4494                         goto cleanup;
4495                 }
4496
4497                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4498
4499                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4500                                             rt, &r_cfg);
4501                 if (err) {
4502                         fib6_info_release(rt);
4503                         goto cleanup;
4504                 }
4505
4506                 rtnh = rtnh_next(rtnh, &remaining);
4507         }
4508
4509         /* for add and replace send one notification with all nexthops.
4510          * Skip the notification in fib6_add_rt2node and send one with
4511          * the full route when done
4512          */
4513         info->skip_notify = 1;
4514
4515         err_nh = NULL;
4516         list_for_each_entry(nh, &rt6_nh_list, next) {
4517                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4518                 fib6_info_release(nh->fib6_info);
4519
4520                 if (!err) {
4521                         /* save reference to last route successfully inserted */
4522                         rt_last = nh->fib6_info;
4523
4524                         /* save reference to first route for notification */
4525                         if (!rt_notif)
4526                                 rt_notif = nh->fib6_info;
4527                 }
4528
4529                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4530                 nh->fib6_info = NULL;
4531                 if (err) {
4532                         if (replace && nhn)
4533                                 ip6_print_replace_route_err(&rt6_nh_list);
4534                         err_nh = nh;
4535                         goto add_errout;
4536                 }
4537
4538                 /* Because each route is added like a single route we remove
4539                  * these flags after the first nexthop: if there is a collision,
4540                  * we have already failed to add the first nexthop:
4541                  * fib6_add_rt2node() has rejected it; when replacing, old
4542                  * nexthops have been replaced by first new, the rest should
4543                  * be added to it.
4544                  */
4545                 if (cfg->fc_nlinfo.nlh) {
4546                         cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4547                                                              NLM_F_REPLACE);
4548                         cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
4549                 }
4550                 nhn++;
4551         }
4552
4553         /* success ... tell user about new route */
4554         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4555         goto cleanup;
4556
4557 add_errout:
4558         /* send notification for routes that were added so that
4559          * the delete notifications sent by ip6_route_del are
4560          * coherent
4561          */
4562         if (rt_notif)
4563                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4564
4565         /* Delete routes that were already added */
4566         list_for_each_entry(nh, &rt6_nh_list, next) {
4567                 if (err_nh == nh)
4568                         break;
4569                 ip6_route_del(&nh->r_cfg, extack);
4570         }
4571
4572 cleanup:
4573         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4574                 if (nh->fib6_info)
4575                         fib6_info_release(nh->fib6_info);
4576                 list_del(&nh->next);
4577                 kfree(nh);
4578         }
4579
4580         return err;
4581 }
4582
4583 static int ip6_route_multipath_del(struct fib6_config *cfg,
4584                                    struct netlink_ext_ack *extack)
4585 {
4586         struct fib6_config r_cfg;
4587         struct rtnexthop *rtnh;
4588         int remaining;
4589         int attrlen;
4590         int err = 1, last_err = 0;
4591
4592         remaining = cfg->fc_mp_len;
4593         rtnh = (struct rtnexthop *)cfg->fc_mp;
4594
4595         /* Parse a Multipath Entry */
4596         while (rtnh_ok(rtnh, remaining)) {
4597                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4598                 if (rtnh->rtnh_ifindex)
4599                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4600
4601                 attrlen = rtnh_attrlen(rtnh);
4602                 if (attrlen > 0) {
4603                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4604
4605                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4606                         if (nla) {
4607                                 err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
4608                                                         extack);
4609                                 if (err) {
4610                                         last_err = err;
4611                                         goto next_rtnh;
4612                                 }
4613
4614                                 r_cfg.fc_flags |= RTF_GATEWAY;
4615                         }
4616                 }
4617                 err = ip6_route_del(&r_cfg, extack);
4618                 if (err)
4619                         last_err = err;
4620
4621 next_rtnh:
4622                 rtnh = rtnh_next(rtnh, &remaining);
4623         }
4624
4625         return last_err;
4626 }
4627
4628 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4629                               struct netlink_ext_ack *extack)
4630 {
4631         struct fib6_config cfg;
4632         int err;
4633
4634         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4635         if (err < 0)
4636                 return err;
4637
4638         if (cfg.fc_mp)
4639                 return ip6_route_multipath_del(&cfg, extack);
4640         else {
4641                 cfg.fc_delete_all_nh = 1;
4642                 return ip6_route_del(&cfg, extack);
4643         }
4644 }
4645
4646 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4647                               struct netlink_ext_ack *extack)
4648 {
4649         struct fib6_config cfg;
4650         int err;
4651
4652         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4653         if (err < 0)
4654                 return err;
4655
4656         if (cfg.fc_mp)
4657                 return ip6_route_multipath_add(&cfg, extack);
4658         else
4659                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4660 }
4661
4662 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4663 {
4664         int nexthop_len = 0;
4665
4666         if (rt->fib6_nsiblings) {
4667                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4668                             + NLA_ALIGN(sizeof(struct rtnexthop))
4669                             + nla_total_size(16) /* RTA_GATEWAY */
4670                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4671
4672                 nexthop_len *= rt->fib6_nsiblings;
4673         }
4674
4675         return NLMSG_ALIGN(sizeof(struct rtmsg))
4676                + nla_total_size(16) /* RTA_SRC */
4677                + nla_total_size(16) /* RTA_DST */
4678                + nla_total_size(16) /* RTA_GATEWAY */
4679                + nla_total_size(16) /* RTA_PREFSRC */
4680                + nla_total_size(4) /* RTA_TABLE */
4681                + nla_total_size(4) /* RTA_IIF */
4682                + nla_total_size(4) /* RTA_OIF */
4683                + nla_total_size(4) /* RTA_PRIORITY */
4684                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4685                + nla_total_size(sizeof(struct rta_cacheinfo))
4686                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4687                + nla_total_size(1) /* RTA_PREF */
4688                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4689                + nexthop_len;
4690 }
4691
4692 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4693                             unsigned int *flags, bool skip_oif)
4694 {
4695         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4696                 *flags |= RTNH_F_DEAD;
4697
4698         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4699                 *flags |= RTNH_F_LINKDOWN;
4700
4701                 rcu_read_lock();
4702                 if (fib6_ignore_linkdown(rt))
4703                         *flags |= RTNH_F_DEAD;
4704                 rcu_read_unlock();
4705         }
4706
4707         if (rt->fib6_flags & RTF_GATEWAY) {
4708                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4709                         goto nla_put_failure;
4710         }
4711
4712         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4713         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4714                 *flags |= RTNH_F_OFFLOAD;
4715
4716         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4717         if (!skip_oif && rt->fib6_nh.nh_dev &&
4718             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4719                 goto nla_put_failure;
4720
4721         if (rt->fib6_nh.nh_lwtstate &&
4722             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4723                 goto nla_put_failure;
4724
4725         return 0;
4726
4727 nla_put_failure:
4728         return -EMSGSIZE;
4729 }
4730
4731 /* add multipath next hop */
4732 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4733 {
4734         const struct net_device *dev = rt->fib6_nh.nh_dev;
4735         struct rtnexthop *rtnh;
4736         unsigned int flags = 0;
4737
4738         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4739         if (!rtnh)
4740                 goto nla_put_failure;
4741
4742         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4743         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4744
4745         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4746                 goto nla_put_failure;
4747
4748         rtnh->rtnh_flags = flags;
4749
4750         /* length of rtnetlink header + attributes */
4751         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4752
4753         return 0;
4754
4755 nla_put_failure:
4756         return -EMSGSIZE;
4757 }
4758
4759 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4760                          struct fib6_info *rt, struct dst_entry *dst,
4761                          struct in6_addr *dest, struct in6_addr *src,
4762                          int iif, int type, u32 portid, u32 seq,
4763                          unsigned int flags)
4764 {
4765         struct rt6_info *rt6 = (struct rt6_info *)dst;
4766         struct rt6key *rt6_dst, *rt6_src;
4767         u32 *pmetrics, table, rt6_flags;
4768         struct nlmsghdr *nlh;
4769         struct rtmsg *rtm;
4770         long expires = 0;
4771
4772         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4773         if (!nlh)
4774                 return -EMSGSIZE;
4775
4776         if (rt6) {
4777                 rt6_dst = &rt6->rt6i_dst;
4778                 rt6_src = &rt6->rt6i_src;
4779                 rt6_flags = rt6->rt6i_flags;
4780         } else {
4781                 rt6_dst = &rt->fib6_dst;
4782                 rt6_src = &rt->fib6_src;
4783                 rt6_flags = rt->fib6_flags;
4784         }
4785
4786         rtm = nlmsg_data(nlh);
4787         rtm->rtm_family = AF_INET6;
4788         rtm->rtm_dst_len = rt6_dst->plen;
4789         rtm->rtm_src_len = rt6_src->plen;
4790         rtm->rtm_tos = 0;
4791         if (rt->fib6_table)
4792                 table = rt->fib6_table->tb6_id;
4793         else
4794                 table = RT6_TABLE_UNSPEC;
4795         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4796         if (nla_put_u32(skb, RTA_TABLE, table))
4797                 goto nla_put_failure;
4798
4799         rtm->rtm_type = rt->fib6_type;
4800         rtm->rtm_flags = 0;
4801         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4802         rtm->rtm_protocol = rt->fib6_protocol;
4803
4804         if (rt6_flags & RTF_CACHE)
4805                 rtm->rtm_flags |= RTM_F_CLONED;
4806
4807         if (dest) {
4808                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4809                         goto nla_put_failure;
4810                 rtm->rtm_dst_len = 128;
4811         } else if (rtm->rtm_dst_len)
4812                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4813                         goto nla_put_failure;
4814 #ifdef CONFIG_IPV6_SUBTREES
4815         if (src) {
4816                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4817                         goto nla_put_failure;
4818                 rtm->rtm_src_len = 128;
4819         } else if (rtm->rtm_src_len &&
4820                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4821                 goto nla_put_failure;
4822 #endif
4823         if (iif) {
4824 #ifdef CONFIG_IPV6_MROUTE
4825                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4826                         int err = ip6mr_get_route(net, skb, rtm, portid);
4827
4828                         if (err == 0)
4829                                 return 0;
4830                         if (err < 0)
4831                                 goto nla_put_failure;
4832                 } else
4833 #endif
4834                         if (nla_put_u32(skb, RTA_IIF, iif))
4835                                 goto nla_put_failure;
4836         } else if (dest) {
4837                 struct in6_addr saddr_buf;
4838                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4839                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4840                         goto nla_put_failure;
4841         }
4842
4843         if (rt->fib6_prefsrc.plen) {
4844                 struct in6_addr saddr_buf;
4845                 saddr_buf = rt->fib6_prefsrc.addr;
4846                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4847                         goto nla_put_failure;
4848         }
4849
4850         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4851         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4852                 goto nla_put_failure;
4853
4854         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4855                 goto nla_put_failure;
4856
4857         /* For multipath routes, walk the siblings list and add
4858          * each as a nexthop within RTA_MULTIPATH.
4859          */
4860         if (rt6) {
4861                 if (rt6_flags & RTF_GATEWAY &&
4862                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4863                         goto nla_put_failure;
4864
4865                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4866                         goto nla_put_failure;
4867         } else if (rt->fib6_nsiblings) {
4868                 struct fib6_info *sibling, *next_sibling;
4869                 struct nlattr *mp;
4870
4871                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4872                 if (!mp)
4873                         goto nla_put_failure;
4874
4875                 if (rt6_add_nexthop(skb, rt) < 0)
4876                         goto nla_put_failure;
4877
4878                 list_for_each_entry_safe(sibling, next_sibling,
4879                                          &rt->fib6_siblings, fib6_siblings) {
4880                         if (rt6_add_nexthop(skb, sibling) < 0)
4881                                 goto nla_put_failure;
4882                 }
4883
4884                 nla_nest_end(skb, mp);
4885         } else {
4886                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4887                         goto nla_put_failure;
4888         }
4889
4890         if (rt6_flags & RTF_EXPIRES) {
4891                 expires = dst ? dst->expires : rt->expires;
4892                 expires -= jiffies;
4893         }
4894
4895         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4896                 goto nla_put_failure;
4897
4898         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4899                 goto nla_put_failure;
4900
4901
4902         nlmsg_end(skb, nlh);
4903         return 0;
4904
4905 nla_put_failure:
4906         nlmsg_cancel(skb, nlh);
4907         return -EMSGSIZE;
4908 }
4909
4910 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4911 {
4912         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4913         struct net *net = arg->net;
4914
4915         if (rt == net->ipv6.fib6_null_entry)
4916                 return 0;
4917
4918         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4919                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4920
4921                 /* user wants prefix routes only */
4922                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4923                     !(rt->fib6_flags & RTF_PREFIX_RT)) {
4924                         /* success since this is not a prefix route */
4925                         return 1;
4926                 }
4927         }
4928
4929         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4930                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4931                              arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4932 }
4933
4934 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4935                               struct netlink_ext_ack *extack)
4936 {
4937         struct net *net = sock_net(in_skb->sk);
4938         struct nlattr *tb[RTA_MAX+1];
4939         int err, iif = 0, oif = 0;
4940         struct fib6_info *from;
4941         struct dst_entry *dst;
4942         struct rt6_info *rt;
4943         struct sk_buff *skb;
4944         struct rtmsg *rtm;
4945         struct flowi6 fl6;
4946         bool fibmatch;
4947
4948         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4949                           extack);
4950         if (err < 0)
4951                 goto errout;
4952
4953         err = -EINVAL;
4954         memset(&fl6, 0, sizeof(fl6));
4955         rtm = nlmsg_data(nlh);
4956         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4957         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4958
4959         if (tb[RTA_SRC]) {
4960                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4961                         goto errout;
4962
4963                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4964         }
4965
4966         if (tb[RTA_DST]) {
4967                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4968                         goto errout;
4969
4970                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4971         }
4972
4973         if (tb[RTA_IIF])
4974                 iif = nla_get_u32(tb[RTA_IIF]);
4975
4976         if (tb[RTA_OIF])
4977                 oif = nla_get_u32(tb[RTA_OIF]);
4978
4979         if (tb[RTA_MARK])
4980                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4981
4982         if (tb[RTA_UID])
4983                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4984                                            nla_get_u32(tb[RTA_UID]));
4985         else
4986                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4987
4988         if (tb[RTA_SPORT])
4989                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4990
4991         if (tb[RTA_DPORT])
4992                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4993
4994         if (tb[RTA_IP_PROTO]) {
4995                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4996                                                   &fl6.flowi6_proto, AF_INET6,
4997                                                   extack);
4998                 if (err)
4999                         goto errout;
5000         }
5001
5002         if (iif) {
5003                 struct net_device *dev;
5004                 int flags = 0;
5005
5006                 rcu_read_lock();
5007
5008                 dev = dev_get_by_index_rcu(net, iif);
5009                 if (!dev) {
5010                         rcu_read_unlock();
5011                         err = -ENODEV;
5012                         goto errout;
5013                 }
5014
5015                 fl6.flowi6_iif = iif;
5016
5017                 if (!ipv6_addr_any(&fl6.saddr))
5018                         flags |= RT6_LOOKUP_F_HAS_SADDR;
5019
5020                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5021
5022                 rcu_read_unlock();
5023         } else {
5024                 fl6.flowi6_oif = oif;
5025
5026                 dst = ip6_route_output(net, NULL, &fl6);
5027         }
5028
5029
5030         rt = container_of(dst, struct rt6_info, dst);
5031         if (rt->dst.error) {
5032                 err = rt->dst.error;
5033                 ip6_rt_put(rt);
5034                 goto errout;
5035         }
5036
5037         if (rt == net->ipv6.ip6_null_entry) {
5038                 err = rt->dst.error;
5039                 ip6_rt_put(rt);
5040                 goto errout;
5041         }
5042
5043         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5044         if (!skb) {
5045                 ip6_rt_put(rt);
5046                 err = -ENOBUFS;
5047                 goto errout;
5048         }
5049
5050         skb_dst_set(skb, &rt->dst);
5051
5052         rcu_read_lock();
5053         from = rcu_dereference(rt->from);
5054         if (from) {
5055                 if (fibmatch)
5056                         err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5057                                             iif, RTM_NEWROUTE,
5058                                             NETLINK_CB(in_skb).portid,
5059                                             nlh->nlmsg_seq, 0);
5060                 else
5061                         err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5062                                             &fl6.saddr, iif, RTM_NEWROUTE,
5063                                             NETLINK_CB(in_skb).portid,
5064                                             nlh->nlmsg_seq, 0);
5065         } else {
5066                 err = -ENETUNREACH;
5067         }
5068         rcu_read_unlock();
5069
5070         if (err < 0) {
5071                 kfree_skb(skb);
5072                 goto errout;
5073         }
5074
5075         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5076 errout:
5077         return err;
5078 }
5079
5080 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5081                      unsigned int nlm_flags)
5082 {
5083         struct sk_buff *skb;
5084         struct net *net = info->nl_net;
5085         u32 seq;
5086         int err;
5087
5088         err = -ENOBUFS;
5089         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5090
5091         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5092         if (!skb)
5093                 goto errout;
5094
5095         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5096                             event, info->portid, seq, nlm_flags);
5097         if (err < 0) {
5098                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5099                 WARN_ON(err == -EMSGSIZE);
5100                 kfree_skb(skb);
5101                 goto errout;
5102         }
5103         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5104                     info->nlh, gfp_any());
5105         return;
5106 errout:
5107         if (err < 0)
5108                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5109 }
5110
5111 static int ip6_route_dev_notify(struct notifier_block *this,
5112                                 unsigned long event, void *ptr)
5113 {
5114         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5115         struct net *net = dev_net(dev);
5116
5117         if (!(dev->flags & IFF_LOOPBACK))
5118                 return NOTIFY_OK;
5119
5120         if (event == NETDEV_REGISTER) {
5121                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5122                 net->ipv6.ip6_null_entry->dst.dev = dev;
5123                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5124 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5125                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5126                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5127                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5128                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5129 #endif
5130          } else if (event == NETDEV_UNREGISTER &&
5131                     dev->reg_state != NETREG_UNREGISTERED) {
5132                 /* NETDEV_UNREGISTER could be fired for multiple times by
5133                  * netdev_wait_allrefs(). Make sure we only call this once.
5134                  */
5135                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5136 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5137                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5138                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5139 #endif
5140         }
5141
5142         return NOTIFY_OK;
5143 }
5144
5145 /*
5146  *      /proc
5147  */
5148
5149 #ifdef CONFIG_PROC_FS
5150 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5151 {
5152         struct net *net = (struct net *)seq->private;
5153         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5154                    net->ipv6.rt6_stats->fib_nodes,
5155                    net->ipv6.rt6_stats->fib_route_nodes,
5156                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5157                    net->ipv6.rt6_stats->fib_rt_entries,
5158                    net->ipv6.rt6_stats->fib_rt_cache,
5159                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5160                    net->ipv6.rt6_stats->fib_discarded_routes);
5161
5162         return 0;
5163 }
5164 #endif  /* CONFIG_PROC_FS */
5165
5166 #ifdef CONFIG_SYSCTL
5167
5168 static
5169 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5170                               void __user *buffer, size_t *lenp, loff_t *ppos)
5171 {
5172         struct net *net;
5173         int delay;
5174         if (!write)
5175                 return -EINVAL;
5176
5177         net = (struct net *)ctl->extra1;
5178         delay = net->ipv6.sysctl.flush_delay;
5179         proc_dointvec(ctl, write, buffer, lenp, ppos);
5180         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5181         return 0;
5182 }
5183
5184 struct ctl_table ipv6_route_table_template[] = {
5185         {
5186                 .procname       =       "flush",
5187                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5188                 .maxlen         =       sizeof(int),
5189                 .mode           =       0200,
5190                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5191         },
5192         {
5193                 .procname       =       "gc_thresh",
5194                 .data           =       &ip6_dst_ops_template.gc_thresh,
5195                 .maxlen         =       sizeof(int),
5196                 .mode           =       0644,
5197                 .proc_handler   =       proc_dointvec,
5198         },
5199         {
5200                 .procname       =       "max_size",
5201                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5202                 .maxlen         =       sizeof(int),
5203                 .mode           =       0644,
5204                 .proc_handler   =       proc_dointvec,
5205         },
5206         {
5207                 .procname       =       "gc_min_interval",
5208                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5209                 .maxlen         =       sizeof(int),
5210                 .mode           =       0644,
5211                 .proc_handler   =       proc_dointvec_jiffies,
5212         },
5213         {
5214                 .procname       =       "gc_timeout",
5215                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5216                 .maxlen         =       sizeof(int),
5217                 .mode           =       0644,
5218                 .proc_handler   =       proc_dointvec_jiffies,
5219         },
5220         {
5221                 .procname       =       "gc_interval",
5222                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5223                 .maxlen         =       sizeof(int),
5224                 .mode           =       0644,
5225                 .proc_handler   =       proc_dointvec_jiffies,
5226         },
5227         {
5228                 .procname       =       "gc_elasticity",
5229                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5230                 .maxlen         =       sizeof(int),
5231                 .mode           =       0644,
5232                 .proc_handler   =       proc_dointvec,
5233         },
5234         {
5235                 .procname       =       "mtu_expires",
5236                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5237                 .maxlen         =       sizeof(int),
5238                 .mode           =       0644,
5239                 .proc_handler   =       proc_dointvec_jiffies,
5240         },
5241         {
5242                 .procname       =       "min_adv_mss",
5243                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5244                 .maxlen         =       sizeof(int),
5245                 .mode           =       0644,
5246                 .proc_handler   =       proc_dointvec,
5247         },
5248         {
5249                 .procname       =       "gc_min_interval_ms",
5250                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5251                 .maxlen         =       sizeof(int),
5252                 .mode           =       0644,
5253                 .proc_handler   =       proc_dointvec_ms_jiffies,
5254         },
5255         { }
5256 };
5257
5258 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5259 {
5260         struct ctl_table *table;
5261
5262         table = kmemdup(ipv6_route_table_template,
5263                         sizeof(ipv6_route_table_template),
5264                         GFP_KERNEL);
5265
5266         if (table) {
5267                 table[0].data = &net->ipv6.sysctl.flush_delay;
5268                 table[0].extra1 = net;
5269                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5270                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5271                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5272                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5273                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5274                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5275                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5276                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5277                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5278
5279                 /* Don't export sysctls to unprivileged users */
5280                 if (net->user_ns != &init_user_ns)
5281                         table[0].procname = NULL;
5282         }
5283
5284         return table;
5285 }
5286 #endif
5287
5288 static int __net_init ip6_route_net_init(struct net *net)
5289 {
5290         int ret = -ENOMEM;
5291
5292         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5293                sizeof(net->ipv6.ip6_dst_ops));
5294
5295         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5296                 goto out_ip6_dst_ops;
5297
5298         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5299                                             sizeof(*net->ipv6.fib6_null_entry),
5300                                             GFP_KERNEL);
5301         if (!net->ipv6.fib6_null_entry)
5302                 goto out_ip6_dst_entries;
5303
5304         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5305                                            sizeof(*net->ipv6.ip6_null_entry),
5306                                            GFP_KERNEL);
5307         if (!net->ipv6.ip6_null_entry)
5308                 goto out_fib6_null_entry;
5309         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5310         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5311                          ip6_template_metrics, true);
5312
5313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5314         net->ipv6.fib6_has_custom_rules = false;
5315         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5316                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5317                                                GFP_KERNEL);
5318         if (!net->ipv6.ip6_prohibit_entry)
5319                 goto out_ip6_null_entry;
5320         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5321         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5322                          ip6_template_metrics, true);
5323
5324         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5325                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5326                                                GFP_KERNEL);
5327         if (!net->ipv6.ip6_blk_hole_entry)
5328                 goto out_ip6_prohibit_entry;
5329         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5330         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5331                          ip6_template_metrics, true);
5332 #endif
5333
5334         net->ipv6.sysctl.flush_delay = 0;
5335         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5336         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5337         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5338         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5339         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5340         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5341         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5342
5343         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5344
5345         ret = 0;
5346 out:
5347         return ret;
5348
5349 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5350 out_ip6_prohibit_entry:
5351         kfree(net->ipv6.ip6_prohibit_entry);
5352 out_ip6_null_entry:
5353         kfree(net->ipv6.ip6_null_entry);
5354 #endif
5355 out_fib6_null_entry:
5356         kfree(net->ipv6.fib6_null_entry);
5357 out_ip6_dst_entries:
5358         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5359 out_ip6_dst_ops:
5360         goto out;
5361 }
5362
5363 static void __net_exit ip6_route_net_exit(struct net *net)
5364 {
5365         kfree(net->ipv6.fib6_null_entry);
5366         kfree(net->ipv6.ip6_null_entry);
5367 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5368         kfree(net->ipv6.ip6_prohibit_entry);
5369         kfree(net->ipv6.ip6_blk_hole_entry);
5370 #endif
5371         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5372 }
5373
5374 static int __net_init ip6_route_net_init_late(struct net *net)
5375 {
5376 #ifdef CONFIG_PROC_FS
5377         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5378                         sizeof(struct ipv6_route_iter));
5379         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5380                         rt6_stats_seq_show, NULL);
5381 #endif
5382         return 0;
5383 }
5384
5385 static void __net_exit ip6_route_net_exit_late(struct net *net)
5386 {
5387 #ifdef CONFIG_PROC_FS
5388         remove_proc_entry("ipv6_route", net->proc_net);
5389         remove_proc_entry("rt6_stats", net->proc_net);
5390 #endif
5391 }
5392
5393 static struct pernet_operations ip6_route_net_ops = {
5394         .init = ip6_route_net_init,
5395         .exit = ip6_route_net_exit,
5396 };
5397
5398 static int __net_init ipv6_inetpeer_init(struct net *net)
5399 {
5400         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5401
5402         if (!bp)
5403                 return -ENOMEM;
5404         inet_peer_base_init(bp);
5405         net->ipv6.peers = bp;
5406         return 0;
5407 }
5408
5409 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5410 {
5411         struct inet_peer_base *bp = net->ipv6.peers;
5412
5413         net->ipv6.peers = NULL;
5414         inetpeer_invalidate_tree(bp);
5415         kfree(bp);
5416 }
5417
5418 static struct pernet_operations ipv6_inetpeer_ops = {
5419         .init   =       ipv6_inetpeer_init,
5420         .exit   =       ipv6_inetpeer_exit,
5421 };
5422
5423 static struct pernet_operations ip6_route_net_late_ops = {
5424         .init = ip6_route_net_init_late,
5425         .exit = ip6_route_net_exit_late,
5426 };
5427
5428 static struct notifier_block ip6_route_dev_notifier = {
5429         .notifier_call = ip6_route_dev_notify,
5430         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5431 };
5432
5433 void __init ip6_route_init_special_entries(void)
5434 {
5435         /* Registering of the loopback is done before this portion of code,
5436          * the loopback reference in rt6_info will not be taken, do it
5437          * manually for init_net */
5438         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5439         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5440         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5441   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5442         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5443         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5444         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5445         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5446   #endif
5447 }
5448
5449 int __init ip6_route_init(void)
5450 {
5451         int ret;
5452         int cpu;
5453
5454         ret = -ENOMEM;
5455         ip6_dst_ops_template.kmem_cachep =
5456                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5457                                   SLAB_HWCACHE_ALIGN, NULL);
5458         if (!ip6_dst_ops_template.kmem_cachep)
5459                 goto out;
5460
5461         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5462         if (ret)
5463                 goto out_kmem_cache;
5464
5465         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5466         if (ret)
5467                 goto out_dst_entries;
5468
5469         ret = register_pernet_subsys(&ip6_route_net_ops);
5470         if (ret)
5471                 goto out_register_inetpeer;
5472
5473         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5474
5475         ret = fib6_init();
5476         if (ret)
5477                 goto out_register_subsys;
5478
5479         ret = xfrm6_init();
5480         if (ret)
5481                 goto out_fib6_init;
5482
5483         ret = fib6_rules_init();
5484         if (ret)
5485                 goto xfrm6_init;
5486
5487         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5488         if (ret)
5489                 goto fib6_rules_init;
5490
5491         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5492                                    inet6_rtm_newroute, NULL, 0);
5493         if (ret < 0)
5494                 goto out_register_late_subsys;
5495
5496         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5497                                    inet6_rtm_delroute, NULL, 0);
5498         if (ret < 0)
5499                 goto out_register_late_subsys;
5500
5501         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5502                                    inet6_rtm_getroute, NULL,
5503                                    RTNL_FLAG_DOIT_UNLOCKED);
5504         if (ret < 0)
5505                 goto out_register_late_subsys;
5506
5507         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5508         if (ret)
5509                 goto out_register_late_subsys;
5510
5511         for_each_possible_cpu(cpu) {
5512                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5513
5514                 INIT_LIST_HEAD(&ul->head);
5515                 spin_lock_init(&ul->lock);
5516         }
5517
5518 out:
5519         return ret;
5520
5521 out_register_late_subsys:
5522         rtnl_unregister_all(PF_INET6);
5523         unregister_pernet_subsys(&ip6_route_net_late_ops);
5524 fib6_rules_init:
5525         fib6_rules_cleanup();
5526 xfrm6_init:
5527         xfrm6_fini();
5528 out_fib6_init:
5529         fib6_gc_cleanup();
5530 out_register_subsys:
5531         unregister_pernet_subsys(&ip6_route_net_ops);
5532 out_register_inetpeer:
5533         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5534 out_dst_entries:
5535         dst_entries_destroy(&ip6_dst_blackhole_ops);
5536 out_kmem_cache:
5537         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5538         goto out;
5539 }
5540
5541 void ip6_route_cleanup(void)
5542 {
5543         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5544         unregister_pernet_subsys(&ip6_route_net_late_ops);
5545         fib6_rules_cleanup();
5546         xfrm6_fini();
5547         fib6_gc_cleanup();
5548         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5549         unregister_pernet_subsys(&ip6_route_net_ops);
5550         dst_entries_destroy(&ip6_dst_blackhole_ops);
5551         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5552 }