net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/rcupdate.h>
  91 #include <linux/times.h>
  92 #include <linux/slab.h>
  93 #include <linux/jhash.h>
  94 #include <net/dst.h>
  95 #include <net/dst_metadata.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/lwtunnel.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115 #include <net/ip_tunnels.h>
 116 #include <net/l3mdev.h>
 117
 118 #include "fib_lookup.h"
 119
 120 #define RT_FL_TOS(oldflp4) \
 121         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 122
 123 #define RT_GC_TIMEOUT (300*HZ)
 124
 125 static int ip_rt_max_size;
 126 static int ip_rt_redirect_number __read_mostly  = 9;
 127 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 128 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 129 static int ip_rt_error_cost __read_mostly       = HZ;
 130 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 132 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 133 static int ip_rt_min_advmss __read_mostly       = 256;
 134
 135 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 143 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 145 static void              ipv4_link_failure(struct sk_buff *skb);
 146 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 147                                            struct sk_buff *skb, u32 mtu,
 148                                            bool confirm_neigh);
 149 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 150                                         struct sk_buff *skb);
 151 static void             ipv4_dst_destroy(struct dst_entry *dst);
 152
 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154 {
 155         WARN_ON(1);
 156         return NULL;
 157 }
 158
 159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 160                                            struct sk_buff *skb,
 161                                            const void *daddr);
 162 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 163
 164 static struct dst_ops ipv4_dst_ops = {
 165         .family =               AF_INET,
 166         .check =                ipv4_dst_check,
 167         .default_advmss =       ipv4_default_advmss,
 168         .mtu =                  ipv4_mtu,
 169         .cow_metrics =          ipv4_cow_metrics,
 170         .destroy =              ipv4_dst_destroy,
 171         .negative_advice =      ipv4_negative_advice,
 172         .link_failure =         ipv4_link_failure,
 173         .update_pmtu =          ip_rt_update_pmtu,
 174         .redirect =             ip_do_redirect,
 175         .local_out =            __ip_local_out,
 176         .neigh_lookup =         ipv4_neigh_lookup,
 177         .confirm_neigh =        ipv4_confirm_neigh,
 178 };
 179
 180 #define ECN_OR_COST(class)      TC_PRIO_##class
 181
 182 const __u8 ip_tos2prio[16] = {
 183         TC_PRIO_BESTEFFORT,
 184         ECN_OR_COST(BESTEFFORT),
 185         TC_PRIO_BESTEFFORT,
 186         ECN_OR_COST(BESTEFFORT),
 187         TC_PRIO_BULK,
 188         ECN_OR_COST(BULK),
 189         TC_PRIO_BULK,
 190         ECN_OR_COST(BULK),
 191         TC_PRIO_INTERACTIVE,
 192         ECN_OR_COST(INTERACTIVE),
 193         TC_PRIO_INTERACTIVE,
 194         ECN_OR_COST(INTERACTIVE),
 195         TC_PRIO_INTERACTIVE_BULK,
 196         ECN_OR_COST(INTERACTIVE_BULK),
 197         TC_PRIO_INTERACTIVE_BULK,
 198         ECN_OR_COST(INTERACTIVE_BULK)
 199 };
 200 EXPORT_SYMBOL(ip_tos2prio);
 201
 202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 203 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 204
 205 #ifdef CONFIG_PROC_FS
 206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 207 {
 208         if (*pos)
 209                 return NULL;
 210         return SEQ_START_TOKEN;
 211 }
 212
 213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 214 {
 215         ++*pos;
 216         return NULL;
 217 }
 218
 219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 220 {
 221 }
 222
 223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 224 {
 225         if (v == SEQ_START_TOKEN)
 226                 seq_printf(seq, "%-127s\n",
 227                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 228                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 229                            "HHUptod\tSpecDst");
 230         return 0;
 231 }
 232
 233 static const struct seq_operations rt_cache_seq_ops = {
 234         .start  = rt_cache_seq_start,
 235         .next   = rt_cache_seq_next,
 236         .stop   = rt_cache_seq_stop,
 237         .show   = rt_cache_seq_show,
 238 };
 239
 240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 241 {
 242         return seq_open(file, &rt_cache_seq_ops);
 243 }
 244
 245 static const struct file_operations rt_cache_seq_fops = {
 246         .owner   = THIS_MODULE,
 247         .open    = rt_cache_seq_open,
 248         .read    = seq_read,
 249         .llseek  = seq_lseek,
 250         .release = seq_release,
 251 };
 252
 253
 254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 255 {
 256         int cpu;
 257
 258         if (*pos == 0)
 259                 return SEQ_START_TOKEN;
 260
 261         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 262                 if (!cpu_possible(cpu))
 263                         continue;
 264                 *pos = cpu+1;
 265                 return &per_cpu(rt_cache_stat, cpu);
 266         }
 267         return NULL;
 268 }
 269
 270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 271 {
 272         int cpu;
 273
 274         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 275                 if (!cpu_possible(cpu))
 276                         continue;
 277                 *pos = cpu+1;
 278                 return &per_cpu(rt_cache_stat, cpu);
 279         }
 280         (*pos)++;
 281         return NULL;
 282
 283 }
 284
 285 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 286 {
 287
 288 }
 289
 290 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 291 {
 292         struct rt_cache_stat *st = v;
 293
 294         if (v == SEQ_START_TOKEN) {
 295                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 296                 return 0;
 297         }
 298
 299         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 300                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 301                    dst_entries_get_slow(&ipv4_dst_ops),
 302                    0, /* st->in_hit */
 303                    st->in_slow_tot,
 304                    st->in_slow_mc,
 305                    st->in_no_route,
 306                    st->in_brd,
 307                    st->in_martian_dst,
 308                    st->in_martian_src,
 309
 310                    0, /* st->out_hit */
 311                    st->out_slow_tot,
 312                    st->out_slow_mc,
 313
 314                    0, /* st->gc_total */
 315                    0, /* st->gc_ignored */
 316                    0, /* st->gc_goal_miss */
 317                    0, /* st->gc_dst_overflow */
 318                    0, /* st->in_hlist_search */
 319                    0  /* st->out_hlist_search */
 320                 );
 321         return 0;
 322 }
 323
 324 static const struct seq_operations rt_cpu_seq_ops = {
 325         .start  = rt_cpu_seq_start,
 326         .next   = rt_cpu_seq_next,
 327         .stop   = rt_cpu_seq_stop,
 328         .show   = rt_cpu_seq_show,
 329 };
 330
 331
 332 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 333 {
 334         return seq_open(file, &rt_cpu_seq_ops);
 335 }
 336
 337 static const struct file_operations rt_cpu_seq_fops = {
 338         .owner   = THIS_MODULE,
 339         .open    = rt_cpu_seq_open,
 340         .read    = seq_read,
 341         .llseek  = seq_lseek,
 342         .release = seq_release,
 343 };
 344
 345 #ifdef CONFIG_IP_ROUTE_CLASSID
 346 static int rt_acct_proc_show(struct seq_file *m, void *v)
 347 {
 348         struct ip_rt_acct *dst, *src;
 349         unsigned int i, j;
 350
 351         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 352         if (!dst)
 353                 return -ENOMEM;
 354
 355         for_each_possible_cpu(i) {
 356                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 357                 for (j = 0; j < 256; j++) {
 358                         dst[j].o_bytes   += src[j].o_bytes;
 359                         dst[j].o_packets += src[j].o_packets;
 360                         dst[j].i_bytes   += src[j].i_bytes;
 361                         dst[j].i_packets += src[j].i_packets;
 362                 }
 363         }
 364
 365         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 366         kfree(dst);
 367         return 0;
 368 }
 369
 370 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 371 {
 372         return single_open(file, rt_acct_proc_show, NULL);
 373 }
 374
 375 static const struct file_operations rt_acct_proc_fops = {
 376         .owner          = THIS_MODULE,
 377         .open           = rt_acct_proc_open,
 378         .read           = seq_read,
 379         .llseek         = seq_lseek,
 380         .release        = single_release,
 381 };
 382 #endif
 383
 384 static int __net_init ip_rt_do_proc_init(struct net *net)
 385 {
 386         struct proc_dir_entry *pde;
 387
 388         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 389                           &rt_cache_seq_fops);
 390         if (!pde)
 391                 goto err1;
 392
 393         pde = proc_create("rt_cache", S_IRUGO,
 394                           net->proc_net_stat, &rt_cpu_seq_fops);
 395         if (!pde)
 396                 goto err2;
 397
 398 #ifdef CONFIG_IP_ROUTE_CLASSID
 399         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 400         if (!pde)
 401                 goto err3;
 402 #endif
 403         return 0;
 404
 405 #ifdef CONFIG_IP_ROUTE_CLASSID
 406 err3:
 407         remove_proc_entry("rt_cache", net->proc_net_stat);
 408 #endif
 409 err2:
 410         remove_proc_entry("rt_cache", net->proc_net);
 411 err1:
 412         return -ENOMEM;
 413 }
 414
 415 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 416 {
 417         remove_proc_entry("rt_cache", net->proc_net_stat);
 418         remove_proc_entry("rt_cache", net->proc_net);
 419 #ifdef CONFIG_IP_ROUTE_CLASSID
 420         remove_proc_entry("rt_acct", net->proc_net);
 421 #endif
 422 }
 423
 424 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 425         .init = ip_rt_do_proc_init,
 426         .exit = ip_rt_do_proc_exit,
 427 };
 428
 429 static int __init ip_rt_proc_init(void)
 430 {
 431         return register_pernet_subsys(&ip_rt_proc_ops);
 432 }
 433
 434 #else
 435 static inline int ip_rt_proc_init(void)
 436 {
 437         return 0;
 438 }
 439 #endif /* CONFIG_PROC_FS */
 440
 441 static inline bool rt_is_expired(const struct rtable *rth)
 442 {
 443         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 444 }
 445
 446 void rt_cache_flush(struct net *net)
 447 {
 448         rt_genid_bump_ipv4(net);
 449 }
 450
 451 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 452                                            struct sk_buff *skb,
 453                                            const void *daddr)
 454 {
 455         struct net_device *dev = dst->dev;
 456         const __be32 *pkey = daddr;
 457         const struct rtable *rt;
 458         struct neighbour *n;
 459
 460         rt = (const struct rtable *) dst;
 461         if (rt->rt_gateway)
 462                 pkey = (const __be32 *) &rt->rt_gateway;
 463         else if (skb)
 464                 pkey = &ip_hdr(skb)->daddr;
 465
 466         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 467         if (n)
 468                 return n;
 469         return neigh_create(&arp_tbl, pkey, dev);
 470 }
 471
 472 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 473 {
 474         struct net_device *dev = dst->dev;
 475         const __be32 *pkey = daddr;
 476         const struct rtable *rt;
 477
 478         rt = (const struct rtable *)dst;
 479         if (rt->rt_gateway)
 480                 pkey = (const __be32 *)&rt->rt_gateway;
 481         else if (!daddr ||
 482                  (rt->rt_flags &
 483                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 484                 return;
 485
 486         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 487 }
 488
 489 /* Hash tables of size 2048..262144 depending on RAM size.
 490  * Each bucket uses 8 bytes.
 491  */
 492 static u32 ip_idents_mask __read_mostly;
 493 static atomic_t *ip_idents __read_mostly;
 494 static u32 *ip_tstamps __read_mostly;
 495
 496 /* In order to protect privacy, we add a perturbation to identifiers
 497  * if one generator is seldom used. This makes hard for an attacker
 498  * to infer how many packets were sent between two points in time.
 499  */
 500 u32 ip_idents_reserve(u32 hash, int segs)
 501 {
 502         u32 bucket, old, now = (u32)jiffies;
 503         atomic_t *p_id;
 504         u32 *p_tstamp;
 505         u32 delta = 0;
 506
 507         bucket = hash & ip_idents_mask;
 508         p_tstamp = ip_tstamps + bucket;
 509         p_id = ip_idents + bucket;
 510         old = ACCESS_ONCE(*p_tstamp);
 511
 512         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 513                 delta = prandom_u32_max(now - old);
 514
 515         /* If UBSAN reports an error there, please make sure your compiler
 516          * supports -fno-strict-overflow before reporting it that was a bug
 517          * in UBSAN, and it has been fixed in GCC-8.
 518          */
 519         return atomic_add_return(segs + delta, p_id) - segs;
 520 }
 521 EXPORT_SYMBOL(ip_idents_reserve);
 522
 523 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 524 {
 525         u32 hash, id;
 526
 527         /* Note the following code is not safe, but this is okay. */
 528         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 529                 get_random_bytes(&net->ipv4.ip_id_key,
 530                                  sizeof(net->ipv4.ip_id_key));
 531
 532         hash = siphash_3u32((__force u32)iph->daddr,
 533                             (__force u32)iph->saddr,
 534                             iph->protocol,
 535                             &net->ipv4.ip_id_key);
 536         id = ip_idents_reserve(hash, segs);
 537         iph->id = htons(id);
 538 }
 539 EXPORT_SYMBOL(__ip_select_ident);
 540
 541 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 542                              const struct sock *sk,
 543                              const struct iphdr *iph,
 544                              int oif, u8 tos,
 545                              u8 prot, u32 mark, int flow_flags)
 546 {
 547         if (sk) {
 548                 const struct inet_sock *inet = inet_sk(sk);
 549
 550                 oif = sk->sk_bound_dev_if;
 551                 mark = sk->sk_mark;
 552                 tos = RT_CONN_FLAGS(sk);
 553                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 554         }
 555         flowi4_init_output(fl4, oif, mark, tos,
 556                            RT_SCOPE_UNIVERSE, prot,
 557                            flow_flags,
 558                            iph->daddr, iph->saddr, 0, 0,
 559                            sock_net_uid(net, sk));
 560 }
 561
 562 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 563                                const struct sock *sk)
 564 {
 565         const struct net *net = dev_net(skb->dev);
 566         const struct iphdr *iph = ip_hdr(skb);
 567         int oif = skb->dev->ifindex;
 568         u8 tos = RT_TOS(iph->tos);
 569         u8 prot = iph->protocol;
 570         u32 mark = skb->mark;
 571
 572         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 573 }
 574
 575 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 576 {
 577         const struct inet_sock *inet = inet_sk(sk);
 578         const struct ip_options_rcu *inet_opt;
 579         __be32 daddr = inet->inet_daddr;
 580
 581         rcu_read_lock();
 582         inet_opt = rcu_dereference(inet->inet_opt);
 583         if (inet_opt && inet_opt->opt.srr)
 584                 daddr = inet_opt->opt.faddr;
 585         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 586                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 587                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 588                            inet_sk_flowi_flags(sk),
 589                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 590         rcu_read_unlock();
 591 }
 592
 593 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 594                                  const struct sk_buff *skb)
 595 {
 596         if (skb)
 597                 build_skb_flow_key(fl4, skb, sk);
 598         else
 599                 build_sk_flow_key(fl4, sk);
 600 }
 601
 602 static DEFINE_SPINLOCK(fnhe_lock);
 603
 604 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 605 {
 606         struct rtable *rt;
 607
 608         rt = rcu_dereference(fnhe->fnhe_rth_input);
 609         if (rt) {
 610                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 611                 dst_dev_put(&rt->dst);
 612                 dst_release(&rt->dst);
 613         }
 614         rt = rcu_dereference(fnhe->fnhe_rth_output);
 615         if (rt) {
 616                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 617                 dst_dev_put(&rt->dst);
 618                 dst_release(&rt->dst);
 619         }
 620 }
 621
 622 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
 623 {
 624         struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
 625         struct fib_nh_exception *fnhe, *oldest = NULL;
 626
 627         for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
 628                 fnhe = rcu_dereference_protected(*fnhe_p,
 629                                                  lockdep_is_held(&fnhe_lock));
 630                 if (!fnhe)
 631                         break;
 632                 if (!oldest ||
 633                     time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
 634                         oldest = fnhe;
 635                         oldest_p = fnhe_p;
 636                 }
 637         }
 638         fnhe_flush_routes(oldest);
 639         *oldest_p = oldest->fnhe_next;
 640         kfree_rcu(oldest, rcu);
 641 }
 642
 643 static u32 fnhe_hashfun(__be32 daddr)
 644 {
 645         static siphash_key_t fnhe_hash_key __read_mostly;
 646         u64 hval;
 647
 648         net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
 649         hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
 650         return hash_64(hval, FNHE_HASH_SHIFT);
 651 }
 652
 653 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 654 {
 655         rt->rt_pmtu = fnhe->fnhe_pmtu;
 656         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 657         rt->dst.expires = fnhe->fnhe_expires;
 658
 659         if (fnhe->fnhe_gw) {
 660                 rt->rt_flags |= RTCF_REDIRECTED;
 661                 rt->rt_gateway = fnhe->fnhe_gw;
 662                 rt->rt_uses_gateway = 1;
 663         }
 664 }
 665
 666 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 667                                   u32 pmtu, bool lock, unsigned long expires)
 668 {
 669         struct fnhe_hash_bucket *hash;
 670         struct fib_nh_exception *fnhe;
 671         struct rtable *rt;
 672         u32 genid, hval;
 673         unsigned int i;
 674         int depth;
 675
 676         genid = fnhe_genid(dev_net(nh->nh_dev));
 677         hval = fnhe_hashfun(daddr);
 678
 679         spin_lock_bh(&fnhe_lock);
 680
 681         hash = rcu_dereference(nh->nh_exceptions);
 682         if (!hash) {
 683                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 684                 if (!hash)
 685                         goto out_unlock;
 686                 rcu_assign_pointer(nh->nh_exceptions, hash);
 687         }
 688
 689         hash += hval;
 690
 691         depth = 0;
 692         for (fnhe = rcu_dereference(hash->chain); fnhe;
 693              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 694                 if (fnhe->fnhe_daddr == daddr)
 695                         break;
 696                 depth++;
 697         }
 698
 699         if (fnhe) {
 700                 if (fnhe->fnhe_genid != genid)
 701                         fnhe->fnhe_genid = genid;
 702                 if (gw)
 703                         fnhe->fnhe_gw = gw;
 704                 if (pmtu) {
 705                         fnhe->fnhe_pmtu = pmtu;
 706                         fnhe->fnhe_mtu_locked = lock;
 707                 }
 708                 fnhe->fnhe_expires = max(1UL, expires);
 709                 /* Update all cached dsts too */
 710                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 711                 if (rt)
 712                         fill_route_from_fnhe(rt, fnhe);
 713                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 714                 if (rt)
 715                         fill_route_from_fnhe(rt, fnhe);
 716         } else {
 717                 /* Randomize max depth to avoid some side channels attacks. */
 718                 int max_depth = FNHE_RECLAIM_DEPTH +
 719                                 prandom_u32_max(FNHE_RECLAIM_DEPTH);
 720
 721                 while (depth > max_depth) {
 722                         fnhe_remove_oldest(hash);
 723                         depth--;
 724                 }
 725
 726                 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 727                 if (!fnhe)
 728                         goto out_unlock;
 729
 730                 fnhe->fnhe_next = hash->chain;
 731
 732                 fnhe->fnhe_genid = genid;
 733                 fnhe->fnhe_daddr = daddr;
 734                 fnhe->fnhe_gw = gw;
 735                 fnhe->fnhe_pmtu = pmtu;
 736                 fnhe->fnhe_mtu_locked = lock;
 737                 fnhe->fnhe_expires = max(1UL, expires);
 738
 739                 rcu_assign_pointer(hash->chain, fnhe);
 740
 741                 /* Exception created; mark the cached routes for the nexthop
 742                  * stale, so anyone caching it rechecks if this exception
 743                  * applies to them.
 744                  */
 745                 rt = rcu_dereference(nh->nh_rth_input);
 746                 if (rt)
 747                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 748
 749                 for_each_possible_cpu(i) {
 750                         struct rtable __rcu **prt;
 751                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 752                         rt = rcu_dereference(*prt);
 753                         if (rt)
 754                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 755                 }
 756         }
 757
 758         fnhe->fnhe_stamp = jiffies;
 759
 760 out_unlock:
 761         spin_unlock_bh(&fnhe_lock);
 762 }
 763
 764 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 765                              bool kill_route)
 766 {
 767         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 768         __be32 old_gw = ip_hdr(skb)->saddr;
 769         struct net_device *dev = skb->dev;
 770         struct in_device *in_dev;
 771         struct fib_result res;
 772         struct neighbour *n;
 773         struct net *net;
 774
 775         switch (icmp_hdr(skb)->code & 7) {
 776         case ICMP_REDIR_NET:
 777         case ICMP_REDIR_NETTOS:
 778         case ICMP_REDIR_HOST:
 779         case ICMP_REDIR_HOSTTOS:
 780                 break;
 781
 782         default:
 783                 return;
 784         }
 785
 786         if (rt->rt_gateway != old_gw)
 787                 return;
 788
 789         in_dev = __in_dev_get_rcu(dev);
 790         if (!in_dev)
 791                 return;
 792
 793         net = dev_net(dev);
 794         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 795             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 796             ipv4_is_zeronet(new_gw))
 797                 goto reject_redirect;
 798
 799         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 800                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 801                         goto reject_redirect;
 802                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 803                         goto reject_redirect;
 804         } else {
 805                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 806                         goto reject_redirect;
 807         }
 808
 809         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 810         if (!n)
 811                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 812         if (!IS_ERR(n)) {
 813                 if (!(n->nud_state & NUD_VALID)) {
 814                         neigh_event_send(n, NULL);
 815                 } else {
 816                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 817                                 struct fib_nh *nh = &FIB_RES_NH(res);
 818
 819                                 fib_select_path(net, &res, fl4, skb);
 820                                 nh = &FIB_RES_NH(res);
 821                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 822                                                 0, false,
 823                                                 jiffies + ip_rt_gc_timeout);
 824                         }
 825                         if (kill_route)
 826                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 827                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 828                 }
 829                 neigh_release(n);
 830         }
 831         return;
 832
 833 reject_redirect:
 834 #ifdef CONFIG_IP_ROUTE_VERBOSE
 835         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 836                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 837                 __be32 daddr = iph->daddr;
 838                 __be32 saddr = iph->saddr;
 839
 840                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 841                                      "  Advised path = %pI4 -> %pI4\n",
 842                                      &old_gw, dev->name, &new_gw,
 843                                      &saddr, &daddr);
 844         }
 845 #endif
 846         ;
 847 }
 848
 849 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 850 {
 851         struct rtable *rt;
 852         struct flowi4 fl4;
 853         const struct iphdr *iph = (const struct iphdr *) skb->data;
 854         struct net *net = dev_net(skb->dev);
 855         int oif = skb->dev->ifindex;
 856         u8 tos = RT_TOS(iph->tos);
 857         u8 prot = iph->protocol;
 858         u32 mark = skb->mark;
 859
 860         rt = (struct rtable *) dst;
 861
 862         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 863         __ip_do_redirect(rt, skb, &fl4, true);
 864 }
 865
 866 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 867 {
 868         struct rtable *rt = (struct rtable *)dst;
 869         struct dst_entry *ret = dst;
 870
 871         if (rt) {
 872                 if (dst->obsolete > 0) {
 873                         ip_rt_put(rt);
 874                         ret = NULL;
 875                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 876                            rt->dst.expires) {
 877                         ip_rt_put(rt);
 878                         ret = NULL;
 879                 }
 880         }
 881         return ret;
 882 }
 883
 884 /*
 885  * Algorithm:
 886  *      1. The first ip_rt_redirect_number redirects are sent
 887  *         with exponential backoff, then we stop sending them at all,
 888  *         assuming that the host ignores our redirects.
 889  *      2. If we did not see packets requiring redirects
 890  *         during ip_rt_redirect_silence, we assume that the host
 891  *         forgot redirected route and start to send redirects again.
 892  *
 893  * This algorithm is much cheaper and more intelligent than dumb load limiting
 894  * in icmp.c.
 895  *
 896  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 897  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 898  */
 899
 900 void ip_rt_send_redirect(struct sk_buff *skb)
 901 {
 902         struct rtable *rt = skb_rtable(skb);
 903         struct in_device *in_dev;
 904         struct inet_peer *peer;
 905         struct net *net;
 906         int log_martians;
 907         int vif;
 908
 909         rcu_read_lock();
 910         in_dev = __in_dev_get_rcu(rt->dst.dev);
 911         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 912                 rcu_read_unlock();
 913                 return;
 914         }
 915         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 916         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 917         rcu_read_unlock();
 918
 919         net = dev_net(rt->dst.dev);
 920         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 921         if (!peer) {
 922                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 923                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 924                 return;
 925         }
 926
 927         /* No redirected packets during ip_rt_redirect_silence;
 928          * reset the algorithm.
 929          */
 930         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 931                 peer->rate_tokens = 0;
 932                 peer->n_redirects = 0;
 933         }
 934
 935         /* Too many ignored redirects; do not send anything
 936          * set dst.rate_last to the last seen redirected packet.
 937          */
 938         if (peer->n_redirects >= ip_rt_redirect_number) {
 939                 peer->rate_last = jiffies;
 940                 goto out_put_peer;
 941         }
 942
 943         /* Check for load limit; set rate_last to the latest sent
 944          * redirect.
 945          */
 946         if (peer->n_redirects == 0 ||
 947             time_after(jiffies,
 948                        (peer->rate_last +
 949                         (ip_rt_redirect_load << peer->n_redirects)))) {
 950                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 951
 952                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 953                 peer->rate_last = jiffies;
 954                 ++peer->n_redirects;
 955 #ifdef CONFIG_IP_ROUTE_VERBOSE
 956                 if (log_martians &&
 957                     peer->n_redirects == ip_rt_redirect_number)
 958                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 959                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 960                                              &ip_hdr(skb)->daddr, &gw);
 961 #endif
 962         }
 963 out_put_peer:
 964         inet_putpeer(peer);
 965 }
 966
 967 static int ip_error(struct sk_buff *skb)
 968 {
 969         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 970         struct rtable *rt = skb_rtable(skb);
 971         struct inet_peer *peer;
 972         unsigned long now;
 973         struct net *net;
 974         bool send;
 975         int code;
 976
 977         /* IP on this device is disabled. */
 978         if (!in_dev)
 979                 goto out;
 980
 981         net = dev_net(rt->dst.dev);
 982         if (!IN_DEV_FORWARD(in_dev)) {
 983                 switch (rt->dst.error) {
 984                 case EHOSTUNREACH:
 985                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 986                         break;
 987
 988                 case ENETUNREACH:
 989                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 990                         break;
 991                 }
 992                 goto out;
 993         }
 994
 995         switch (rt->dst.error) {
 996         case EINVAL:
 997         default:
 998                 goto out;
 999         case EHOSTUNREACH:
1000                 code = ICMP_HOST_UNREACH;
1001                 break;
1002         case ENETUNREACH:
1003                 code = ICMP_NET_UNREACH;
1004                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1005                 break;
1006         case EACCES:
1007                 code = ICMP_PKT_FILTERED;
1008                 break;
1009         }
1010
1011         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1012                                l3mdev_master_ifindex(skb->dev), 1);
1013
1014         send = true;
1015         if (peer) {
1016                 now = jiffies;
1017                 peer->rate_tokens += now - peer->rate_last;
1018                 if (peer->rate_tokens > ip_rt_error_burst)
1019                         peer->rate_tokens = ip_rt_error_burst;
1020                 peer->rate_last = now;
1021                 if (peer->rate_tokens >= ip_rt_error_cost)
1022                         peer->rate_tokens -= ip_rt_error_cost;
1023                 else
1024                         send = false;
1025                 inet_putpeer(peer);
1026         }
1027         if (send)
1028                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1029
1030 out:    kfree_skb(skb);
1031         return 0;
1032 }
1033
1034 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1035 {
1036         struct dst_entry *dst = &rt->dst;
1037         struct net *net = dev_net(dst->dev);
1038         u32 old_mtu = ipv4_mtu(dst);
1039         struct fib_result res;
1040         bool lock = false;
1041
1042         if (ip_mtu_locked(dst))
1043                 return;
1044
1045         if (old_mtu < mtu)
1046                 return;
1047
1048         if (mtu < ip_rt_min_pmtu) {
1049                 lock = true;
1050                 mtu = min(old_mtu, ip_rt_min_pmtu);
1051         }
1052
1053         if (rt->rt_pmtu == mtu && !lock &&
1054             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1055                 return;
1056
1057         rcu_read_lock();
1058         if (fib_lookup(net, fl4, &res, 0) == 0) {
1059                 struct fib_nh *nh;
1060
1061                 fib_select_path(net, &res, fl4, NULL);
1062                 nh = &FIB_RES_NH(res);
1063                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1064                                       jiffies + ip_rt_mtu_expires);
1065         }
1066         rcu_read_unlock();
1067 }
1068
1069 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1070                               struct sk_buff *skb, u32 mtu,
1071                               bool confirm_neigh)
1072 {
1073         struct rtable *rt = (struct rtable *) dst;
1074         struct flowi4 fl4;
1075
1076         ip_rt_build_flow_key(&fl4, sk, skb);
1077         __ip_rt_update_pmtu(rt, &fl4, mtu);
1078 }
1079
1080 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1081                       int oif, u32 mark, u8 protocol, int flow_flags)
1082 {
1083         const struct iphdr *iph = (const struct iphdr *) skb->data;
1084         struct flowi4 fl4;
1085         struct rtable *rt;
1086
1087         if (!mark)
1088                 mark = IP4_REPLY_MARK(net, skb->mark);
1089
1090         __build_flow_key(net, &fl4, NULL, iph, oif,
1091                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1092         rt = __ip_route_output_key(net, &fl4);
1093         if (!IS_ERR(rt)) {
1094                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1095                 ip_rt_put(rt);
1096         }
1097 }
1098 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1099
1100 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1101 {
1102         const struct iphdr *iph = (const struct iphdr *) skb->data;
1103         struct flowi4 fl4;
1104         struct rtable *rt;
1105
1106         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1107
1108         if (!fl4.flowi4_mark)
1109                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1110
1111         rt = __ip_route_output_key(sock_net(sk), &fl4);
1112         if (!IS_ERR(rt)) {
1113                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1114                 ip_rt_put(rt);
1115         }
1116 }
1117
1118 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1119 {
1120         const struct iphdr *iph = (const struct iphdr *) skb->data;
1121         struct flowi4 fl4;
1122         struct rtable *rt;
1123         struct dst_entry *odst = NULL;
1124         bool new = false;
1125         struct net *net = sock_net(sk);
1126
1127         bh_lock_sock(sk);
1128
1129         if (!ip_sk_accept_pmtu(sk))
1130                 goto out;
1131
1132         odst = sk_dst_get(sk);
1133
1134         if (sock_owned_by_user(sk) || !odst) {
1135                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1136                 goto out;
1137         }
1138
1139         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1140
1141         rt = (struct rtable *)odst;
1142         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1143                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1144                 if (IS_ERR(rt))
1145                         goto out;
1146
1147                 new = true;
1148         }
1149
1150         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1151
1152         if (!dst_check(&rt->dst, 0)) {
1153                 if (new)
1154                         dst_release(&rt->dst);
1155
1156                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1157                 if (IS_ERR(rt))
1158                         goto out;
1159
1160                 new = true;
1161         }
1162
1163         if (new)
1164                 sk_dst_set(sk, &rt->dst);
1165
1166 out:
1167         bh_unlock_sock(sk);
1168         dst_release(odst);
1169 }
1170 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1171
1172 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1173                    int oif, u32 mark, u8 protocol, int flow_flags)
1174 {
1175         const struct iphdr *iph = (const struct iphdr *) skb->data;
1176         struct flowi4 fl4;
1177         struct rtable *rt;
1178
1179         __build_flow_key(net, &fl4, NULL, iph, oif,
1180                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1181         rt = __ip_route_output_key(net, &fl4);
1182         if (!IS_ERR(rt)) {
1183                 __ip_do_redirect(rt, skb, &fl4, false);
1184                 ip_rt_put(rt);
1185         }
1186 }
1187 EXPORT_SYMBOL_GPL(ipv4_redirect);
1188
1189 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1190 {
1191         const struct iphdr *iph = (const struct iphdr *) skb->data;
1192         struct flowi4 fl4;
1193         struct rtable *rt;
1194         struct net *net = sock_net(sk);
1195
1196         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1197         rt = __ip_route_output_key(net, &fl4);
1198         if (!IS_ERR(rt)) {
1199                 __ip_do_redirect(rt, skb, &fl4, false);
1200                 ip_rt_put(rt);
1201         }
1202 }
1203 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1204
1205 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1206 {
1207         struct rtable *rt = (struct rtable *) dst;
1208
1209         /* All IPV4 dsts are created with ->obsolete set to the value
1210          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1211          * into this function always.
1212          *
1213          * When a PMTU/redirect information update invalidates a route,
1214          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1215          * DST_OBSOLETE_DEAD by dst_free().
1216          */
1217         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1218                 return NULL;
1219         return dst;
1220 }
1221
1222 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1223 {
1224         struct ip_options opt;
1225         int res;
1226
1227         /* Recompile ip options since IPCB may not be valid anymore.
1228          * Also check we have a reasonable ipv4 header.
1229          */
1230         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1231             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1232                 return;
1233
1234         memset(&opt, 0, sizeof(opt));
1235         if (ip_hdr(skb)->ihl > 5) {
1236                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1237                         return;
1238                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1239
1240                 rcu_read_lock();
1241                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1242                 rcu_read_unlock();
1243
1244                 if (res)
1245                         return;
1246         }
1247         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1248 }
1249
1250 static void ipv4_link_failure(struct sk_buff *skb)
1251 {
1252         struct rtable *rt;
1253
1254         ipv4_send_dest_unreach(skb);
1255
1256         rt = skb_rtable(skb);
1257         if (rt)
1258                 dst_set_expires(&rt->dst, 0);
1259 }
1260
1261 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1262 {
1263         pr_debug("%s: %pI4 -> %pI4, %s\n",
1264                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1265                  skb->dev ? skb->dev->name : "?");
1266         kfree_skb(skb);
1267         WARN_ON(1);
1268         return 0;
1269 }
1270
1271 /*
1272    We do not cache source address of outgoing interface,
1273    because it is used only by IP RR, TS and SRR options,
1274    so that it out of fast path.
1275
1276    BTW remember: "addr" is allowed to be not aligned
1277    in IP options!
1278  */
1279
1280 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1281 {
1282         __be32 src;
1283
1284         if (rt_is_output_route(rt))
1285                 src = ip_hdr(skb)->saddr;
1286         else {
1287                 struct fib_result res;
1288                 struct flowi4 fl4;
1289                 struct iphdr *iph;
1290
1291                 iph = ip_hdr(skb);
1292
1293                 memset(&fl4, 0, sizeof(fl4));
1294                 fl4.daddr = iph->daddr;
1295                 fl4.saddr = iph->saddr;
1296                 fl4.flowi4_tos = RT_TOS(iph->tos);
1297                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1298                 fl4.flowi4_iif = skb->dev->ifindex;
1299                 fl4.flowi4_mark = skb->mark;
1300
1301                 rcu_read_lock();
1302                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1303                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1304                 else
1305                         src = inet_select_addr(rt->dst.dev,
1306                                                rt_nexthop(rt, iph->daddr),
1307                                                RT_SCOPE_UNIVERSE);
1308                 rcu_read_unlock();
1309         }
1310         memcpy(addr, &src, 4);
1311 }
1312
1313 #ifdef CONFIG_IP_ROUTE_CLASSID
1314 static void set_class_tag(struct rtable *rt, u32 tag)
1315 {
1316         if (!(rt->dst.tclassid & 0xFFFF))
1317                 rt->dst.tclassid |= tag & 0xFFFF;
1318         if (!(rt->dst.tclassid & 0xFFFF0000))
1319                 rt->dst.tclassid |= tag & 0xFFFF0000;
1320 }
1321 #endif
1322
1323 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1324 {
1325         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1326         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1327                                     ip_rt_min_advmss);
1328
1329         return min(advmss, IPV4_MAX_PMTU - header_size);
1330 }
1331
1332 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1333 {
1334         const struct rtable *rt = (const struct rtable *) dst;
1335         unsigned int mtu = rt->rt_pmtu;
1336
1337         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1338                 mtu = dst_metric_raw(dst, RTAX_MTU);
1339
1340         if (mtu)
1341                 return mtu;
1342
1343         mtu = READ_ONCE(dst->dev->mtu);
1344
1345         if (unlikely(ip_mtu_locked(dst))) {
1346                 if (rt->rt_uses_gateway && mtu > 576)
1347                         mtu = 576;
1348         }
1349
1350         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1351
1352         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1353 }
1354
1355 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1356 {
1357         struct fnhe_hash_bucket *hash;
1358         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1359         u32 hval = fnhe_hashfun(daddr);
1360
1361         spin_lock_bh(&fnhe_lock);
1362
1363         hash = rcu_dereference_protected(nh->nh_exceptions,
1364                                          lockdep_is_held(&fnhe_lock));
1365         hash += hval;
1366
1367         fnhe_p = &hash->chain;
1368         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1369         while (fnhe) {
1370                 if (fnhe->fnhe_daddr == daddr) {
1371                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1372                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1373                         /* set fnhe_daddr to 0 to ensure it won't bind with
1374                          * new dsts in rt_bind_exception().
1375                          */
1376                         fnhe->fnhe_daddr = 0;
1377                         fnhe_flush_routes(fnhe);
1378                         kfree_rcu(fnhe, rcu);
1379                         break;
1380                 }
1381                 fnhe_p = &fnhe->fnhe_next;
1382                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1383                                                  lockdep_is_held(&fnhe_lock));
1384         }
1385
1386         spin_unlock_bh(&fnhe_lock);
1387 }
1388
1389 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1390 {
1391         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1392         struct fib_nh_exception *fnhe;
1393         u32 hval;
1394
1395         if (!hash)
1396                 return NULL;
1397
1398         hval = fnhe_hashfun(daddr);
1399
1400         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1401              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1402                 if (fnhe->fnhe_daddr == daddr) {
1403                         if (fnhe->fnhe_expires &&
1404                             time_after(jiffies, fnhe->fnhe_expires)) {
1405                                 ip_del_fnhe(nh, daddr);
1406                                 break;
1407                         }
1408                         return fnhe;
1409                 }
1410         }
1411         return NULL;
1412 }
1413
1414 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1415                               __be32 daddr, const bool do_cache)
1416 {
1417         bool ret = false;
1418
1419         spin_lock_bh(&fnhe_lock);
1420
1421         if (daddr == fnhe->fnhe_daddr) {
1422                 struct rtable __rcu **porig;
1423                 struct rtable *orig;
1424                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1425
1426                 if (rt_is_input_route(rt))
1427                         porig = &fnhe->fnhe_rth_input;
1428                 else
1429                         porig = &fnhe->fnhe_rth_output;
1430                 orig = rcu_dereference(*porig);
1431
1432                 if (fnhe->fnhe_genid != genid) {
1433                         fnhe->fnhe_genid = genid;
1434                         fnhe->fnhe_gw = 0;
1435                         fnhe->fnhe_pmtu = 0;
1436                         fnhe->fnhe_expires = 0;
1437                         fnhe_flush_routes(fnhe);
1438                         orig = NULL;
1439                 }
1440                 fill_route_from_fnhe(rt, fnhe);
1441                 if (!rt->rt_gateway)
1442                         rt->rt_gateway = daddr;
1443
1444                 if (do_cache) {
1445                         dst_hold(&rt->dst);
1446                         rcu_assign_pointer(*porig, rt);
1447                         if (orig) {
1448                                 dst_dev_put(&orig->dst);
1449                                 dst_release(&orig->dst);
1450                         }
1451                         ret = true;
1452                 }
1453
1454                 fnhe->fnhe_stamp = jiffies;
1455         }
1456         spin_unlock_bh(&fnhe_lock);
1457
1458         return ret;
1459 }
1460
1461 struct uncached_list {
1462         spinlock_t              lock;
1463         struct list_head        head;
1464 };
1465
1466 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1467
1468 static void rt_add_uncached_list(struct rtable *rt)
1469 {
1470         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1471
1472         rt->rt_uncached_list = ul;
1473
1474         spin_lock_bh(&ul->lock);
1475         list_add_tail(&rt->rt_uncached, &ul->head);
1476         spin_unlock_bh(&ul->lock);
1477 }
1478
1479 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1480 {
1481         struct rtable *orig, *prev, **p;
1482         bool ret = true;
1483
1484         if (rt_is_input_route(rt)) {
1485                 p = (struct rtable **)&nh->nh_rth_input;
1486         } else {
1487                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1488         }
1489         orig = *p;
1490
1491         /* hold dst before doing cmpxchg() to avoid race condition
1492          * on this dst
1493          */
1494         dst_hold(&rt->dst);
1495         prev = cmpxchg(p, orig, rt);
1496         if (prev == orig) {
1497                 if (orig) {
1498                         rt_add_uncached_list(orig);
1499                         dst_release(&orig->dst);
1500                 }
1501         } else {
1502                 dst_release(&rt->dst);
1503                 ret = false;
1504         }
1505
1506         return ret;
1507 }
1508
1509 static void ipv4_dst_destroy(struct dst_entry *dst)
1510 {
1511         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1512         struct rtable *rt = (struct rtable *) dst;
1513
1514         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1515                 kfree(p);
1516
1517         if (!list_empty(&rt->rt_uncached)) {
1518                 struct uncached_list *ul = rt->rt_uncached_list;
1519
1520                 spin_lock_bh(&ul->lock);
1521                 list_del(&rt->rt_uncached);
1522                 spin_unlock_bh(&ul->lock);
1523         }
1524 }
1525
1526 void rt_flush_dev(struct net_device *dev)
1527 {
1528         struct net *net = dev_net(dev);
1529         struct rtable *rt;
1530         int cpu;
1531
1532         for_each_possible_cpu(cpu) {
1533                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1534
1535                 spin_lock_bh(&ul->lock);
1536                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1537                         if (rt->dst.dev != dev)
1538                                 continue;
1539                         rt->dst.dev = net->loopback_dev;
1540                         dev_hold(rt->dst.dev);
1541                         dev_put(dev);
1542                 }
1543                 spin_unlock_bh(&ul->lock);
1544         }
1545 }
1546
1547 static bool rt_cache_valid(const struct rtable *rt)
1548 {
1549         return  rt &&
1550                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1551                 !rt_is_expired(rt);
1552 }
1553
1554 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1555                            const struct fib_result *res,
1556                            struct fib_nh_exception *fnhe,
1557                            struct fib_info *fi, u16 type, u32 itag,
1558                            const bool do_cache)
1559 {
1560         bool cached = false;
1561
1562         if (fi) {
1563                 struct fib_nh *nh = &FIB_RES_NH(*res);
1564
1565                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1566                         rt->rt_gateway = nh->nh_gw;
1567                         rt->rt_uses_gateway = 1;
1568                 }
1569                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1570                 if (fi->fib_metrics != &dst_default_metrics) {
1571                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1572                         refcount_inc(&fi->fib_metrics->refcnt);
1573                 }
1574 #ifdef CONFIG_IP_ROUTE_CLASSID
1575                 rt->dst.tclassid = nh->nh_tclassid;
1576 #endif
1577                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1578                 if (unlikely(fnhe))
1579                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1580                 else if (do_cache)
1581                         cached = rt_cache_route(nh, rt);
1582                 if (unlikely(!cached)) {
1583                         /* Routes we intend to cache in nexthop exception or
1584                          * FIB nexthop have the DST_NOCACHE bit clear.
1585                          * However, if we are unsuccessful at storing this
1586                          * route into the cache we really need to set it.
1587                          */
1588                         if (!rt->rt_gateway)
1589                                 rt->rt_gateway = daddr;
1590                         rt_add_uncached_list(rt);
1591                 }
1592         } else
1593                 rt_add_uncached_list(rt);
1594
1595 #ifdef CONFIG_IP_ROUTE_CLASSID
1596 #ifdef CONFIG_IP_MULTIPLE_TABLES
1597         set_class_tag(rt, res->tclassid);
1598 #endif
1599         set_class_tag(rt, itag);
1600 #endif
1601 }
1602
1603 struct rtable *rt_dst_alloc(struct net_device *dev,
1604                             unsigned int flags, u16 type,
1605                             bool nopolicy, bool noxfrm, bool will_cache)
1606 {
1607         struct rtable *rt;
1608
1609         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1610                        (will_cache ? 0 : DST_HOST) |
1611                        (nopolicy ? DST_NOPOLICY : 0) |
1612                        (noxfrm ? DST_NOXFRM : 0));
1613
1614         if (rt) {
1615                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1616                 rt->rt_flags = flags;
1617                 rt->rt_type = type;
1618                 rt->rt_is_input = 0;
1619                 rt->rt_iif = 0;
1620                 rt->rt_pmtu = 0;
1621                 rt->rt_mtu_locked = 0;
1622                 rt->rt_gateway = 0;
1623                 rt->rt_uses_gateway = 0;
1624                 rt->rt_table_id = 0;
1625                 INIT_LIST_HEAD(&rt->rt_uncached);
1626
1627                 rt->dst.output = ip_output;
1628                 if (flags & RTCF_LOCAL)
1629                         rt->dst.input = ip_local_deliver;
1630         }
1631
1632         return rt;
1633 }
1634 EXPORT_SYMBOL(rt_dst_alloc);
1635
1636 /* called in rcu_read_lock() section */
1637 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1638                           u8 tos, struct net_device *dev,
1639                           struct in_device *in_dev, u32 *itag)
1640 {
1641         int err;
1642
1643         /* Primary sanity checks. */
1644         if (!in_dev)
1645                 return -EINVAL;
1646
1647         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1648             skb->protocol != htons(ETH_P_IP))
1649                 return -EINVAL;
1650
1651         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1652                 return -EINVAL;
1653
1654         if (ipv4_is_zeronet(saddr)) {
1655                 if (!ipv4_is_local_multicast(daddr))
1656                         return -EINVAL;
1657         } else {
1658                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1659                                           in_dev, itag);
1660                 if (err < 0)
1661                         return err;
1662         }
1663         return 0;
1664 }
1665
1666 /* called in rcu_read_lock() section */
1667 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1668                              u8 tos, struct net_device *dev, int our)
1669 {
1670         struct in_device *in_dev = __in_dev_get_rcu(dev);
1671         unsigned int flags = RTCF_MULTICAST;
1672         struct rtable *rth;
1673         u32 itag = 0;
1674         int err;
1675
1676         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1677         if (err)
1678                 return err;
1679
1680         if (our)
1681                 flags |= RTCF_LOCAL;
1682
1683         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1684                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1685         if (!rth)
1686                 return -ENOBUFS;
1687
1688 #ifdef CONFIG_IP_ROUTE_CLASSID
1689         rth->dst.tclassid = itag;
1690 #endif
1691         rth->dst.output = ip_rt_bug;
1692         rth->rt_is_input= 1;
1693
1694 #ifdef CONFIG_IP_MROUTE
1695         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1696                 rth->dst.input = ip_mr_input;
1697 #endif
1698         RT_CACHE_STAT_INC(in_slow_mc);
1699
1700         skb_dst_set(skb, &rth->dst);
1701         return 0;
1702 }
1703
1704
1705 static void ip_handle_martian_source(struct net_device *dev,
1706                                      struct in_device *in_dev,
1707                                      struct sk_buff *skb,
1708                                      __be32 daddr,
1709                                      __be32 saddr)
1710 {
1711         RT_CACHE_STAT_INC(in_martian_src);
1712 #ifdef CONFIG_IP_ROUTE_VERBOSE
1713         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1714                 /*
1715                  *      RFC1812 recommendation, if source is martian,
1716                  *      the only hint is MAC header.
1717                  */
1718                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1719                         &daddr, &saddr, dev->name);
1720                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1721                         print_hex_dump(KERN_WARNING, "ll header: ",
1722                                        DUMP_PREFIX_OFFSET, 16, 1,
1723                                        skb_mac_header(skb),
1724                                        dev->hard_header_len, true);
1725                 }
1726         }
1727 #endif
1728 }
1729
1730 static void set_lwt_redirect(struct rtable *rth)
1731 {
1732         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1733                 rth->dst.lwtstate->orig_output = rth->dst.output;
1734                 rth->dst.output = lwtunnel_output;
1735         }
1736
1737         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1738                 rth->dst.lwtstate->orig_input = rth->dst.input;
1739                 rth->dst.input = lwtunnel_input;
1740         }
1741 }
1742
1743 /* called in rcu_read_lock() section */
1744 static int __mkroute_input(struct sk_buff *skb,
1745                            const struct fib_result *res,
1746                            struct in_device *in_dev,
1747                            __be32 daddr, __be32 saddr, u32 tos)
1748 {
1749         struct fib_nh_exception *fnhe;
1750         struct rtable *rth;
1751         int err;
1752         struct in_device *out_dev;
1753         bool do_cache;
1754         u32 itag = 0;
1755
1756         /* get a working reference to the output device */
1757         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1758         if (!out_dev) {
1759                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1760                 return -EINVAL;
1761         }
1762
1763         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1764                                   in_dev->dev, in_dev, &itag);
1765         if (err < 0) {
1766                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1767                                          saddr);
1768
1769                 goto cleanup;
1770         }
1771
1772         do_cache = res->fi && !itag;
1773         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1774             skb->protocol == htons(ETH_P_IP) &&
1775             (IN_DEV_SHARED_MEDIA(out_dev) ||
1776              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1777                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1778
1779         if (skb->protocol != htons(ETH_P_IP)) {
1780                 /* Not IP (i.e. ARP). Do not create route, if it is
1781                  * invalid for proxy arp. DNAT routes are always valid.
1782                  *
1783                  * Proxy arp feature have been extended to allow, ARP
1784                  * replies back to the same interface, to support
1785                  * Private VLAN switch technologies. See arp.c.
1786                  */
1787                 if (out_dev == in_dev &&
1788                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1789                         err = -EINVAL;
1790                         goto cleanup;
1791                 }
1792         }
1793
1794         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1795         if (do_cache) {
1796                 if (fnhe)
1797                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1798                 else
1799                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1800                 if (rt_cache_valid(rth)) {
1801                         skb_dst_set_noref(skb, &rth->dst);
1802                         goto out;
1803                 }
1804         }
1805
1806         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1807                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1808                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1809         if (!rth) {
1810                 err = -ENOBUFS;
1811                 goto cleanup;
1812         }
1813
1814         rth->rt_is_input = 1;
1815         if (res->table)
1816                 rth->rt_table_id = res->table->tb_id;
1817         RT_CACHE_STAT_INC(in_slow_tot);
1818
1819         rth->dst.input = ip_forward;
1820
1821         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1822                        do_cache);
1823         set_lwt_redirect(rth);
1824         skb_dst_set(skb, &rth->dst);
1825 out:
1826         err = 0;
1827  cleanup:
1828         return err;
1829 }
1830
1831 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1832 /* To make ICMP packets follow the right flow, the multipath hash is
1833  * calculated from the inner IP addresses.
1834  */
1835 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1836                                  struct flow_keys *hash_keys)
1837 {
1838         const struct iphdr *outer_iph = ip_hdr(skb);
1839         const struct iphdr *inner_iph;
1840         const struct icmphdr *icmph;
1841         struct iphdr _inner_iph;
1842         struct icmphdr _icmph;
1843
1844         hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1845         hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1846         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1847                 return;
1848
1849         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1850                 return;
1851
1852         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1853                                    &_icmph);
1854         if (!icmph)
1855                 return;
1856
1857         if (icmph->type != ICMP_DEST_UNREACH &&
1858             icmph->type != ICMP_REDIRECT &&
1859             icmph->type != ICMP_TIME_EXCEEDED &&
1860             icmph->type != ICMP_PARAMETERPROB)
1861                 return;
1862
1863         inner_iph = skb_header_pointer(skb,
1864                                        outer_iph->ihl * 4 + sizeof(_icmph),
1865                                        sizeof(_inner_iph), &_inner_iph);
1866         if (!inner_iph)
1867                 return;
1868         hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1869         hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1870 }
1871
1872 /* if skb is set it will be used and fl4 can be NULL */
1873 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1874                        const struct sk_buff *skb)
1875 {
1876         struct net *net = fi->fib_net;
1877         struct flow_keys hash_keys;
1878         u32 mhash;
1879
1880         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1881         case 0:
1882                 memset(&hash_keys, 0, sizeof(hash_keys));
1883                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1884                 if (skb) {
1885                         ip_multipath_l3_keys(skb, &hash_keys);
1886                 } else {
1887                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1888                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1889                 }
1890                 break;
1891         case 1:
1892                 /* skb is currently provided only when forwarding */
1893                 if (skb) {
1894                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1895                         struct flow_keys keys;
1896
1897                         /* short-circuit if we already have L4 hash present */
1898                         if (skb->l4_hash)
1899                                 return skb_get_hash_raw(skb) >> 1;
1900                         memset(&hash_keys, 0, sizeof(hash_keys));
1901                         skb_flow_dissect_flow_keys(skb, &keys, flag);
1902
1903                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1904                         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1905                         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1906                         hash_keys.ports.src = keys.ports.src;
1907                         hash_keys.ports.dst = keys.ports.dst;
1908                         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1909                 } else {
1910                         memset(&hash_keys, 0, sizeof(hash_keys));
1911                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1912                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1913                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1914                         hash_keys.ports.src = fl4->fl4_sport;
1915                         hash_keys.ports.dst = fl4->fl4_dport;
1916                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1917                 }
1918                 break;
1919         }
1920         mhash = flow_hash_from_keys(&hash_keys);
1921
1922         return mhash >> 1;
1923 }
1924 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1925 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1926
1927 static int ip_mkroute_input(struct sk_buff *skb,
1928                             struct fib_result *res,
1929                             struct in_device *in_dev,
1930                             __be32 daddr, __be32 saddr, u32 tos)
1931 {
1932 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1933         if (res->fi && res->fi->fib_nhs > 1) {
1934                 int h = fib_multipath_hash(res->fi, NULL, skb);
1935
1936                 fib_select_multipath(res, h);
1937         }
1938 #endif
1939
1940         /* create a routing cache entry */
1941         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1942 }
1943
1944 /*
1945  *      NOTE. We drop all the packets that has local source
1946  *      addresses, because every properly looped back packet
1947  *      must have correct destination already attached by output routine.
1948  *
1949  *      Such approach solves two big problems:
1950  *      1. Not simplex devices are handled properly.
1951  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1952  *      called with rcu_read_lock()
1953  */
1954
1955 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1956                                u8 tos, struct net_device *dev,
1957                                struct fib_result *res)
1958 {
1959         struct in_device *in_dev = __in_dev_get_rcu(dev);
1960         struct ip_tunnel_info *tun_info;
1961         struct flowi4   fl4;
1962         unsigned int    flags = 0;
1963         u32             itag = 0;
1964         struct rtable   *rth;
1965         int             err = -EINVAL;
1966         struct net    *net = dev_net(dev);
1967         bool do_cache;
1968
1969         /* IP on this device is disabled. */
1970
1971         if (!in_dev)
1972                 goto out;
1973
1974         /* Check for the most weird martians, which can be not detected
1975            by fib_lookup.
1976          */
1977
1978         tun_info = skb_tunnel_info(skb);
1979         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1980                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1981         else
1982                 fl4.flowi4_tun_key.tun_id = 0;
1983         skb_dst_drop(skb);
1984
1985         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1986                 goto martian_source;
1987
1988         res->fi = NULL;
1989         res->table = NULL;
1990         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1991                 goto brd_input;
1992
1993         /* Accept zero addresses only to limited broadcast;
1994          * I even do not know to fix it or not. Waiting for complains :-)
1995          */
1996         if (ipv4_is_zeronet(saddr))
1997                 goto martian_source;
1998
1999         if (ipv4_is_zeronet(daddr))
2000                 goto martian_destination;
2001
2002         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2003          * and call it once if daddr or/and saddr are loopback addresses
2004          */
2005         if (ipv4_is_loopback(daddr)) {
2006                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2007                         goto martian_destination;
2008         } else if (ipv4_is_loopback(saddr)) {
2009                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2010                         goto martian_source;
2011         }
2012
2013         /*
2014          *      Now we are ready to route packet.
2015          */
2016         fl4.flowi4_oif = 0;
2017         fl4.flowi4_iif = dev->ifindex;
2018         fl4.flowi4_mark = skb->mark;
2019         fl4.flowi4_tos = tos;
2020         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2021         fl4.flowi4_flags = 0;
2022         fl4.daddr = daddr;
2023         fl4.saddr = saddr;
2024         fl4.flowi4_uid = sock_net_uid(net, NULL);
2025         err = fib_lookup(net, &fl4, res, 0);
2026         if (err != 0) {
2027                 if (!IN_DEV_FORWARD(in_dev))
2028                         err = -EHOSTUNREACH;
2029                 goto no_route;
2030         }
2031
2032         if (res->type == RTN_BROADCAST)
2033                 goto brd_input;
2034
2035         if (res->type == RTN_LOCAL) {
2036                 err = fib_validate_source(skb, saddr, daddr, tos,
2037                                           0, dev, in_dev, &itag);
2038                 if (err < 0)
2039                         goto martian_source;
2040                 goto local_input;
2041         }
2042
2043         if (!IN_DEV_FORWARD(in_dev)) {
2044                 err = -EHOSTUNREACH;
2045                 goto no_route;
2046         }
2047         if (res->type != RTN_UNICAST)
2048                 goto martian_destination;
2049
2050         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2051 out:    return err;
2052
2053 brd_input:
2054         if (skb->protocol != htons(ETH_P_IP))
2055                 goto e_inval;
2056
2057         if (!ipv4_is_zeronet(saddr)) {
2058                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2059                                           in_dev, &itag);
2060                 if (err < 0)
2061                         goto martian_source;
2062         }
2063         flags |= RTCF_BROADCAST;
2064         res->type = RTN_BROADCAST;
2065         RT_CACHE_STAT_INC(in_brd);
2066
2067 local_input:
2068         do_cache = false;
2069         if (res->fi) {
2070                 if (!itag) {
2071                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2072                         if (rt_cache_valid(rth)) {
2073                                 skb_dst_set_noref(skb, &rth->dst);
2074                                 err = 0;
2075                                 goto out;
2076                         }
2077                         do_cache = true;
2078                 }
2079         }
2080
2081         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2082                            flags | RTCF_LOCAL, res->type,
2083                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2084         if (!rth)
2085                 goto e_nobufs;
2086
2087         rth->dst.output= ip_rt_bug;
2088 #ifdef CONFIG_IP_ROUTE_CLASSID
2089         rth->dst.tclassid = itag;
2090 #endif
2091         rth->rt_is_input = 1;
2092         if (res->table)
2093                 rth->rt_table_id = res->table->tb_id;
2094
2095         RT_CACHE_STAT_INC(in_slow_tot);
2096         if (res->type == RTN_UNREACHABLE) {
2097                 rth->dst.input= ip_error;
2098                 rth->dst.error= -err;
2099                 rth->rt_flags   &= ~RTCF_LOCAL;
2100         }
2101
2102         if (do_cache) {
2103                 struct fib_nh *nh = &FIB_RES_NH(*res);
2104
2105                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2106                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2107                         WARN_ON(rth->dst.input == lwtunnel_input);
2108                         rth->dst.lwtstate->orig_input = rth->dst.input;
2109                         rth->dst.input = lwtunnel_input;
2110                 }
2111
2112                 if (unlikely(!rt_cache_route(nh, rth)))
2113                         rt_add_uncached_list(rth);
2114         }
2115         skb_dst_set(skb, &rth->dst);
2116         err = 0;
2117         goto out;
2118
2119 no_route:
2120         RT_CACHE_STAT_INC(in_no_route);
2121         res->type = RTN_UNREACHABLE;
2122         res->fi = NULL;
2123         res->table = NULL;
2124         goto local_input;
2125
2126         /*
2127          *      Do not cache martian addresses: they should be logged (RFC1812)
2128          */
2129 martian_destination:
2130         RT_CACHE_STAT_INC(in_martian_dst);
2131 #ifdef CONFIG_IP_ROUTE_VERBOSE
2132         if (IN_DEV_LOG_MARTIANS(in_dev))
2133                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2134                                      &daddr, &saddr, dev->name);
2135 #endif
2136
2137 e_inval:
2138         err = -EINVAL;
2139         goto out;
2140
2141 e_nobufs:
2142         err = -ENOBUFS;
2143         goto out;
2144
2145 martian_source:
2146         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2147         goto out;
2148 }
2149
2150 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2151                          u8 tos, struct net_device *dev)
2152 {
2153         struct fib_result res;
2154         int err;
2155
2156         tos &= IPTOS_RT_MASK;
2157         rcu_read_lock();
2158         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2159         rcu_read_unlock();
2160
2161         return err;
2162 }
2163 EXPORT_SYMBOL(ip_route_input_noref);
2164
2165 /* called with rcu_read_lock held */
2166 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2167                        u8 tos, struct net_device *dev, struct fib_result *res)
2168 {
2169         /* Multicast recognition logic is moved from route cache to here.
2170            The problem was that too many Ethernet cards have broken/missing
2171            hardware multicast filters :-( As result the host on multicasting
2172            network acquires a lot of useless route cache entries, sort of
2173            SDR messages from all the world. Now we try to get rid of them.
2174            Really, provided software IP multicast filter is organized
2175            reasonably (at least, hashed), it does not result in a slowdown
2176            comparing with route cache reject entries.
2177            Note, that multicast routers are not affected, because
2178            route cache entry is created eventually.
2179          */
2180         if (ipv4_is_multicast(daddr)) {
2181                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2182                 int our = 0;
2183                 int err = -EINVAL;
2184
2185                 if (!in_dev)
2186                         return err;
2187                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2188                                       ip_hdr(skb)->protocol);
2189
2190                 /* check l3 master if no match yet */
2191                 if (!our && netif_is_l3_slave(dev)) {
2192                         struct in_device *l3_in_dev;
2193
2194                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2195                         if (l3_in_dev)
2196                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2197                                                       ip_hdr(skb)->protocol);
2198                 }
2199
2200                 if (our
2201 #ifdef CONFIG_IP_MROUTE
2202                         ||
2203                     (!ipv4_is_local_multicast(daddr) &&
2204                      IN_DEV_MFORWARD(in_dev))
2205 #endif
2206                    ) {
2207                         err = ip_route_input_mc(skb, daddr, saddr,
2208                                                 tos, dev, our);
2209                 }
2210                 return err;
2211         }
2212
2213         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2214 }
2215
2216 /* called with rcu_read_lock() */
2217 static struct rtable *__mkroute_output(const struct fib_result *res,
2218                                        const struct flowi4 *fl4, int orig_oif,
2219                                        struct net_device *dev_out,
2220                                        unsigned int flags)
2221 {
2222         struct fib_info *fi = res->fi;
2223         struct fib_nh_exception *fnhe;
2224         struct in_device *in_dev;
2225         u16 type = res->type;
2226         struct rtable *rth;
2227         bool do_cache;
2228
2229         in_dev = __in_dev_get_rcu(dev_out);
2230         if (!in_dev)
2231                 return ERR_PTR(-EINVAL);
2232
2233         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2234                 if (ipv4_is_loopback(fl4->saddr) &&
2235                     !(dev_out->flags & IFF_LOOPBACK) &&
2236                     !netif_is_l3_master(dev_out))
2237                         return ERR_PTR(-EINVAL);
2238
2239         if (ipv4_is_lbcast(fl4->daddr))
2240                 type = RTN_BROADCAST;
2241         else if (ipv4_is_multicast(fl4->daddr))
2242                 type = RTN_MULTICAST;
2243         else if (ipv4_is_zeronet(fl4->daddr))
2244                 return ERR_PTR(-EINVAL);
2245
2246         if (dev_out->flags & IFF_LOOPBACK)
2247                 flags |= RTCF_LOCAL;
2248
2249         do_cache = true;
2250         if (type == RTN_BROADCAST) {
2251                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2252                 fi = NULL;
2253         } else if (type == RTN_MULTICAST) {
2254                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2255                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2256                                      fl4->flowi4_proto))
2257                         flags &= ~RTCF_LOCAL;
2258                 else
2259                         do_cache = false;
2260                 /* If multicast route do not exist use
2261                  * default one, but do not gateway in this case.
2262                  * Yes, it is hack.
2263                  */
2264                 if (fi && res->prefixlen < 4)
2265                         fi = NULL;
2266         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2267                    (orig_oif != dev_out->ifindex)) {
2268                 /* For local routes that require a particular output interface
2269                  * we do not want to cache the result.  Caching the result
2270                  * causes incorrect behaviour when there are multiple source
2271                  * addresses on the interface, the end result being that if the
2272                  * intended recipient is waiting on that interface for the
2273                  * packet he won't receive it because it will be delivered on
2274                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2275                  * be set to the loopback interface as well.
2276                  */
2277                 do_cache = false;
2278         }
2279
2280         fnhe = NULL;
2281         do_cache &= fi != NULL;
2282         if (fi) {
2283                 struct rtable __rcu **prth;
2284                 struct fib_nh *nh = &FIB_RES_NH(*res);
2285
2286                 fnhe = find_exception(nh, fl4->daddr);
2287                 if (!do_cache)
2288                         goto add;
2289                 if (fnhe) {
2290                         prth = &fnhe->fnhe_rth_output;
2291                 } else {
2292                         if (unlikely(fl4->flowi4_flags &
2293                                      FLOWI_FLAG_KNOWN_NH &&
2294                                      !(nh->nh_gw &&
2295                                        nh->nh_scope == RT_SCOPE_LINK))) {
2296                                 do_cache = false;
2297                                 goto add;
2298                         }
2299                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2300                 }
2301                 rth = rcu_dereference(*prth);
2302                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2303                         return rth;
2304         }
2305
2306 add:
2307         rth = rt_dst_alloc(dev_out, flags, type,
2308                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2309                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2310                            do_cache);
2311         if (!rth)
2312                 return ERR_PTR(-ENOBUFS);
2313
2314         rth->rt_iif = orig_oif;
2315         if (res->table)
2316                 rth->rt_table_id = res->table->tb_id;
2317
2318         RT_CACHE_STAT_INC(out_slow_tot);
2319
2320         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2321                 if (flags & RTCF_LOCAL &&
2322                     !(dev_out->flags & IFF_LOOPBACK)) {
2323                         rth->dst.output = ip_mc_output;
2324                         RT_CACHE_STAT_INC(out_slow_mc);
2325                 }
2326 #ifdef CONFIG_IP_MROUTE
2327                 if (type == RTN_MULTICAST) {
2328                         if (IN_DEV_MFORWARD(in_dev) &&
2329                             !ipv4_is_local_multicast(fl4->daddr)) {
2330                                 rth->dst.input = ip_mr_input;
2331                                 rth->dst.output = ip_mc_output;
2332                         }
2333                 }
2334 #endif
2335         }
2336
2337         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2338         set_lwt_redirect(rth);
2339
2340         return rth;
2341 }
2342
2343 /*
2344  * Major route resolver routine.
2345  */
2346
2347 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2348                                         const struct sk_buff *skb)
2349 {
2350         __u8 tos = RT_FL_TOS(fl4);
2351         struct fib_result res = {
2352                 .type           = RTN_UNSPEC,
2353                 .fi             = NULL,
2354                 .table          = NULL,
2355                 .tclassid       = 0,
2356         };
2357         struct rtable *rth;
2358
2359         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2360         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2361         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2362                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2363
2364         rcu_read_lock();
2365         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2366         rcu_read_unlock();
2367
2368         return rth;
2369 }
2370 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2371
2372 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2373                                             struct fib_result *res,
2374                                             const struct sk_buff *skb)
2375 {
2376         struct net_device *dev_out = NULL;
2377         int orig_oif = fl4->flowi4_oif;
2378         unsigned int flags = 0;
2379         struct rtable *rth;
2380         int err;
2381
2382         if (fl4->saddr) {
2383                 if (ipv4_is_multicast(fl4->saddr) ||
2384                     ipv4_is_lbcast(fl4->saddr) ||
2385                     ipv4_is_zeronet(fl4->saddr)) {
2386                         rth = ERR_PTR(-EINVAL);
2387                         goto out;
2388                 }
2389
2390                 rth = ERR_PTR(-ENETUNREACH);
2391
2392                 /* I removed check for oif == dev_out->oif here.
2393                    It was wrong for two reasons:
2394                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2395                       is assigned to multiple interfaces.
2396                    2. Moreover, we are allowed to send packets with saddr
2397                       of another iface. --ANK
2398                  */
2399
2400                 if (fl4->flowi4_oif == 0 &&
2401                     (ipv4_is_multicast(fl4->daddr) ||
2402                      ipv4_is_lbcast(fl4->daddr))) {
2403                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2404                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2405                         if (!dev_out)
2406                                 goto out;
2407
2408                         /* Special hack: user can direct multicasts
2409                            and limited broadcast via necessary interface
2410                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2411                            This hack is not just for fun, it allows
2412                            vic,vat and friends to work.
2413                            They bind socket to loopback, set ttl to zero
2414                            and expect that it will work.
2415                            From the viewpoint of routing cache they are broken,
2416                            because we are not allowed to build multicast path
2417                            with loopback source addr (look, routing cache
2418                            cannot know, that ttl is zero, so that packet
2419                            will not leave this host and route is valid).
2420                            Luckily, this hack is good workaround.
2421                          */
2422
2423                         fl4->flowi4_oif = dev_out->ifindex;
2424                         goto make_route;
2425                 }
2426
2427                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2428                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2429                         if (!__ip_dev_find(net, fl4->saddr, false))
2430                                 goto out;
2431                 }
2432         }
2433
2434
2435         if (fl4->flowi4_oif) {
2436                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2437                 rth = ERR_PTR(-ENODEV);
2438                 if (!dev_out)
2439                         goto out;
2440
2441                 /* RACE: Check return value of inet_select_addr instead. */
2442                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2443                         rth = ERR_PTR(-ENETUNREACH);
2444                         goto out;
2445                 }
2446                 if (ipv4_is_local_multicast(fl4->daddr) ||
2447                     ipv4_is_lbcast(fl4->daddr) ||
2448                     fl4->flowi4_proto == IPPROTO_IGMP) {
2449                         if (!fl4->saddr)
2450                                 fl4->saddr = inet_select_addr(dev_out, 0,
2451                                                               RT_SCOPE_LINK);
2452                         goto make_route;
2453                 }
2454                 if (!fl4->saddr) {
2455                         if (ipv4_is_multicast(fl4->daddr))
2456                                 fl4->saddr = inet_select_addr(dev_out, 0,
2457                                                               fl4->flowi4_scope);
2458                         else if (!fl4->daddr)
2459                                 fl4->saddr = inet_select_addr(dev_out, 0,
2460                                                               RT_SCOPE_HOST);
2461                 }
2462         }
2463
2464         if (!fl4->daddr) {
2465                 fl4->daddr = fl4->saddr;
2466                 if (!fl4->daddr)
2467                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2468                 dev_out = net->loopback_dev;
2469                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2470                 res->type = RTN_LOCAL;
2471                 flags |= RTCF_LOCAL;
2472                 goto make_route;
2473         }
2474
2475         err = fib_lookup(net, fl4, res, 0);
2476         if (err) {
2477                 res->fi = NULL;
2478                 res->table = NULL;
2479                 if (fl4->flowi4_oif &&
2480                     (ipv4_is_multicast(fl4->daddr) ||
2481                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2482                         /* Apparently, routing tables are wrong. Assume,
2483                            that the destination is on link.
2484
2485                            WHY? DW.
2486                            Because we are allowed to send to iface
2487                            even if it has NO routes and NO assigned
2488                            addresses. When oif is specified, routing
2489                            tables are looked up with only one purpose:
2490                            to catch if destination is gatewayed, rather than
2491                            direct. Moreover, if MSG_DONTROUTE is set,
2492                            we send packet, ignoring both routing tables
2493                            and ifaddr state. --ANK
2494
2495
2496                            We could make it even if oif is unknown,
2497                            likely IPv6, but we do not.
2498                          */
2499
2500                         if (fl4->saddr == 0)
2501                                 fl4->saddr = inet_select_addr(dev_out, 0,
2502                                                               RT_SCOPE_LINK);
2503                         res->type = RTN_UNICAST;
2504                         goto make_route;
2505                 }
2506                 rth = ERR_PTR(err);
2507                 goto out;
2508         }
2509
2510         if (res->type == RTN_LOCAL) {
2511                 if (!fl4->saddr) {
2512                         if (res->fi->fib_prefsrc)
2513                                 fl4->saddr = res->fi->fib_prefsrc;
2514                         else
2515                                 fl4->saddr = fl4->daddr;
2516                 }
2517
2518                 /* L3 master device is the loopback for that domain */
2519                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2520                         net->loopback_dev;
2521
2522                 /* make sure orig_oif points to fib result device even
2523                  * though packet rx/tx happens over loopback or l3mdev
2524                  */
2525                 orig_oif = FIB_RES_OIF(*res);
2526
2527                 fl4->flowi4_oif = dev_out->ifindex;
2528                 flags |= RTCF_LOCAL;
2529                 goto make_route;
2530         }
2531
2532         fib_select_path(net, res, fl4, skb);
2533
2534         dev_out = FIB_RES_DEV(*res);
2535
2536 make_route:
2537         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2538
2539 out:
2540         return rth;
2541 }
2542
2543 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2544 {
2545         return NULL;
2546 }
2547
2548 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2549 {
2550         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2551
2552         return mtu ? : dst->dev->mtu;
2553 }
2554
2555 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2556                                           struct sk_buff *skb, u32 mtu,
2557                                           bool confirm_neigh)
2558 {
2559 }
2560
2561 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2562                                        struct sk_buff *skb)
2563 {
2564 }
2565
2566 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2567                                           unsigned long old)
2568 {
2569         return NULL;
2570 }
2571
2572 static struct dst_ops ipv4_dst_blackhole_ops = {
2573         .family                 =       AF_INET,
2574         .check                  =       ipv4_blackhole_dst_check,
2575         .mtu                    =       ipv4_blackhole_mtu,
2576         .default_advmss         =       ipv4_default_advmss,
2577         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2578         .redirect               =       ipv4_rt_blackhole_redirect,
2579         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2580         .neigh_lookup           =       ipv4_neigh_lookup,
2581 };
2582
2583 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2584 {
2585         struct rtable *ort = (struct rtable *) dst_orig;
2586         struct rtable *rt;
2587
2588         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2589         if (rt) {
2590                 struct dst_entry *new = &rt->dst;
2591
2592                 new->__use = 1;
2593                 new->input = dst_discard;
2594                 new->output = dst_discard_out;
2595
2596                 new->dev = net->loopback_dev;
2597                 if (new->dev)
2598                         dev_hold(new->dev);
2599
2600                 rt->rt_is_input = ort->rt_is_input;
2601                 rt->rt_iif = ort->rt_iif;
2602                 rt->rt_pmtu = ort->rt_pmtu;
2603                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2604
2605                 rt->rt_genid = rt_genid_ipv4(net);
2606                 rt->rt_flags = ort->rt_flags;
2607                 rt->rt_type = ort->rt_type;
2608                 rt->rt_gateway = ort->rt_gateway;
2609                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2610
2611                 INIT_LIST_HEAD(&rt->rt_uncached);
2612         }
2613
2614         dst_release(dst_orig);
2615
2616         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2617 }
2618
2619 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2620                                     const struct sock *sk)
2621 {
2622         struct rtable *rt = __ip_route_output_key(net, flp4);
2623
2624         if (IS_ERR(rt))
2625                 return rt;
2626
2627         if (flp4->flowi4_proto) {
2628                 flp4->flowi4_oif = rt->dst.dev->ifindex;
2629                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2630                                                         flowi4_to_flowi(flp4),
2631                                                         sk, 0);
2632         }
2633
2634         return rt;
2635 }
2636 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2637
2638 /* called with rcu_read_lock held */
2639 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2640                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2641                         u32 seq)
2642 {
2643         struct rtable *rt = skb_rtable(skb);
2644         struct rtmsg *r;
2645         struct nlmsghdr *nlh;
2646         unsigned long expires = 0;
2647         u32 error;
2648         u32 metrics[RTAX_MAX];
2649
2650         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2651         if (!nlh)
2652                 return -EMSGSIZE;
2653
2654         r = nlmsg_data(nlh);
2655         r->rtm_family    = AF_INET;
2656         r->rtm_dst_len  = 32;
2657         r->rtm_src_len  = 0;
2658         r->rtm_tos      = fl4->flowi4_tos;
2659         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2660         if (nla_put_u32(skb, RTA_TABLE, table_id))
2661                 goto nla_put_failure;
2662         r->rtm_type     = rt->rt_type;
2663         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2664         r->rtm_protocol = RTPROT_UNSPEC;
2665         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2666         if (rt->rt_flags & RTCF_NOTIFY)
2667                 r->rtm_flags |= RTM_F_NOTIFY;
2668         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2669                 r->rtm_flags |= RTCF_DOREDIRECT;
2670
2671         if (nla_put_in_addr(skb, RTA_DST, dst))
2672                 goto nla_put_failure;
2673         if (src) {
2674                 r->rtm_src_len = 32;
2675                 if (nla_put_in_addr(skb, RTA_SRC, src))
2676                         goto nla_put_failure;
2677         }
2678         if (rt->dst.dev &&
2679             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2680                 goto nla_put_failure;
2681 #ifdef CONFIG_IP_ROUTE_CLASSID
2682         if (rt->dst.tclassid &&
2683             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2684                 goto nla_put_failure;
2685 #endif
2686         if (!rt_is_input_route(rt) &&
2687             fl4->saddr != src) {
2688                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2689                         goto nla_put_failure;
2690         }
2691         if (rt->rt_uses_gateway &&
2692             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2693                 goto nla_put_failure;
2694
2695         expires = rt->dst.expires;
2696         if (expires) {
2697                 unsigned long now = jiffies;
2698
2699                 if (time_before(now, expires))
2700                         expires -= now;
2701                 else
2702                         expires = 0;
2703         }
2704
2705         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2706         if (rt->rt_pmtu && expires)
2707                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2708         if (rt->rt_mtu_locked && expires)
2709                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2710         if (rtnetlink_put_metrics(skb, metrics) < 0)
2711                 goto nla_put_failure;
2712
2713         if (fl4->flowi4_mark &&
2714             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2715                 goto nla_put_failure;
2716
2717         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2718             nla_put_u32(skb, RTA_UID,
2719                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2720                 goto nla_put_failure;
2721
2722         error = rt->dst.error;
2723
2724         if (rt_is_input_route(rt)) {
2725 #ifdef CONFIG_IP_MROUTE
2726                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2727                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2728                         int err = ipmr_get_route(net, skb,
2729                                                  fl4->saddr, fl4->daddr,
2730                                                  r, portid);
2731
2732                         if (err <= 0) {
2733                                 if (err == 0)
2734                                         return 0;
2735                                 goto nla_put_failure;
2736                         }
2737                 } else
2738 #endif
2739                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2740                                 goto nla_put_failure;
2741         }
2742
2743         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2744                 goto nla_put_failure;
2745
2746         nlmsg_end(skb, nlh);
2747         return 0;
2748
2749 nla_put_failure:
2750         nlmsg_cancel(skb, nlh);
2751         return -EMSGSIZE;
2752 }
2753
2754 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2755                              struct netlink_ext_ack *extack)
2756 {
2757         struct net *net = sock_net(in_skb->sk);
2758         struct rtmsg *rtm;
2759         struct nlattr *tb[RTA_MAX+1];
2760         struct fib_result res = {};
2761         struct rtable *rt = NULL;
2762         struct flowi4 fl4;
2763         __be32 dst = 0;
2764         __be32 src = 0;
2765         u32 iif;
2766         int err;
2767         int mark;
2768         struct sk_buff *skb;
2769         u32 table_id = RT_TABLE_MAIN;
2770         kuid_t uid;
2771
2772         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2773                           extack);
2774         if (err < 0)
2775                 goto errout;
2776
2777         rtm = nlmsg_data(nlh);
2778
2779         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2780         if (!skb) {
2781                 err = -ENOBUFS;
2782                 goto errout;
2783         }
2784
2785         /* Reserve room for dummy headers, this skb can pass
2786            through good chunk of routing engine.
2787          */
2788         skb_reset_mac_header(skb);
2789         skb_reset_network_header(skb);
2790
2791         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2792         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2793         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2794         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2795         if (tb[RTA_UID])
2796                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2797         else
2798                 uid = (iif ? INVALID_UID : current_uid());
2799
2800         /* Bugfix: need to give ip_route_input enough of an IP header to
2801          * not gag.
2802          */
2803         ip_hdr(skb)->protocol = IPPROTO_UDP;
2804         ip_hdr(skb)->saddr = src;
2805         ip_hdr(skb)->daddr = dst;
2806
2807         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2808
2809         memset(&fl4, 0, sizeof(fl4));
2810         fl4.daddr = dst;
2811         fl4.saddr = src;
2812         fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
2813         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2814         fl4.flowi4_mark = mark;
2815         fl4.flowi4_uid = uid;
2816
2817         rcu_read_lock();
2818
2819         if (iif) {
2820                 struct net_device *dev;
2821
2822                 dev = dev_get_by_index_rcu(net, iif);
2823                 if (!dev) {
2824                         err = -ENODEV;
2825                         goto errout_free;
2826                 }
2827
2828                 skb->protocol   = htons(ETH_P_IP);
2829                 skb->dev        = dev;
2830                 skb->mark       = mark;
2831                 err = ip_route_input_rcu(skb, dst, src,
2832                                          rtm->rtm_tos & IPTOS_RT_MASK, dev,
2833                                          &res);
2834
2835                 rt = skb_rtable(skb);
2836                 if (err == 0 && rt->dst.error)
2837                         err = -rt->dst.error;
2838         } else {
2839                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2840                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2841                 err = 0;
2842                 if (IS_ERR(rt))
2843                         err = PTR_ERR(rt);
2844                 else
2845                         skb_dst_set(skb, &rt->dst);
2846         }
2847
2848         if (err)
2849                 goto errout_free;
2850
2851         if (rtm->rtm_flags & RTM_F_NOTIFY)
2852                 rt->rt_flags |= RTCF_NOTIFY;
2853
2854         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2855                 table_id = rt->rt_table_id;
2856
2857         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2858                 if (!res.fi) {
2859                         err = fib_props[res.type].error;
2860                         if (!err)
2861                                 err = -EHOSTUNREACH;
2862                         goto errout_free;
2863                 }
2864                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2865                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2866                                     rt->rt_type, res.prefix, res.prefixlen,
2867                                     fl4.flowi4_tos, res.fi, 0);
2868         } else {
2869                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2870                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2871         }
2872         if (err < 0)
2873                 goto errout_free;
2874
2875         rcu_read_unlock();
2876
2877         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2878 errout:
2879         return err;
2880
2881 errout_free:
2882         rcu_read_unlock();
2883         kfree_skb(skb);
2884         goto errout;
2885 }
2886
2887 void ip_rt_multicast_event(struct in_device *in_dev)
2888 {
2889         rt_cache_flush(dev_net(in_dev->dev));
2890 }
2891
2892 #ifdef CONFIG_SYSCTL
2893 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2894 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2895 static int ip_rt_gc_elasticity __read_mostly    = 8;
2896 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2897
2898 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2899                                         void __user *buffer,
2900                                         size_t *lenp, loff_t *ppos)
2901 {
2902         struct net *net = (struct net *)__ctl->extra1;
2903
2904         if (write) {
2905                 rt_cache_flush(net);
2906                 fnhe_genid_bump(net);
2907                 return 0;
2908         }
2909
2910         return -EINVAL;
2911 }
2912
2913 static struct ctl_table ipv4_route_table[] = {
2914         {
2915                 .procname       = "gc_thresh",
2916                 .data           = &ipv4_dst_ops.gc_thresh,
2917                 .maxlen         = sizeof(int),
2918                 .mode           = 0644,
2919                 .proc_handler   = proc_dointvec,
2920         },
2921         {
2922                 .procname       = "max_size",
2923                 .data           = &ip_rt_max_size,
2924                 .maxlen         = sizeof(int),
2925                 .mode           = 0644,
2926                 .proc_handler   = proc_dointvec,
2927         },
2928         {
2929                 /*  Deprecated. Use gc_min_interval_ms */
2930
2931                 .procname       = "gc_min_interval",
2932                 .data           = &ip_rt_gc_min_interval,
2933                 .maxlen         = sizeof(int),
2934                 .mode           = 0644,
2935                 .proc_handler   = proc_dointvec_jiffies,
2936         },
2937         {
2938                 .procname       = "gc_min_interval_ms",
2939                 .data           = &ip_rt_gc_min_interval,
2940                 .maxlen         = sizeof(int),
2941                 .mode           = 0644,
2942                 .proc_handler   = proc_dointvec_ms_jiffies,
2943         },
2944         {
2945                 .procname       = "gc_timeout",
2946                 .data           = &ip_rt_gc_timeout,
2947                 .maxlen         = sizeof(int),
2948                 .mode           = 0644,
2949                 .proc_handler   = proc_dointvec_jiffies,
2950         },
2951         {
2952                 .procname       = "gc_interval",
2953                 .data           = &ip_rt_gc_interval,
2954                 .maxlen         = sizeof(int),
2955                 .mode           = 0644,
2956                 .proc_handler   = proc_dointvec_jiffies,
2957         },
2958         {
2959                 .procname       = "redirect_load",
2960                 .data           = &ip_rt_redirect_load,
2961                 .maxlen         = sizeof(int),
2962                 .mode           = 0644,
2963                 .proc_handler   = proc_dointvec,
2964         },
2965         {
2966                 .procname       = "redirect_number",
2967                 .data           = &ip_rt_redirect_number,
2968                 .maxlen         = sizeof(int),
2969                 .mode           = 0644,
2970                 .proc_handler   = proc_dointvec,
2971         },
2972         {
2973                 .procname       = "redirect_silence",
2974                 .data           = &ip_rt_redirect_silence,
2975                 .maxlen         = sizeof(int),
2976                 .mode           = 0644,
2977                 .proc_handler   = proc_dointvec,
2978         },
2979         {
2980                 .procname       = "error_cost",
2981                 .data           = &ip_rt_error_cost,
2982                 .maxlen         = sizeof(int),
2983                 .mode           = 0644,
2984                 .proc_handler   = proc_dointvec,
2985         },
2986         {
2987                 .procname       = "error_burst",
2988                 .data           = &ip_rt_error_burst,
2989                 .maxlen         = sizeof(int),
2990                 .mode           = 0644,
2991                 .proc_handler   = proc_dointvec,
2992         },
2993         {
2994                 .procname       = "gc_elasticity",
2995                 .data           = &ip_rt_gc_elasticity,
2996                 .maxlen         = sizeof(int),
2997                 .mode           = 0644,
2998                 .proc_handler   = proc_dointvec,
2999         },
3000         {
3001                 .procname       = "mtu_expires",
3002                 .data           = &ip_rt_mtu_expires,
3003                 .maxlen         = sizeof(int),
3004                 .mode           = 0644,
3005                 .proc_handler   = proc_dointvec_jiffies,
3006         },
3007         {
3008                 .procname       = "min_pmtu",
3009                 .data           = &ip_rt_min_pmtu,
3010                 .maxlen         = sizeof(int),
3011                 .mode           = 0644,
3012                 .proc_handler   = proc_dointvec_minmax,
3013                 .extra1         = &ip_min_valid_pmtu,
3014         },
3015         {
3016                 .procname       = "min_adv_mss",
3017                 .data           = &ip_rt_min_advmss,
3018                 .maxlen         = sizeof(int),
3019                 .mode           = 0644,
3020                 .proc_handler   = proc_dointvec,
3021         },
3022         { }
3023 };
3024
3025 static struct ctl_table ipv4_route_flush_table[] = {
3026         {
3027                 .procname       = "flush",
3028                 .maxlen         = sizeof(int),
3029                 .mode           = 0200,
3030                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3031         },
3032         { },
3033 };
3034
3035 static __net_init int sysctl_route_net_init(struct net *net)
3036 {
3037         struct ctl_table *tbl;
3038
3039         tbl = ipv4_route_flush_table;
3040         if (!net_eq(net, &init_net)) {
3041                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3042                 if (!tbl)
3043                         goto err_dup;
3044
3045                 /* Don't export sysctls to unprivileged users */
3046                 if (net->user_ns != &init_user_ns)
3047                         tbl[0].procname = NULL;
3048         }
3049         tbl[0].extra1 = net;
3050
3051         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3052         if (!net->ipv4.route_hdr)
3053                 goto err_reg;
3054         return 0;
3055
3056 err_reg:
3057         if (tbl != ipv4_route_flush_table)
3058                 kfree(tbl);
3059 err_dup:
3060         return -ENOMEM;
3061 }
3062
3063 static __net_exit void sysctl_route_net_exit(struct net *net)
3064 {
3065         struct ctl_table *tbl;
3066
3067         tbl = net->ipv4.route_hdr->ctl_table_arg;
3068         unregister_net_sysctl_table(net->ipv4.route_hdr);
3069         BUG_ON(tbl == ipv4_route_flush_table);
3070         kfree(tbl);
3071 }
3072
3073 static __net_initdata struct pernet_operations sysctl_route_ops = {
3074         .init = sysctl_route_net_init,
3075         .exit = sysctl_route_net_exit,
3076 };
3077 #endif
3078
3079 static __net_init int rt_genid_init(struct net *net)
3080 {
3081         atomic_set(&net->ipv4.rt_genid, 0);
3082         atomic_set(&net->fnhe_genid, 0);
3083         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3084         return 0;
3085 }
3086
3087 static __net_initdata struct pernet_operations rt_genid_ops = {
3088         .init = rt_genid_init,
3089 };
3090
3091 static int __net_init ipv4_inetpeer_init(struct net *net)
3092 {
3093         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3094
3095         if (!bp)
3096                 return -ENOMEM;
3097         inet_peer_base_init(bp);
3098         net->ipv4.peers = bp;
3099         return 0;
3100 }
3101
3102 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3103 {
3104         struct inet_peer_base *bp = net->ipv4.peers;
3105
3106         net->ipv4.peers = NULL;
3107         inetpeer_invalidate_tree(bp);
3108         kfree(bp);
3109 }
3110
3111 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3112         .init   =       ipv4_inetpeer_init,
3113         .exit   =       ipv4_inetpeer_exit,
3114 };
3115
3116 #ifdef CONFIG_IP_ROUTE_CLASSID
3117 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3118 #endif /* CONFIG_IP_ROUTE_CLASSID */
3119
3120 int __init ip_rt_init(void)
3121 {
3122         void *idents_hash;
3123         int rc = 0;
3124         int cpu;
3125
3126         /* For modern hosts, this will use 2 MB of memory */
3127         idents_hash = alloc_large_system_hash("IP idents",
3128                                               sizeof(*ip_idents) + sizeof(*ip_tstamps),
3129                                               0,
3130                                               16, /* one bucket per 64 KB */
3131                                               HASH_ZERO,
3132                                               NULL,
3133                                               &ip_idents_mask,
3134                                               2048,
3135                                               256*1024);
3136
3137         ip_idents = idents_hash;
3138
3139         prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3140
3141         ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3142
3143         for_each_possible_cpu(cpu) {
3144                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3145
3146                 INIT_LIST_HEAD(&ul->head);
3147                 spin_lock_init(&ul->lock);
3148         }
3149 #ifdef CONFIG_IP_ROUTE_CLASSID
3150         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3151         if (!ip_rt_acct)
3152                 panic("IP: failed to allocate ip_rt_acct\n");
3153 #endif
3154
3155         ipv4_dst_ops.kmem_cachep =
3156                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3157                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3158
3159         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3160
3161         if (dst_entries_init(&ipv4_dst_ops) < 0)
3162                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3163
3164         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3165                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3166
3167         ipv4_dst_ops.gc_thresh = ~0;
3168         ip_rt_max_size = INT_MAX;
3169
3170         devinet_init();
3171         ip_fib_init();
3172
3173         if (ip_rt_proc_init())
3174                 pr_err("Unable to create route proc files\n");
3175 #ifdef CONFIG_XFRM
3176         xfrm_init();
3177         xfrm4_init();
3178 #endif
3179         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3180                       RTNL_FLAG_DOIT_UNLOCKED);
3181
3182 #ifdef CONFIG_SYSCTL
3183         register_pernet_subsys(&sysctl_route_ops);
3184 #endif
3185         register_pernet_subsys(&rt_genid_ops);
3186         register_pernet_subsys(&ipv4_inetpeer_ops);
3187         return rc;
3188 }
3189
3190 #ifdef CONFIG_SYSCTL
3191 /*
3192  * We really need to sanitize the damn ipv4 init order, then all
3193  * this nonsense will go away.
3194  */
3195 void __init ip_static_sysctl_init(void)
3196 {
3197         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3198 }
3199 #endif