GNU Linux-libre 4.4.284-gnu1
[releases.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 #include <net/udp.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73                                 __be16 flags, __be32 key)
74 {
75         if (p->i_flags & TUNNEL_KEY) {
76                 if (flags & TUNNEL_KEY)
77                         return key == p->i_key;
78                 else
79                         /* key expected, none present */
80                         return false;
81         } else
82                 return !(flags & TUNNEL_KEY);
83 }
84
85 /* Fallback tunnel: no source, no destination, no key, no options
86
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97                                    int link, __be16 flags,
98                                    __be32 remote, __be32 local,
99                                    __be32 key)
100 {
101         struct ip_tunnel *t, *cand = NULL;
102         struct hlist_head *head;
103         struct net_device *ndev;
104         unsigned int hash;
105
106         hash = ip_tunnel_hash(key, remote);
107         head = &itn->tunnels[hash];
108
109         hlist_for_each_entry_rcu(t, head, hash_node) {
110                 if (local != t->parms.iph.saddr ||
111                     remote != t->parms.iph.daddr ||
112                     !(t->dev->flags & IFF_UP))
113                         continue;
114
115                 if (!ip_tunnel_key_match(&t->parms, flags, key))
116                         continue;
117
118                 if (t->parms.link == link)
119                         return t;
120                 else
121                         cand = t;
122         }
123
124         hlist_for_each_entry_rcu(t, head, hash_node) {
125                 if (remote != t->parms.iph.daddr ||
126                     t->parms.iph.saddr != 0 ||
127                     !(t->dev->flags & IFF_UP))
128                         continue;
129
130                 if (!ip_tunnel_key_match(&t->parms, flags, key))
131                         continue;
132
133                 if (t->parms.link == link)
134                         return t;
135                 else if (!cand)
136                         cand = t;
137         }
138
139         hash = ip_tunnel_hash(key, 0);
140         head = &itn->tunnels[hash];
141
142         hlist_for_each_entry_rcu(t, head, hash_node) {
143                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
144                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
145                         continue;
146
147                 if (!(t->dev->flags & IFF_UP))
148                         continue;
149
150                 if (!ip_tunnel_key_match(&t->parms, flags, key))
151                         continue;
152
153                 if (t->parms.link == link)
154                         return t;
155                 else if (!cand)
156                         cand = t;
157         }
158
159         hlist_for_each_entry_rcu(t, head, hash_node) {
160                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
161                     t->parms.iph.saddr != 0 ||
162                     t->parms.iph.daddr != 0 ||
163                     !(t->dev->flags & IFF_UP))
164                         continue;
165
166                 if (t->parms.link == link)
167                         return t;
168                 else if (!cand)
169                         cand = t;
170         }
171
172         if (cand)
173                 return cand;
174
175         t = rcu_dereference(itn->collect_md_tun);
176         if (t)
177                 return t;
178
179         ndev = READ_ONCE(itn->fb_tunnel_dev);
180         if (ndev && ndev->flags & IFF_UP)
181                 return netdev_priv(ndev);
182
183         return NULL;
184 }
185 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
186
187 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
188                                     struct ip_tunnel_parm *parms)
189 {
190         unsigned int h;
191         __be32 remote;
192         __be32 i_key = parms->i_key;
193
194         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
195                 remote = parms->iph.daddr;
196         else
197                 remote = 0;
198
199         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
200                 i_key = 0;
201
202         h = ip_tunnel_hash(i_key, remote);
203         return &itn->tunnels[h];
204 }
205
206 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
207 {
208         struct hlist_head *head = ip_bucket(itn, &t->parms);
209
210         if (t->collect_md)
211                 rcu_assign_pointer(itn->collect_md_tun, t);
212         hlist_add_head_rcu(&t->hash_node, head);
213 }
214
215 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
216 {
217         if (t->collect_md)
218                 rcu_assign_pointer(itn->collect_md_tun, NULL);
219         hlist_del_init_rcu(&t->hash_node);
220 }
221
222 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
223                                         struct ip_tunnel_parm *parms,
224                                         int type)
225 {
226         __be32 remote = parms->iph.daddr;
227         __be32 local = parms->iph.saddr;
228         __be32 key = parms->i_key;
229         __be16 flags = parms->i_flags;
230         int link = parms->link;
231         struct ip_tunnel *t = NULL;
232         struct hlist_head *head = ip_bucket(itn, parms);
233
234         hlist_for_each_entry_rcu(t, head, hash_node) {
235                 if (local == t->parms.iph.saddr &&
236                     remote == t->parms.iph.daddr &&
237                     link == t->parms.link &&
238                     type == t->dev->type &&
239                     ip_tunnel_key_match(&t->parms, flags, key))
240                         break;
241         }
242         return t;
243 }
244
245 static struct net_device *__ip_tunnel_create(struct net *net,
246                                              const struct rtnl_link_ops *ops,
247                                              struct ip_tunnel_parm *parms)
248 {
249         int err;
250         struct ip_tunnel *tunnel;
251         struct net_device *dev;
252         char name[IFNAMSIZ];
253
254         err = -E2BIG;
255         if (parms->name[0]) {
256                 if (!dev_valid_name(parms->name))
257                         goto failed;
258                 strlcpy(name, parms->name, IFNAMSIZ);
259         } else {
260                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
261                         goto failed;
262                 strcpy(name, ops->kind);
263                 strcat(name, "%d");
264         }
265
266         ASSERT_RTNL();
267         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
268         if (!dev) {
269                 err = -ENOMEM;
270                 goto failed;
271         }
272         dev_net_set(dev, net);
273
274         dev->rtnl_link_ops = ops;
275
276         tunnel = netdev_priv(dev);
277         tunnel->parms = *parms;
278         tunnel->net = net;
279
280         err = register_netdevice(dev);
281         if (err)
282                 goto failed_free;
283
284         return dev;
285
286 failed_free:
287         free_netdev(dev);
288 failed:
289         return ERR_PTR(err);
290 }
291
292 static inline void init_tunnel_flow(struct flowi4 *fl4,
293                                     int proto,
294                                     __be32 daddr, __be32 saddr,
295                                     __be32 key, __u8 tos, int oif)
296 {
297         memset(fl4, 0, sizeof(*fl4));
298         fl4->flowi4_oif = oif;
299         fl4->daddr = daddr;
300         fl4->saddr = saddr;
301         fl4->flowi4_tos = tos;
302         fl4->flowi4_proto = proto;
303         fl4->fl4_gre_key = key;
304 }
305
306 static int ip_tunnel_bind_dev(struct net_device *dev)
307 {
308         struct net_device *tdev = NULL;
309         struct ip_tunnel *tunnel = netdev_priv(dev);
310         const struct iphdr *iph;
311         int hlen = LL_MAX_HEADER;
312         int mtu = ETH_DATA_LEN;
313         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
314
315         iph = &tunnel->parms.iph;
316
317         /* Guess output device to choose reasonable mtu and needed_headroom */
318         if (iph->daddr) {
319                 struct flowi4 fl4;
320                 struct rtable *rt;
321
322                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
323                                  iph->saddr, tunnel->parms.o_key,
324                                  RT_TOS(iph->tos), tunnel->parms.link);
325                 rt = ip_route_output_key(tunnel->net, &fl4);
326
327                 if (!IS_ERR(rt)) {
328                         tdev = rt->dst.dev;
329                         ip_rt_put(rt);
330                 }
331                 if (dev->type != ARPHRD_ETHER)
332                         dev->flags |= IFF_POINTOPOINT;
333
334                 dst_cache_reset(&tunnel->dst_cache);
335         }
336
337         if (!tdev && tunnel->parms.link)
338                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
339
340         if (tdev) {
341                 hlen = tdev->hard_header_len + tdev->needed_headroom;
342                 mtu = tdev->mtu;
343         }
344
345         dev->needed_headroom = t_hlen + hlen;
346         mtu -= (dev->hard_header_len + t_hlen);
347
348         if (mtu < IPV4_MIN_MTU)
349                 mtu = IPV4_MIN_MTU;
350
351         return mtu;
352 }
353
354 static struct ip_tunnel *ip_tunnel_create(struct net *net,
355                                           struct ip_tunnel_net *itn,
356                                           struct ip_tunnel_parm *parms)
357 {
358         struct ip_tunnel *nt;
359         struct net_device *dev;
360
361         BUG_ON(!itn->fb_tunnel_dev);
362         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
363         if (IS_ERR(dev))
364                 return ERR_CAST(dev);
365
366         dev->mtu = ip_tunnel_bind_dev(dev);
367
368         nt = netdev_priv(dev);
369         ip_tunnel_add(itn, nt);
370         return nt;
371 }
372
373 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
374                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
375                   bool log_ecn_error)
376 {
377         struct pcpu_sw_netstats *tstats;
378         const struct iphdr *iph = ip_hdr(skb);
379         int err;
380
381 #ifdef CONFIG_NET_IPGRE_BROADCAST
382         if (ipv4_is_multicast(iph->daddr)) {
383                 tunnel->dev->stats.multicast++;
384                 skb->pkt_type = PACKET_BROADCAST;
385         }
386 #endif
387
388         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
389              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
390                 tunnel->dev->stats.rx_crc_errors++;
391                 tunnel->dev->stats.rx_errors++;
392                 goto drop;
393         }
394
395         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
396                 if (!(tpi->flags&TUNNEL_SEQ) ||
397                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
398                         tunnel->dev->stats.rx_fifo_errors++;
399                         tunnel->dev->stats.rx_errors++;
400                         goto drop;
401                 }
402                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
403         }
404
405         skb_reset_network_header(skb);
406
407         err = IP_ECN_decapsulate(iph, skb);
408         if (unlikely(err)) {
409                 if (log_ecn_error)
410                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
411                                         &iph->saddr, iph->tos);
412                 if (err > 1) {
413                         ++tunnel->dev->stats.rx_frame_errors;
414                         ++tunnel->dev->stats.rx_errors;
415                         goto drop;
416                 }
417         }
418
419         tstats = this_cpu_ptr(tunnel->dev->tstats);
420         u64_stats_update_begin(&tstats->syncp);
421         tstats->rx_packets++;
422         tstats->rx_bytes += skb->len;
423         u64_stats_update_end(&tstats->syncp);
424
425         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
426
427         if (tunnel->dev->type == ARPHRD_ETHER) {
428                 skb->protocol = eth_type_trans(skb, tunnel->dev);
429                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
430         } else {
431                 skb->dev = tunnel->dev;
432         }
433
434         if (tun_dst)
435                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
436
437         gro_cells_receive(&tunnel->gro_cells, skb);
438         return 0;
439
440 drop:
441         kfree_skb(skb);
442         return 0;
443 }
444 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
445
446 static int ip_encap_hlen(struct ip_tunnel_encap *e)
447 {
448         const struct ip_tunnel_encap_ops *ops;
449         int hlen = -EINVAL;
450
451         if (e->type == TUNNEL_ENCAP_NONE)
452                 return 0;
453
454         if (e->type >= MAX_IPTUN_ENCAP_OPS)
455                 return -EINVAL;
456
457         rcu_read_lock();
458         ops = rcu_dereference(iptun_encaps[e->type]);
459         if (likely(ops && ops->encap_hlen))
460                 hlen = ops->encap_hlen(e);
461         rcu_read_unlock();
462
463         return hlen;
464 }
465
466 const struct ip_tunnel_encap_ops __rcu *
467                 iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
468
469 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
470                             unsigned int num)
471 {
472         if (num >= MAX_IPTUN_ENCAP_OPS)
473                 return -ERANGE;
474
475         return !cmpxchg((const struct ip_tunnel_encap_ops **)
476                         &iptun_encaps[num],
477                         NULL, ops) ? 0 : -1;
478 }
479 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
480
481 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
482                             unsigned int num)
483 {
484         int ret;
485
486         if (num >= MAX_IPTUN_ENCAP_OPS)
487                 return -ERANGE;
488
489         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
490                        &iptun_encaps[num],
491                        ops, NULL) == ops) ? 0 : -1;
492
493         synchronize_net();
494
495         return ret;
496 }
497 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
498
499 int ip_tunnel_encap_setup(struct ip_tunnel *t,
500                           struct ip_tunnel_encap *ipencap)
501 {
502         int hlen;
503
504         memset(&t->encap, 0, sizeof(t->encap));
505
506         hlen = ip_encap_hlen(ipencap);
507         if (hlen < 0)
508                 return hlen;
509
510         t->encap.type = ipencap->type;
511         t->encap.sport = ipencap->sport;
512         t->encap.dport = ipencap->dport;
513         t->encap.flags = ipencap->flags;
514
515         t->encap_hlen = hlen;
516         t->hlen = t->encap_hlen + t->tun_hlen;
517
518         return 0;
519 }
520 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
521
522 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
523                     u8 *protocol, struct flowi4 *fl4)
524 {
525         const struct ip_tunnel_encap_ops *ops;
526         int ret = -EINVAL;
527
528         if (t->encap.type == TUNNEL_ENCAP_NONE)
529                 return 0;
530
531         if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
532                 return -EINVAL;
533
534         rcu_read_lock();
535         ops = rcu_dereference(iptun_encaps[t->encap.type]);
536         if (likely(ops && ops->build_header))
537                 ret = ops->build_header(skb, &t->encap, protocol, fl4);
538         rcu_read_unlock();
539
540         return ret;
541 }
542 EXPORT_SYMBOL(ip_tunnel_encap);
543
544 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
545                             struct rtable *rt, __be16 df,
546                             const struct iphdr *inner_iph)
547 {
548         struct ip_tunnel *tunnel = netdev_priv(dev);
549         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
550         int mtu;
551
552         if (df)
553                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
554                                         - sizeof(struct iphdr) - tunnel->hlen;
555         else
556                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
557
558         if (skb_dst(skb))
559                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
560
561         if (skb->protocol == htons(ETH_P_IP)) {
562                 if (!skb_is_gso(skb) &&
563                     (inner_iph->frag_off & htons(IP_DF)) &&
564                     mtu < pkt_size) {
565                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
566                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
567                         return -E2BIG;
568                 }
569         }
570 #if IS_ENABLED(CONFIG_IPV6)
571         else if (skb->protocol == htons(ETH_P_IPV6)) {
572                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
573
574                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
575                            mtu >= IPV6_MIN_MTU) {
576                         if ((tunnel->parms.iph.daddr &&
577                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
578                             rt6->rt6i_dst.plen == 128) {
579                                 rt6->rt6i_flags |= RTF_MODIFIED;
580                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
581                         }
582                 }
583
584                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
585                                         mtu < pkt_size) {
586                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
587                         return -E2BIG;
588                 }
589         }
590 #endif
591         return 0;
592 }
593
594 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
595                     const struct iphdr *tnl_params, u8 protocol)
596 {
597         struct ip_tunnel *tunnel = netdev_priv(dev);
598         unsigned int inner_nhdr_len = 0;
599         const struct iphdr *inner_iph;
600         struct flowi4 fl4;
601         u8     tos, ttl;
602         __be16 df;
603         struct rtable *rt;              /* Route to the other host */
604         unsigned int max_headroom;      /* The extra header space needed */
605         __be32 dst;
606         int err;
607         bool connected;
608
609         /* ensure we can access the inner net header, for several users below */
610         if (skb->protocol == htons(ETH_P_IP))
611                 inner_nhdr_len = sizeof(struct iphdr);
612         else if (skb->protocol == htons(ETH_P_IPV6))
613                 inner_nhdr_len = sizeof(struct ipv6hdr);
614         if (unlikely(!pskb_may_pull(skb, inner_nhdr_len)))
615                 goto tx_error;
616
617         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
618         connected = (tunnel->parms.iph.daddr != 0);
619
620         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
621
622         dst = tnl_params->daddr;
623         if (dst == 0) {
624                 /* NBMA tunnel */
625
626                 if (!skb_dst(skb)) {
627                         dev->stats.tx_fifo_errors++;
628                         goto tx_error;
629                 }
630
631                 if (skb->protocol == htons(ETH_P_IP)) {
632                         rt = skb_rtable(skb);
633                         dst = rt_nexthop(rt, inner_iph->daddr);
634                 }
635 #if IS_ENABLED(CONFIG_IPV6)
636                 else if (skb->protocol == htons(ETH_P_IPV6)) {
637                         const struct in6_addr *addr6;
638                         struct neighbour *neigh;
639                         bool do_tx_error_icmp;
640                         int addr_type;
641
642                         neigh = dst_neigh_lookup(skb_dst(skb),
643                                                  &ipv6_hdr(skb)->daddr);
644                         if (!neigh)
645                                 goto tx_error;
646
647                         addr6 = (const struct in6_addr *)&neigh->primary_key;
648                         addr_type = ipv6_addr_type(addr6);
649
650                         if (addr_type == IPV6_ADDR_ANY) {
651                                 addr6 = &ipv6_hdr(skb)->daddr;
652                                 addr_type = ipv6_addr_type(addr6);
653                         }
654
655                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
656                                 do_tx_error_icmp = true;
657                         else {
658                                 do_tx_error_icmp = false;
659                                 dst = addr6->s6_addr32[3];
660                         }
661                         neigh_release(neigh);
662                         if (do_tx_error_icmp)
663                                 goto tx_error_icmp;
664                 }
665 #endif
666                 else
667                         goto tx_error;
668
669                 connected = false;
670         }
671
672         tos = tnl_params->tos;
673         if (tos & 0x1) {
674                 tos &= ~0x1;
675                 if (skb->protocol == htons(ETH_P_IP)) {
676                         tos = inner_iph->tos;
677                         connected = false;
678                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
679                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
680                         connected = false;
681                 }
682         }
683
684         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
685                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
686
687         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
688                 goto tx_error;
689
690         rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
691                          NULL;
692
693         if (!rt) {
694                 rt = ip_route_output_key(tunnel->net, &fl4);
695
696                 if (IS_ERR(rt)) {
697                         dev->stats.tx_carrier_errors++;
698                         goto tx_error;
699                 }
700                 if (connected)
701                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
702                                           fl4.saddr);
703         }
704
705         if (rt->dst.dev == dev) {
706                 ip_rt_put(rt);
707                 dev->stats.collisions++;
708                 goto tx_error;
709         }
710
711         df = tnl_params->frag_off;
712         if (skb->protocol == htons(ETH_P_IP))
713                 df |= (inner_iph->frag_off & htons(IP_DF));
714
715         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph)) {
716                 ip_rt_put(rt);
717                 goto tx_error;
718         }
719
720         if (tunnel->err_count > 0) {
721                 if (time_before(jiffies,
722                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
723                         tunnel->err_count--;
724
725                         dst_link_failure(skb);
726                 } else
727                         tunnel->err_count = 0;
728         }
729
730         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
731         ttl = tnl_params->ttl;
732         if (ttl == 0) {
733                 if (skb->protocol == htons(ETH_P_IP))
734                         ttl = inner_iph->ttl;
735 #if IS_ENABLED(CONFIG_IPV6)
736                 else if (skb->protocol == htons(ETH_P_IPV6))
737                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
738 #endif
739                 else
740                         ttl = ip4_dst_hoplimit(&rt->dst);
741         }
742
743         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
744                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
745         if (max_headroom > dev->needed_headroom)
746                 dev->needed_headroom = max_headroom;
747
748         if (skb_cow_head(skb, dev->needed_headroom)) {
749                 ip_rt_put(rt);
750                 dev->stats.tx_dropped++;
751                 kfree_skb(skb);
752                 return;
753         }
754
755         err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
756                             tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
757         iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
758
759         return;
760
761 #if IS_ENABLED(CONFIG_IPV6)
762 tx_error_icmp:
763         dst_link_failure(skb);
764 #endif
765 tx_error:
766         dev->stats.tx_errors++;
767         kfree_skb(skb);
768 }
769 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
770
771 static void ip_tunnel_update(struct ip_tunnel_net *itn,
772                              struct ip_tunnel *t,
773                              struct net_device *dev,
774                              struct ip_tunnel_parm *p,
775                              bool set_mtu)
776 {
777         ip_tunnel_del(itn, t);
778         t->parms.iph.saddr = p->iph.saddr;
779         t->parms.iph.daddr = p->iph.daddr;
780         t->parms.i_key = p->i_key;
781         t->parms.o_key = p->o_key;
782         if (dev->type != ARPHRD_ETHER) {
783                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
784                 memcpy(dev->broadcast, &p->iph.daddr, 4);
785         }
786         ip_tunnel_add(itn, t);
787
788         t->parms.iph.ttl = p->iph.ttl;
789         t->parms.iph.tos = p->iph.tos;
790         t->parms.iph.frag_off = p->iph.frag_off;
791
792         if (t->parms.link != p->link) {
793                 int mtu;
794
795                 t->parms.link = p->link;
796                 mtu = ip_tunnel_bind_dev(dev);
797                 if (set_mtu)
798                         dev->mtu = mtu;
799         }
800         dst_cache_reset(&t->dst_cache);
801         netdev_state_change(dev);
802 }
803
804 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
805 {
806         int err = 0;
807         struct ip_tunnel *t = netdev_priv(dev);
808         struct net *net = t->net;
809         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
810
811         BUG_ON(!itn->fb_tunnel_dev);
812         switch (cmd) {
813         case SIOCGETTUNNEL:
814                 if (dev == itn->fb_tunnel_dev) {
815                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
816                         if (!t)
817                                 t = netdev_priv(dev);
818                 }
819                 memcpy(p, &t->parms, sizeof(*p));
820                 break;
821
822         case SIOCADDTUNNEL:
823         case SIOCCHGTUNNEL:
824                 err = -EPERM;
825                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
826                         goto done;
827                 if (p->iph.ttl)
828                         p->iph.frag_off |= htons(IP_DF);
829                 if (!(p->i_flags & VTI_ISVTI)) {
830                         if (!(p->i_flags & TUNNEL_KEY))
831                                 p->i_key = 0;
832                         if (!(p->o_flags & TUNNEL_KEY))
833                                 p->o_key = 0;
834                 }
835
836                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
837
838                 if (cmd == SIOCADDTUNNEL) {
839                         if (!t) {
840                                 t = ip_tunnel_create(net, itn, p);
841                                 err = PTR_ERR_OR_ZERO(t);
842                                 break;
843                         }
844
845                         err = -EEXIST;
846                         break;
847                 }
848                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
849                         if (t) {
850                                 if (t->dev != dev) {
851                                         err = -EEXIST;
852                                         break;
853                                 }
854                         } else {
855                                 unsigned int nflags = 0;
856
857                                 if (ipv4_is_multicast(p->iph.daddr))
858                                         nflags = IFF_BROADCAST;
859                                 else if (p->iph.daddr)
860                                         nflags = IFF_POINTOPOINT;
861
862                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
863                                         err = -EINVAL;
864                                         break;
865                                 }
866
867                                 t = netdev_priv(dev);
868                         }
869                 }
870
871                 if (t) {
872                         err = 0;
873                         ip_tunnel_update(itn, t, dev, p, true);
874                 } else {
875                         err = -ENOENT;
876                 }
877                 break;
878
879         case SIOCDELTUNNEL:
880                 err = -EPERM;
881                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
882                         goto done;
883
884                 if (dev == itn->fb_tunnel_dev) {
885                         err = -ENOENT;
886                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
887                         if (!t)
888                                 goto done;
889                         err = -EPERM;
890                         if (t == netdev_priv(itn->fb_tunnel_dev))
891                                 goto done;
892                         dev = t->dev;
893                 }
894                 unregister_netdevice(dev);
895                 err = 0;
896                 break;
897
898         default:
899                 err = -EINVAL;
900         }
901
902 done:
903         return err;
904 }
905 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
906
907 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
908 {
909         struct ip_tunnel *tunnel = netdev_priv(dev);
910         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
911         int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
912
913         if (new_mtu < 68)
914                 return -EINVAL;
915
916         if (new_mtu > max_mtu) {
917                 if (strict)
918                         return -EINVAL;
919
920                 new_mtu = max_mtu;
921         }
922
923         dev->mtu = new_mtu;
924         return 0;
925 }
926 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
927
928 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
929 {
930         return __ip_tunnel_change_mtu(dev, new_mtu, true);
931 }
932 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
933
934 static void ip_tunnel_dev_free(struct net_device *dev)
935 {
936         struct ip_tunnel *tunnel = netdev_priv(dev);
937
938         gro_cells_destroy(&tunnel->gro_cells);
939         dst_cache_destroy(&tunnel->dst_cache);
940         free_percpu(dev->tstats);
941         free_netdev(dev);
942 }
943
944 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
945 {
946         struct ip_tunnel *tunnel = netdev_priv(dev);
947         struct ip_tunnel_net *itn;
948
949         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
950
951         if (itn->fb_tunnel_dev != dev) {
952                 ip_tunnel_del(itn, netdev_priv(dev));
953                 unregister_netdevice_queue(dev, head);
954         }
955 }
956 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
957
958 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
959 {
960         struct ip_tunnel *tunnel = netdev_priv(dev);
961
962         return tunnel->net;
963 }
964 EXPORT_SYMBOL(ip_tunnel_get_link_net);
965
966 int ip_tunnel_get_iflink(const struct net_device *dev)
967 {
968         struct ip_tunnel *tunnel = netdev_priv(dev);
969
970         return tunnel->parms.link;
971 }
972 EXPORT_SYMBOL(ip_tunnel_get_iflink);
973
974 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
975                                   struct rtnl_link_ops *ops, char *devname)
976 {
977         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
978         struct ip_tunnel_parm parms;
979         unsigned int i;
980
981         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
982                 INIT_HLIST_HEAD(&itn->tunnels[i]);
983
984         if (!ops) {
985                 itn->fb_tunnel_dev = NULL;
986                 return 0;
987         }
988
989         memset(&parms, 0, sizeof(parms));
990         if (devname)
991                 strlcpy(parms.name, devname, IFNAMSIZ);
992
993         rtnl_lock();
994         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
995         /* FB netdevice is special: we have one, and only one per netns.
996          * Allowing to move it to another netns is clearly unsafe.
997          */
998         if (!IS_ERR(itn->fb_tunnel_dev)) {
999                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1000                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1001                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1002         }
1003         rtnl_unlock();
1004
1005         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1006 }
1007 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1008
1009 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1010                               struct rtnl_link_ops *ops)
1011 {
1012         struct net *net = dev_net(itn->fb_tunnel_dev);
1013         struct net_device *dev, *aux;
1014         int h;
1015
1016         for_each_netdev_safe(net, dev, aux)
1017                 if (dev->rtnl_link_ops == ops)
1018                         unregister_netdevice_queue(dev, head);
1019
1020         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1021                 struct ip_tunnel *t;
1022                 struct hlist_node *n;
1023                 struct hlist_head *thead = &itn->tunnels[h];
1024
1025                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1026                         /* If dev is in the same netns, it has already
1027                          * been added to the list by the previous loop.
1028                          */
1029                         if (!net_eq(dev_net(t->dev), net))
1030                                 unregister_netdevice_queue(t->dev, head);
1031         }
1032 }
1033
1034 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1035 {
1036         LIST_HEAD(list);
1037
1038         rtnl_lock();
1039         ip_tunnel_destroy(itn, &list, ops);
1040         unregister_netdevice_many(&list);
1041         rtnl_unlock();
1042 }
1043 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1044
1045 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1046                       struct ip_tunnel_parm *p)
1047 {
1048         struct ip_tunnel *nt;
1049         struct net *net = dev_net(dev);
1050         struct ip_tunnel_net *itn;
1051         int mtu;
1052         int err;
1053
1054         nt = netdev_priv(dev);
1055         itn = net_generic(net, nt->ip_tnl_net_id);
1056
1057         if (nt->collect_md) {
1058                 if (rtnl_dereference(itn->collect_md_tun))
1059                         return -EEXIST;
1060         } else {
1061                 if (ip_tunnel_find(itn, p, dev->type))
1062                         return -EEXIST;
1063         }
1064
1065         nt->net = net;
1066         nt->parms = *p;
1067         err = register_netdevice(dev);
1068         if (err)
1069                 goto out;
1070
1071         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1072                 eth_hw_addr_random(dev);
1073
1074         mtu = ip_tunnel_bind_dev(dev);
1075         if (!tb[IFLA_MTU])
1076                 dev->mtu = mtu;
1077
1078         ip_tunnel_add(itn, nt);
1079 out:
1080         return err;
1081 }
1082 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1083
1084 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1085                          struct ip_tunnel_parm *p)
1086 {
1087         struct ip_tunnel *t;
1088         struct ip_tunnel *tunnel = netdev_priv(dev);
1089         struct net *net = tunnel->net;
1090         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1091
1092         if (dev == itn->fb_tunnel_dev)
1093                 return -EINVAL;
1094
1095         t = ip_tunnel_find(itn, p, dev->type);
1096
1097         if (t) {
1098                 if (t->dev != dev)
1099                         return -EEXIST;
1100         } else {
1101                 t = tunnel;
1102
1103                 if (dev->type != ARPHRD_ETHER) {
1104                         unsigned int nflags = 0;
1105
1106                         if (ipv4_is_multicast(p->iph.daddr))
1107                                 nflags = IFF_BROADCAST;
1108                         else if (p->iph.daddr)
1109                                 nflags = IFF_POINTOPOINT;
1110
1111                         if ((dev->flags ^ nflags) &
1112                             (IFF_POINTOPOINT | IFF_BROADCAST))
1113                                 return -EINVAL;
1114                 }
1115         }
1116
1117         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1118         return 0;
1119 }
1120 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1121
1122 int ip_tunnel_init(struct net_device *dev)
1123 {
1124         struct ip_tunnel *tunnel = netdev_priv(dev);
1125         struct iphdr *iph = &tunnel->parms.iph;
1126         int err;
1127
1128         dev->destructor = ip_tunnel_dev_free;
1129         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1130         if (!dev->tstats)
1131                 return -ENOMEM;
1132
1133         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1134         if (err) {
1135                 free_percpu(dev->tstats);
1136                 return err;
1137         }
1138
1139         err = gro_cells_init(&tunnel->gro_cells, dev);
1140         if (err) {
1141                 dst_cache_destroy(&tunnel->dst_cache);
1142                 free_percpu(dev->tstats);
1143                 return err;
1144         }
1145
1146         tunnel->dev = dev;
1147         tunnel->net = dev_net(dev);
1148         strcpy(tunnel->parms.name, dev->name);
1149         iph->version            = 4;
1150         iph->ihl                = 5;
1151
1152         if (tunnel->collect_md)
1153                 netif_keep_dst(dev);
1154         return 0;
1155 }
1156 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1157
1158 void ip_tunnel_uninit(struct net_device *dev)
1159 {
1160         struct ip_tunnel *tunnel = netdev_priv(dev);
1161         struct net *net = tunnel->net;
1162         struct ip_tunnel_net *itn;
1163
1164         itn = net_generic(net, tunnel->ip_tnl_net_id);
1165         ip_tunnel_del(itn, netdev_priv(dev));
1166         if (itn->fb_tunnel_dev == dev)
1167                 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1168
1169         dst_cache_reset(&tunnel->dst_cache);
1170 }
1171 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1172
1173 /* Do least required initialization, rest of init is done in tunnel_init call */
1174 void ip_tunnel_setup(struct net_device *dev, int net_id)
1175 {
1176         struct ip_tunnel *tunnel = netdev_priv(dev);
1177         tunnel->ip_tnl_net_id = net_id;
1178 }
1179 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1180
1181 MODULE_LICENSE("GPL");