GNU Linux-libre 4.14.266-gnu1
[releases.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <linux/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/if_vlan.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ip_tunnels.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 #include <net/dst_metadata.h>
51 #include <net/erspan.h>
52
53 /*
54    Problems & solutions
55    --------------------
56
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is a good
66    solution, but it supposes maintaining new variable in ALL
67    skb, even if no tunneling is used.
68
69    Current solution: xmit_recursion breaks dead loops. This is a percpu
70    counter, since when we enter the first ndo_xmit(), cpu migration is
71    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
72
73    2. Networking dead loops would not kill routers, but would really
74    kill network. IP hop limit plays role of "t->recursion" in this case,
75    if we copy it from packet being encapsulated to upper header.
76    It is very good solution, but it introduces two problems:
77
78    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79      do not work over tunnels.
80    - traceroute does not work. I planned to relay ICMP from tunnel,
81      so that this problem would be solved and traceroute output
82      would even more informative. This idea appeared to be wrong:
83      only Linux complies to rfc1812 now (yes, guys, Linux is the only
84      true router now :-)), all routers (at least, in neighbourhood of mine)
85      return only 8 bytes of payload. It is the end.
86
87    Hence, if we want that OSPF worked or traceroute said something reasonable,
88    we should search for another solution.
89
90    One of them is to parse packet trying to detect inner encapsulation
91    made by our node. It is difficult or even impossible, especially,
92    taking into account fragmentation. TO be short, ttl is not solution at all.
93
94    Current solution: The solution was UNEXPECTEDLY SIMPLE.
95    We force DF flag on tunnels with preconfigured hop limit,
96    that is ALL. :-) Well, it does not remove the problem completely,
97    but exponential growth of network traffic is changed to linear
98    (branches, that exceed pmtu are pruned) and tunnel mtu
99    rapidly degrades to value <68, where looping stops.
100    Yes, it is not good if there exists a router in the loop,
101    which does not force DF, even when encapsulating packets have DF set.
102    But it is not our problem! Nobody could accuse us, we made
103    all that we could make. Even if it is your gated who injected
104    fatal route to network, even if it were you who configured
105    fatal static route: you are innocent. :-)
106
107    Alexey Kuznetsov.
108  */
109
110 static bool log_ecn_error = true;
111 module_param(log_ecn_error, bool, 0644);
112 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
113
114 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
115 static int ipgre_tunnel_init(struct net_device *dev);
116 static void erspan_build_header(struct sk_buff *skb,
117                                 __be32 id, u32 index, bool truncate);
118
119 static unsigned int ipgre_net_id __read_mostly;
120 static unsigned int gre_tap_net_id __read_mostly;
121 static unsigned int erspan_net_id __read_mostly;
122
123 static void ipgre_err(struct sk_buff *skb, u32 info,
124                       const struct tnl_ptk_info *tpi)
125 {
126
127         /* All the routers (except for Linux) return only
128            8 bytes of packet payload. It means, that precise relaying of
129            ICMP in the real Internet is absolutely infeasible.
130
131            Moreover, Cisco "wise men" put GRE key to the third word
132            in GRE header. It makes impossible maintaining even soft
133            state for keyed GRE tunnels with enabled checksum. Tell
134            them "thank you".
135
136            Well, I wonder, rfc1812 was written by Cisco employee,
137            what the hell these idiots break standards established
138            by themselves???
139            */
140         struct net *net = dev_net(skb->dev);
141         struct ip_tunnel_net *itn;
142         const struct iphdr *iph;
143         const int type = icmp_hdr(skb)->type;
144         const int code = icmp_hdr(skb)->code;
145         unsigned int data_len = 0;
146         struct ip_tunnel *t;
147
148         switch (type) {
149         default:
150         case ICMP_PARAMETERPROB:
151                 return;
152
153         case ICMP_DEST_UNREACH:
154                 switch (code) {
155                 case ICMP_SR_FAILED:
156                 case ICMP_PORT_UNREACH:
157                         /* Impossible event. */
158                         return;
159                 default:
160                         /* All others are translated to HOST_UNREACH.
161                            rfc2003 contains "deep thoughts" about NET_UNREACH,
162                            I believe they are just ether pollution. --ANK
163                          */
164                         break;
165                 }
166                 break;
167
168         case ICMP_TIME_EXCEEDED:
169                 if (code != ICMP_EXC_TTL)
170                         return;
171                 data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
172                 break;
173
174         case ICMP_REDIRECT:
175                 break;
176         }
177
178         if (tpi->proto == htons(ETH_P_TEB))
179                 itn = net_generic(net, gre_tap_net_id);
180         else if (tpi->proto == htons(ETH_P_ERSPAN))
181                 itn = net_generic(net, erspan_net_id);
182         else
183                 itn = net_generic(net, ipgre_net_id);
184
185         iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
186         t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
187                              iph->daddr, iph->saddr, tpi->key);
188
189         if (!t)
190                 return;
191
192 #if IS_ENABLED(CONFIG_IPV6)
193        if (tpi->proto == htons(ETH_P_IPV6) &&
194            !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
195                                        type, data_len))
196                return;
197 #endif
198
199         if (t->parms.iph.daddr == 0 ||
200             ipv4_is_multicast(t->parms.iph.daddr))
201                 return;
202
203         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
204                 return;
205
206         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
207                 t->err_count++;
208         else
209                 t->err_count = 1;
210         t->err_time = jiffies;
211 }
212
213 static void gre_err(struct sk_buff *skb, u32 info)
214 {
215         /* All the routers (except for Linux) return only
216          * 8 bytes of packet payload. It means, that precise relaying of
217          * ICMP in the real Internet is absolutely infeasible.
218          *
219          * Moreover, Cisco "wise men" put GRE key to the third word
220          * in GRE header. It makes impossible maintaining even soft
221          * state for keyed
222          * GRE tunnels with enabled checksum. Tell them "thank you".
223          *
224          * Well, I wonder, rfc1812 was written by Cisco employee,
225          * what the hell these idiots break standards established
226          * by themselves???
227          */
228
229         const struct iphdr *iph = (struct iphdr *)skb->data;
230         const int type = icmp_hdr(skb)->type;
231         const int code = icmp_hdr(skb)->code;
232         struct tnl_ptk_info tpi;
233
234         if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
235                              iph->ihl * 4) < 0)
236                 return;
237
238         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
239                 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
240                                  skb->dev->ifindex, 0, IPPROTO_GRE, 0);
241                 return;
242         }
243         if (type == ICMP_REDIRECT) {
244                 ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
245                               IPPROTO_GRE, 0);
246                 return;
247         }
248
249         ipgre_err(skb, info, &tpi);
250 }
251
252 static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
253                       int gre_hdr_len)
254 {
255         struct net *net = dev_net(skb->dev);
256         struct metadata_dst *tun_dst = NULL;
257         struct ip_tunnel_net *itn;
258         struct ip_tunnel *tunnel;
259         struct erspanhdr *ershdr;
260         const struct iphdr *iph;
261         __be32 index;
262         int len;
263
264         itn = net_generic(net, erspan_net_id);
265         len = gre_hdr_len + sizeof(*ershdr);
266
267         if (unlikely(!pskb_may_pull(skb, len)))
268                 return -ENOMEM;
269
270         iph = ip_hdr(skb);
271         ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
272
273         /* The original GRE header does not have key field,
274          * Use ERSPAN 10-bit session ID as key.
275          */
276         tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
277         index = ershdr->md.index;
278         tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
279                                   tpi->flags | TUNNEL_KEY,
280                                   iph->saddr, iph->daddr, tpi->key);
281
282         if (tunnel) {
283                 if (__iptunnel_pull_header(skb,
284                                            gre_hdr_len + sizeof(*ershdr),
285                                            htons(ETH_P_TEB),
286                                            false, false) < 0)
287                         goto drop;
288
289                 if (tunnel->collect_md) {
290                         struct ip_tunnel_info *info;
291                         struct erspan_metadata *md;
292                         __be64 tun_id;
293                         __be16 flags;
294
295                         tpi->flags |= TUNNEL_KEY;
296                         flags = tpi->flags;
297                         tun_id = key32_to_tunnel_id(tpi->key);
298
299                         tun_dst = ip_tun_rx_dst(skb, flags,
300                                                 tun_id, sizeof(*md));
301                         if (!tun_dst)
302                                 return PACKET_REJECT;
303
304                         md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
305                         if (!md) {
306                                 dst_release((struct dst_entry *)tun_dst);
307                                 return PACKET_REJECT;
308                         }
309
310                         md->index = index;
311                         info = &tun_dst->u.tun_info;
312                         info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
313                         info->options_len = sizeof(*md);
314                 } else {
315                         tunnel->index = ntohl(index);
316                 }
317
318                 skb_reset_mac_header(skb);
319                 ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
320                 return PACKET_RCVD;
321         }
322         return PACKET_REJECT;
323
324 drop:
325         kfree_skb(skb);
326         return PACKET_RCVD;
327 }
328
329 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
330                        struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
331 {
332         struct metadata_dst *tun_dst = NULL;
333         const struct iphdr *iph;
334         struct ip_tunnel *tunnel;
335
336         iph = ip_hdr(skb);
337         tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
338                                   iph->saddr, iph->daddr, tpi->key);
339
340         if (tunnel) {
341                 if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
342                                            raw_proto, false) < 0)
343                         goto drop;
344
345                 if (tunnel->dev->type != ARPHRD_NONE)
346                         skb_pop_mac_header(skb);
347                 else
348                         skb_reset_mac_header(skb);
349                 if (tunnel->collect_md) {
350                         __be16 flags;
351                         __be64 tun_id;
352
353                         flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
354                         tun_id = key32_to_tunnel_id(tpi->key);
355                         tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
356                         if (!tun_dst)
357                                 return PACKET_REJECT;
358                 }
359
360                 ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
361                 return PACKET_RCVD;
362         }
363         return PACKET_NEXT;
364
365 drop:
366         kfree_skb(skb);
367         return PACKET_RCVD;
368 }
369
370 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
371                      int hdr_len)
372 {
373         struct net *net = dev_net(skb->dev);
374         struct ip_tunnel_net *itn;
375         int res;
376
377         if (tpi->proto == htons(ETH_P_TEB))
378                 itn = net_generic(net, gre_tap_net_id);
379         else
380                 itn = net_generic(net, ipgre_net_id);
381
382         res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
383         if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
384                 /* ipgre tunnels in collect metadata mode should receive
385                  * also ETH_P_TEB traffic.
386                  */
387                 itn = net_generic(net, ipgre_net_id);
388                 res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
389         }
390         return res;
391 }
392
393 static int gre_rcv(struct sk_buff *skb)
394 {
395         struct tnl_ptk_info tpi;
396         bool csum_err = false;
397         int hdr_len;
398
399 #ifdef CONFIG_NET_IPGRE_BROADCAST
400         if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
401                 /* Looped back packet, drop it! */
402                 if (rt_is_output_route(skb_rtable(skb)))
403                         goto drop;
404         }
405 #endif
406
407         hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
408         if (hdr_len < 0)
409                 goto drop;
410
411         if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
412                 if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
413                         return 0;
414                 goto out;
415         }
416
417         if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
418                 return 0;
419
420 out:
421         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
422 drop:
423         kfree_skb(skb);
424         return 0;
425 }
426
427 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
428                        const struct iphdr *tnl_params,
429                        __be16 proto)
430 {
431         struct ip_tunnel *tunnel = netdev_priv(dev);
432
433         if (tunnel->parms.o_flags & TUNNEL_SEQ)
434                 tunnel->o_seqno++;
435
436         /* Push GRE header. */
437         gre_build_header(skb, tunnel->tun_hlen,
438                          tunnel->parms.o_flags, proto, tunnel->parms.o_key,
439                          htonl(tunnel->o_seqno));
440
441         ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
442 }
443
444 static int gre_handle_offloads(struct sk_buff *skb, bool csum)
445 {
446         if (csum && skb_checksum_start(skb) < skb->data)
447                 return -EINVAL;
448         return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
449 }
450
451 static struct rtable *gre_get_rt(struct sk_buff *skb,
452                                  struct net_device *dev,
453                                  struct flowi4 *fl,
454                                  const struct ip_tunnel_key *key)
455 {
456         struct net *net = dev_net(dev);
457
458         memset(fl, 0, sizeof(*fl));
459         fl->daddr = key->u.ipv4.dst;
460         fl->saddr = key->u.ipv4.src;
461         fl->flowi4_tos = RT_TOS(key->tos);
462         fl->flowi4_mark = skb->mark;
463         fl->flowi4_proto = IPPROTO_GRE;
464
465         return ip_route_output_key(net, fl);
466 }
467
468 static struct rtable *prepare_fb_xmit(struct sk_buff *skb,
469                                       struct net_device *dev,
470                                       struct flowi4 *fl,
471                                       int tunnel_hlen)
472 {
473         struct ip_tunnel_info *tun_info;
474         const struct ip_tunnel_key *key;
475         struct rtable *rt = NULL;
476         int min_headroom;
477         bool use_cache;
478         int err;
479
480         tun_info = skb_tunnel_info(skb);
481         key = &tun_info->key;
482         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
483
484         if (use_cache)
485                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr);
486         if (!rt) {
487                 rt = gre_get_rt(skb, dev, fl, key);
488                 if (IS_ERR(rt))
489                         goto err_free_skb;
490                 if (use_cache)
491                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
492                                           fl->saddr);
493         }
494
495         min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
496                         + tunnel_hlen + sizeof(struct iphdr);
497         if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
498                 int head_delta = SKB_DATA_ALIGN(min_headroom -
499                                                 skb_headroom(skb) +
500                                                 16);
501                 err = pskb_expand_head(skb, max_t(int, head_delta, 0),
502                                        0, GFP_ATOMIC);
503                 if (unlikely(err))
504                         goto err_free_rt;
505         }
506         return rt;
507
508 err_free_rt:
509         ip_rt_put(rt);
510 err_free_skb:
511         kfree_skb(skb);
512         dev->stats.tx_dropped++;
513         return NULL;
514 }
515
516 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
517                         __be16 proto)
518 {
519         struct ip_tunnel_info *tun_info;
520         const struct ip_tunnel_key *key;
521         struct rtable *rt = NULL;
522         struct flowi4 fl;
523         int tunnel_hlen;
524         __be16 df, flags;
525
526         tun_info = skb_tunnel_info(skb);
527         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
528                      ip_tunnel_info_af(tun_info) != AF_INET))
529                 goto err_free_skb;
530
531         key = &tun_info->key;
532         tunnel_hlen = gre_calc_hlen(key->tun_flags);
533
534         rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
535         if (!rt)
536                 return;
537
538         /* Push Tunnel header. */
539         if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
540                 goto err_free_rt;
541
542         flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
543         gre_build_header(skb, tunnel_hlen, flags, proto,
544                          tunnel_id_to_key32(tun_info->key.tun_id), 0);
545
546         df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
547
548         iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
549                       key->tos, key->ttl, df, false);
550         return;
551
552 err_free_rt:
553         ip_rt_put(rt);
554 err_free_skb:
555         kfree_skb(skb);
556         dev->stats.tx_dropped++;
557 }
558
559 static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
560                            __be16 proto)
561 {
562         struct ip_tunnel *tunnel = netdev_priv(dev);
563         struct ip_tunnel_info *tun_info;
564         const struct ip_tunnel_key *key;
565         struct erspan_metadata *md;
566         struct rtable *rt = NULL;
567         bool truncate = false;
568         struct flowi4 fl;
569         int tunnel_hlen;
570         __be16 df;
571
572         tun_info = skb_tunnel_info(skb);
573         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
574                      ip_tunnel_info_af(tun_info) != AF_INET))
575                 goto err_free_skb;
576
577         key = &tun_info->key;
578
579         /* ERSPAN has fixed 8 byte GRE header */
580         tunnel_hlen = 8 + sizeof(struct erspanhdr);
581
582         rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
583         if (!rt)
584                 return;
585
586         if (gre_handle_offloads(skb, false))
587                 goto err_free_rt;
588
589         if (skb->len > dev->mtu + dev->hard_header_len) {
590                 pskb_trim(skb, dev->mtu + dev->hard_header_len);
591                 truncate = true;
592         }
593
594         if (tun_info->options_len < sizeof(*md))
595                 goto err_free_rt;
596
597         md = ip_tunnel_info_opts(tun_info);
598         if (!md)
599                 goto err_free_rt;
600
601         erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
602                             ntohl(md->index), truncate);
603
604         gre_build_header(skb, 8, TUNNEL_SEQ,
605                          htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
606
607         df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
608
609         iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
610                       key->tos, key->ttl, df, false);
611         return;
612
613 err_free_rt:
614         ip_rt_put(rt);
615 err_free_skb:
616         kfree_skb(skb);
617         dev->stats.tx_dropped++;
618 }
619
620 static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
621 {
622         struct ip_tunnel_info *info = skb_tunnel_info(skb);
623         struct rtable *rt;
624         struct flowi4 fl4;
625
626         if (ip_tunnel_info_af(info) != AF_INET)
627                 return -EINVAL;
628
629         rt = gre_get_rt(skb, dev, &fl4, &info->key);
630         if (IS_ERR(rt))
631                 return PTR_ERR(rt);
632
633         ip_rt_put(rt);
634         info->key.u.ipv4.src = fl4.saddr;
635         return 0;
636 }
637
638 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
639                               struct net_device *dev)
640 {
641         struct ip_tunnel *tunnel = netdev_priv(dev);
642         const struct iphdr *tnl_params;
643
644         if (tunnel->collect_md) {
645                 gre_fb_xmit(skb, dev, skb->protocol);
646                 return NETDEV_TX_OK;
647         }
648
649         if (dev->header_ops) {
650                 /* Need space for new headers */
651                 if (skb_cow_head(skb, dev->needed_headroom -
652                                       (tunnel->hlen + sizeof(struct iphdr))))
653                         goto free_skb;
654
655                 tnl_params = (const struct iphdr *)skb->data;
656
657                 /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
658                  * to gre header.
659                  */
660                 skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
661                 skb_reset_mac_header(skb);
662         } else {
663                 if (skb_cow_head(skb, dev->needed_headroom))
664                         goto free_skb;
665
666                 tnl_params = &tunnel->parms.iph;
667         }
668
669         if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
670                 goto free_skb;
671
672         __gre_xmit(skb, dev, tnl_params, skb->protocol);
673         return NETDEV_TX_OK;
674
675 free_skb:
676         kfree_skb(skb);
677         dev->stats.tx_dropped++;
678         return NETDEV_TX_OK;
679 }
680
681 static inline u8 tos_to_cos(u8 tos)
682 {
683         u8 dscp, cos;
684
685         dscp = tos >> 2;
686         cos = dscp >> 3;
687         return cos;
688 }
689
690 static void erspan_build_header(struct sk_buff *skb,
691                                 __be32 id, u32 index, bool truncate)
692 {
693         struct iphdr *iphdr = ip_hdr(skb);
694         struct ethhdr *eth = (struct ethhdr *)skb->data;
695         enum erspan_encap_type enc_type;
696         struct erspanhdr *ershdr;
697         struct qtag_prefix {
698                 __be16 eth_type;
699                 __be16 tci;
700         } *qp;
701         u16 vlan_tci = 0;
702
703         enc_type = ERSPAN_ENCAP_NOVLAN;
704
705         /* If mirrored packet has vlan tag, extract tci and
706          *  perserve vlan header in the mirrored frame.
707          */
708         if (eth->h_proto == htons(ETH_P_8021Q)) {
709                 qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
710                 vlan_tci = ntohs(qp->tci);
711                 enc_type = ERSPAN_ENCAP_INFRAME;
712         }
713
714         skb_push(skb, sizeof(*ershdr));
715         ershdr = (struct erspanhdr *)skb->data;
716         memset(ershdr, 0, sizeof(*ershdr));
717
718         ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
719                                  (ERSPAN_VERSION << VER_OFFSET));
720         ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
721                            ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
722                            (enc_type << EN_OFFSET & EN_MASK) |
723                            ((truncate << T_OFFSET) & T_MASK));
724         ershdr->md.index = htonl(index & INDEX_MASK);
725 }
726
727 static netdev_tx_t erspan_xmit(struct sk_buff *skb,
728                                struct net_device *dev)
729 {
730         struct ip_tunnel *tunnel = netdev_priv(dev);
731         bool truncate = false;
732
733         if (tunnel->collect_md) {
734                 erspan_fb_xmit(skb, dev, skb->protocol);
735                 return NETDEV_TX_OK;
736         }
737
738         if (gre_handle_offloads(skb, false))
739                 goto free_skb;
740
741         if (skb_cow_head(skb, dev->needed_headroom))
742                 goto free_skb;
743
744         if (skb->len > dev->mtu + dev->hard_header_len) {
745                 pskb_trim(skb, dev->mtu + dev->hard_header_len);
746                 truncate = true;
747         }
748
749         /* Push ERSPAN header */
750         erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
751         tunnel->parms.o_flags &= ~TUNNEL_KEY;
752         __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
753         return NETDEV_TX_OK;
754
755 free_skb:
756         kfree_skb(skb);
757         dev->stats.tx_dropped++;
758         return NETDEV_TX_OK;
759 }
760
761 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
762                                 struct net_device *dev)
763 {
764         struct ip_tunnel *tunnel = netdev_priv(dev);
765
766         if (tunnel->collect_md) {
767                 gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
768                 return NETDEV_TX_OK;
769         }
770
771         if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
772                 goto free_skb;
773
774         if (skb_cow_head(skb, dev->needed_headroom))
775                 goto free_skb;
776
777         __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
778         return NETDEV_TX_OK;
779
780 free_skb:
781         kfree_skb(skb);
782         dev->stats.tx_dropped++;
783         return NETDEV_TX_OK;
784 }
785
786 static int ipgre_tunnel_ioctl(struct net_device *dev,
787                               struct ifreq *ifr, int cmd)
788 {
789         int err;
790         struct ip_tunnel_parm p;
791
792         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
793                 return -EFAULT;
794         if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
795                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
796                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
797                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
798                         return -EINVAL;
799         }
800         p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
801         p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
802
803         err = ip_tunnel_ioctl(dev, &p, cmd);
804         if (err)
805                 return err;
806
807         p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
808         p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
809
810         if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
811                 return -EFAULT;
812         return 0;
813 }
814
815 /* Nice toy. Unfortunately, useless in real life :-)
816    It allows to construct virtual multiprotocol broadcast "LAN"
817    over the Internet, provided multicast routing is tuned.
818
819
820    I have no idea was this bicycle invented before me,
821    so that I had to set ARPHRD_IPGRE to a random value.
822    I have an impression, that Cisco could make something similar,
823    but this feature is apparently missing in IOS<=11.2(8).
824
825    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
826    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
827
828    ping -t 255 224.66.66.66
829
830    If nobody answers, mbone does not work.
831
832    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
833    ip addr add 10.66.66.<somewhat>/24 dev Universe
834    ifconfig Universe up
835    ifconfig Universe add fe80::<Your_real_addr>/10
836    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
837    ftp 10.66.66.66
838    ...
839    ftp fec0:6666:6666::193.233.7.65
840    ...
841  */
842 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
843                         unsigned short type,
844                         const void *daddr, const void *saddr, unsigned int len)
845 {
846         struct ip_tunnel *t = netdev_priv(dev);
847         struct iphdr *iph;
848         struct gre_base_hdr *greh;
849
850         iph = skb_push(skb, t->hlen + sizeof(*iph));
851         greh = (struct gre_base_hdr *)(iph+1);
852         greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
853         greh->protocol = htons(type);
854
855         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
856
857         /* Set the source hardware address. */
858         if (saddr)
859                 memcpy(&iph->saddr, saddr, 4);
860         if (daddr)
861                 memcpy(&iph->daddr, daddr, 4);
862         if (iph->daddr)
863                 return t->hlen + sizeof(*iph);
864
865         return -(t->hlen + sizeof(*iph));
866 }
867
868 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
869 {
870         const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
871         memcpy(haddr, &iph->saddr, 4);
872         return 4;
873 }
874
875 static const struct header_ops ipgre_header_ops = {
876         .create = ipgre_header,
877         .parse  = ipgre_header_parse,
878 };
879
880 #ifdef CONFIG_NET_IPGRE_BROADCAST
881 static int ipgre_open(struct net_device *dev)
882 {
883         struct ip_tunnel *t = netdev_priv(dev);
884
885         if (ipv4_is_multicast(t->parms.iph.daddr)) {
886                 struct flowi4 fl4;
887                 struct rtable *rt;
888
889                 rt = ip_route_output_gre(t->net, &fl4,
890                                          t->parms.iph.daddr,
891                                          t->parms.iph.saddr,
892                                          t->parms.o_key,
893                                          RT_TOS(t->parms.iph.tos),
894                                          t->parms.link);
895                 if (IS_ERR(rt))
896                         return -EADDRNOTAVAIL;
897                 dev = rt->dst.dev;
898                 ip_rt_put(rt);
899                 if (!__in_dev_get_rtnl(dev))
900                         return -EADDRNOTAVAIL;
901                 t->mlink = dev->ifindex;
902                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
903         }
904         return 0;
905 }
906
907 static int ipgre_close(struct net_device *dev)
908 {
909         struct ip_tunnel *t = netdev_priv(dev);
910
911         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
912                 struct in_device *in_dev;
913                 in_dev = inetdev_by_index(t->net, t->mlink);
914                 if (in_dev)
915                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
916         }
917         return 0;
918 }
919 #endif
920
921 static const struct net_device_ops ipgre_netdev_ops = {
922         .ndo_init               = ipgre_tunnel_init,
923         .ndo_uninit             = ip_tunnel_uninit,
924 #ifdef CONFIG_NET_IPGRE_BROADCAST
925         .ndo_open               = ipgre_open,
926         .ndo_stop               = ipgre_close,
927 #endif
928         .ndo_start_xmit         = ipgre_xmit,
929         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
930         .ndo_change_mtu         = ip_tunnel_change_mtu,
931         .ndo_get_stats64        = ip_tunnel_get_stats64,
932         .ndo_get_iflink         = ip_tunnel_get_iflink,
933 };
934
935 #define GRE_FEATURES (NETIF_F_SG |              \
936                       NETIF_F_FRAGLIST |        \
937                       NETIF_F_HIGHDMA |         \
938                       NETIF_F_HW_CSUM)
939
940 static void ipgre_tunnel_setup(struct net_device *dev)
941 {
942         dev->netdev_ops         = &ipgre_netdev_ops;
943         dev->type               = ARPHRD_IPGRE;
944         ip_tunnel_setup(dev, ipgre_net_id);
945 }
946
947 static void __gre_tunnel_init(struct net_device *dev)
948 {
949         struct ip_tunnel *tunnel;
950         int t_hlen;
951
952         tunnel = netdev_priv(dev);
953         tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
954         tunnel->parms.iph.protocol = IPPROTO_GRE;
955
956         tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
957
958         t_hlen = tunnel->hlen + sizeof(struct iphdr);
959
960         dev->features           |= GRE_FEATURES;
961         dev->hw_features        |= GRE_FEATURES;
962
963         if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
964                 /* TCP offload with GRE SEQ is not supported, nor
965                  * can we support 2 levels of outer headers requiring
966                  * an update.
967                  */
968                 if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
969                     (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
970                         dev->features    |= NETIF_F_GSO_SOFTWARE;
971                         dev->hw_features |= NETIF_F_GSO_SOFTWARE;
972                 }
973
974                 /* Can use a lockless transmit, unless we generate
975                  * output sequences
976                  */
977                 dev->features |= NETIF_F_LLTX;
978         }
979 }
980
981 static int ipgre_tunnel_init(struct net_device *dev)
982 {
983         struct ip_tunnel *tunnel = netdev_priv(dev);
984         struct iphdr *iph = &tunnel->parms.iph;
985
986         __gre_tunnel_init(dev);
987
988         memcpy(dev->dev_addr, &iph->saddr, 4);
989         memcpy(dev->broadcast, &iph->daddr, 4);
990
991         dev->flags              = IFF_NOARP;
992         netif_keep_dst(dev);
993         dev->addr_len           = 4;
994
995         if (iph->daddr && !tunnel->collect_md) {
996 #ifdef CONFIG_NET_IPGRE_BROADCAST
997                 if (ipv4_is_multicast(iph->daddr)) {
998                         if (!iph->saddr)
999                                 return -EINVAL;
1000                         dev->flags = IFF_BROADCAST;
1001                         dev->header_ops = &ipgre_header_ops;
1002                 }
1003 #endif
1004         } else if (!tunnel->collect_md) {
1005                 dev->header_ops = &ipgre_header_ops;
1006         }
1007
1008         return ip_tunnel_init(dev);
1009 }
1010
1011 static const struct gre_protocol ipgre_protocol = {
1012         .handler     = gre_rcv,
1013         .err_handler = gre_err,
1014 };
1015
1016 static int __net_init ipgre_init_net(struct net *net)
1017 {
1018         return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
1019 }
1020
1021 static void __net_exit ipgre_exit_net(struct net *net)
1022 {
1023         struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
1024         ip_tunnel_delete_net(itn, &ipgre_link_ops);
1025 }
1026
1027 static struct pernet_operations ipgre_net_ops = {
1028         .init = ipgre_init_net,
1029         .exit = ipgre_exit_net,
1030         .id   = &ipgre_net_id,
1031         .size = sizeof(struct ip_tunnel_net),
1032 };
1033
1034 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1035                                  struct netlink_ext_ack *extack)
1036 {
1037         __be16 flags;
1038
1039         if (!data)
1040                 return 0;
1041
1042         flags = 0;
1043         if (data[IFLA_GRE_IFLAGS])
1044                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1045         if (data[IFLA_GRE_OFLAGS])
1046                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1047         if (flags & (GRE_VERSION|GRE_ROUTING))
1048                 return -EINVAL;
1049
1050         if (data[IFLA_GRE_COLLECT_METADATA] &&
1051             data[IFLA_GRE_ENCAP_TYPE] &&
1052             nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1053                 return -EINVAL;
1054
1055         return 0;
1056 }
1057
1058 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1059                               struct netlink_ext_ack *extack)
1060 {
1061         __be32 daddr;
1062
1063         if (tb[IFLA_ADDRESS]) {
1064                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1065                         return -EINVAL;
1066                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1067                         return -EADDRNOTAVAIL;
1068         }
1069
1070         if (!data)
1071                 goto out;
1072
1073         if (data[IFLA_GRE_REMOTE]) {
1074                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1075                 if (!daddr)
1076                         return -EINVAL;
1077         }
1078
1079 out:
1080         return ipgre_tunnel_validate(tb, data, extack);
1081 }
1082
1083 static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1084                            struct netlink_ext_ack *extack)
1085 {
1086         __be16 flags = 0;
1087         int ret;
1088
1089         if (!data)
1090                 return 0;
1091
1092         ret = ipgre_tap_validate(tb, data, extack);
1093         if (ret)
1094                 return ret;
1095
1096         /* ERSPAN should only have GRE sequence and key flag */
1097         if (data[IFLA_GRE_OFLAGS])
1098                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1099         if (data[IFLA_GRE_IFLAGS])
1100                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1101         if (!data[IFLA_GRE_COLLECT_METADATA] &&
1102             flags != (GRE_SEQ | GRE_KEY))
1103                 return -EINVAL;
1104
1105         /* ERSPAN Session ID only has 10-bit. Since we reuse
1106          * 32-bit key field as ID, check it's range.
1107          */
1108         if (data[IFLA_GRE_IKEY] &&
1109             (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1110                 return -EINVAL;
1111
1112         if (data[IFLA_GRE_OKEY] &&
1113             (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1114                 return -EINVAL;
1115
1116         return 0;
1117 }
1118
1119 static int ipgre_netlink_parms(struct net_device *dev,
1120                                 struct nlattr *data[],
1121                                 struct nlattr *tb[],
1122                                 struct ip_tunnel_parm *parms,
1123                                 __u32 *fwmark)
1124 {
1125         struct ip_tunnel *t = netdev_priv(dev);
1126
1127         memset(parms, 0, sizeof(*parms));
1128
1129         parms->iph.protocol = IPPROTO_GRE;
1130
1131         if (!data)
1132                 return 0;
1133
1134         if (data[IFLA_GRE_LINK])
1135                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1136
1137         if (data[IFLA_GRE_IFLAGS])
1138                 parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1139
1140         if (data[IFLA_GRE_OFLAGS])
1141                 parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1142
1143         if (data[IFLA_GRE_IKEY])
1144                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1145
1146         if (data[IFLA_GRE_OKEY])
1147                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1148
1149         if (data[IFLA_GRE_LOCAL])
1150                 parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1151
1152         if (data[IFLA_GRE_REMOTE])
1153                 parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1154
1155         if (data[IFLA_GRE_TTL])
1156                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1157
1158         if (data[IFLA_GRE_TOS])
1159                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1160
1161         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1162                 if (t->ignore_df)
1163                         return -EINVAL;
1164                 parms->iph.frag_off = htons(IP_DF);
1165         }
1166
1167         if (data[IFLA_GRE_COLLECT_METADATA]) {
1168                 t->collect_md = true;
1169                 if (dev->type == ARPHRD_IPGRE)
1170                         dev->type = ARPHRD_NONE;
1171         }
1172
1173         if (data[IFLA_GRE_IGNORE_DF]) {
1174                 if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1175                   && (parms->iph.frag_off & htons(IP_DF)))
1176                         return -EINVAL;
1177                 t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1178         }
1179
1180         if (data[IFLA_GRE_FWMARK])
1181                 *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1182
1183         if (data[IFLA_GRE_ERSPAN_INDEX]) {
1184                 t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1185
1186                 if (t->index & ~INDEX_MASK)
1187                         return -EINVAL;
1188         }
1189
1190         return 0;
1191 }
1192
1193 /* This function returns true when ENCAP attributes are present in the nl msg */
1194 static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1195                                       struct ip_tunnel_encap *ipencap)
1196 {
1197         bool ret = false;
1198
1199         memset(ipencap, 0, sizeof(*ipencap));
1200
1201         if (!data)
1202                 return ret;
1203
1204         if (data[IFLA_GRE_ENCAP_TYPE]) {
1205                 ret = true;
1206                 ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1207         }
1208
1209         if (data[IFLA_GRE_ENCAP_FLAGS]) {
1210                 ret = true;
1211                 ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1212         }
1213
1214         if (data[IFLA_GRE_ENCAP_SPORT]) {
1215                 ret = true;
1216                 ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1217         }
1218
1219         if (data[IFLA_GRE_ENCAP_DPORT]) {
1220                 ret = true;
1221                 ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1222         }
1223
1224         return ret;
1225 }
1226
1227 static int gre_tap_init(struct net_device *dev)
1228 {
1229         __gre_tunnel_init(dev);
1230         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1231         netif_keep_dst(dev);
1232
1233         return ip_tunnel_init(dev);
1234 }
1235
1236 static const struct net_device_ops gre_tap_netdev_ops = {
1237         .ndo_init               = gre_tap_init,
1238         .ndo_uninit             = ip_tunnel_uninit,
1239         .ndo_start_xmit         = gre_tap_xmit,
1240         .ndo_set_mac_address    = eth_mac_addr,
1241         .ndo_validate_addr      = eth_validate_addr,
1242         .ndo_change_mtu         = ip_tunnel_change_mtu,
1243         .ndo_get_stats64        = ip_tunnel_get_stats64,
1244         .ndo_get_iflink         = ip_tunnel_get_iflink,
1245         .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1246 };
1247
1248 static int erspan_tunnel_init(struct net_device *dev)
1249 {
1250         struct ip_tunnel *tunnel = netdev_priv(dev);
1251         int t_hlen;
1252
1253         tunnel->tun_hlen = 8;
1254         tunnel->parms.iph.protocol = IPPROTO_GRE;
1255         tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
1256                        sizeof(struct erspanhdr);
1257         t_hlen = tunnel->hlen + sizeof(struct iphdr);
1258
1259         dev->features           |= GRE_FEATURES;
1260         dev->hw_features        |= GRE_FEATURES;
1261         dev->priv_flags         |= IFF_LIVE_ADDR_CHANGE;
1262         netif_keep_dst(dev);
1263
1264         return ip_tunnel_init(dev);
1265 }
1266
1267 static const struct net_device_ops erspan_netdev_ops = {
1268         .ndo_init               = erspan_tunnel_init,
1269         .ndo_uninit             = ip_tunnel_uninit,
1270         .ndo_start_xmit         = erspan_xmit,
1271         .ndo_set_mac_address    = eth_mac_addr,
1272         .ndo_validate_addr      = eth_validate_addr,
1273         .ndo_change_mtu         = ip_tunnel_change_mtu,
1274         .ndo_get_stats64        = ip_tunnel_get_stats64,
1275         .ndo_get_iflink         = ip_tunnel_get_iflink,
1276         .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1277 };
1278
1279 static void ipgre_tap_setup(struct net_device *dev)
1280 {
1281         ether_setup(dev);
1282         dev->max_mtu = 0;
1283         dev->netdev_ops = &gre_tap_netdev_ops;
1284         dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1285         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1286         ip_tunnel_setup(dev, gre_tap_net_id);
1287 }
1288
1289 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1290                          struct nlattr *tb[], struct nlattr *data[],
1291                          struct netlink_ext_ack *extack)
1292 {
1293         struct ip_tunnel_parm p;
1294         struct ip_tunnel_encap ipencap;
1295         __u32 fwmark = 0;
1296         int err;
1297
1298         if (ipgre_netlink_encap_parms(data, &ipencap)) {
1299                 struct ip_tunnel *t = netdev_priv(dev);
1300                 err = ip_tunnel_encap_setup(t, &ipencap);
1301
1302                 if (err < 0)
1303                         return err;
1304         }
1305
1306         err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1307         if (err < 0)
1308                 return err;
1309         return ip_tunnel_newlink(dev, tb, &p, fwmark);
1310 }
1311
1312 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1313                             struct nlattr *data[],
1314                             struct netlink_ext_ack *extack)
1315 {
1316         struct ip_tunnel *t = netdev_priv(dev);
1317         struct ip_tunnel_parm p;
1318         struct ip_tunnel_encap ipencap;
1319         __u32 fwmark = t->fwmark;
1320         int err;
1321
1322         if (ipgre_netlink_encap_parms(data, &ipencap)) {
1323                 err = ip_tunnel_encap_setup(t, &ipencap);
1324
1325                 if (err < 0)
1326                         return err;
1327         }
1328
1329         err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1330         if (err < 0)
1331                 return err;
1332         return ip_tunnel_changelink(dev, tb, &p, fwmark);
1333 }
1334
1335 static size_t ipgre_get_size(const struct net_device *dev)
1336 {
1337         return
1338                 /* IFLA_GRE_LINK */
1339                 nla_total_size(4) +
1340                 /* IFLA_GRE_IFLAGS */
1341                 nla_total_size(2) +
1342                 /* IFLA_GRE_OFLAGS */
1343                 nla_total_size(2) +
1344                 /* IFLA_GRE_IKEY */
1345                 nla_total_size(4) +
1346                 /* IFLA_GRE_OKEY */
1347                 nla_total_size(4) +
1348                 /* IFLA_GRE_LOCAL */
1349                 nla_total_size(4) +
1350                 /* IFLA_GRE_REMOTE */
1351                 nla_total_size(4) +
1352                 /* IFLA_GRE_TTL */
1353                 nla_total_size(1) +
1354                 /* IFLA_GRE_TOS */
1355                 nla_total_size(1) +
1356                 /* IFLA_GRE_PMTUDISC */
1357                 nla_total_size(1) +
1358                 /* IFLA_GRE_ENCAP_TYPE */
1359                 nla_total_size(2) +
1360                 /* IFLA_GRE_ENCAP_FLAGS */
1361                 nla_total_size(2) +
1362                 /* IFLA_GRE_ENCAP_SPORT */
1363                 nla_total_size(2) +
1364                 /* IFLA_GRE_ENCAP_DPORT */
1365                 nla_total_size(2) +
1366                 /* IFLA_GRE_COLLECT_METADATA */
1367                 nla_total_size(0) +
1368                 /* IFLA_GRE_IGNORE_DF */
1369                 nla_total_size(1) +
1370                 /* IFLA_GRE_FWMARK */
1371                 nla_total_size(4) +
1372                 /* IFLA_GRE_ERSPAN_INDEX */
1373                 nla_total_size(4) +
1374                 0;
1375 }
1376
1377 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1378 {
1379         struct ip_tunnel *t = netdev_priv(dev);
1380         struct ip_tunnel_parm *p = &t->parms;
1381
1382         if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1383             nla_put_be16(skb, IFLA_GRE_IFLAGS,
1384                          gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1385             nla_put_be16(skb, IFLA_GRE_OFLAGS,
1386                          gre_tnl_flags_to_gre_flags(p->o_flags)) ||
1387             nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1388             nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1389             nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1390             nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1391             nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1392             nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1393             nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1394                        !!(p->iph.frag_off & htons(IP_DF))) ||
1395             nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1396                 goto nla_put_failure;
1397
1398         if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1399                         t->encap.type) ||
1400             nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1401                          t->encap.sport) ||
1402             nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1403                          t->encap.dport) ||
1404             nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1405                         t->encap.flags))
1406                 goto nla_put_failure;
1407
1408         if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1409                 goto nla_put_failure;
1410
1411         if (t->collect_md) {
1412                 if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1413                         goto nla_put_failure;
1414         }
1415
1416         if (t->index)
1417                 if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1418                         goto nla_put_failure;
1419
1420         return 0;
1421
1422 nla_put_failure:
1423         return -EMSGSIZE;
1424 }
1425
1426 static void erspan_setup(struct net_device *dev)
1427 {
1428         ether_setup(dev);
1429         dev->max_mtu = 0;
1430         dev->netdev_ops = &erspan_netdev_ops;
1431         dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1432         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1433         ip_tunnel_setup(dev, erspan_net_id);
1434 }
1435
1436 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1437         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1438         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1439         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1440         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1441         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1442         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1443         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1444         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1445         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1446         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1447         [IFLA_GRE_ENCAP_TYPE]   = { .type = NLA_U16 },
1448         [IFLA_GRE_ENCAP_FLAGS]  = { .type = NLA_U16 },
1449         [IFLA_GRE_ENCAP_SPORT]  = { .type = NLA_U16 },
1450         [IFLA_GRE_ENCAP_DPORT]  = { .type = NLA_U16 },
1451         [IFLA_GRE_COLLECT_METADATA]     = { .type = NLA_FLAG },
1452         [IFLA_GRE_IGNORE_DF]    = { .type = NLA_U8 },
1453         [IFLA_GRE_FWMARK]       = { .type = NLA_U32 },
1454         [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
1455 };
1456
1457 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1458         .kind           = "gre",
1459         .maxtype        = IFLA_GRE_MAX,
1460         .policy         = ipgre_policy,
1461         .priv_size      = sizeof(struct ip_tunnel),
1462         .setup          = ipgre_tunnel_setup,
1463         .validate       = ipgre_tunnel_validate,
1464         .newlink        = ipgre_newlink,
1465         .changelink     = ipgre_changelink,
1466         .dellink        = ip_tunnel_dellink,
1467         .get_size       = ipgre_get_size,
1468         .fill_info      = ipgre_fill_info,
1469         .get_link_net   = ip_tunnel_get_link_net,
1470 };
1471
1472 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1473         .kind           = "gretap",
1474         .maxtype        = IFLA_GRE_MAX,
1475         .policy         = ipgre_policy,
1476         .priv_size      = sizeof(struct ip_tunnel),
1477         .setup          = ipgre_tap_setup,
1478         .validate       = ipgre_tap_validate,
1479         .newlink        = ipgre_newlink,
1480         .changelink     = ipgre_changelink,
1481         .dellink        = ip_tunnel_dellink,
1482         .get_size       = ipgre_get_size,
1483         .fill_info      = ipgre_fill_info,
1484         .get_link_net   = ip_tunnel_get_link_net,
1485 };
1486
1487 static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1488         .kind           = "erspan",
1489         .maxtype        = IFLA_GRE_MAX,
1490         .policy         = ipgre_policy,
1491         .priv_size      = sizeof(struct ip_tunnel),
1492         .setup          = erspan_setup,
1493         .validate       = erspan_validate,
1494         .newlink        = ipgre_newlink,
1495         .changelink     = ipgre_changelink,
1496         .dellink        = ip_tunnel_dellink,
1497         .get_size       = ipgre_get_size,
1498         .fill_info      = ipgre_fill_info,
1499         .get_link_net   = ip_tunnel_get_link_net,
1500 };
1501
1502 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1503                                         u8 name_assign_type)
1504 {
1505         struct nlattr *tb[IFLA_MAX + 1];
1506         struct net_device *dev;
1507         LIST_HEAD(list_kill);
1508         struct ip_tunnel *t;
1509         int err;
1510
1511         memset(&tb, 0, sizeof(tb));
1512
1513         dev = rtnl_create_link(net, name, name_assign_type,
1514                                &ipgre_tap_ops, tb);
1515         if (IS_ERR(dev))
1516                 return dev;
1517
1518         /* Configure flow based GRE device. */
1519         t = netdev_priv(dev);
1520         t->collect_md = true;
1521
1522         err = ipgre_newlink(net, dev, tb, NULL, NULL);
1523         if (err < 0) {
1524                 free_netdev(dev);
1525                 return ERR_PTR(err);
1526         }
1527
1528         /* openvswitch users expect packet sizes to be unrestricted,
1529          * so set the largest MTU we can.
1530          */
1531         err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1532         if (err)
1533                 goto out;
1534
1535         err = rtnl_configure_link(dev, NULL);
1536         if (err < 0)
1537                 goto out;
1538
1539         return dev;
1540 out:
1541         ip_tunnel_dellink(dev, &list_kill);
1542         unregister_netdevice_many(&list_kill);
1543         return ERR_PTR(err);
1544 }
1545 EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1546
1547 static int __net_init ipgre_tap_init_net(struct net *net)
1548 {
1549         return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1550 }
1551
1552 static void __net_exit ipgre_tap_exit_net(struct net *net)
1553 {
1554         struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
1555         ip_tunnel_delete_net(itn, &ipgre_tap_ops);
1556 }
1557
1558 static struct pernet_operations ipgre_tap_net_ops = {
1559         .init = ipgre_tap_init_net,
1560         .exit = ipgre_tap_exit_net,
1561         .id   = &gre_tap_net_id,
1562         .size = sizeof(struct ip_tunnel_net),
1563 };
1564
1565 static int __net_init erspan_init_net(struct net *net)
1566 {
1567         return ip_tunnel_init_net(net, erspan_net_id,
1568                                   &erspan_link_ops, "erspan0");
1569 }
1570
1571 static void __net_exit erspan_exit_net(struct net *net)
1572 {
1573         struct ip_tunnel_net *itn = net_generic(net, erspan_net_id);
1574
1575         ip_tunnel_delete_net(itn, &erspan_link_ops);
1576 }
1577
1578 static struct pernet_operations erspan_net_ops = {
1579         .init = erspan_init_net,
1580         .exit = erspan_exit_net,
1581         .id   = &erspan_net_id,
1582         .size = sizeof(struct ip_tunnel_net),
1583 };
1584
1585 static int __init ipgre_init(void)
1586 {
1587         int err;
1588
1589         pr_info("GRE over IPv4 tunneling driver\n");
1590
1591         err = register_pernet_device(&ipgre_net_ops);
1592         if (err < 0)
1593                 return err;
1594
1595         err = register_pernet_device(&ipgre_tap_net_ops);
1596         if (err < 0)
1597                 goto pnet_tap_failed;
1598
1599         err = register_pernet_device(&erspan_net_ops);
1600         if (err < 0)
1601                 goto pnet_erspan_failed;
1602
1603         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1604         if (err < 0) {
1605                 pr_info("%s: can't add protocol\n", __func__);
1606                 goto add_proto_failed;
1607         }
1608
1609         err = rtnl_link_register(&ipgre_link_ops);
1610         if (err < 0)
1611                 goto rtnl_link_failed;
1612
1613         err = rtnl_link_register(&ipgre_tap_ops);
1614         if (err < 0)
1615                 goto tap_ops_failed;
1616
1617         err = rtnl_link_register(&erspan_link_ops);
1618         if (err < 0)
1619                 goto erspan_link_failed;
1620
1621         return 0;
1622
1623 erspan_link_failed:
1624         rtnl_link_unregister(&ipgre_tap_ops);
1625 tap_ops_failed:
1626         rtnl_link_unregister(&ipgre_link_ops);
1627 rtnl_link_failed:
1628         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1629 add_proto_failed:
1630         unregister_pernet_device(&erspan_net_ops);
1631 pnet_erspan_failed:
1632         unregister_pernet_device(&ipgre_tap_net_ops);
1633 pnet_tap_failed:
1634         unregister_pernet_device(&ipgre_net_ops);
1635         return err;
1636 }
1637
1638 static void __exit ipgre_fini(void)
1639 {
1640         rtnl_link_unregister(&ipgre_tap_ops);
1641         rtnl_link_unregister(&ipgre_link_ops);
1642         rtnl_link_unregister(&erspan_link_ops);
1643         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1644         unregister_pernet_device(&ipgre_tap_net_ops);
1645         unregister_pernet_device(&ipgre_net_ops);
1646         unregister_pernet_device(&erspan_net_ops);
1647 }
1648
1649 module_init(ipgre_init);
1650 module_exit(ipgre_fini);
1651 MODULE_LICENSE("GPL");
1652 MODULE_ALIAS_RTNL_LINK("gre");
1653 MODULE_ALIAS_RTNL_LINK("gretap");
1654 MODULE_ALIAS_RTNL_LINK("erspan");
1655 MODULE_ALIAS_NETDEV("gre0");
1656 MODULE_ALIAS_NETDEV("gretap0");
1657 MODULE_ALIAS_NETDEV("erspan0");