2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/if_vlan.h>
29 #include <linux/init.h>
30 #include <linux/in6.h>
31 #include <linux/inetdevice.h>
32 #include <linux/igmp.h>
33 #include <linux/netfilter_ipv4.h>
34 #include <linux/etherdevice.h>
35 #include <linux/if_ether.h>
40 #include <net/protocol.h>
41 #include <net/ip_tunnels.h>
43 #include <net/checksum.h>
44 #include <net/dsfield.h>
45 #include <net/inet_ecn.h>
47 #include <net/net_namespace.h>
48 #include <net/netns/generic.h>
49 #include <net/rtnetlink.h>
51 #include <net/dst_metadata.h>
53 #if IS_ENABLED(CONFIG_IPV6)
55 #include <net/ip6_fib.h>
56 #include <net/ip6_route.h>
63 1. The most important issue is detecting local dead loops.
64 They would cause complete host lockup in transmit, which
65 would be "resolved" by stack overflow or, if queueing is enabled,
66 with infinite looping in net_bh.
68 We cannot track such dead loops during route installation,
69 it is infeasible task. The most general solutions would be
70 to keep skb->encapsulation counter (sort of local ttl),
71 and silently drop packet when it expires. It is a good
72 solution, but it supposes maintaining new variable in ALL
73 skb, even if no tunneling is used.
75 Current solution: xmit_recursion breaks dead loops. This is a percpu
76 counter, since when we enter the first ndo_xmit(), cpu migration is
77 forbidden. We force an exit if this counter reaches RECURSION_LIMIT
79 2. Networking dead loops would not kill routers, but would really
80 kill network. IP hop limit plays role of "t->recursion" in this case,
81 if we copy it from packet being encapsulated to upper header.
82 It is very good solution, but it introduces two problems:
84 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
85 do not work over tunnels.
86 - traceroute does not work. I planned to relay ICMP from tunnel,
87 so that this problem would be solved and traceroute output
88 would even more informative. This idea appeared to be wrong:
89 only Linux complies to rfc1812 now (yes, guys, Linux is the only
90 true router now :-)), all routers (at least, in neighbourhood of mine)
91 return only 8 bytes of payload. It is the end.
93 Hence, if we want that OSPF worked or traceroute said something reasonable,
94 we should search for another solution.
96 One of them is to parse packet trying to detect inner encapsulation
97 made by our node. It is difficult or even impossible, especially,
98 taking into account fragmentation. TO be short, ttl is not solution at all.
100 Current solution: The solution was UNEXPECTEDLY SIMPLE.
101 We force DF flag on tunnels with preconfigured hop limit,
102 that is ALL. :-) Well, it does not remove the problem completely,
103 but exponential growth of network traffic is changed to linear
104 (branches, that exceed pmtu are pruned) and tunnel mtu
105 rapidly degrades to value <68, where looping stops.
106 Yes, it is not good if there exists a router in the loop,
107 which does not force DF, even when encapsulating packets have DF set.
108 But it is not our problem! Nobody could accuse us, we made
109 all that we could make. Even if it is your gated who injected
110 fatal route to network, even if it were you who configured
111 fatal static route: you are innocent. :-)
116 static bool log_ecn_error = true;
117 module_param(log_ecn_error, bool, 0644);
118 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
120 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
121 static int ipgre_tunnel_init(struct net_device *dev);
123 static int ipgre_net_id __read_mostly;
124 static int gre_tap_net_id __read_mostly;
126 static int ip_gre_calc_hlen(__be16 o_flags)
130 if (o_flags & TUNNEL_CSUM)
132 if (o_flags & TUNNEL_KEY)
134 if (o_flags & TUNNEL_SEQ)
139 static __be16 gre_flags_to_tnl_flags(__be16 flags)
143 if (flags & GRE_CSUM)
144 tflags |= TUNNEL_CSUM;
145 if (flags & GRE_ROUTING)
146 tflags |= TUNNEL_ROUTING;
148 tflags |= TUNNEL_KEY;
150 tflags |= TUNNEL_SEQ;
151 if (flags & GRE_STRICT)
152 tflags |= TUNNEL_STRICT;
154 tflags |= TUNNEL_REC;
155 if (flags & GRE_VERSION)
156 tflags |= TUNNEL_VERSION;
161 static __be16 tnl_flags_to_gre_flags(__be16 tflags)
165 if (tflags & TUNNEL_CSUM)
167 if (tflags & TUNNEL_ROUTING)
168 flags |= GRE_ROUTING;
169 if (tflags & TUNNEL_KEY)
171 if (tflags & TUNNEL_SEQ)
173 if (tflags & TUNNEL_STRICT)
175 if (tflags & TUNNEL_REC)
177 if (tflags & TUNNEL_VERSION)
178 flags |= GRE_VERSION;
183 /* Fills in tpi and returns header length to be pulled. */
184 static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
187 const struct gre_base_hdr *greh;
191 if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
194 greh = (struct gre_base_hdr *)skb_transport_header(skb);
195 if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
198 tpi->flags = gre_flags_to_tnl_flags(greh->flags);
199 hdr_len = ip_gre_calc_hlen(tpi->flags);
201 if (!pskb_may_pull(skb, hdr_len))
204 greh = (struct gre_base_hdr *)skb_transport_header(skb);
205 tpi->proto = greh->protocol;
207 options = (__be32 *)(greh + 1);
208 if (greh->flags & GRE_CSUM) {
209 if (skb_checksum_simple_validate(skb)) {
214 skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
215 null_compute_pseudo);
219 if (greh->flags & GRE_KEY) {
225 if (unlikely(greh->flags & GRE_SEQ)) {
231 /* WCCP version 1 and 2 protocol decoding.
232 * - Change protocol to IP
233 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
235 if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
236 tpi->proto = htons(ETH_P_IP);
237 if ((*(u8 *)options & 0xF0) != 0x40) {
239 if (!pskb_may_pull(skb, hdr_len))
246 static void ipgre_err(struct sk_buff *skb, u32 info,
247 const struct tnl_ptk_info *tpi)
250 /* All the routers (except for Linux) return only
251 8 bytes of packet payload. It means, that precise relaying of
252 ICMP in the real Internet is absolutely infeasible.
254 Moreover, Cisco "wise men" put GRE key to the third word
255 in GRE header. It makes impossible maintaining even soft
256 state for keyed GRE tunnels with enabled checksum. Tell
259 Well, I wonder, rfc1812 was written by Cisco employee,
260 what the hell these idiots break standards established
263 struct net *net = dev_net(skb->dev);
264 struct ip_tunnel_net *itn;
265 const struct iphdr *iph;
266 const int type = icmp_hdr(skb)->type;
267 const int code = icmp_hdr(skb)->code;
272 case ICMP_PARAMETERPROB:
275 case ICMP_DEST_UNREACH:
278 case ICMP_PORT_UNREACH:
279 /* Impossible event. */
282 /* All others are translated to HOST_UNREACH.
283 rfc2003 contains "deep thoughts" about NET_UNREACH,
284 I believe they are just ether pollution. --ANK
290 case ICMP_TIME_EXCEEDED:
291 if (code != ICMP_EXC_TTL)
299 if (tpi->proto == htons(ETH_P_TEB))
300 itn = net_generic(net, gre_tap_net_id);
302 itn = net_generic(net, ipgre_net_id);
304 iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
305 t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
306 iph->daddr, iph->saddr, tpi->key);
311 if (t->parms.iph.daddr == 0 ||
312 ipv4_is_multicast(t->parms.iph.daddr))
315 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
318 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
322 t->err_time = jiffies;
325 static void gre_err(struct sk_buff *skb, u32 info)
327 /* All the routers (except for Linux) return only
328 * 8 bytes of packet payload. It means, that precise relaying of
329 * ICMP in the real Internet is absolutely infeasible.
331 * Moreover, Cisco "wise men" put GRE key to the third word
332 * in GRE header. It makes impossible maintaining even soft
334 * GRE tunnels with enabled checksum. Tell them "thank you".
336 * Well, I wonder, rfc1812 was written by Cisco employee,
337 * what the hell these idiots break standards established
341 const int type = icmp_hdr(skb)->type;
342 const int code = icmp_hdr(skb)->code;
343 struct tnl_ptk_info tpi;
344 bool csum_err = false;
346 if (parse_gre_header(skb, &tpi, &csum_err) < 0) {
347 if (!csum_err) /* ignore csum errors. */
351 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
352 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
353 skb->dev->ifindex, 0, IPPROTO_GRE, 0);
356 if (type == ICMP_REDIRECT) {
357 ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
362 ipgre_err(skb, info, &tpi);
365 static __be64 key_to_tunnel_id(__be32 key)
368 return (__force __be64)((__force u32)key);
370 return (__force __be64)((__force u64)key << 32);
374 /* Returns the least-significant 32 bits of a __be64. */
375 static __be32 tunnel_id_to_key(__be64 x)
378 return (__force __be32)x;
380 return (__force __be32)((__force u64)x >> 32);
384 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
386 struct net *net = dev_net(skb->dev);
387 struct metadata_dst *tun_dst = NULL;
388 struct ip_tunnel_net *itn;
389 const struct iphdr *iph;
390 struct ip_tunnel *tunnel;
392 if (tpi->proto == htons(ETH_P_TEB))
393 itn = net_generic(net, gre_tap_net_id);
395 itn = net_generic(net, ipgre_net_id);
398 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
399 iph->saddr, iph->daddr, tpi->key);
402 if (tunnel->dev->type != ARPHRD_NONE)
403 skb_pop_mac_header(skb);
405 skb_reset_mac_header(skb);
406 if (tunnel->collect_md) {
410 flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
411 tun_id = key_to_tunnel_id(tpi->key);
412 tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
414 return PACKET_REJECT;
417 ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
420 return PACKET_REJECT;
423 static int gre_rcv(struct sk_buff *skb)
425 struct tnl_ptk_info tpi;
426 bool csum_err = false;
429 #ifdef CONFIG_NET_IPGRE_BROADCAST
430 if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
431 /* Looped back packet, drop it! */
432 if (rt_is_output_route(skb_rtable(skb)))
437 hdr_len = parse_gre_header(skb, &tpi, &csum_err);
440 if (iptunnel_pull_header(skb, hdr_len, tpi.proto) < 0)
443 if (ipgre_rcv(skb, &tpi) == PACKET_RCVD)
446 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
452 static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
453 __be16 proto, __be32 key, __be32 seq)
455 struct gre_base_hdr *greh;
457 skb_push(skb, hdr_len);
459 skb_reset_transport_header(skb);
460 greh = (struct gre_base_hdr *)skb->data;
461 greh->flags = tnl_flags_to_gre_flags(flags);
462 greh->protocol = proto;
464 if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) {
465 __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
467 if (flags & TUNNEL_SEQ) {
471 if (flags & TUNNEL_KEY) {
475 if (flags & TUNNEL_CSUM &&
476 !(skb_shinfo(skb)->gso_type &
477 (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
479 *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
485 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
486 const struct iphdr *tnl_params,
489 struct ip_tunnel *tunnel = netdev_priv(dev);
491 if (tunnel->parms.o_flags & TUNNEL_SEQ)
494 /* Push GRE header. */
495 build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
496 proto, tunnel->parms.o_key, htonl(tunnel->o_seqno));
498 skb_set_inner_protocol(skb, proto);
499 ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
502 static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
505 return iptunnel_handle_offloads(skb, csum,
506 csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
509 static struct rtable *gre_get_rt(struct sk_buff *skb,
510 struct net_device *dev,
512 const struct ip_tunnel_key *key)
514 struct net *net = dev_net(dev);
516 memset(fl, 0, sizeof(*fl));
517 fl->daddr = key->u.ipv4.dst;
518 fl->saddr = key->u.ipv4.src;
519 fl->flowi4_tos = RT_TOS(key->tos);
520 fl->flowi4_mark = skb->mark;
521 fl->flowi4_proto = IPPROTO_GRE;
523 return ip_route_output_key(net, fl);
526 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
529 struct ip_tunnel_info *tun_info;
530 const struct ip_tunnel_key *key;
538 tun_info = skb_tunnel_info(skb);
539 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
540 ip_tunnel_info_af(tun_info) != AF_INET))
543 key = &tun_info->key;
544 rt = gre_get_rt(skb, dev, &fl, key);
548 tunnel_hlen = ip_gre_calc_hlen(key->tun_flags);
550 min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
551 + tunnel_hlen + sizeof(struct iphdr);
552 if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
553 int head_delta = SKB_DATA_ALIGN(min_headroom -
556 err = pskb_expand_head(skb, max_t(int, head_delta, 0),
562 /* Push Tunnel header. */
563 skb = gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM));
569 flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
570 build_header(skb, tunnel_hlen, flags, proto,
571 tunnel_id_to_key(tun_info->key.tun_id), 0);
573 df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
574 err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
575 key->u.ipv4.dst, IPPROTO_GRE,
576 key->tos, key->ttl, df, false);
577 iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
584 dev->stats.tx_dropped++;
587 static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
589 struct ip_tunnel_info *info = skb_tunnel_info(skb);
593 if (ip_tunnel_info_af(info) != AF_INET)
596 rt = gre_get_rt(skb, dev, &fl4, &info->key);
601 info->key.u.ipv4.src = fl4.saddr;
605 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
606 struct net_device *dev)
608 struct ip_tunnel *tunnel = netdev_priv(dev);
609 const struct iphdr *tnl_params;
611 if (tunnel->collect_md) {
612 gre_fb_xmit(skb, dev, skb->protocol);
616 if (dev->header_ops) {
617 /* Need space for new headers */
618 if (skb_cow_head(skb, dev->needed_headroom -
619 (tunnel->hlen + sizeof(struct iphdr))))
622 tnl_params = (const struct iphdr *)skb->data;
624 /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
627 skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
628 skb_reset_mac_header(skb);
630 if (skb_cow_head(skb, dev->needed_headroom))
633 tnl_params = &tunnel->parms.iph;
636 skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
640 __gre_xmit(skb, dev, tnl_params, skb->protocol);
646 dev->stats.tx_dropped++;
650 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
651 struct net_device *dev)
653 struct ip_tunnel *tunnel = netdev_priv(dev);
655 if (tunnel->collect_md) {
656 gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
660 skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
664 if (skb_cow_head(skb, dev->needed_headroom))
667 __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
673 dev->stats.tx_dropped++;
677 static int ipgre_tunnel_ioctl(struct net_device *dev,
678 struct ifreq *ifr, int cmd)
681 struct ip_tunnel_parm p;
683 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
685 if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
686 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
687 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
688 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
691 p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
692 p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
694 err = ip_tunnel_ioctl(dev, &p, cmd);
698 p.i_flags = tnl_flags_to_gre_flags(p.i_flags);
699 p.o_flags = tnl_flags_to_gre_flags(p.o_flags);
701 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
706 /* Nice toy. Unfortunately, useless in real life :-)
707 It allows to construct virtual multiprotocol broadcast "LAN"
708 over the Internet, provided multicast routing is tuned.
711 I have no idea was this bicycle invented before me,
712 so that I had to set ARPHRD_IPGRE to a random value.
713 I have an impression, that Cisco could make something similar,
714 but this feature is apparently missing in IOS<=11.2(8).
716 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
717 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
719 ping -t 255 224.66.66.66
721 If nobody answers, mbone does not work.
723 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
724 ip addr add 10.66.66.<somewhat>/24 dev Universe
726 ifconfig Universe add fe80::<Your_real_addr>/10
727 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
730 ftp fec0:6666:6666::193.233.7.65
733 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
735 const void *daddr, const void *saddr, unsigned int len)
737 struct ip_tunnel *t = netdev_priv(dev);
739 struct gre_base_hdr *greh;
741 iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph));
742 greh = (struct gre_base_hdr *)(iph+1);
743 greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags);
744 greh->protocol = htons(type);
746 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
748 /* Set the source hardware address. */
750 memcpy(&iph->saddr, saddr, 4);
752 memcpy(&iph->daddr, daddr, 4);
754 return t->hlen + sizeof(*iph);
756 return -(t->hlen + sizeof(*iph));
759 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
761 const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
762 memcpy(haddr, &iph->saddr, 4);
766 static const struct header_ops ipgre_header_ops = {
767 .create = ipgre_header,
768 .parse = ipgre_header_parse,
771 #ifdef CONFIG_NET_IPGRE_BROADCAST
772 static int ipgre_open(struct net_device *dev)
774 struct ip_tunnel *t = netdev_priv(dev);
776 if (ipv4_is_multicast(t->parms.iph.daddr)) {
780 rt = ip_route_output_gre(t->net, &fl4,
784 RT_TOS(t->parms.iph.tos),
787 return -EADDRNOTAVAIL;
790 if (!__in_dev_get_rtnl(dev))
791 return -EADDRNOTAVAIL;
792 t->mlink = dev->ifindex;
793 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
798 static int ipgre_close(struct net_device *dev)
800 struct ip_tunnel *t = netdev_priv(dev);
802 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
803 struct in_device *in_dev;
804 in_dev = inetdev_by_index(t->net, t->mlink);
806 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
812 static const struct net_device_ops ipgre_netdev_ops = {
813 .ndo_init = ipgre_tunnel_init,
814 .ndo_uninit = ip_tunnel_uninit,
815 #ifdef CONFIG_NET_IPGRE_BROADCAST
816 .ndo_open = ipgre_open,
817 .ndo_stop = ipgre_close,
819 .ndo_start_xmit = ipgre_xmit,
820 .ndo_do_ioctl = ipgre_tunnel_ioctl,
821 .ndo_change_mtu = ip_tunnel_change_mtu,
822 .ndo_get_stats64 = ip_tunnel_get_stats64,
823 .ndo_get_iflink = ip_tunnel_get_iflink,
826 #define GRE_FEATURES (NETIF_F_SG | \
831 static void ipgre_tunnel_setup(struct net_device *dev)
833 dev->netdev_ops = &ipgre_netdev_ops;
834 dev->type = ARPHRD_IPGRE;
835 ip_tunnel_setup(dev, ipgre_net_id);
838 static void __gre_tunnel_init(struct net_device *dev)
840 struct ip_tunnel *tunnel;
843 tunnel = netdev_priv(dev);
844 tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
845 tunnel->parms.iph.protocol = IPPROTO_GRE;
847 tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
849 t_hlen = tunnel->hlen + sizeof(struct iphdr);
851 dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
852 dev->mtu = ETH_DATA_LEN - t_hlen - 4;
854 dev->features |= GRE_FEATURES;
855 dev->hw_features |= GRE_FEATURES;
857 if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
858 /* TCP offload with GRE SEQ is not supported, nor
859 * can we support 2 levels of outer headers requiring
862 if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
863 (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
864 dev->features |= NETIF_F_GSO_SOFTWARE;
865 dev->hw_features |= NETIF_F_GSO_SOFTWARE;
868 /* Can use a lockless transmit, unless we generate
871 dev->features |= NETIF_F_LLTX;
875 static int ipgre_tunnel_init(struct net_device *dev)
877 struct ip_tunnel *tunnel = netdev_priv(dev);
878 struct iphdr *iph = &tunnel->parms.iph;
880 __gre_tunnel_init(dev);
882 memcpy(dev->dev_addr, &iph->saddr, 4);
883 memcpy(dev->broadcast, &iph->daddr, 4);
885 dev->flags = IFF_NOARP;
889 if (iph->daddr && !tunnel->collect_md) {
890 #ifdef CONFIG_NET_IPGRE_BROADCAST
891 if (ipv4_is_multicast(iph->daddr)) {
894 dev->flags = IFF_BROADCAST;
895 dev->header_ops = &ipgre_header_ops;
898 } else if (!tunnel->collect_md) {
899 dev->header_ops = &ipgre_header_ops;
902 return ip_tunnel_init(dev);
905 static const struct gre_protocol ipgre_protocol = {
907 .err_handler = gre_err,
910 static int __net_init ipgre_init_net(struct net *net)
912 return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
915 static void __net_exit ipgre_exit_net(struct net *net)
917 struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
918 ip_tunnel_delete_net(itn, &ipgre_link_ops);
921 static struct pernet_operations ipgre_net_ops = {
922 .init = ipgre_init_net,
923 .exit = ipgre_exit_net,
925 .size = sizeof(struct ip_tunnel_net),
928 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
936 if (data[IFLA_GRE_IFLAGS])
937 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
938 if (data[IFLA_GRE_OFLAGS])
939 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
940 if (flags & (GRE_VERSION|GRE_ROUTING))
943 if (data[IFLA_GRE_COLLECT_METADATA] &&
944 data[IFLA_GRE_ENCAP_TYPE] &&
945 nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
951 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
955 if (tb[IFLA_ADDRESS]) {
956 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
958 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
959 return -EADDRNOTAVAIL;
965 if (data[IFLA_GRE_REMOTE]) {
966 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
972 return ipgre_tunnel_validate(tb, data);
975 static void ipgre_netlink_parms(struct net_device *dev,
976 struct nlattr *data[],
978 struct ip_tunnel_parm *parms)
980 memset(parms, 0, sizeof(*parms));
982 parms->iph.protocol = IPPROTO_GRE;
987 if (data[IFLA_GRE_LINK])
988 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
990 if (data[IFLA_GRE_IFLAGS])
991 parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
993 if (data[IFLA_GRE_OFLAGS])
994 parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
996 if (data[IFLA_GRE_IKEY])
997 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
999 if (data[IFLA_GRE_OKEY])
1000 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1002 if (data[IFLA_GRE_LOCAL])
1003 parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1005 if (data[IFLA_GRE_REMOTE])
1006 parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1008 if (data[IFLA_GRE_TTL])
1009 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1011 if (data[IFLA_GRE_TOS])
1012 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1014 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1015 parms->iph.frag_off = htons(IP_DF);
1017 if (data[IFLA_GRE_COLLECT_METADATA]) {
1018 struct ip_tunnel *t = netdev_priv(dev);
1020 t->collect_md = true;
1021 if (dev->type == ARPHRD_IPGRE)
1022 dev->type = ARPHRD_NONE;
1026 /* This function returns true when ENCAP attributes are present in the nl msg */
1027 static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1028 struct ip_tunnel_encap *ipencap)
1032 memset(ipencap, 0, sizeof(*ipencap));
1037 if (data[IFLA_GRE_ENCAP_TYPE]) {
1039 ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1042 if (data[IFLA_GRE_ENCAP_FLAGS]) {
1044 ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1047 if (data[IFLA_GRE_ENCAP_SPORT]) {
1049 ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1052 if (data[IFLA_GRE_ENCAP_DPORT]) {
1054 ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1060 static int gre_tap_init(struct net_device *dev)
1062 __gre_tunnel_init(dev);
1063 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1065 return ip_tunnel_init(dev);
1068 static const struct net_device_ops gre_tap_netdev_ops = {
1069 .ndo_init = gre_tap_init,
1070 .ndo_uninit = ip_tunnel_uninit,
1071 .ndo_start_xmit = gre_tap_xmit,
1072 .ndo_set_mac_address = eth_mac_addr,
1073 .ndo_validate_addr = eth_validate_addr,
1074 .ndo_change_mtu = ip_tunnel_change_mtu,
1075 .ndo_get_stats64 = ip_tunnel_get_stats64,
1076 .ndo_get_iflink = ip_tunnel_get_iflink,
1077 .ndo_fill_metadata_dst = gre_fill_metadata_dst,
1080 static void ipgre_tap_setup(struct net_device *dev)
1083 dev->netdev_ops = &gre_tap_netdev_ops;
1084 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1085 ip_tunnel_setup(dev, gre_tap_net_id);
1088 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1089 struct nlattr *tb[], struct nlattr *data[])
1091 struct ip_tunnel_parm p;
1092 struct ip_tunnel_encap ipencap;
1094 if (ipgre_netlink_encap_parms(data, &ipencap)) {
1095 struct ip_tunnel *t = netdev_priv(dev);
1096 int err = ip_tunnel_encap_setup(t, &ipencap);
1102 ipgre_netlink_parms(dev, data, tb, &p);
1103 return ip_tunnel_newlink(dev, tb, &p);
1106 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1107 struct nlattr *data[])
1109 struct ip_tunnel_parm p;
1110 struct ip_tunnel_encap ipencap;
1112 if (ipgre_netlink_encap_parms(data, &ipencap)) {
1113 struct ip_tunnel *t = netdev_priv(dev);
1114 int err = ip_tunnel_encap_setup(t, &ipencap);
1120 ipgre_netlink_parms(dev, data, tb, &p);
1121 return ip_tunnel_changelink(dev, tb, &p);
1124 static size_t ipgre_get_size(const struct net_device *dev)
1129 /* IFLA_GRE_IFLAGS */
1131 /* IFLA_GRE_OFLAGS */
1137 /* IFLA_GRE_LOCAL */
1139 /* IFLA_GRE_REMOTE */
1145 /* IFLA_GRE_PMTUDISC */
1147 /* IFLA_GRE_ENCAP_TYPE */
1149 /* IFLA_GRE_ENCAP_FLAGS */
1151 /* IFLA_GRE_ENCAP_SPORT */
1153 /* IFLA_GRE_ENCAP_DPORT */
1155 /* IFLA_GRE_COLLECT_METADATA */
1160 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1162 struct ip_tunnel *t = netdev_priv(dev);
1163 struct ip_tunnel_parm *p = &t->parms;
1165 if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1166 nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) ||
1167 nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) ||
1168 nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1169 nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1170 nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1171 nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1172 nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1173 nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1174 nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1175 !!(p->iph.frag_off & htons(IP_DF))))
1176 goto nla_put_failure;
1178 if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1180 nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1182 nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1184 nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1186 goto nla_put_failure;
1188 if (t->collect_md) {
1189 if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1190 goto nla_put_failure;
1199 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1200 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1201 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1202 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1203 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1204 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
1205 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1206 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1207 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1208 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1209 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1210 [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 },
1211 [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
1212 [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
1213 [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
1214 [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
1217 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1219 .maxtype = IFLA_GRE_MAX,
1220 .policy = ipgre_policy,
1221 .priv_size = sizeof(struct ip_tunnel),
1222 .setup = ipgre_tunnel_setup,
1223 .validate = ipgre_tunnel_validate,
1224 .newlink = ipgre_newlink,
1225 .changelink = ipgre_changelink,
1226 .dellink = ip_tunnel_dellink,
1227 .get_size = ipgre_get_size,
1228 .fill_info = ipgre_fill_info,
1229 .get_link_net = ip_tunnel_get_link_net,
1232 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1234 .maxtype = IFLA_GRE_MAX,
1235 .policy = ipgre_policy,
1236 .priv_size = sizeof(struct ip_tunnel),
1237 .setup = ipgre_tap_setup,
1238 .validate = ipgre_tap_validate,
1239 .newlink = ipgre_newlink,
1240 .changelink = ipgre_changelink,
1241 .dellink = ip_tunnel_dellink,
1242 .get_size = ipgre_get_size,
1243 .fill_info = ipgre_fill_info,
1244 .get_link_net = ip_tunnel_get_link_net,
1247 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1248 u8 name_assign_type)
1250 struct nlattr *tb[IFLA_MAX + 1];
1251 struct net_device *dev;
1252 LIST_HEAD(list_kill);
1253 struct ip_tunnel *t;
1256 memset(&tb, 0, sizeof(tb));
1258 dev = rtnl_create_link(net, name, name_assign_type,
1259 &ipgre_tap_ops, tb);
1263 /* Configure flow based GRE device. */
1264 t = netdev_priv(dev);
1265 t->collect_md = true;
1267 err = ipgre_newlink(net, dev, tb, NULL);
1270 return ERR_PTR(err);
1273 /* openvswitch users expect packet sizes to be unrestricted,
1274 * so set the largest MTU we can.
1276 err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1280 err = rtnl_configure_link(dev, NULL);
1286 ip_tunnel_dellink(dev, &list_kill);
1287 unregister_netdevice_many(&list_kill);
1288 return ERR_PTR(err);
1290 EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1292 static int __net_init ipgre_tap_init_net(struct net *net)
1294 return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1297 static void __net_exit ipgre_tap_exit_net(struct net *net)
1299 struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
1300 ip_tunnel_delete_net(itn, &ipgre_tap_ops);
1303 static struct pernet_operations ipgre_tap_net_ops = {
1304 .init = ipgre_tap_init_net,
1305 .exit = ipgre_tap_exit_net,
1306 .id = &gre_tap_net_id,
1307 .size = sizeof(struct ip_tunnel_net),
1310 static int __init ipgre_init(void)
1314 pr_info("GRE over IPv4 tunneling driver\n");
1316 err = register_pernet_device(&ipgre_net_ops);
1320 err = register_pernet_device(&ipgre_tap_net_ops);
1322 goto pnet_tap_faied;
1324 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1326 pr_info("%s: can't add protocol\n", __func__);
1327 goto add_proto_failed;
1330 err = rtnl_link_register(&ipgre_link_ops);
1332 goto rtnl_link_failed;
1334 err = rtnl_link_register(&ipgre_tap_ops);
1336 goto tap_ops_failed;
1341 rtnl_link_unregister(&ipgre_link_ops);
1343 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1345 unregister_pernet_device(&ipgre_tap_net_ops);
1347 unregister_pernet_device(&ipgre_net_ops);
1351 static void __exit ipgre_fini(void)
1353 rtnl_link_unregister(&ipgre_tap_ops);
1354 rtnl_link_unregister(&ipgre_link_ops);
1355 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1356 unregister_pernet_device(&ipgre_tap_net_ops);
1357 unregister_pernet_device(&ipgre_net_ops);
1360 module_init(ipgre_init);
1361 module_exit(ipgre_fini);
1362 MODULE_LICENSE("GPL");
1363 MODULE_ALIAS_RTNL_LINK("gre");
1364 MODULE_ALIAS_RTNL_LINK("gretap");
1365 MODULE_ALIAS_NETDEV("gre0");
1366 MODULE_ALIAS_NETDEV("gretap0");