2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
55 #include <linux/types.h>
57 #include <linux/capability.h>
58 #include <linux/fcntl.h>
59 #include <linux/socket.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_packet.h>
64 #include <linux/wireless.h>
65 #include <linux/kernel.h>
66 #include <linux/kmod.h>
67 #include <linux/slab.h>
68 #include <linux/vmalloc.h>
69 #include <net/net_namespace.h>
71 #include <net/protocol.h>
72 #include <linux/skbuff.h>
74 #include <linux/errno.h>
75 #include <linux/timer.h>
76 #include <linux/uaccess.h>
77 #include <asm/ioctls.h>
79 #include <asm/cacheflush.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/poll.h>
84 #include <linux/module.h>
85 #include <linux/init.h>
86 #include <linux/mutex.h>
87 #include <linux/if_vlan.h>
88 #include <linux/virtio_net.h>
89 #include <linux/errqueue.h>
90 #include <linux/net_tstamp.h>
91 #include <linux/percpu.h>
93 #include <net/inet_common.h>
95 #include <linux/bpf.h>
96 #include <net/compat.h>
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
114 Incoming, dev->hard_header!=NULL
115 mac_header -> ll header
118 Outgoing, dev->hard_header!=NULL
119 mac_header -> ll header
122 Incoming, dev->hard_header==NULL
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
125 assymetry between rx and tx paths.
128 Outgoing, dev->hard_header==NULL
129 mac_header -> data. ll header is still not built!
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
139 dev->hard_header != NULL
140 mac_header -> ll header
143 dev->hard_header == NULL (ll header is added by device, we cannot control it)
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
151 /* Private packet socket structures. */
153 /* identical to struct packet_mreq except it has
154 * a longer address field.
156 struct packet_mreq_max {
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
170 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
171 int closing, int tx_ring);
173 #define V3_ALIGNMENT (8)
175 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
177 #define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
180 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
190 struct packet_type *pt, struct net_device *orig_dev);
192 static void *packet_previous_frame(struct packet_sock *po,
193 struct packet_ring_buffer *rb,
195 static void packet_increment_head(struct packet_ring_buffer *buff);
196 static int prb_curr_blk_in_use(struct tpacket_block_desc *);
197 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
198 struct packet_sock *);
199 static void prb_retire_current_block(struct tpacket_kbdq_core *,
200 struct packet_sock *, unsigned int status);
201 static int prb_queue_frozen(struct tpacket_kbdq_core *);
202 static void prb_open_block(struct tpacket_kbdq_core *,
203 struct tpacket_block_desc *);
204 static void prb_retire_rx_blk_timer_expired(unsigned long);
205 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
206 static void prb_init_blk_timer(struct packet_sock *,
207 struct tpacket_kbdq_core *,
208 void (*func) (unsigned long));
209 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
210 static void prb_clear_rxhash(struct tpacket_kbdq_core *,
211 struct tpacket3_hdr *);
212 static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214 static void packet_flush_mclist(struct sock *sk);
215 static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb);
217 struct packet_skb_cb {
219 struct sockaddr_pkt pkt;
221 /* Trick: alias skb original length with
222 * ll.sll_family and ll.protocol in order
225 unsigned int origlen;
226 struct sockaddr_ll ll;
231 #define vio_le() virtio_legacy_is_little_endian()
233 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
235 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
236 #define GET_PBLOCK_DESC(x, bid) \
237 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
238 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
239 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
240 #define GET_NEXT_PRB_BLK_NUM(x) \
241 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
242 ((x)->kactive_blk_num+1) : 0)
244 static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
245 static void __fanout_link(struct sock *sk, struct packet_sock *po);
247 static int packet_direct_xmit(struct sk_buff *skb)
249 struct net_device *dev = skb->dev;
250 struct sk_buff *orig_skb = skb;
251 struct netdev_queue *txq;
252 int ret = NETDEV_TX_BUSY;
254 if (unlikely(!netif_running(dev) ||
255 !netif_carrier_ok(dev)))
258 skb = validate_xmit_skb_list(skb, dev);
262 packet_pick_tx_queue(dev, skb);
263 txq = skb_get_tx_queue(dev, skb);
267 HARD_TX_LOCK(dev, txq, smp_processor_id());
268 if (!netif_xmit_frozen_or_drv_stopped(txq))
269 ret = netdev_start_xmit(skb, dev, txq, false);
270 HARD_TX_UNLOCK(dev, txq);
274 if (!dev_xmit_complete(ret))
279 atomic_long_inc(&dev->tx_dropped);
281 return NET_XMIT_DROP;
284 static struct net_device *packet_cached_dev_get(struct packet_sock *po)
286 struct net_device *dev;
289 dev = rcu_dereference(po->cached_dev);
297 static void packet_cached_dev_assign(struct packet_sock *po,
298 struct net_device *dev)
300 rcu_assign_pointer(po->cached_dev, dev);
303 static void packet_cached_dev_reset(struct packet_sock *po)
305 RCU_INIT_POINTER(po->cached_dev, NULL);
308 static bool packet_use_direct_xmit(const struct packet_sock *po)
310 return po->xmit == packet_direct_xmit;
313 static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
315 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
318 static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
320 const struct net_device_ops *ops = dev->netdev_ops;
323 if (ops->ndo_select_queue) {
324 queue_index = ops->ndo_select_queue(dev, skb, NULL,
325 __packet_pick_tx_queue);
326 queue_index = netdev_cap_txqueue(dev, queue_index);
328 queue_index = __packet_pick_tx_queue(dev, skb);
331 skb_set_queue_mapping(skb, queue_index);
334 /* __register_prot_hook must be invoked through register_prot_hook
335 * or from a context in which asynchronous accesses to the packet
336 * socket is not possible (packet_create()).
338 static void __register_prot_hook(struct sock *sk)
340 struct packet_sock *po = pkt_sk(sk);
344 __fanout_link(sk, po);
346 dev_add_pack(&po->prot_hook);
353 static void register_prot_hook(struct sock *sk)
355 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
356 __register_prot_hook(sk);
359 /* If the sync parameter is true, we will temporarily drop
360 * the po->bind_lock and do a synchronize_net to make sure no
361 * asynchronous packet processing paths still refer to the elements
362 * of po->prot_hook. If the sync parameter is false, it is the
363 * callers responsibility to take care of this.
365 static void __unregister_prot_hook(struct sock *sk, bool sync)
367 struct packet_sock *po = pkt_sk(sk);
369 lockdep_assert_held_once(&po->bind_lock);
374 __fanout_unlink(sk, po);
376 __dev_remove_pack(&po->prot_hook);
381 spin_unlock(&po->bind_lock);
383 spin_lock(&po->bind_lock);
387 static void unregister_prot_hook(struct sock *sk, bool sync)
389 struct packet_sock *po = pkt_sk(sk);
392 __unregister_prot_hook(sk, sync);
395 static inline struct page * __pure pgv_to_page(void *addr)
397 if (is_vmalloc_addr(addr))
398 return vmalloc_to_page(addr);
399 return virt_to_page(addr);
402 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
404 union tpacket_uhdr h;
407 switch (po->tp_version) {
409 h.h1->tp_status = status;
410 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
413 h.h2->tp_status = status;
414 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
417 h.h3->tp_status = status;
418 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
421 WARN(1, "TPACKET version not supported.\n");
428 static int __packet_get_status(struct packet_sock *po, void *frame)
430 union tpacket_uhdr h;
435 switch (po->tp_version) {
437 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
438 return h.h1->tp_status;
440 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
441 return h.h2->tp_status;
443 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
444 return h.h3->tp_status;
446 WARN(1, "TPACKET version not supported.\n");
452 static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
455 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
458 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
459 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
460 return TP_STATUS_TS_RAW_HARDWARE;
462 if (ktime_to_timespec_cond(skb->tstamp, ts))
463 return TP_STATUS_TS_SOFTWARE;
468 static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
471 union tpacket_uhdr h;
475 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
479 switch (po->tp_version) {
481 h.h1->tp_sec = ts.tv_sec;
482 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
485 h.h2->tp_sec = ts.tv_sec;
486 h.h2->tp_nsec = ts.tv_nsec;
489 h.h3->tp_sec = ts.tv_sec;
490 h.h3->tp_nsec = ts.tv_nsec;
493 WARN(1, "TPACKET version not supported.\n");
497 /* one flush is safe, as both fields always lie on the same cacheline */
498 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
504 static void *packet_lookup_frame(struct packet_sock *po,
505 struct packet_ring_buffer *rb,
506 unsigned int position,
509 unsigned int pg_vec_pos, frame_offset;
510 union tpacket_uhdr h;
512 pg_vec_pos = position / rb->frames_per_block;
513 frame_offset = position % rb->frames_per_block;
515 h.raw = rb->pg_vec[pg_vec_pos].buffer +
516 (frame_offset * rb->frame_size);
518 if (status != __packet_get_status(po, h.raw))
524 static void *packet_current_frame(struct packet_sock *po,
525 struct packet_ring_buffer *rb,
528 return packet_lookup_frame(po, rb, rb->head, status);
531 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
533 del_timer_sync(&pkc->retire_blk_timer);
536 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
537 struct sk_buff_head *rb_queue)
539 struct tpacket_kbdq_core *pkc;
541 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
543 spin_lock_bh(&rb_queue->lock);
544 pkc->delete_blk_timer = 1;
545 spin_unlock_bh(&rb_queue->lock);
547 prb_del_retire_blk_timer(pkc);
550 static void prb_init_blk_timer(struct packet_sock *po,
551 struct tpacket_kbdq_core *pkc,
552 void (*func) (unsigned long))
554 init_timer(&pkc->retire_blk_timer);
555 pkc->retire_blk_timer.data = (long)po;
556 pkc->retire_blk_timer.function = func;
557 pkc->retire_blk_timer.expires = jiffies;
560 static void prb_setup_retire_blk_timer(struct packet_sock *po)
562 struct tpacket_kbdq_core *pkc;
564 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
565 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
568 static int prb_calc_retire_blk_tmo(struct packet_sock *po,
569 int blk_size_in_bytes)
571 struct net_device *dev;
572 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
573 struct ethtool_link_ksettings ecmd;
577 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
578 if (unlikely(!dev)) {
580 return DEFAULT_PRB_RETIRE_TOV;
582 err = __ethtool_get_link_ksettings(dev, &ecmd);
586 * If the link speed is so slow you don't really
587 * need to worry about perf anyways
589 if (ecmd.base.speed < SPEED_1000 ||
590 ecmd.base.speed == SPEED_UNKNOWN) {
591 return DEFAULT_PRB_RETIRE_TOV;
594 div = ecmd.base.speed / 1000;
597 return DEFAULT_PRB_RETIRE_TOV;
599 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
611 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
612 union tpacket_req_u *req_u)
614 p1->feature_req_word = req_u->req3.tp_feature_req_word;
617 static void init_prb_bdqc(struct packet_sock *po,
618 struct packet_ring_buffer *rb,
620 union tpacket_req_u *req_u)
622 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
623 struct tpacket_block_desc *pbd;
625 memset(p1, 0x0, sizeof(*p1));
627 p1->knxt_seq_num = 1;
629 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
630 p1->pkblk_start = pg_vec[0].buffer;
631 p1->kblk_size = req_u->req3.tp_block_size;
632 p1->knum_blocks = req_u->req3.tp_block_nr;
633 p1->hdrlen = po->tp_hdrlen;
634 p1->version = po->tp_version;
635 p1->last_kactive_blk_num = 0;
636 po->stats.stats3.tp_freeze_q_cnt = 0;
637 if (req_u->req3.tp_retire_blk_tov)
638 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
640 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
641 req_u->req3.tp_block_size);
642 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
643 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
645 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
646 prb_init_ft_ops(p1, req_u);
647 prb_setup_retire_blk_timer(po);
648 prb_open_block(p1, pbd);
651 /* Do NOT update the last_blk_num first.
652 * Assumes sk_buff_head lock is held.
654 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
656 mod_timer(&pkc->retire_blk_timer,
657 jiffies + pkc->tov_in_jiffies);
658 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
663 * 1) We refresh the timer only when we open a block.
664 * By doing this we don't waste cycles refreshing the timer
665 * on packet-by-packet basis.
667 * With a 1MB block-size, on a 1Gbps line, it will take
668 * i) ~8 ms to fill a block + ii) memcpy etc.
669 * In this cut we are not accounting for the memcpy time.
671 * So, if the user sets the 'tmo' to 10ms then the timer
672 * will never fire while the block is still getting filled
673 * (which is what we want). However, the user could choose
674 * to close a block early and that's fine.
676 * But when the timer does fire, we check whether or not to refresh it.
677 * Since the tmo granularity is in msecs, it is not too expensive
678 * to refresh the timer, lets say every '8' msecs.
679 * Either the user can set the 'tmo' or we can derive it based on
680 * a) line-speed and b) block-size.
681 * prb_calc_retire_blk_tmo() calculates the tmo.
684 static void prb_retire_rx_blk_timer_expired(unsigned long data)
686 struct packet_sock *po = (struct packet_sock *)data;
687 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
689 struct tpacket_block_desc *pbd;
691 spin_lock(&po->sk.sk_receive_queue.lock);
693 frozen = prb_queue_frozen(pkc);
694 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
696 if (unlikely(pkc->delete_blk_timer))
699 /* We only need to plug the race when the block is partially filled.
701 * lock(); increment BLOCK_NUM_PKTS; unlock()
702 * copy_bits() is in progress ...
703 * timer fires on other cpu:
704 * we can't retire the current block because copy_bits
708 if (BLOCK_NUM_PKTS(pbd)) {
709 while (atomic_read(&pkc->blk_fill_in_prog)) {
710 /* Waiting for skb_copy_bits to finish... */
715 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
717 if (!BLOCK_NUM_PKTS(pbd)) {
718 /* An empty block. Just refresh the timer. */
721 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
722 if (!prb_dispatch_next_block(pkc, po))
727 /* Case 1. Queue was frozen because user-space was
730 if (prb_curr_blk_in_use(pbd)) {
732 * Ok, user-space is still behind.
733 * So just refresh the timer.
737 /* Case 2. queue was frozen,user-space caught up,
738 * now the link went idle && the timer fired.
739 * We don't have a block to close.So we open this
740 * block and restart the timer.
741 * opening a block thaws the queue,restarts timer
742 * Thawing/timer-refresh is a side effect.
744 prb_open_block(pkc, pbd);
751 _prb_refresh_rx_retire_blk_timer(pkc);
754 spin_unlock(&po->sk.sk_receive_queue.lock);
757 static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
758 struct tpacket_block_desc *pbd1, __u32 status)
760 /* Flush everything minus the block header */
762 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
767 /* Skip the block header(we know header WILL fit in 4K) */
770 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
771 for (; start < end; start += PAGE_SIZE)
772 flush_dcache_page(pgv_to_page(start));
777 /* Now update the block status. */
779 BLOCK_STATUS(pbd1) = status;
781 /* Flush the block header */
783 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
785 flush_dcache_page(pgv_to_page(start));
795 * 2) Increment active_blk_num
797 * Note:We DONT refresh the timer on purpose.
798 * Because almost always the next block will be opened.
800 static void prb_close_block(struct tpacket_kbdq_core *pkc1,
801 struct tpacket_block_desc *pbd1,
802 struct packet_sock *po, unsigned int stat)
804 __u32 status = TP_STATUS_USER | stat;
806 struct tpacket3_hdr *last_pkt;
807 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
808 struct sock *sk = &po->sk;
810 if (po->stats.stats3.tp_drops)
811 status |= TP_STATUS_LOSING;
813 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
814 last_pkt->tp_next_offset = 0;
816 /* Get the ts of the last pkt */
817 if (BLOCK_NUM_PKTS(pbd1)) {
818 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
819 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
821 /* Ok, we tmo'd - so get the current time.
823 * It shouldn't really happen as we don't close empty
824 * blocks. See prb_retire_rx_blk_timer_expired().
828 h1->ts_last_pkt.ts_sec = ts.tv_sec;
829 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
834 /* Flush the block */
835 prb_flush_block(pkc1, pbd1, status);
837 sk->sk_data_ready(sk);
839 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
842 static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
844 pkc->reset_pending_on_curr_blk = 0;
848 * Side effect of opening a block:
850 * 1) prb_queue is thawed.
851 * 2) retire_blk_timer is refreshed.
854 static void prb_open_block(struct tpacket_kbdq_core *pkc1,
855 struct tpacket_block_desc *pbd1)
858 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
862 /* We could have just memset this but we will lose the
863 * flexibility of making the priv area sticky
866 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
867 BLOCK_NUM_PKTS(pbd1) = 0;
868 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
872 h1->ts_first_pkt.ts_sec = ts.tv_sec;
873 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
875 pkc1->pkblk_start = (char *)pbd1;
876 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
878 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
879 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
881 pbd1->version = pkc1->version;
882 pkc1->prev = pkc1->nxt_offset;
883 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
885 prb_thaw_queue(pkc1);
886 _prb_refresh_rx_retire_blk_timer(pkc1);
892 * Queue freeze logic:
893 * 1) Assume tp_block_nr = 8 blocks.
894 * 2) At time 't0', user opens Rx ring.
895 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
896 * 4) user-space is either sleeping or processing block '0'.
897 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
898 * it will close block-7,loop around and try to fill block '0'.
900 * __packet_lookup_frame_in_block
901 * prb_retire_current_block()
902 * prb_dispatch_next_block()
903 * |->(BLOCK_STATUS == USER) evaluates to true
904 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
905 * 6) Now there are two cases:
906 * 6.1) Link goes idle right after the queue is frozen.
907 * But remember, the last open_block() refreshed the timer.
908 * When this timer expires,it will refresh itself so that we can
909 * re-open block-0 in near future.
910 * 6.2) Link is busy and keeps on receiving packets. This is a simple
911 * case and __packet_lookup_frame_in_block will check if block-0
912 * is free and can now be re-used.
914 static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
915 struct packet_sock *po)
917 pkc->reset_pending_on_curr_blk = 1;
918 po->stats.stats3.tp_freeze_q_cnt++;
921 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
924 * If the next block is free then we will dispatch it
925 * and return a good offset.
926 * Else, we will freeze the queue.
927 * So, caller must check the return value.
929 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
930 struct packet_sock *po)
932 struct tpacket_block_desc *pbd;
936 /* 1. Get current block num */
937 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
939 /* 2. If this block is currently in_use then freeze the queue */
940 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
941 prb_freeze_queue(pkc, po);
947 * open this block and return the offset where the first packet
948 * needs to get stored.
950 prb_open_block(pkc, pbd);
951 return (void *)pkc->nxt_offset;
954 static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
955 struct packet_sock *po, unsigned int status)
957 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
959 /* retire/close the current block */
960 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
962 * Plug the case where copy_bits() is in progress on
963 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
964 * have space to copy the pkt in the current block and
965 * called prb_retire_current_block()
967 * We don't need to worry about the TMO case because
968 * the timer-handler already handled this case.
970 if (!(status & TP_STATUS_BLK_TMO)) {
971 while (atomic_read(&pkc->blk_fill_in_prog)) {
972 /* Waiting for skb_copy_bits to finish... */
976 prb_close_block(pkc, pbd, po, status);
981 static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
983 return TP_STATUS_USER & BLOCK_STATUS(pbd);
986 static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
988 return pkc->reset_pending_on_curr_blk;
991 static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
992 __releases(&pkc->blk_fill_in_prog_lock)
994 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
995 atomic_dec(&pkc->blk_fill_in_prog);
998 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
999 struct tpacket3_hdr *ppd)
1001 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
1004 static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
1005 struct tpacket3_hdr *ppd)
1007 ppd->hv1.tp_rxhash = 0;
1010 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
1011 struct tpacket3_hdr *ppd)
1013 if (skb_vlan_tag_present(pkc->skb)) {
1014 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
1015 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1016 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
1018 ppd->hv1.tp_vlan_tci = 0;
1019 ppd->hv1.tp_vlan_tpid = 0;
1020 ppd->tp_status = TP_STATUS_AVAILABLE;
1024 static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
1025 struct tpacket3_hdr *ppd)
1027 ppd->hv1.tp_padding = 0;
1028 prb_fill_vlan_info(pkc, ppd);
1030 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1031 prb_fill_rxhash(pkc, ppd);
1033 prb_clear_rxhash(pkc, ppd);
1036 static void prb_fill_curr_block(char *curr,
1037 struct tpacket_kbdq_core *pkc,
1038 struct tpacket_block_desc *pbd,
1040 __acquires(&pkc->blk_fill_in_prog_lock)
1042 struct tpacket3_hdr *ppd;
1044 ppd = (struct tpacket3_hdr *)curr;
1045 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1047 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1048 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1049 BLOCK_NUM_PKTS(pbd) += 1;
1050 atomic_inc(&pkc->blk_fill_in_prog);
1051 prb_run_all_ft_ops(pkc, ppd);
1054 /* Assumes caller has the sk->rx_queue.lock */
1055 static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1056 struct sk_buff *skb,
1061 struct tpacket_kbdq_core *pkc;
1062 struct tpacket_block_desc *pbd;
1065 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1066 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1068 /* Queue is frozen when user space is lagging behind */
1069 if (prb_queue_frozen(pkc)) {
1071 * Check if that last block which caused the queue to freeze,
1072 * is still in_use by user-space.
1074 if (prb_curr_blk_in_use(pbd)) {
1075 /* Can't record this packet */
1079 * Ok, the block was released by user-space.
1080 * Now let's open that block.
1081 * opening a block also thaws the queue.
1082 * Thawing is a side effect.
1084 prb_open_block(pkc, pbd);
1089 curr = pkc->nxt_offset;
1091 end = (char *)pbd + pkc->kblk_size;
1093 /* first try the current block */
1094 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1095 prb_fill_curr_block(curr, pkc, pbd, len);
1096 return (void *)curr;
1099 /* Ok, close the current block */
1100 prb_retire_current_block(pkc, po, 0);
1102 /* Now, try to dispatch the next block */
1103 curr = (char *)prb_dispatch_next_block(pkc, po);
1105 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1106 prb_fill_curr_block(curr, pkc, pbd, len);
1107 return (void *)curr;
1111 * No free blocks are available.user_space hasn't caught up yet.
1112 * Queue was just frozen and now this packet will get dropped.
1117 static void *packet_current_rx_frame(struct packet_sock *po,
1118 struct sk_buff *skb,
1119 int status, unsigned int len)
1122 switch (po->tp_version) {
1125 curr = packet_lookup_frame(po, &po->rx_ring,
1126 po->rx_ring.head, status);
1129 return __packet_lookup_frame_in_block(po, skb, status, len);
1131 WARN(1, "TPACKET version not supported\n");
1137 static void *prb_lookup_block(struct packet_sock *po,
1138 struct packet_ring_buffer *rb,
1142 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1143 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1145 if (status != BLOCK_STATUS(pbd))
1150 static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1153 if (rb->prb_bdqc.kactive_blk_num)
1154 prev = rb->prb_bdqc.kactive_blk_num-1;
1156 prev = rb->prb_bdqc.knum_blocks-1;
1160 /* Assumes caller has held the rx_queue.lock */
1161 static void *__prb_previous_block(struct packet_sock *po,
1162 struct packet_ring_buffer *rb,
1165 unsigned int previous = prb_previous_blk_num(rb);
1166 return prb_lookup_block(po, rb, previous, status);
1169 static void *packet_previous_rx_frame(struct packet_sock *po,
1170 struct packet_ring_buffer *rb,
1173 if (po->tp_version <= TPACKET_V2)
1174 return packet_previous_frame(po, rb, status);
1176 return __prb_previous_block(po, rb, status);
1179 static void packet_increment_rx_head(struct packet_sock *po,
1180 struct packet_ring_buffer *rb)
1182 switch (po->tp_version) {
1185 return packet_increment_head(rb);
1188 WARN(1, "TPACKET version not supported.\n");
1194 static void *packet_previous_frame(struct packet_sock *po,
1195 struct packet_ring_buffer *rb,
1198 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1199 return packet_lookup_frame(po, rb, previous, status);
1202 static void packet_increment_head(struct packet_ring_buffer *buff)
1204 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1207 static void packet_inc_pending(struct packet_ring_buffer *rb)
1209 this_cpu_inc(*rb->pending_refcnt);
1212 static void packet_dec_pending(struct packet_ring_buffer *rb)
1214 this_cpu_dec(*rb->pending_refcnt);
1217 static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1219 unsigned int refcnt = 0;
1222 /* We don't use pending refcount in rx_ring. */
1223 if (rb->pending_refcnt == NULL)
1226 for_each_possible_cpu(cpu)
1227 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1232 static int packet_alloc_pending(struct packet_sock *po)
1234 po->rx_ring.pending_refcnt = NULL;
1236 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1237 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1243 static void packet_free_pending(struct packet_sock *po)
1245 free_percpu(po->tx_ring.pending_refcnt);
1248 #define ROOM_POW_OFF 2
1249 #define ROOM_NONE 0x0
1250 #define ROOM_LOW 0x1
1251 #define ROOM_NORMAL 0x2
1253 static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
1257 len = po->rx_ring.frame_max + 1;
1258 idx = po->rx_ring.head;
1260 idx += len >> pow_off;
1263 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1266 static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1270 len = po->rx_ring.prb_bdqc.knum_blocks;
1271 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1273 idx += len >> pow_off;
1276 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1279 static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1281 struct sock *sk = &po->sk;
1282 int ret = ROOM_NONE;
1284 if (po->prot_hook.func != tpacket_rcv) {
1285 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1286 - (skb ? skb->truesize : 0);
1287 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1295 if (po->tp_version == TPACKET_V3) {
1296 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1298 else if (__tpacket_v3_has_room(po, 0))
1301 if (__tpacket_has_room(po, ROOM_POW_OFF))
1303 else if (__tpacket_has_room(po, 0))
1310 static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1315 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1316 ret = __packet_rcv_has_room(po, skb);
1317 has_room = ret == ROOM_NORMAL;
1318 if (po->pressure == has_room)
1319 po->pressure = !has_room;
1320 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
1325 static void packet_sock_destruct(struct sock *sk)
1327 skb_queue_purge(&sk->sk_error_queue);
1329 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1330 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1332 if (!sock_flag(sk, SOCK_DEAD)) {
1333 pr_err("Attempt to release alive packet socket: %p\n", sk);
1337 sk_refcnt_debug_dec(sk);
1340 static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1342 u32 *history = po->rollover->history;
1346 rxhash = skb_get_hash(skb);
1347 for (i = 0; i < ROLLOVER_HLEN; i++)
1348 if (READ_ONCE(history[i]) == rxhash)
1351 victim = prandom_u32() % ROLLOVER_HLEN;
1353 /* Avoid dirtying the cache line if possible */
1354 if (READ_ONCE(history[victim]) != rxhash)
1355 WRITE_ONCE(history[victim], rxhash);
1357 return count > (ROLLOVER_HLEN >> 1);
1360 static unsigned int fanout_demux_hash(struct packet_fanout *f,
1361 struct sk_buff *skb,
1364 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
1367 static unsigned int fanout_demux_lb(struct packet_fanout *f,
1368 struct sk_buff *skb,
1371 unsigned int val = atomic_inc_return(&f->rr_cur);
1376 static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1377 struct sk_buff *skb,
1380 return smp_processor_id() % num;
1383 static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1384 struct sk_buff *skb,
1387 return prandom_u32_max(num);
1390 static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1391 struct sk_buff *skb,
1392 unsigned int idx, bool try_self,
1395 struct packet_sock *po, *po_next, *po_skip = NULL;
1396 unsigned int i, j, room = ROOM_NONE;
1398 po = pkt_sk(f->arr[idx]);
1401 room = packet_rcv_has_room(po, skb);
1402 if (room == ROOM_NORMAL ||
1403 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1408 i = j = min_t(int, po->rollover->sock, num - 1);
1410 po_next = pkt_sk(f->arr[i]);
1411 if (po_next != po_skip && !po_next->pressure &&
1412 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
1414 po->rollover->sock = i;
1415 atomic_long_inc(&po->rollover->num);
1416 if (room == ROOM_LOW)
1417 atomic_long_inc(&po->rollover->num_huge);
1425 atomic_long_inc(&po->rollover->num_failed);
1429 static unsigned int fanout_demux_qm(struct packet_fanout *f,
1430 struct sk_buff *skb,
1433 return skb_get_queue_mapping(skb) % num;
1436 static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1437 struct sk_buff *skb,
1440 struct bpf_prog *prog;
1441 unsigned int ret = 0;
1444 prog = rcu_dereference(f->bpf_prog);
1446 ret = bpf_prog_run_clear_cb(prog, skb) % num;
1452 static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1454 return f->flags & (flag >> 8);
1457 static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1458 struct packet_type *pt, struct net_device *orig_dev)
1460 struct packet_fanout *f = pt->af_packet_priv;
1461 unsigned int num = READ_ONCE(f->num_members);
1462 struct net *net = read_pnet(&f->net);
1463 struct packet_sock *po;
1466 if (!net_eq(dev_net(dev), net) || !num) {
1471 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1472 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
1477 case PACKET_FANOUT_HASH:
1479 idx = fanout_demux_hash(f, skb, num);
1481 case PACKET_FANOUT_LB:
1482 idx = fanout_demux_lb(f, skb, num);
1484 case PACKET_FANOUT_CPU:
1485 idx = fanout_demux_cpu(f, skb, num);
1487 case PACKET_FANOUT_RND:
1488 idx = fanout_demux_rnd(f, skb, num);
1490 case PACKET_FANOUT_QM:
1491 idx = fanout_demux_qm(f, skb, num);
1493 case PACKET_FANOUT_ROLLOVER:
1494 idx = fanout_demux_rollover(f, skb, 0, false, num);
1496 case PACKET_FANOUT_CBPF:
1497 case PACKET_FANOUT_EBPF:
1498 idx = fanout_demux_bpf(f, skb, num);
1502 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1503 idx = fanout_demux_rollover(f, skb, idx, true, num);
1505 po = pkt_sk(f->arr[idx]);
1506 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1509 DEFINE_MUTEX(fanout_mutex);
1510 EXPORT_SYMBOL_GPL(fanout_mutex);
1511 static LIST_HEAD(fanout_list);
1512 static u16 fanout_next_id;
1514 static void __fanout_link(struct sock *sk, struct packet_sock *po)
1516 struct packet_fanout *f = po->fanout;
1518 spin_lock(&f->lock);
1519 f->arr[f->num_members] = sk;
1522 if (f->num_members == 1)
1523 dev_add_pack(&f->prot_hook);
1524 spin_unlock(&f->lock);
1527 static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1529 struct packet_fanout *f = po->fanout;
1532 spin_lock(&f->lock);
1533 for (i = 0; i < f->num_members; i++) {
1534 if (f->arr[i] == sk)
1537 BUG_ON(i >= f->num_members);
1538 f->arr[i] = f->arr[f->num_members - 1];
1540 if (f->num_members == 0)
1541 __dev_remove_pack(&f->prot_hook);
1542 spin_unlock(&f->lock);
1545 static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1547 if (sk->sk_family != PF_PACKET)
1550 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
1553 static void fanout_init_data(struct packet_fanout *f)
1556 case PACKET_FANOUT_LB:
1557 atomic_set(&f->rr_cur, 0);
1559 case PACKET_FANOUT_CBPF:
1560 case PACKET_FANOUT_EBPF:
1561 RCU_INIT_POINTER(f->bpf_prog, NULL);
1566 static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1568 struct bpf_prog *old;
1570 spin_lock(&f->lock);
1571 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1572 rcu_assign_pointer(f->bpf_prog, new);
1573 spin_unlock(&f->lock);
1577 bpf_prog_destroy(old);
1581 static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1584 struct bpf_prog *new;
1585 struct sock_fprog fprog;
1588 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1590 if (len != sizeof(fprog))
1592 if (copy_from_user(&fprog, data, len))
1595 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
1599 __fanout_set_data_bpf(po->fanout, new);
1603 static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1606 struct bpf_prog *new;
1609 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1611 if (len != sizeof(fd))
1613 if (copy_from_user(&fd, data, len))
1616 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
1618 return PTR_ERR(new);
1620 __fanout_set_data_bpf(po->fanout, new);
1624 static int fanout_set_data(struct packet_sock *po, char __user *data,
1627 switch (po->fanout->type) {
1628 case PACKET_FANOUT_CBPF:
1629 return fanout_set_data_cbpf(po, data, len);
1630 case PACKET_FANOUT_EBPF:
1631 return fanout_set_data_ebpf(po, data, len);
1637 static void fanout_release_data(struct packet_fanout *f)
1640 case PACKET_FANOUT_CBPF:
1641 case PACKET_FANOUT_EBPF:
1642 __fanout_set_data_bpf(f, NULL);
1646 static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1648 struct packet_fanout *f;
1650 list_for_each_entry(f, &fanout_list, list) {
1651 if (f->id == candidate_id &&
1652 read_pnet(&f->net) == sock_net(sk)) {
1659 static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1661 u16 id = fanout_next_id;
1664 if (__fanout_id_is_free(sk, id)) {
1666 fanout_next_id = id + 1;
1671 } while (id != fanout_next_id);
1676 static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1678 struct packet_rollover *rollover = NULL;
1679 struct packet_sock *po = pkt_sk(sk);
1680 struct packet_fanout *f, *match;
1681 u8 type = type_flags & 0xff;
1682 u8 flags = type_flags >> 8;
1686 case PACKET_FANOUT_ROLLOVER:
1687 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1689 case PACKET_FANOUT_HASH:
1690 case PACKET_FANOUT_LB:
1691 case PACKET_FANOUT_CPU:
1692 case PACKET_FANOUT_RND:
1693 case PACKET_FANOUT_QM:
1694 case PACKET_FANOUT_CBPF:
1695 case PACKET_FANOUT_EBPF:
1701 mutex_lock(&fanout_mutex);
1707 if (type == PACKET_FANOUT_ROLLOVER ||
1708 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1710 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1713 atomic_long_set(&rollover->num, 0);
1714 atomic_long_set(&rollover->num_huge, 0);
1715 atomic_long_set(&rollover->num_failed, 0);
1718 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1723 if (!fanout_find_new_id(sk, &id)) {
1727 /* ephemeral flag for the first socket in the group: drop it */
1728 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1732 list_for_each_entry(f, &fanout_list, list) {
1734 read_pnet(&f->net) == sock_net(sk)) {
1740 if (match && match->flags != flags)
1744 match = kzalloc(sizeof(*match), GFP_KERNEL);
1747 write_pnet(&match->net, sock_net(sk));
1750 match->flags = flags;
1751 INIT_LIST_HEAD(&match->list);
1752 spin_lock_init(&match->lock);
1753 refcount_set(&match->sk_ref, 0);
1754 fanout_init_data(match);
1755 match->prot_hook.type = po->prot_hook.type;
1756 match->prot_hook.dev = po->prot_hook.dev;
1757 match->prot_hook.func = packet_rcv_fanout;
1758 match->prot_hook.af_packet_priv = match;
1759 match->prot_hook.af_packet_net = read_pnet(&match->net);
1760 match->prot_hook.id_match = match_fanout_group;
1761 list_add(&match->list, &fanout_list);
1765 spin_lock(&po->bind_lock);
1767 match->type == type &&
1768 match->prot_hook.type == po->prot_hook.type &&
1769 match->prot_hook.dev == po->prot_hook.dev) {
1771 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1772 __dev_remove_pack(&po->prot_hook);
1774 /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */
1775 WRITE_ONCE(po->fanout, match);
1777 po->rollover = rollover;
1779 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
1780 __fanout_link(sk, po);
1784 spin_unlock(&po->bind_lock);
1786 if (err && !refcount_read(&match->sk_ref)) {
1787 list_del(&match->list);
1793 mutex_unlock(&fanout_mutex);
1797 /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1798 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1799 * It is the responsibility of the caller to call fanout_release_data() and
1800 * free the returned packet_fanout (after synchronize_net())
1802 static struct packet_fanout *fanout_release(struct sock *sk)
1804 struct packet_sock *po = pkt_sk(sk);
1805 struct packet_fanout *f;
1807 mutex_lock(&fanout_mutex);
1812 if (refcount_dec_and_test(&f->sk_ref))
1817 mutex_unlock(&fanout_mutex);
1822 static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1823 struct sk_buff *skb)
1825 /* Earlier code assumed this would be a VLAN pkt, double-check
1826 * this now that we have the actual packet in hand. We can only
1827 * do this check on Ethernet devices.
1829 if (unlikely(dev->type != ARPHRD_ETHER))
1832 skb_reset_mac_header(skb);
1833 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1836 static const struct proto_ops packet_ops;
1838 static const struct proto_ops packet_ops_spkt;
1840 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1841 struct packet_type *pt, struct net_device *orig_dev)
1844 struct sockaddr_pkt *spkt;
1847 * When we registered the protocol we saved the socket in the data
1848 * field for just this event.
1851 sk = pt->af_packet_priv;
1854 * Yank back the headers [hope the device set this
1855 * right or kerboom...]
1857 * Incoming packets have ll header pulled,
1860 * For outgoing ones skb->data == skb_mac_header(skb)
1861 * so that this procedure is noop.
1864 if (skb->pkt_type == PACKET_LOOPBACK)
1867 if (!net_eq(dev_net(dev), sock_net(sk)))
1870 skb = skb_share_check(skb, GFP_ATOMIC);
1874 /* drop any routing info */
1877 /* drop conntrack reference */
1880 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1882 skb_push(skb, skb->data - skb_mac_header(skb));
1885 * The SOCK_PACKET socket receives _all_ frames.
1888 spkt->spkt_family = dev->type;
1889 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1890 spkt->spkt_protocol = skb->protocol;
1893 * Charge the memory to the socket. This is done specifically
1894 * to prevent sockets using all the memory up.
1897 if (sock_queue_rcv_skb(sk, skb) == 0)
1908 * Output a raw packet to a device layer. This bypasses all the other
1909 * protocol layers and you must therefore supply it with a complete frame
1912 static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1915 struct sock *sk = sock->sk;
1916 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1917 struct sk_buff *skb = NULL;
1918 struct net_device *dev;
1919 struct sockcm_cookie sockc;
1925 * Get and verify the address.
1929 if (msg->msg_namelen < sizeof(struct sockaddr))
1931 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1932 proto = saddr->spkt_protocol;
1934 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1937 * Find the device first to size check it
1940 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1943 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1949 if (!(dev->flags & IFF_UP))
1953 * You may not queue a frame bigger than the mtu. This is the lowest level
1954 * raw protocol and you must do your own fragmentation at this level.
1957 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1958 if (!netif_supports_nofcs(dev)) {
1959 err = -EPROTONOSUPPORT;
1962 extra_len = 4; /* We're doing our own CRC */
1966 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1970 size_t reserved = LL_RESERVED_SPACE(dev);
1971 int tlen = dev->needed_tailroom;
1972 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1975 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1978 /* FIXME: Save some space for broken drivers that write a hard
1979 * header at transmission time by themselves. PPP is the notable
1980 * one here. This should really be fixed at the driver level.
1982 skb_reserve(skb, reserved);
1983 skb_reset_network_header(skb);
1985 /* Try to align data part correctly */
1990 skb_reset_network_header(skb);
1992 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1998 if (!dev_validate_header(dev, skb->data, len)) {
2002 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
2003 !packet_extra_vlan_len_allowed(dev, skb)) {
2008 sockc.tsflags = sk->sk_tsflags;
2009 if (msg->msg_controllen) {
2010 err = sock_cmsg_send(sk, msg, &sockc);
2015 skb->protocol = proto;
2017 skb->priority = sk->sk_priority;
2018 skb->mark = sk->sk_mark;
2020 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
2022 if (unlikely(extra_len == 4))
2025 skb_probe_transport_header(skb, 0);
2027 dev_queue_xmit(skb);
2038 static unsigned int run_filter(struct sk_buff *skb,
2039 const struct sock *sk,
2042 struct sk_filter *filter;
2045 filter = rcu_dereference(sk->sk_filter);
2047 res = bpf_prog_run_clear_cb(filter->prog, skb);
2053 static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2056 struct virtio_net_hdr vnet_hdr;
2058 if (*len < sizeof(vnet_hdr))
2060 *len -= sizeof(vnet_hdr);
2062 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
2065 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2069 * This function makes lazy skb cloning in hope that most of packets
2070 * are discarded by BPF.
2072 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2073 * and skb->cb are mangled. It works because (and until) packets
2074 * falling here are owned by current CPU. Output packets are cloned
2075 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2076 * sequencially, so that if we return skb to original state on exit,
2077 * we will not harm anyone.
2080 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2081 struct packet_type *pt, struct net_device *orig_dev)
2084 struct sockaddr_ll *sll;
2085 struct packet_sock *po;
2086 u8 *skb_head = skb->data;
2087 int skb_len = skb->len;
2088 unsigned int snaplen, res;
2089 bool is_drop_n_account = false;
2091 if (skb->pkt_type == PACKET_LOOPBACK)
2094 sk = pt->af_packet_priv;
2097 if (!net_eq(dev_net(dev), sock_net(sk)))
2102 if (dev->header_ops) {
2103 /* The device has an explicit notion of ll header,
2104 * exported to higher levels.
2106 * Otherwise, the device hides details of its frame
2107 * structure, so that corresponding packet head is
2108 * never delivered to user.
2110 if (sk->sk_type != SOCK_DGRAM)
2111 skb_push(skb, skb->data - skb_mac_header(skb));
2112 else if (skb->pkt_type == PACKET_OUTGOING) {
2113 /* Special case: outgoing packets have ll header at head */
2114 skb_pull(skb, skb_network_offset(skb));
2120 res = run_filter(skb, sk, snaplen);
2122 goto drop_n_restore;
2126 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2129 if (skb_shared(skb)) {
2130 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2134 if (skb_head != skb->data) {
2135 skb->data = skb_head;
2142 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
2144 sll = &PACKET_SKB_CB(skb)->sa.ll;
2145 sll->sll_hatype = dev->type;
2146 sll->sll_pkttype = skb->pkt_type;
2147 if (unlikely(po->origdev))
2148 sll->sll_ifindex = orig_dev->ifindex;
2150 sll->sll_ifindex = dev->ifindex;
2152 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2154 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2155 * Use their space for storing the original skb length.
2157 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
2159 if (pskb_trim(skb, snaplen))
2162 skb_set_owner_r(skb, sk);
2166 /* drop conntrack reference */
2169 spin_lock(&sk->sk_receive_queue.lock);
2170 po->stats.stats1.tp_packets++;
2171 sock_skb_set_dropcount(sk, skb);
2172 __skb_queue_tail(&sk->sk_receive_queue, skb);
2173 spin_unlock(&sk->sk_receive_queue.lock);
2174 sk->sk_data_ready(sk);
2178 is_drop_n_account = true;
2179 spin_lock(&sk->sk_receive_queue.lock);
2180 po->stats.stats1.tp_drops++;
2181 atomic_inc(&sk->sk_drops);
2182 spin_unlock(&sk->sk_receive_queue.lock);
2185 if (skb_head != skb->data && skb_shared(skb)) {
2186 skb->data = skb_head;
2190 if (!is_drop_n_account)
2197 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2198 struct packet_type *pt, struct net_device *orig_dev)
2201 struct packet_sock *po;
2202 struct sockaddr_ll *sll;
2203 union tpacket_uhdr h;
2204 u8 *skb_head = skb->data;
2205 int skb_len = skb->len;
2206 unsigned int snaplen, res;
2207 unsigned long status = TP_STATUS_USER;
2208 unsigned short macoff, hdrlen;
2209 unsigned int netoff;
2210 struct sk_buff *copy_skb = NULL;
2213 bool is_drop_n_account = false;
2214 unsigned int slot_id = 0;
2215 bool do_vnet = false;
2217 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2218 * We may add members to them until current aligned size without forcing
2219 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2221 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2222 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2224 if (skb->pkt_type == PACKET_LOOPBACK)
2227 sk = pt->af_packet_priv;
2230 if (!net_eq(dev_net(dev), sock_net(sk)))
2233 if (dev->header_ops) {
2234 if (sk->sk_type != SOCK_DGRAM)
2235 skb_push(skb, skb->data - skb_mac_header(skb));
2236 else if (skb->pkt_type == PACKET_OUTGOING) {
2237 /* Special case: outgoing packets have ll header at head */
2238 skb_pull(skb, skb_network_offset(skb));
2244 res = run_filter(skb, sk, snaplen);
2246 goto drop_n_restore;
2248 if (skb->ip_summed == CHECKSUM_PARTIAL)
2249 status |= TP_STATUS_CSUMNOTREADY;
2250 else if (skb->pkt_type != PACKET_OUTGOING &&
2251 (skb->ip_summed == CHECKSUM_COMPLETE ||
2252 skb_csum_unnecessary(skb)))
2253 status |= TP_STATUS_CSUM_VALID;
2258 if (sk->sk_type == SOCK_DGRAM) {
2259 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2262 unsigned int maclen = skb_network_offset(skb);
2263 netoff = TPACKET_ALIGN(po->tp_hdrlen +
2264 (maclen < 16 ? 16 : maclen)) +
2266 if (po->has_vnet_hdr) {
2267 netoff += sizeof(struct virtio_net_hdr);
2270 macoff = netoff - maclen;
2272 if (netoff > USHRT_MAX) {
2273 spin_lock(&sk->sk_receive_queue.lock);
2274 po->stats.stats1.tp_drops++;
2275 spin_unlock(&sk->sk_receive_queue.lock);
2276 goto drop_n_restore;
2278 if (po->tp_version <= TPACKET_V2) {
2279 if (macoff + snaplen > po->rx_ring.frame_size) {
2280 if (po->copy_thresh &&
2281 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
2282 if (skb_shared(skb)) {
2283 copy_skb = skb_clone(skb, GFP_ATOMIC);
2285 copy_skb = skb_get(skb);
2286 skb_head = skb->data;
2289 skb_set_owner_r(copy_skb, sk);
2291 snaplen = po->rx_ring.frame_size - macoff;
2292 if ((int)snaplen < 0) {
2297 } else if (unlikely(macoff + snaplen >
2298 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2301 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2302 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2303 snaplen, nval, macoff);
2305 if (unlikely((int)snaplen < 0)) {
2307 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2311 spin_lock(&sk->sk_receive_queue.lock);
2312 h.raw = packet_current_rx_frame(po, skb,
2313 TP_STATUS_KERNEL, (macoff+snaplen));
2315 goto drop_n_account;
2317 if (po->tp_version <= TPACKET_V2) {
2318 slot_id = po->rx_ring.head;
2319 if (test_bit(slot_id, po->rx_ring.rx_owner_map))
2320 goto drop_n_account;
2321 __set_bit(slot_id, po->rx_ring.rx_owner_map);
2325 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2326 sizeof(struct virtio_net_hdr),
2327 vio_le(), true, 0)) {
2328 if (po->tp_version == TPACKET_V3)
2329 prb_clear_blk_fill_status(&po->rx_ring);
2330 goto drop_n_account;
2333 if (po->tp_version <= TPACKET_V2) {
2334 packet_increment_rx_head(po, &po->rx_ring);
2336 * LOSING will be reported till you read the stats,
2337 * because it's COR - Clear On Read.
2338 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2341 if (po->stats.stats1.tp_drops)
2342 status |= TP_STATUS_LOSING;
2345 po->stats.stats1.tp_packets++;
2347 status |= TP_STATUS_COPY;
2348 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2350 spin_unlock(&sk->sk_receive_queue.lock);
2352 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2354 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
2355 getnstimeofday(&ts);
2357 status |= ts_status;
2359 switch (po->tp_version) {
2361 h.h1->tp_len = skb->len;
2362 h.h1->tp_snaplen = snaplen;
2363 h.h1->tp_mac = macoff;
2364 h.h1->tp_net = netoff;
2365 h.h1->tp_sec = ts.tv_sec;
2366 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
2367 hdrlen = sizeof(*h.h1);
2370 h.h2->tp_len = skb->len;
2371 h.h2->tp_snaplen = snaplen;
2372 h.h2->tp_mac = macoff;
2373 h.h2->tp_net = netoff;
2374 h.h2->tp_sec = ts.tv_sec;
2375 h.h2->tp_nsec = ts.tv_nsec;
2376 if (skb_vlan_tag_present(skb)) {
2377 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
2378 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2379 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2381 h.h2->tp_vlan_tci = 0;
2382 h.h2->tp_vlan_tpid = 0;
2384 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
2385 hdrlen = sizeof(*h.h2);
2388 /* tp_nxt_offset,vlan are already populated above.
2389 * So DONT clear those fields here
2391 h.h3->tp_status |= status;
2392 h.h3->tp_len = skb->len;
2393 h.h3->tp_snaplen = snaplen;
2394 h.h3->tp_mac = macoff;
2395 h.h3->tp_net = netoff;
2396 h.h3->tp_sec = ts.tv_sec;
2397 h.h3->tp_nsec = ts.tv_nsec;
2398 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
2399 hdrlen = sizeof(*h.h3);
2405 sll = h.raw + TPACKET_ALIGN(hdrlen);
2406 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2407 sll->sll_family = AF_PACKET;
2408 sll->sll_hatype = dev->type;
2409 sll->sll_protocol = skb->protocol;
2410 sll->sll_pkttype = skb->pkt_type;
2411 if (unlikely(po->origdev))
2412 sll->sll_ifindex = orig_dev->ifindex;
2414 sll->sll_ifindex = dev->ifindex;
2418 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2419 if (po->tp_version <= TPACKET_V2) {
2422 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2425 for (start = h.raw; start < end; start += PAGE_SIZE)
2426 flush_dcache_page(pgv_to_page(start));
2431 if (po->tp_version <= TPACKET_V2) {
2432 spin_lock(&sk->sk_receive_queue.lock);
2433 __packet_set_status(po, h.raw, status);
2434 __clear_bit(slot_id, po->rx_ring.rx_owner_map);
2435 spin_unlock(&sk->sk_receive_queue.lock);
2436 sk->sk_data_ready(sk);
2437 } else if (po->tp_version == TPACKET_V3) {
2438 prb_clear_blk_fill_status(&po->rx_ring);
2442 if (skb_head != skb->data && skb_shared(skb)) {
2443 skb->data = skb_head;
2447 if (!is_drop_n_account)
2454 is_drop_n_account = true;
2455 po->stats.stats1.tp_drops++;
2456 spin_unlock(&sk->sk_receive_queue.lock);
2458 sk->sk_data_ready(sk);
2459 kfree_skb(copy_skb);
2460 goto drop_n_restore;
2463 static void tpacket_destruct_skb(struct sk_buff *skb)
2465 struct packet_sock *po = pkt_sk(skb->sk);
2467 if (likely(po->tx_ring.pg_vec)) {
2471 ph = skb_zcopy_get_nouarg(skb);
2472 packet_dec_pending(&po->tx_ring);
2474 ts = __packet_set_timestamp(po, ph, skb);
2475 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
2477 if (!packet_read_pending(&po->tx_ring))
2478 complete(&po->skb_completion);
2484 static void tpacket_set_protocol(const struct net_device *dev,
2485 struct sk_buff *skb)
2487 if (dev->type == ARPHRD_ETHER) {
2488 skb_reset_mac_header(skb);
2489 skb->protocol = eth_hdr(skb)->h_proto;
2493 static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2495 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2496 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2497 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2498 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2499 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2500 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2501 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2503 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2509 static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2510 struct virtio_net_hdr *vnet_hdr)
2512 if (*len < sizeof(*vnet_hdr))
2514 *len -= sizeof(*vnet_hdr);
2516 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
2519 return __packet_snd_vnet_parse(vnet_hdr, *len);
2522 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2523 void *frame, struct net_device *dev, void *data, int tp_len,
2524 __be16 proto, unsigned char *addr, int hlen, int copylen,
2525 const struct sockcm_cookie *sockc)
2527 union tpacket_uhdr ph;
2528 int to_write, offset, len, nr_frags, len_max;
2529 struct socket *sock = po->sk.sk_socket;
2535 skb->protocol = proto;
2537 skb->priority = po->sk.sk_priority;
2538 skb->mark = po->sk.sk_mark;
2539 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
2540 skb_zcopy_set_nouarg(skb, ph.raw);
2542 skb_reserve(skb, hlen);
2543 skb_reset_network_header(skb);
2547 if (sock->type == SOCK_DGRAM) {
2548 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2550 if (unlikely(err < 0))
2552 } else if (copylen) {
2553 int hdrlen = min_t(int, copylen, tp_len);
2555 skb_push(skb, dev->hard_header_len);
2556 skb_put(skb, copylen - dev->hard_header_len);
2557 err = skb_store_bits(skb, 0, data, hdrlen);
2560 if (!dev_validate_header(dev, skb->data, hdrlen))
2563 tpacket_set_protocol(dev, skb);
2569 offset = offset_in_page(data);
2570 len_max = PAGE_SIZE - offset;
2571 len = ((to_write > len_max) ? len_max : to_write);
2573 skb->data_len = to_write;
2574 skb->len += to_write;
2575 skb->truesize += to_write;
2576 refcount_add(to_write, &po->sk.sk_wmem_alloc);
2578 while (likely(to_write)) {
2579 nr_frags = skb_shinfo(skb)->nr_frags;
2581 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2582 pr_err("Packet exceed the number of skb frags(%lu)\n",
2587 page = pgv_to_page(data);
2589 flush_dcache_page(page);
2591 skb_fill_page_desc(skb, nr_frags, page, offset, len);
2594 len_max = PAGE_SIZE;
2595 len = ((to_write > len_max) ? len_max : to_write);
2598 skb_probe_transport_header(skb, 0);
2603 static int tpacket_parse_header(struct packet_sock *po, void *frame,
2604 int size_max, void **data)
2606 union tpacket_uhdr ph;
2611 switch (po->tp_version) {
2613 if (ph.h3->tp_next_offset != 0) {
2614 pr_warn_once("variable sized slot not supported");
2617 tp_len = ph.h3->tp_len;
2620 tp_len = ph.h2->tp_len;
2623 tp_len = ph.h1->tp_len;
2626 if (unlikely(tp_len > size_max)) {
2627 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2631 if (unlikely(po->tp_tx_has_off)) {
2632 int off_min, off_max;
2634 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2635 off_max = po->tx_ring.frame_size - tp_len;
2636 if (po->sk.sk_type == SOCK_DGRAM) {
2637 switch (po->tp_version) {
2639 off = ph.h3->tp_net;
2642 off = ph.h2->tp_net;
2645 off = ph.h1->tp_net;
2649 switch (po->tp_version) {
2651 off = ph.h3->tp_mac;
2654 off = ph.h2->tp_mac;
2657 off = ph.h1->tp_mac;
2661 if (unlikely((off < off_min) || (off_max < off)))
2664 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2667 *data = frame + off;
2671 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2673 struct sk_buff *skb = NULL;
2674 struct net_device *dev;
2675 struct virtio_net_hdr *vnet_hdr = NULL;
2676 struct sockcm_cookie sockc;
2678 int err, reserve = 0;
2680 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2681 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2682 unsigned char *addr = NULL;
2683 int tp_len, size_max;
2686 int status = TP_STATUS_AVAILABLE;
2687 int hlen, tlen, copylen = 0;
2690 mutex_lock(&po->pg_vec_lock);
2692 /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
2693 * we need to confirm it under protection of pg_vec_lock.
2695 if (unlikely(!po->tx_ring.pg_vec)) {
2699 if (likely(saddr == NULL)) {
2700 dev = packet_cached_dev_get(po);
2701 proto = READ_ONCE(po->num);
2704 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2706 if (msg->msg_namelen < (saddr->sll_halen
2707 + offsetof(struct sockaddr_ll,
2710 proto = saddr->sll_protocol;
2711 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2712 if (po->sk.sk_socket->type == SOCK_DGRAM) {
2713 if (dev && msg->msg_namelen < dev->addr_len +
2714 offsetof(struct sockaddr_ll, sll_addr))
2716 addr = saddr->sll_addr;
2721 if (unlikely(dev == NULL))
2724 if (unlikely(!(dev->flags & IFF_UP)))
2727 sockc.tsflags = po->sk.sk_tsflags;
2728 if (msg->msg_controllen) {
2729 err = sock_cmsg_send(&po->sk, msg, &sockc);
2734 if (po->sk.sk_socket->type == SOCK_RAW)
2735 reserve = dev->hard_header_len;
2736 size_max = po->tx_ring.frame_size
2737 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2739 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
2740 size_max = dev->mtu + reserve + VLAN_HLEN;
2742 reinit_completion(&po->skb_completion);
2745 ph = packet_current_frame(po, &po->tx_ring,
2746 TP_STATUS_SEND_REQUEST);
2747 if (unlikely(ph == NULL)) {
2748 if (need_wait && skb) {
2749 timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
2750 timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
2752 err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
2756 /* check for additional frames */
2761 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2765 status = TP_STATUS_SEND_REQUEST;
2766 hlen = LL_RESERVED_SPACE(dev);
2767 tlen = dev->needed_tailroom;
2768 if (po->has_vnet_hdr) {
2770 data += sizeof(*vnet_hdr);
2771 tp_len -= sizeof(*vnet_hdr);
2773 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2777 copylen = __virtio16_to_cpu(vio_le(),
2780 copylen = max_t(int, copylen, dev->hard_header_len);
2781 skb = sock_alloc_send_skb(&po->sk,
2782 hlen + tlen + sizeof(struct sockaddr_ll) +
2783 (copylen - dev->hard_header_len),
2786 if (unlikely(skb == NULL)) {
2787 /* we assume the socket was initially writeable ... */
2788 if (likely(len_sum > 0))
2792 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
2793 addr, hlen, copylen, &sockc);
2794 if (likely(tp_len >= 0) &&
2795 tp_len > dev->mtu + reserve &&
2796 !po->has_vnet_hdr &&
2797 !packet_extra_vlan_len_allowed(dev, skb))
2800 if (unlikely(tp_len < 0)) {
2803 __packet_set_status(po, ph,
2804 TP_STATUS_AVAILABLE);
2805 packet_increment_head(&po->tx_ring);
2809 status = TP_STATUS_WRONG_FORMAT;
2815 if (po->has_vnet_hdr) {
2816 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2820 virtio_net_hdr_set_proto(skb, vnet_hdr);
2823 skb->destructor = tpacket_destruct_skb;
2824 __packet_set_status(po, ph, TP_STATUS_SENDING);
2825 packet_inc_pending(&po->tx_ring);
2827 status = TP_STATUS_SEND_REQUEST;
2828 err = po->xmit(skb);
2829 if (unlikely(err > 0)) {
2830 err = net_xmit_errno(err);
2831 if (err && __packet_get_status(po, ph) ==
2832 TP_STATUS_AVAILABLE) {
2833 /* skb was destructed already */
2838 * skb was dropped but not destructed yet;
2839 * let's treat it like congestion or err < 0
2843 packet_increment_head(&po->tx_ring);
2845 } while (likely((ph != NULL) ||
2846 /* Note: packet_read_pending() might be slow if we have
2847 * to call it as it's per_cpu variable, but in fast-path
2848 * we already short-circuit the loop with the first
2849 * condition, and luckily don't have to go that path
2852 (need_wait && packet_read_pending(&po->tx_ring))));
2858 __packet_set_status(po, ph, status);
2863 mutex_unlock(&po->pg_vec_lock);
2867 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2868 size_t reserve, size_t len,
2869 size_t linear, int noblock,
2872 struct sk_buff *skb;
2874 /* Under a page? Don't bother with paged skb. */
2875 if (prepad + len < PAGE_SIZE || !linear)
2878 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2883 skb_reserve(skb, reserve);
2884 skb_put(skb, linear);
2885 skb->data_len = len - linear;
2886 skb->len += len - linear;
2891 static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2893 struct sock *sk = sock->sk;
2894 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2895 struct sk_buff *skb;
2896 struct net_device *dev;
2898 unsigned char *addr = NULL;
2899 int err, reserve = 0;
2900 struct sockcm_cookie sockc;
2901 struct virtio_net_hdr vnet_hdr = { 0 };
2903 struct packet_sock *po = pkt_sk(sk);
2904 bool has_vnet_hdr = false;
2905 int hlen, tlen, linear;
2909 * Get and verify the address.
2912 if (likely(saddr == NULL)) {
2913 dev = packet_cached_dev_get(po);
2914 proto = READ_ONCE(po->num);
2917 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2919 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2921 proto = saddr->sll_protocol;
2922 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2923 if (sock->type == SOCK_DGRAM) {
2924 if (dev && msg->msg_namelen < dev->addr_len +
2925 offsetof(struct sockaddr_ll, sll_addr))
2927 addr = saddr->sll_addr;
2932 if (unlikely(dev == NULL))
2935 if (unlikely(!(dev->flags & IFF_UP)))
2938 sockc.tsflags = sk->sk_tsflags;
2939 sockc.mark = sk->sk_mark;
2940 if (msg->msg_controllen) {
2941 err = sock_cmsg_send(sk, msg, &sockc);
2946 if (sock->type == SOCK_RAW)
2947 reserve = dev->hard_header_len;
2948 if (po->has_vnet_hdr) {
2949 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2952 has_vnet_hdr = true;
2955 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2956 if (!netif_supports_nofcs(dev)) {
2957 err = -EPROTONOSUPPORT;
2960 extra_len = 4; /* We're doing our own CRC */
2964 if (!vnet_hdr.gso_type &&
2965 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
2969 hlen = LL_RESERVED_SPACE(dev);
2970 tlen = dev->needed_tailroom;
2971 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2972 linear = max(linear, min_t(int, len, dev->hard_header_len));
2973 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
2974 msg->msg_flags & MSG_DONTWAIT, &err);
2978 skb_reset_network_header(skb);
2981 if (sock->type == SOCK_DGRAM) {
2982 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
2983 if (unlikely(offset < 0))
2985 } else if (reserve) {
2986 skb_reserve(skb, -reserve);
2988 skb_reset_network_header(skb);
2991 /* Returns -EFAULT on error */
2992 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
2996 if (sock->type == SOCK_RAW &&
2997 !dev_validate_header(dev, skb->data, len)) {
3002 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
3004 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3005 !packet_extra_vlan_len_allowed(dev, skb)) {
3010 skb->protocol = proto;
3012 skb->priority = sk->sk_priority;
3013 skb->mark = sockc.mark;
3016 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
3019 len += sizeof(vnet_hdr);
3020 virtio_net_hdr_set_proto(skb, &vnet_hdr);
3023 skb_probe_transport_header(skb, reserve);
3025 if (unlikely(extra_len == 4))
3028 err = po->xmit(skb);
3029 if (err > 0 && (err = net_xmit_errno(err)) != 0)
3045 static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
3047 struct sock *sk = sock->sk;
3048 struct packet_sock *po = pkt_sk(sk);
3050 if (po->tx_ring.pg_vec)
3051 return tpacket_snd(po, msg);
3053 return packet_snd(sock, msg, len);
3057 * Close a PACKET socket. This is fairly simple. We immediately go
3058 * to 'closed' state and remove our protocol entry in the device list.
3061 static int packet_release(struct socket *sock)
3063 struct sock *sk = sock->sk;
3064 struct packet_sock *po;
3065 struct packet_fanout *f;
3067 union tpacket_req_u req_u;
3075 mutex_lock(&net->packet.sklist_lock);
3076 sk_del_node_init_rcu(sk);
3077 mutex_unlock(&net->packet.sklist_lock);
3080 sock_prot_inuse_add(net, sk->sk_prot, -1);
3083 spin_lock(&po->bind_lock);
3084 unregister_prot_hook(sk, false);
3085 packet_cached_dev_reset(po);
3087 if (po->prot_hook.dev) {
3088 dev_put(po->prot_hook.dev);
3089 po->prot_hook.dev = NULL;
3091 spin_unlock(&po->bind_lock);
3093 packet_flush_mclist(sk);
3096 if (po->rx_ring.pg_vec) {
3097 memset(&req_u, 0, sizeof(req_u));
3098 packet_set_ring(sk, &req_u, 1, 0);
3101 if (po->tx_ring.pg_vec) {
3102 memset(&req_u, 0, sizeof(req_u));
3103 packet_set_ring(sk, &req_u, 1, 1);
3107 f = fanout_release(sk);
3112 kfree(po->rollover);
3113 fanout_release_data(f);
3117 * Now the socket is dead. No more input will appear.
3124 skb_queue_purge(&sk->sk_receive_queue);
3125 packet_free_pending(po);
3126 sk_refcnt_debug_release(sk);
3133 * Attach a packet hook.
3136 static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3139 struct packet_sock *po = pkt_sk(sk);
3140 struct net_device *dev_curr;
3143 struct net_device *dev = NULL;
3145 bool unlisted = false;
3148 spin_lock(&po->bind_lock);
3157 dev = dev_get_by_name_rcu(sock_net(sk), name);
3162 } else if (ifindex) {
3163 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3173 proto_curr = po->prot_hook.type;
3174 dev_curr = po->prot_hook.dev;
3176 need_rehook = proto_curr != proto || dev_curr != dev;
3181 /* prevents packet_notifier() from calling
3182 * register_prot_hook()
3184 WRITE_ONCE(po->num, 0);
3185 __unregister_prot_hook(sk, true);
3187 dev_curr = po->prot_hook.dev;
3189 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3193 BUG_ON(po->running);
3194 WRITE_ONCE(po->num, proto);
3195 po->prot_hook.type = proto;
3197 if (unlikely(unlisted)) {
3199 po->prot_hook.dev = NULL;
3200 WRITE_ONCE(po->ifindex, -1);
3201 packet_cached_dev_reset(po);
3203 po->prot_hook.dev = dev;
3204 WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
3205 packet_cached_dev_assign(po, dev);
3211 if (proto == 0 || !need_rehook)
3214 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
3215 register_prot_hook(sk);
3217 sk->sk_err = ENETDOWN;
3218 if (!sock_flag(sk, SOCK_DEAD))
3219 sk->sk_error_report(sk);
3224 spin_unlock(&po->bind_lock);
3230 * Bind a packet socket to a device
3233 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3236 struct sock *sk = sock->sk;
3237 char name[sizeof(uaddr->sa_data) + 1];
3243 if (addr_len != sizeof(struct sockaddr))
3245 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3248 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3249 name[sizeof(uaddr->sa_data)] = 0;
3251 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
3254 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3256 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3257 struct sock *sk = sock->sk;
3263 if (addr_len < sizeof(struct sockaddr_ll))
3265 if (sll->sll_family != AF_PACKET)
3268 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3269 sll->sll_protocol ? : pkt_sk(sk)->num);
3272 static struct proto packet_proto = {
3274 .owner = THIS_MODULE,
3275 .obj_size = sizeof(struct packet_sock),
3279 * Create a packet of type SOCK_PACKET.
3282 static int packet_create(struct net *net, struct socket *sock, int protocol,
3286 struct packet_sock *po;
3287 __be16 proto = (__force __be16)protocol; /* weird, but documented */
3290 if (!ns_capable(net->user_ns, CAP_NET_RAW))
3292 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3293 sock->type != SOCK_PACKET)
3294 return -ESOCKTNOSUPPORT;
3296 sock->state = SS_UNCONNECTED;
3299 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
3303 sock->ops = &packet_ops;
3304 if (sock->type == SOCK_PACKET)
3305 sock->ops = &packet_ops_spkt;
3307 sock_init_data(sock, sk);
3310 init_completion(&po->skb_completion);
3311 sk->sk_family = PF_PACKET;
3313 po->xmit = dev_queue_xmit;
3315 err = packet_alloc_pending(po);
3319 packet_cached_dev_reset(po);
3321 sk->sk_destruct = packet_sock_destruct;
3322 sk_refcnt_debug_inc(sk);
3325 * Attach a protocol block
3328 spin_lock_init(&po->bind_lock);
3329 mutex_init(&po->pg_vec_lock);
3330 po->rollover = NULL;
3331 po->prot_hook.func = packet_rcv;
3333 if (sock->type == SOCK_PACKET)
3334 po->prot_hook.func = packet_rcv_spkt;
3336 po->prot_hook.af_packet_priv = sk;
3337 po->prot_hook.af_packet_net = sock_net(sk);
3340 po->prot_hook.type = proto;
3341 __register_prot_hook(sk);
3344 mutex_lock(&net->packet.sklist_lock);
3345 sk_add_node_tail_rcu(sk, &net->packet.sklist);
3346 mutex_unlock(&net->packet.sklist_lock);
3349 sock_prot_inuse_add(net, &packet_proto, 1);
3360 * Pull a packet from our receive queue and hand it to the user.
3361 * If necessary we block.
3364 static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3367 struct sock *sk = sock->sk;
3368 struct sk_buff *skb;
3370 int vnet_hdr_len = 0;
3371 unsigned int origlen = 0;
3374 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
3378 /* What error should we return now? EUNATTACH? */
3379 if (pkt_sk(sk)->ifindex < 0)
3383 if (flags & MSG_ERRQUEUE) {
3384 err = sock_recv_errqueue(sk, msg, len,
3385 SOL_PACKET, PACKET_TX_TIMESTAMP);
3390 * Call the generic datagram receiver. This handles all sorts
3391 * of horrible races and re-entrancy so we can forget about it
3392 * in the protocol layers.
3394 * Now it will return ENETDOWN, if device have just gone down,
3395 * but then it will block.
3398 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
3401 * An error occurred so return it. Because skb_recv_datagram()
3402 * handles the blocking we don't see and worry about blocking
3409 if (pkt_sk(sk)->pressure)
3410 packet_rcv_has_room(pkt_sk(sk), NULL);
3412 if (pkt_sk(sk)->has_vnet_hdr) {
3413 err = packet_rcv_vnet(msg, skb, &len);
3416 vnet_hdr_len = sizeof(struct virtio_net_hdr);
3419 /* You lose any data beyond the buffer you gave. If it worries
3420 * a user program they can ask the device for its MTU
3426 msg->msg_flags |= MSG_TRUNC;
3429 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3433 if (sock->type != SOCK_PACKET) {
3434 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3436 /* Original length was stored in sockaddr_ll fields */
3437 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3438 sll->sll_family = AF_PACKET;
3439 sll->sll_protocol = skb->protocol;
3442 sock_recv_ts_and_drops(msg, sk, skb);
3444 if (msg->msg_name) {
3447 /* If the address length field is there to be filled
3448 * in, we fill it in now.
3450 if (sock->type == SOCK_PACKET) {
3451 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
3452 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3453 copy_len = msg->msg_namelen;
3455 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3457 msg->msg_namelen = sll->sll_halen +
3458 offsetof(struct sockaddr_ll, sll_addr);
3459 copy_len = msg->msg_namelen;
3460 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3461 memset(msg->msg_name +
3462 offsetof(struct sockaddr_ll, sll_addr),
3463 0, sizeof(sll->sll_addr));
3464 msg->msg_namelen = sizeof(struct sockaddr_ll);
3467 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
3470 if (pkt_sk(sk)->auxdata) {
3471 struct tpacket_auxdata aux;
3473 aux.tp_status = TP_STATUS_USER;
3474 if (skb->ip_summed == CHECKSUM_PARTIAL)
3475 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3476 else if (skb->pkt_type != PACKET_OUTGOING &&
3477 (skb->ip_summed == CHECKSUM_COMPLETE ||
3478 skb_csum_unnecessary(skb)))
3479 aux.tp_status |= TP_STATUS_CSUM_VALID;
3481 aux.tp_len = origlen;
3482 aux.tp_snaplen = skb->len;
3484 aux.tp_net = skb_network_offset(skb);
3485 if (skb_vlan_tag_present(skb)) {
3486 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
3487 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3488 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
3490 aux.tp_vlan_tci = 0;
3491 aux.tp_vlan_tpid = 0;
3493 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
3497 * Free or return the buffer as appropriate. Again this
3498 * hides all the races and re-entrancy issues from us.
3500 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
3503 skb_free_datagram(sk, skb);
3508 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3509 int *uaddr_len, int peer)
3511 struct net_device *dev;
3512 struct sock *sk = sock->sk;
3517 uaddr->sa_family = AF_PACKET;
3518 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
3520 dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
3522 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
3524 *uaddr_len = sizeof(*uaddr);
3529 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3530 int *uaddr_len, int peer)
3532 struct net_device *dev;
3533 struct sock *sk = sock->sk;
3534 struct packet_sock *po = pkt_sk(sk);
3535 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3541 ifindex = READ_ONCE(po->ifindex);
3542 sll->sll_family = AF_PACKET;
3543 sll->sll_ifindex = ifindex;
3544 sll->sll_protocol = READ_ONCE(po->num);
3545 sll->sll_pkttype = 0;
3547 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3549 sll->sll_hatype = dev->type;
3550 sll->sll_halen = dev->addr_len;
3551 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
3553 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3557 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
3562 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3566 case PACKET_MR_MULTICAST:
3567 if (i->alen != dev->addr_len)
3570 return dev_mc_add(dev, i->addr);
3572 return dev_mc_del(dev, i->addr);
3574 case PACKET_MR_PROMISC:
3575 return dev_set_promiscuity(dev, what);
3576 case PACKET_MR_ALLMULTI:
3577 return dev_set_allmulti(dev, what);
3578 case PACKET_MR_UNICAST:
3579 if (i->alen != dev->addr_len)
3582 return dev_uc_add(dev, i->addr);
3584 return dev_uc_del(dev, i->addr);
3592 static void packet_dev_mclist_delete(struct net_device *dev,
3593 struct packet_mclist **mlp)
3595 struct packet_mclist *ml;
3597 while ((ml = *mlp) != NULL) {
3598 if (ml->ifindex == dev->ifindex) {
3599 packet_dev_mc(dev, ml, -1);
3607 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
3609 struct packet_sock *po = pkt_sk(sk);
3610 struct packet_mclist *ml, *i;
3611 struct net_device *dev;
3617 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
3622 if (mreq->mr_alen > dev->addr_len)
3626 i = kmalloc(sizeof(*i), GFP_KERNEL);
3631 for (ml = po->mclist; ml; ml = ml->next) {
3632 if (ml->ifindex == mreq->mr_ifindex &&
3633 ml->type == mreq->mr_type &&
3634 ml->alen == mreq->mr_alen &&
3635 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3637 /* Free the new element ... */
3643 i->type = mreq->mr_type;
3644 i->ifindex = mreq->mr_ifindex;
3645 i->alen = mreq->mr_alen;
3646 memcpy(i->addr, mreq->mr_address, i->alen);
3647 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
3649 i->next = po->mclist;
3651 err = packet_dev_mc(dev, i, 1);
3653 po->mclist = i->next;
3662 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
3664 struct packet_mclist *ml, **mlp;
3668 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3669 if (ml->ifindex == mreq->mr_ifindex &&
3670 ml->type == mreq->mr_type &&
3671 ml->alen == mreq->mr_alen &&
3672 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3673 if (--ml->count == 0) {
3674 struct net_device *dev;
3676 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3678 packet_dev_mc(dev, ml, -1);
3688 static void packet_flush_mclist(struct sock *sk)
3690 struct packet_sock *po = pkt_sk(sk);
3691 struct packet_mclist *ml;
3697 while ((ml = po->mclist) != NULL) {
3698 struct net_device *dev;
3700 po->mclist = ml->next;
3701 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3703 packet_dev_mc(dev, ml, -1);
3710 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
3712 struct sock *sk = sock->sk;
3713 struct packet_sock *po = pkt_sk(sk);
3716 if (level != SOL_PACKET)
3717 return -ENOPROTOOPT;
3720 case PACKET_ADD_MEMBERSHIP:
3721 case PACKET_DROP_MEMBERSHIP:
3723 struct packet_mreq_max mreq;
3725 memset(&mreq, 0, sizeof(mreq));
3726 if (len < sizeof(struct packet_mreq))
3728 if (len > sizeof(mreq))
3730 if (copy_from_user(&mreq, optval, len))
3732 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3734 if (optname == PACKET_ADD_MEMBERSHIP)
3735 ret = packet_mc_add(sk, &mreq);
3737 ret = packet_mc_drop(sk, &mreq);
3741 case PACKET_RX_RING:
3742 case PACKET_TX_RING:
3744 union tpacket_req_u req_u;
3748 switch (po->tp_version) {
3751 len = sizeof(req_u.req);
3755 len = sizeof(req_u.req3);
3761 if (copy_from_user(&req_u.req, optval, len))
3764 ret = packet_set_ring(sk, &req_u, 0,
3765 optname == PACKET_TX_RING);
3770 case PACKET_COPY_THRESH:
3774 if (optlen != sizeof(val))
3776 if (copy_from_user(&val, optval, sizeof(val)))
3779 pkt_sk(sk)->copy_thresh = val;
3782 case PACKET_VERSION:
3786 if (optlen != sizeof(val))
3788 if (copy_from_user(&val, optval, sizeof(val)))
3799 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3802 po->tp_version = val;
3808 case PACKET_RESERVE:
3812 if (optlen != sizeof(val))
3814 if (copy_from_user(&val, optval, sizeof(val)))
3819 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3822 po->tp_reserve = val;
3832 if (optlen != sizeof(val))
3834 if (copy_from_user(&val, optval, sizeof(val)))
3838 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3841 po->tp_loss = !!val;
3847 case PACKET_AUXDATA:
3851 if (optlen < sizeof(val))
3853 if (copy_from_user(&val, optval, sizeof(val)))
3857 po->auxdata = !!val;
3861 case PACKET_ORIGDEV:
3865 if (optlen < sizeof(val))
3867 if (copy_from_user(&val, optval, sizeof(val)))
3871 po->origdev = !!val;
3875 case PACKET_VNET_HDR:
3879 if (sock->type != SOCK_RAW)
3881 if (optlen < sizeof(val))
3883 if (copy_from_user(&val, optval, sizeof(val)))
3887 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3890 po->has_vnet_hdr = !!val;
3896 case PACKET_TIMESTAMP:
3900 if (optlen != sizeof(val))
3902 if (copy_from_user(&val, optval, sizeof(val)))
3905 po->tp_tstamp = val;
3912 if (optlen != sizeof(val))
3914 if (copy_from_user(&val, optval, sizeof(val)))
3917 return fanout_add(sk, val & 0xffff, val >> 16);
3919 case PACKET_FANOUT_DATA:
3921 /* Paired with the WRITE_ONCE() in fanout_add() */
3922 if (!READ_ONCE(po->fanout))
3925 return fanout_set_data(po, optval, optlen);
3927 case PACKET_TX_HAS_OFF:
3931 if (optlen != sizeof(val))
3933 if (copy_from_user(&val, optval, sizeof(val)))
3937 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3940 po->tp_tx_has_off = !!val;
3946 case PACKET_QDISC_BYPASS:
3950 if (optlen != sizeof(val))
3952 if (copy_from_user(&val, optval, sizeof(val)))
3955 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3959 return -ENOPROTOOPT;
3963 static int packet_getsockopt(struct socket *sock, int level, int optname,
3964 char __user *optval, int __user *optlen)
3967 int val, lv = sizeof(val);
3968 struct sock *sk = sock->sk;
3969 struct packet_sock *po = pkt_sk(sk);
3971 union tpacket_stats_u st;
3972 struct tpacket_rollover_stats rstats;
3974 if (level != SOL_PACKET)
3975 return -ENOPROTOOPT;
3977 if (get_user(len, optlen))
3984 case PACKET_STATISTICS:
3985 spin_lock_bh(&sk->sk_receive_queue.lock);
3986 memcpy(&st, &po->stats, sizeof(st));
3987 memset(&po->stats, 0, sizeof(po->stats));
3988 spin_unlock_bh(&sk->sk_receive_queue.lock);
3990 if (po->tp_version == TPACKET_V3) {
3991 lv = sizeof(struct tpacket_stats_v3);
3992 st.stats3.tp_packets += st.stats3.tp_drops;
3995 lv = sizeof(struct tpacket_stats);
3996 st.stats1.tp_packets += st.stats1.tp_drops;
4001 case PACKET_AUXDATA:
4004 case PACKET_ORIGDEV:
4007 case PACKET_VNET_HDR:
4008 val = po->has_vnet_hdr;
4010 case PACKET_VERSION:
4011 val = po->tp_version;
4014 if (len > sizeof(int))
4016 if (len < sizeof(int))
4018 if (copy_from_user(&val, optval, len))
4022 val = sizeof(struct tpacket_hdr);
4025 val = sizeof(struct tpacket2_hdr);
4028 val = sizeof(struct tpacket3_hdr);
4034 case PACKET_RESERVE:
4035 val = po->tp_reserve;
4040 case PACKET_TIMESTAMP:
4041 val = po->tp_tstamp;
4045 ((u32)po->fanout->id |
4046 ((u32)po->fanout->type << 16) |
4047 ((u32)po->fanout->flags << 24)) :
4050 case PACKET_ROLLOVER_STATS:
4053 rstats.tp_all = atomic_long_read(&po->rollover->num);
4054 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
4055 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
4057 lv = sizeof(rstats);
4059 case PACKET_TX_HAS_OFF:
4060 val = po->tp_tx_has_off;
4062 case PACKET_QDISC_BYPASS:
4063 val = packet_use_direct_xmit(po);
4066 return -ENOPROTOOPT;
4071 if (put_user(len, optlen))
4073 if (copy_to_user(optval, data, len))
4079 #ifdef CONFIG_COMPAT
4080 static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
4081 char __user *optval, unsigned int optlen)
4083 struct packet_sock *po = pkt_sk(sock->sk);
4085 if (level != SOL_PACKET)
4086 return -ENOPROTOOPT;
4088 if (optname == PACKET_FANOUT_DATA &&
4089 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
4090 optval = (char __user *)get_compat_bpf_fprog(optval);
4093 optlen = sizeof(struct sock_fprog);
4096 return packet_setsockopt(sock, level, optname, optval, optlen);
4100 static int packet_notifier(struct notifier_block *this,
4101 unsigned long msg, void *ptr)
4104 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4105 struct net *net = dev_net(dev);
4108 sk_for_each_rcu(sk, &net->packet.sklist) {
4109 struct packet_sock *po = pkt_sk(sk);
4112 case NETDEV_UNREGISTER:
4114 packet_dev_mclist_delete(dev, &po->mclist);
4118 if (dev->ifindex == po->ifindex) {
4119 spin_lock(&po->bind_lock);
4121 __unregister_prot_hook(sk, false);
4122 sk->sk_err = ENETDOWN;
4123 if (!sock_flag(sk, SOCK_DEAD))
4124 sk->sk_error_report(sk);
4126 if (msg == NETDEV_UNREGISTER) {
4127 packet_cached_dev_reset(po);
4128 WRITE_ONCE(po->ifindex, -1);
4129 if (po->prot_hook.dev)
4130 dev_put(po->prot_hook.dev);
4131 po->prot_hook.dev = NULL;
4133 spin_unlock(&po->bind_lock);
4137 if (dev->ifindex == po->ifindex) {
4138 spin_lock(&po->bind_lock);
4140 register_prot_hook(sk);
4141 spin_unlock(&po->bind_lock);
4151 static int packet_ioctl(struct socket *sock, unsigned int cmd,
4154 struct sock *sk = sock->sk;
4159 int amount = sk_wmem_alloc_get(sk);
4161 return put_user(amount, (int __user *)arg);
4165 struct sk_buff *skb;
4168 spin_lock_bh(&sk->sk_receive_queue.lock);
4169 skb = skb_peek(&sk->sk_receive_queue);
4172 spin_unlock_bh(&sk->sk_receive_queue.lock);
4173 return put_user(amount, (int __user *)arg);
4176 return sock_get_timestamp(sk, (struct timeval __user *)arg);
4178 return sock_get_timestampns(sk, (struct timespec __user *)arg);
4188 case SIOCGIFBRDADDR:
4189 case SIOCSIFBRDADDR:
4190 case SIOCGIFNETMASK:
4191 case SIOCSIFNETMASK:
4192 case SIOCGIFDSTADDR:
4193 case SIOCSIFDSTADDR:
4195 return inet_dgram_ops.ioctl(sock, cmd, arg);
4199 return -ENOIOCTLCMD;
4204 static unsigned int packet_poll(struct file *file, struct socket *sock,
4207 struct sock *sk = sock->sk;
4208 struct packet_sock *po = pkt_sk(sk);
4209 unsigned int mask = datagram_poll(file, sock, wait);
4211 spin_lock_bh(&sk->sk_receive_queue.lock);
4212 if (po->rx_ring.pg_vec) {
4213 if (!packet_previous_rx_frame(po, &po->rx_ring,
4215 mask |= POLLIN | POLLRDNORM;
4217 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
4219 spin_unlock_bh(&sk->sk_receive_queue.lock);
4220 spin_lock_bh(&sk->sk_write_queue.lock);
4221 if (po->tx_ring.pg_vec) {
4222 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4223 mask |= POLLOUT | POLLWRNORM;
4225 spin_unlock_bh(&sk->sk_write_queue.lock);
4230 /* Dirty? Well, I still did not learn better way to account
4234 static void packet_mm_open(struct vm_area_struct *vma)
4236 struct file *file = vma->vm_file;
4237 struct socket *sock = file->private_data;
4238 struct sock *sk = sock->sk;
4241 atomic_inc(&pkt_sk(sk)->mapped);
4244 static void packet_mm_close(struct vm_area_struct *vma)
4246 struct file *file = vma->vm_file;
4247 struct socket *sock = file->private_data;
4248 struct sock *sk = sock->sk;
4251 atomic_dec(&pkt_sk(sk)->mapped);
4254 static const struct vm_operations_struct packet_mmap_ops = {
4255 .open = packet_mm_open,
4256 .close = packet_mm_close,
4259 static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4264 for (i = 0; i < len; i++) {
4265 if (likely(pg_vec[i].buffer)) {
4266 if (is_vmalloc_addr(pg_vec[i].buffer))
4267 vfree(pg_vec[i].buffer);
4269 free_pages((unsigned long)pg_vec[i].buffer,
4271 pg_vec[i].buffer = NULL;
4277 static char *alloc_one_pg_vec_page(unsigned long order)
4280 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4281 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4283 buffer = (char *) __get_free_pages(gfp_flags, order);
4287 /* __get_free_pages failed, fall back to vmalloc */
4288 buffer = vzalloc((1 << order) * PAGE_SIZE);
4292 /* vmalloc failed, lets dig into swap here */
4293 gfp_flags &= ~__GFP_NORETRY;
4294 buffer = (char *) __get_free_pages(gfp_flags, order);
4298 /* complete and utter failure */
4302 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4304 unsigned int block_nr = req->tp_block_nr;
4308 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
4309 if (unlikely(!pg_vec))
4312 for (i = 0; i < block_nr; i++) {
4313 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
4314 if (unlikely(!pg_vec[i].buffer))
4315 goto out_free_pgvec;
4322 free_pg_vec(pg_vec, order, block_nr);
4327 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4328 int closing, int tx_ring)
4330 struct pgv *pg_vec = NULL;
4331 struct packet_sock *po = pkt_sk(sk);
4332 unsigned long *rx_owner_map = NULL;
4333 int was_running, order = 0;
4334 struct packet_ring_buffer *rb;
4335 struct sk_buff_head *rb_queue;
4338 /* Added to avoid minimal code churn */
4339 struct tpacket_req *req = &req_u->req;
4341 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4342 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
4346 if (atomic_read(&po->mapped))
4348 if (packet_read_pending(rb))
4352 if (req->tp_block_nr) {
4353 unsigned int min_frame_size;
4355 /* Sanity tests and some calculations */
4357 if (unlikely(rb->pg_vec))
4360 switch (po->tp_version) {
4362 po->tp_hdrlen = TPACKET_HDRLEN;
4365 po->tp_hdrlen = TPACKET2_HDRLEN;
4368 po->tp_hdrlen = TPACKET3_HDRLEN;
4373 if (unlikely((int)req->tp_block_size <= 0))
4375 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4377 min_frame_size = po->tp_hdrlen + po->tp_reserve;
4378 if (po->tp_version >= TPACKET_V3 &&
4379 req->tp_block_size <
4380 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
4382 if (unlikely(req->tp_frame_size < min_frame_size))
4384 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
4387 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4388 if (unlikely(rb->frames_per_block == 0))
4390 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
4392 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4397 order = get_order(req->tp_block_size);
4398 pg_vec = alloc_pg_vec(req, order);
4399 if (unlikely(!pg_vec))
4401 switch (po->tp_version) {
4403 /* Block transmit is not supported yet */
4405 init_prb_bdqc(po, rb, pg_vec, req_u);
4407 struct tpacket_req3 *req3 = &req_u->req3;
4409 if (req3->tp_retire_blk_tov ||
4410 req3->tp_sizeof_priv ||
4411 req3->tp_feature_req_word) {
4413 goto out_free_pg_vec;
4419 rx_owner_map = bitmap_alloc(req->tp_frame_nr,
4420 GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
4422 goto out_free_pg_vec;
4430 if (unlikely(req->tp_frame_nr))
4435 /* Detach socket from network */
4436 spin_lock(&po->bind_lock);
4437 was_running = po->running;
4440 WRITE_ONCE(po->num, 0);
4441 __unregister_prot_hook(sk, false);
4443 spin_unlock(&po->bind_lock);
4448 mutex_lock(&po->pg_vec_lock);
4449 if (closing || atomic_read(&po->mapped) == 0) {
4451 spin_lock_bh(&rb_queue->lock);
4452 swap(rb->pg_vec, pg_vec);
4453 if (po->tp_version <= TPACKET_V2)
4454 swap(rb->rx_owner_map, rx_owner_map);
4455 rb->frame_max = (req->tp_frame_nr - 1);
4457 rb->frame_size = req->tp_frame_size;
4458 spin_unlock_bh(&rb_queue->lock);
4460 swap(rb->pg_vec_order, order);
4461 swap(rb->pg_vec_len, req->tp_block_nr);
4463 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4464 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4465 tpacket_rcv : packet_rcv;
4466 skb_queue_purge(rb_queue);
4467 if (atomic_read(&po->mapped))
4468 pr_err("packet_mmap: vma is busy: %d\n",
4469 atomic_read(&po->mapped));
4471 mutex_unlock(&po->pg_vec_lock);
4473 spin_lock(&po->bind_lock);
4475 WRITE_ONCE(po->num, num);
4476 register_prot_hook(sk);
4478 spin_unlock(&po->bind_lock);
4479 if (pg_vec && (po->tp_version > TPACKET_V2)) {
4480 /* Because we don't support block-based V3 on tx-ring */
4482 prb_shutdown_retire_blk_timer(po, rb_queue);
4487 bitmap_free(rx_owner_map);
4488 free_pg_vec(pg_vec, order, req->tp_block_nr);
4494 static int packet_mmap(struct file *file, struct socket *sock,
4495 struct vm_area_struct *vma)
4497 struct sock *sk = sock->sk;
4498 struct packet_sock *po = pkt_sk(sk);
4499 unsigned long size, expected_size;
4500 struct packet_ring_buffer *rb;
4501 unsigned long start;
4508 mutex_lock(&po->pg_vec_lock);
4511 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4513 expected_size += rb->pg_vec_len
4519 if (expected_size == 0)
4522 size = vma->vm_end - vma->vm_start;
4523 if (size != expected_size)
4526 start = vma->vm_start;
4527 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4528 if (rb->pg_vec == NULL)
4531 for (i = 0; i < rb->pg_vec_len; i++) {
4533 void *kaddr = rb->pg_vec[i].buffer;
4536 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4537 page = pgv_to_page(kaddr);
4538 err = vm_insert_page(vma, start, page);
4547 atomic_inc(&po->mapped);
4548 vma->vm_ops = &packet_mmap_ops;
4552 mutex_unlock(&po->pg_vec_lock);
4556 static const struct proto_ops packet_ops_spkt = {
4557 .family = PF_PACKET,
4558 .owner = THIS_MODULE,
4559 .release = packet_release,
4560 .bind = packet_bind_spkt,
4561 .connect = sock_no_connect,
4562 .socketpair = sock_no_socketpair,
4563 .accept = sock_no_accept,
4564 .getname = packet_getname_spkt,
4565 .poll = datagram_poll,
4566 .ioctl = packet_ioctl,
4567 .listen = sock_no_listen,
4568 .shutdown = sock_no_shutdown,
4569 .setsockopt = sock_no_setsockopt,
4570 .getsockopt = sock_no_getsockopt,
4571 .sendmsg = packet_sendmsg_spkt,
4572 .recvmsg = packet_recvmsg,
4573 .mmap = sock_no_mmap,
4574 .sendpage = sock_no_sendpage,
4577 static const struct proto_ops packet_ops = {
4578 .family = PF_PACKET,
4579 .owner = THIS_MODULE,
4580 .release = packet_release,
4581 .bind = packet_bind,
4582 .connect = sock_no_connect,
4583 .socketpair = sock_no_socketpair,
4584 .accept = sock_no_accept,
4585 .getname = packet_getname,
4586 .poll = packet_poll,
4587 .ioctl = packet_ioctl,
4588 .listen = sock_no_listen,
4589 .shutdown = sock_no_shutdown,
4590 .setsockopt = packet_setsockopt,
4591 .getsockopt = packet_getsockopt,
4592 #ifdef CONFIG_COMPAT
4593 .compat_setsockopt = compat_packet_setsockopt,
4595 .sendmsg = packet_sendmsg,
4596 .recvmsg = packet_recvmsg,
4597 .mmap = packet_mmap,
4598 .sendpage = sock_no_sendpage,
4601 static const struct net_proto_family packet_family_ops = {
4602 .family = PF_PACKET,
4603 .create = packet_create,
4604 .owner = THIS_MODULE,
4607 static struct notifier_block packet_netdev_notifier = {
4608 .notifier_call = packet_notifier,
4611 #ifdef CONFIG_PROC_FS
4613 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
4616 struct net *net = seq_file_net(seq);
4619 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
4622 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4624 struct net *net = seq_file_net(seq);
4625 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
4628 static void packet_seq_stop(struct seq_file *seq, void *v)
4634 static int packet_seq_show(struct seq_file *seq, void *v)
4636 if (v == SEQ_START_TOKEN)
4637 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4639 struct sock *s = sk_entry(v);
4640 const struct packet_sock *po = pkt_sk(s);
4643 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4645 refcount_read(&s->sk_refcnt),
4647 ntohs(READ_ONCE(po->num)),
4648 READ_ONCE(po->ifindex),
4650 atomic_read(&s->sk_rmem_alloc),
4651 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
4658 static const struct seq_operations packet_seq_ops = {
4659 .start = packet_seq_start,
4660 .next = packet_seq_next,
4661 .stop = packet_seq_stop,
4662 .show = packet_seq_show,
4665 static int packet_seq_open(struct inode *inode, struct file *file)
4667 return seq_open_net(inode, file, &packet_seq_ops,
4668 sizeof(struct seq_net_private));
4671 static const struct file_operations packet_seq_fops = {
4672 .owner = THIS_MODULE,
4673 .open = packet_seq_open,
4675 .llseek = seq_lseek,
4676 .release = seq_release_net,
4681 static int __net_init packet_net_init(struct net *net)
4683 mutex_init(&net->packet.sklist_lock);
4684 INIT_HLIST_HEAD(&net->packet.sklist);
4686 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
4692 static void __net_exit packet_net_exit(struct net *net)
4694 remove_proc_entry("packet", net->proc_net);
4697 static struct pernet_operations packet_net_ops = {
4698 .init = packet_net_init,
4699 .exit = packet_net_exit,
4703 static void __exit packet_exit(void)
4705 unregister_netdevice_notifier(&packet_netdev_notifier);
4706 unregister_pernet_subsys(&packet_net_ops);
4707 sock_unregister(PF_PACKET);
4708 proto_unregister(&packet_proto);
4711 static int __init packet_init(void)
4715 rc = proto_register(&packet_proto, 0);
4718 rc = sock_register(&packet_family_ops);
4721 rc = register_pernet_subsys(&packet_net_ops);
4724 rc = register_netdevice_notifier(&packet_netdev_notifier);
4731 unregister_pernet_subsys(&packet_net_ops);
4733 sock_unregister(PF_PACKET);
4735 proto_unregister(&packet_proto);
4740 module_init(packet_init);
4741 module_exit(packet_exit);
4742 MODULE_LICENSE("GPL");
4743 MODULE_ALIAS_NETPROTO(PF_PACKET);