2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
55 #include <linux/types.h>
57 #include <linux/capability.h>
58 #include <linux/fcntl.h>
59 #include <linux/socket.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_packet.h>
64 #include <linux/wireless.h>
65 #include <linux/kernel.h>
66 #include <linux/kmod.h>
67 #include <linux/slab.h>
68 #include <linux/vmalloc.h>
69 #include <net/net_namespace.h>
71 #include <net/protocol.h>
72 #include <linux/skbuff.h>
74 #include <linux/errno.h>
75 #include <linux/timer.h>
76 #include <linux/uaccess.h>
77 #include <asm/ioctls.h>
79 #include <asm/cacheflush.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/poll.h>
84 #include <linux/module.h>
85 #include <linux/init.h>
86 #include <linux/mutex.h>
87 #include <linux/if_vlan.h>
88 #include <linux/virtio_net.h>
89 #include <linux/errqueue.h>
90 #include <linux/net_tstamp.h>
91 #include <linux/percpu.h>
93 #include <net/inet_common.h>
95 #include <linux/bpf.h>
96 #include <net/compat.h>
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
114 Incoming, dev->hard_header!=NULL
115 mac_header -> ll header
118 Outgoing, dev->hard_header!=NULL
119 mac_header -> ll header
122 Incoming, dev->hard_header==NULL
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
125 assymetry between rx and tx paths.
128 Outgoing, dev->hard_header==NULL
129 mac_header -> data. ll header is still not built!
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
139 dev->hard_header != NULL
140 mac_header -> ll header
143 dev->hard_header == NULL (ll header is added by device, we cannot control it)
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
151 /* Private packet socket structures. */
153 /* identical to struct packet_mreq except it has
154 * a longer address field.
156 struct packet_mreq_max {
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
170 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
171 int closing, int tx_ring);
173 #define V3_ALIGNMENT (8)
175 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
177 #define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
180 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
190 struct packet_type *pt, struct net_device *orig_dev);
192 static void *packet_previous_frame(struct packet_sock *po,
193 struct packet_ring_buffer *rb,
195 static void packet_increment_head(struct packet_ring_buffer *buff);
196 static int prb_curr_blk_in_use(struct tpacket_block_desc *);
197 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
198 struct packet_sock *);
199 static void prb_retire_current_block(struct tpacket_kbdq_core *,
200 struct packet_sock *, unsigned int status);
201 static int prb_queue_frozen(struct tpacket_kbdq_core *);
202 static void prb_open_block(struct tpacket_kbdq_core *,
203 struct tpacket_block_desc *);
204 static void prb_retire_rx_blk_timer_expired(struct timer_list *);
205 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
206 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
207 static void prb_clear_rxhash(struct tpacket_kbdq_core *,
208 struct tpacket3_hdr *);
209 static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
210 struct tpacket3_hdr *);
211 static void packet_flush_mclist(struct sock *sk);
212 static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb);
214 struct packet_skb_cb {
216 struct sockaddr_pkt pkt;
218 /* Trick: alias skb original length with
219 * ll.sll_family and ll.protocol in order
222 unsigned int origlen;
223 struct sockaddr_ll ll;
228 #define vio_le() virtio_legacy_is_little_endian()
230 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
232 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
233 #define GET_PBLOCK_DESC(x, bid) \
234 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
235 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
236 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
237 #define GET_NEXT_PRB_BLK_NUM(x) \
238 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
239 ((x)->kactive_blk_num+1) : 0)
241 static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
242 static void __fanout_link(struct sock *sk, struct packet_sock *po);
244 static int packet_direct_xmit(struct sk_buff *skb)
246 struct net_device *dev = skb->dev;
247 struct sk_buff *orig_skb = skb;
248 struct netdev_queue *txq;
249 int ret = NETDEV_TX_BUSY;
251 if (unlikely(!netif_running(dev) ||
252 !netif_carrier_ok(dev)))
255 skb = validate_xmit_skb_list(skb, dev);
259 packet_pick_tx_queue(dev, skb);
260 txq = skb_get_tx_queue(dev, skb);
264 HARD_TX_LOCK(dev, txq, smp_processor_id());
265 if (!netif_xmit_frozen_or_drv_stopped(txq))
266 ret = netdev_start_xmit(skb, dev, txq, false);
267 HARD_TX_UNLOCK(dev, txq);
271 if (!dev_xmit_complete(ret))
276 atomic_long_inc(&dev->tx_dropped);
278 return NET_XMIT_DROP;
281 static struct net_device *packet_cached_dev_get(struct packet_sock *po)
283 struct net_device *dev;
286 dev = rcu_dereference(po->cached_dev);
294 static void packet_cached_dev_assign(struct packet_sock *po,
295 struct net_device *dev)
297 rcu_assign_pointer(po->cached_dev, dev);
300 static void packet_cached_dev_reset(struct packet_sock *po)
302 RCU_INIT_POINTER(po->cached_dev, NULL);
305 static bool packet_use_direct_xmit(const struct packet_sock *po)
307 return po->xmit == packet_direct_xmit;
310 static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
312 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
315 static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
317 const struct net_device_ops *ops = dev->netdev_ops;
320 if (ops->ndo_select_queue) {
321 queue_index = ops->ndo_select_queue(dev, skb, NULL,
322 __packet_pick_tx_queue);
323 queue_index = netdev_cap_txqueue(dev, queue_index);
325 queue_index = __packet_pick_tx_queue(dev, skb);
328 skb_set_queue_mapping(skb, queue_index);
331 /* register_prot_hook must be invoked with the po->bind_lock held,
332 * or from a context in which asynchronous accesses to the packet
333 * socket is not possible (packet_create()).
335 static void register_prot_hook(struct sock *sk)
337 struct packet_sock *po = pkt_sk(sk);
341 __fanout_link(sk, po);
343 dev_add_pack(&po->prot_hook);
350 /* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
351 * held. If the sync parameter is true, we will temporarily drop
352 * the po->bind_lock and do a synchronize_net to make sure no
353 * asynchronous packet processing paths still refer to the elements
354 * of po->prot_hook. If the sync parameter is false, it is the
355 * callers responsibility to take care of this.
357 static void __unregister_prot_hook(struct sock *sk, bool sync)
359 struct packet_sock *po = pkt_sk(sk);
364 __fanout_unlink(sk, po);
366 __dev_remove_pack(&po->prot_hook);
371 spin_unlock(&po->bind_lock);
373 spin_lock(&po->bind_lock);
377 static void unregister_prot_hook(struct sock *sk, bool sync)
379 struct packet_sock *po = pkt_sk(sk);
382 __unregister_prot_hook(sk, sync);
385 static inline struct page * __pure pgv_to_page(void *addr)
387 if (is_vmalloc_addr(addr))
388 return vmalloc_to_page(addr);
389 return virt_to_page(addr);
392 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
394 union tpacket_uhdr h;
397 switch (po->tp_version) {
399 h.h1->tp_status = status;
400 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
403 h.h2->tp_status = status;
404 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
407 h.h3->tp_status = status;
408 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
411 WARN(1, "TPACKET version not supported.\n");
418 static int __packet_get_status(struct packet_sock *po, void *frame)
420 union tpacket_uhdr h;
425 switch (po->tp_version) {
427 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
428 return h.h1->tp_status;
430 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
431 return h.h2->tp_status;
433 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
434 return h.h3->tp_status;
436 WARN(1, "TPACKET version not supported.\n");
442 static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
445 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
448 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
449 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
450 return TP_STATUS_TS_RAW_HARDWARE;
452 if (ktime_to_timespec_cond(skb->tstamp, ts))
453 return TP_STATUS_TS_SOFTWARE;
458 static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
461 union tpacket_uhdr h;
465 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
469 switch (po->tp_version) {
471 h.h1->tp_sec = ts.tv_sec;
472 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
475 h.h2->tp_sec = ts.tv_sec;
476 h.h2->tp_nsec = ts.tv_nsec;
479 h.h3->tp_sec = ts.tv_sec;
480 h.h3->tp_nsec = ts.tv_nsec;
483 WARN(1, "TPACKET version not supported.\n");
487 /* one flush is safe, as both fields always lie on the same cacheline */
488 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
494 static void *packet_lookup_frame(struct packet_sock *po,
495 struct packet_ring_buffer *rb,
496 unsigned int position,
499 unsigned int pg_vec_pos, frame_offset;
500 union tpacket_uhdr h;
502 pg_vec_pos = position / rb->frames_per_block;
503 frame_offset = position % rb->frames_per_block;
505 h.raw = rb->pg_vec[pg_vec_pos].buffer +
506 (frame_offset * rb->frame_size);
508 if (status != __packet_get_status(po, h.raw))
514 static void *packet_current_frame(struct packet_sock *po,
515 struct packet_ring_buffer *rb,
518 return packet_lookup_frame(po, rb, rb->head, status);
521 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
523 del_timer_sync(&pkc->retire_blk_timer);
526 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
527 struct sk_buff_head *rb_queue)
529 struct tpacket_kbdq_core *pkc;
531 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
533 spin_lock_bh(&rb_queue->lock);
534 pkc->delete_blk_timer = 1;
535 spin_unlock_bh(&rb_queue->lock);
537 prb_del_retire_blk_timer(pkc);
540 static void prb_setup_retire_blk_timer(struct packet_sock *po)
542 struct tpacket_kbdq_core *pkc;
544 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
545 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
547 pkc->retire_blk_timer.expires = jiffies;
550 static int prb_calc_retire_blk_tmo(struct packet_sock *po,
551 int blk_size_in_bytes)
553 struct net_device *dev;
554 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
555 struct ethtool_link_ksettings ecmd;
559 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
560 if (unlikely(!dev)) {
562 return DEFAULT_PRB_RETIRE_TOV;
564 err = __ethtool_get_link_ksettings(dev, &ecmd);
568 * If the link speed is so slow you don't really
569 * need to worry about perf anyways
571 if (ecmd.base.speed < SPEED_1000 ||
572 ecmd.base.speed == SPEED_UNKNOWN) {
573 return DEFAULT_PRB_RETIRE_TOV;
576 div = ecmd.base.speed / 1000;
580 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
592 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
593 union tpacket_req_u *req_u)
595 p1->feature_req_word = req_u->req3.tp_feature_req_word;
598 static void init_prb_bdqc(struct packet_sock *po,
599 struct packet_ring_buffer *rb,
601 union tpacket_req_u *req_u)
603 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
604 struct tpacket_block_desc *pbd;
606 memset(p1, 0x0, sizeof(*p1));
608 p1->knxt_seq_num = 1;
610 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
611 p1->pkblk_start = pg_vec[0].buffer;
612 p1->kblk_size = req_u->req3.tp_block_size;
613 p1->knum_blocks = req_u->req3.tp_block_nr;
614 p1->hdrlen = po->tp_hdrlen;
615 p1->version = po->tp_version;
616 p1->last_kactive_blk_num = 0;
617 po->stats.stats3.tp_freeze_q_cnt = 0;
618 if (req_u->req3.tp_retire_blk_tov)
619 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
621 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
622 req_u->req3.tp_block_size);
623 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
624 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
626 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
627 prb_init_ft_ops(p1, req_u);
628 prb_setup_retire_blk_timer(po);
629 prb_open_block(p1, pbd);
632 /* Do NOT update the last_blk_num first.
633 * Assumes sk_buff_head lock is held.
635 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
637 mod_timer(&pkc->retire_blk_timer,
638 jiffies + pkc->tov_in_jiffies);
639 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
644 * 1) We refresh the timer only when we open a block.
645 * By doing this we don't waste cycles refreshing the timer
646 * on packet-by-packet basis.
648 * With a 1MB block-size, on a 1Gbps line, it will take
649 * i) ~8 ms to fill a block + ii) memcpy etc.
650 * In this cut we are not accounting for the memcpy time.
652 * So, if the user sets the 'tmo' to 10ms then the timer
653 * will never fire while the block is still getting filled
654 * (which is what we want). However, the user could choose
655 * to close a block early and that's fine.
657 * But when the timer does fire, we check whether or not to refresh it.
658 * Since the tmo granularity is in msecs, it is not too expensive
659 * to refresh the timer, lets say every '8' msecs.
660 * Either the user can set the 'tmo' or we can derive it based on
661 * a) line-speed and b) block-size.
662 * prb_calc_retire_blk_tmo() calculates the tmo.
665 static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
667 struct packet_sock *po =
668 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
669 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
671 struct tpacket_block_desc *pbd;
673 spin_lock(&po->sk.sk_receive_queue.lock);
675 frozen = prb_queue_frozen(pkc);
676 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
678 if (unlikely(pkc->delete_blk_timer))
681 /* We only need to plug the race when the block is partially filled.
683 * lock(); increment BLOCK_NUM_PKTS; unlock()
684 * copy_bits() is in progress ...
685 * timer fires on other cpu:
686 * we can't retire the current block because copy_bits
690 if (BLOCK_NUM_PKTS(pbd)) {
691 while (atomic_read(&pkc->blk_fill_in_prog)) {
692 /* Waiting for skb_copy_bits to finish... */
697 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
699 if (!BLOCK_NUM_PKTS(pbd)) {
700 /* An empty block. Just refresh the timer. */
703 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
704 if (!prb_dispatch_next_block(pkc, po))
709 /* Case 1. Queue was frozen because user-space was
712 if (prb_curr_blk_in_use(pbd)) {
714 * Ok, user-space is still behind.
715 * So just refresh the timer.
719 /* Case 2. queue was frozen,user-space caught up,
720 * now the link went idle && the timer fired.
721 * We don't have a block to close.So we open this
722 * block and restart the timer.
723 * opening a block thaws the queue,restarts timer
724 * Thawing/timer-refresh is a side effect.
726 prb_open_block(pkc, pbd);
733 _prb_refresh_rx_retire_blk_timer(pkc);
736 spin_unlock(&po->sk.sk_receive_queue.lock);
739 static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
740 struct tpacket_block_desc *pbd1, __u32 status)
742 /* Flush everything minus the block header */
744 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
749 /* Skip the block header(we know header WILL fit in 4K) */
752 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
753 for (; start < end; start += PAGE_SIZE)
754 flush_dcache_page(pgv_to_page(start));
759 /* Now update the block status. */
761 BLOCK_STATUS(pbd1) = status;
763 /* Flush the block header */
765 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
767 flush_dcache_page(pgv_to_page(start));
777 * 2) Increment active_blk_num
779 * Note:We DONT refresh the timer on purpose.
780 * Because almost always the next block will be opened.
782 static void prb_close_block(struct tpacket_kbdq_core *pkc1,
783 struct tpacket_block_desc *pbd1,
784 struct packet_sock *po, unsigned int stat)
786 __u32 status = TP_STATUS_USER | stat;
788 struct tpacket3_hdr *last_pkt;
789 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
790 struct sock *sk = &po->sk;
792 if (po->stats.stats3.tp_drops)
793 status |= TP_STATUS_LOSING;
795 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
796 last_pkt->tp_next_offset = 0;
798 /* Get the ts of the last pkt */
799 if (BLOCK_NUM_PKTS(pbd1)) {
800 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
801 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
803 /* Ok, we tmo'd - so get the current time.
805 * It shouldn't really happen as we don't close empty
806 * blocks. See prb_retire_rx_blk_timer_expired().
810 h1->ts_last_pkt.ts_sec = ts.tv_sec;
811 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
816 /* Flush the block */
817 prb_flush_block(pkc1, pbd1, status);
819 sk->sk_data_ready(sk);
821 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
824 static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
826 pkc->reset_pending_on_curr_blk = 0;
830 * Side effect of opening a block:
832 * 1) prb_queue is thawed.
833 * 2) retire_blk_timer is refreshed.
836 static void prb_open_block(struct tpacket_kbdq_core *pkc1,
837 struct tpacket_block_desc *pbd1)
840 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
844 /* We could have just memset this but we will lose the
845 * flexibility of making the priv area sticky
848 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
849 BLOCK_NUM_PKTS(pbd1) = 0;
850 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
854 h1->ts_first_pkt.ts_sec = ts.tv_sec;
855 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
857 pkc1->pkblk_start = (char *)pbd1;
858 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
860 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
861 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
863 pbd1->version = pkc1->version;
864 pkc1->prev = pkc1->nxt_offset;
865 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
867 prb_thaw_queue(pkc1);
868 _prb_refresh_rx_retire_blk_timer(pkc1);
874 * Queue freeze logic:
875 * 1) Assume tp_block_nr = 8 blocks.
876 * 2) At time 't0', user opens Rx ring.
877 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
878 * 4) user-space is either sleeping or processing block '0'.
879 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
880 * it will close block-7,loop around and try to fill block '0'.
882 * __packet_lookup_frame_in_block
883 * prb_retire_current_block()
884 * prb_dispatch_next_block()
885 * |->(BLOCK_STATUS == USER) evaluates to true
886 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
887 * 6) Now there are two cases:
888 * 6.1) Link goes idle right after the queue is frozen.
889 * But remember, the last open_block() refreshed the timer.
890 * When this timer expires,it will refresh itself so that we can
891 * re-open block-0 in near future.
892 * 6.2) Link is busy and keeps on receiving packets. This is a simple
893 * case and __packet_lookup_frame_in_block will check if block-0
894 * is free and can now be re-used.
896 static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
897 struct packet_sock *po)
899 pkc->reset_pending_on_curr_blk = 1;
900 po->stats.stats3.tp_freeze_q_cnt++;
903 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
906 * If the next block is free then we will dispatch it
907 * and return a good offset.
908 * Else, we will freeze the queue.
909 * So, caller must check the return value.
911 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
912 struct packet_sock *po)
914 struct tpacket_block_desc *pbd;
918 /* 1. Get current block num */
919 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
921 /* 2. If this block is currently in_use then freeze the queue */
922 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
923 prb_freeze_queue(pkc, po);
929 * open this block and return the offset where the first packet
930 * needs to get stored.
932 prb_open_block(pkc, pbd);
933 return (void *)pkc->nxt_offset;
936 static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
937 struct packet_sock *po, unsigned int status)
939 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
941 /* retire/close the current block */
942 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
944 * Plug the case where copy_bits() is in progress on
945 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
946 * have space to copy the pkt in the current block and
947 * called prb_retire_current_block()
949 * We don't need to worry about the TMO case because
950 * the timer-handler already handled this case.
952 if (!(status & TP_STATUS_BLK_TMO)) {
953 while (atomic_read(&pkc->blk_fill_in_prog)) {
954 /* Waiting for skb_copy_bits to finish... */
958 prb_close_block(pkc, pbd, po, status);
963 static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
965 return TP_STATUS_USER & BLOCK_STATUS(pbd);
968 static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
970 return pkc->reset_pending_on_curr_blk;
973 static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
975 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
976 atomic_dec(&pkc->blk_fill_in_prog);
979 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
980 struct tpacket3_hdr *ppd)
982 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
985 static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
986 struct tpacket3_hdr *ppd)
988 ppd->hv1.tp_rxhash = 0;
991 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
992 struct tpacket3_hdr *ppd)
994 if (skb_vlan_tag_present(pkc->skb)) {
995 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
996 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
997 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
999 ppd->hv1.tp_vlan_tci = 0;
1000 ppd->hv1.tp_vlan_tpid = 0;
1001 ppd->tp_status = TP_STATUS_AVAILABLE;
1005 static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
1006 struct tpacket3_hdr *ppd)
1008 ppd->hv1.tp_padding = 0;
1009 prb_fill_vlan_info(pkc, ppd);
1011 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1012 prb_fill_rxhash(pkc, ppd);
1014 prb_clear_rxhash(pkc, ppd);
1017 static void prb_fill_curr_block(char *curr,
1018 struct tpacket_kbdq_core *pkc,
1019 struct tpacket_block_desc *pbd,
1022 struct tpacket3_hdr *ppd;
1024 ppd = (struct tpacket3_hdr *)curr;
1025 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1027 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1028 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1029 BLOCK_NUM_PKTS(pbd) += 1;
1030 atomic_inc(&pkc->blk_fill_in_prog);
1031 prb_run_all_ft_ops(pkc, ppd);
1034 /* Assumes caller has the sk->rx_queue.lock */
1035 static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1036 struct sk_buff *skb,
1041 struct tpacket_kbdq_core *pkc;
1042 struct tpacket_block_desc *pbd;
1045 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1046 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1048 /* Queue is frozen when user space is lagging behind */
1049 if (prb_queue_frozen(pkc)) {
1051 * Check if that last block which caused the queue to freeze,
1052 * is still in_use by user-space.
1054 if (prb_curr_blk_in_use(pbd)) {
1055 /* Can't record this packet */
1059 * Ok, the block was released by user-space.
1060 * Now let's open that block.
1061 * opening a block also thaws the queue.
1062 * Thawing is a side effect.
1064 prb_open_block(pkc, pbd);
1069 curr = pkc->nxt_offset;
1071 end = (char *)pbd + pkc->kblk_size;
1073 /* first try the current block */
1074 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1075 prb_fill_curr_block(curr, pkc, pbd, len);
1076 return (void *)curr;
1079 /* Ok, close the current block */
1080 prb_retire_current_block(pkc, po, 0);
1082 /* Now, try to dispatch the next block */
1083 curr = (char *)prb_dispatch_next_block(pkc, po);
1085 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1086 prb_fill_curr_block(curr, pkc, pbd, len);
1087 return (void *)curr;
1091 * No free blocks are available.user_space hasn't caught up yet.
1092 * Queue was just frozen and now this packet will get dropped.
1097 static void *packet_current_rx_frame(struct packet_sock *po,
1098 struct sk_buff *skb,
1099 int status, unsigned int len)
1102 switch (po->tp_version) {
1105 curr = packet_lookup_frame(po, &po->rx_ring,
1106 po->rx_ring.head, status);
1109 return __packet_lookup_frame_in_block(po, skb, status, len);
1111 WARN(1, "TPACKET version not supported\n");
1117 static void *prb_lookup_block(struct packet_sock *po,
1118 struct packet_ring_buffer *rb,
1122 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1123 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1125 if (status != BLOCK_STATUS(pbd))
1130 static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1133 if (rb->prb_bdqc.kactive_blk_num)
1134 prev = rb->prb_bdqc.kactive_blk_num-1;
1136 prev = rb->prb_bdqc.knum_blocks-1;
1140 /* Assumes caller has held the rx_queue.lock */
1141 static void *__prb_previous_block(struct packet_sock *po,
1142 struct packet_ring_buffer *rb,
1145 unsigned int previous = prb_previous_blk_num(rb);
1146 return prb_lookup_block(po, rb, previous, status);
1149 static void *packet_previous_rx_frame(struct packet_sock *po,
1150 struct packet_ring_buffer *rb,
1153 if (po->tp_version <= TPACKET_V2)
1154 return packet_previous_frame(po, rb, status);
1156 return __prb_previous_block(po, rb, status);
1159 static void packet_increment_rx_head(struct packet_sock *po,
1160 struct packet_ring_buffer *rb)
1162 switch (po->tp_version) {
1165 return packet_increment_head(rb);
1168 WARN(1, "TPACKET version not supported.\n");
1174 static void *packet_previous_frame(struct packet_sock *po,
1175 struct packet_ring_buffer *rb,
1178 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1179 return packet_lookup_frame(po, rb, previous, status);
1182 static void packet_increment_head(struct packet_ring_buffer *buff)
1184 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1187 static void packet_inc_pending(struct packet_ring_buffer *rb)
1189 this_cpu_inc(*rb->pending_refcnt);
1192 static void packet_dec_pending(struct packet_ring_buffer *rb)
1194 this_cpu_dec(*rb->pending_refcnt);
1197 static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1199 unsigned int refcnt = 0;
1202 /* We don't use pending refcount in rx_ring. */
1203 if (rb->pending_refcnt == NULL)
1206 for_each_possible_cpu(cpu)
1207 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1212 static int packet_alloc_pending(struct packet_sock *po)
1214 po->rx_ring.pending_refcnt = NULL;
1216 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1217 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1223 static void packet_free_pending(struct packet_sock *po)
1225 free_percpu(po->tx_ring.pending_refcnt);
1228 #define ROOM_POW_OFF 2
1229 #define ROOM_NONE 0x0
1230 #define ROOM_LOW 0x1
1231 #define ROOM_NORMAL 0x2
1233 static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
1237 len = po->rx_ring.frame_max + 1;
1238 idx = po->rx_ring.head;
1240 idx += len >> pow_off;
1243 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1246 static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1250 len = po->rx_ring.prb_bdqc.knum_blocks;
1251 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1253 idx += len >> pow_off;
1256 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1259 static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1261 struct sock *sk = &po->sk;
1262 int ret = ROOM_NONE;
1264 if (po->prot_hook.func != tpacket_rcv) {
1265 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1266 - (skb ? skb->truesize : 0);
1267 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1275 if (po->tp_version == TPACKET_V3) {
1276 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1278 else if (__tpacket_v3_has_room(po, 0))
1281 if (__tpacket_has_room(po, ROOM_POW_OFF))
1283 else if (__tpacket_has_room(po, 0))
1290 static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1295 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1296 ret = __packet_rcv_has_room(po, skb);
1297 has_room = ret == ROOM_NORMAL;
1298 if (po->pressure == has_room)
1299 po->pressure = !has_room;
1300 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
1305 static void packet_sock_destruct(struct sock *sk)
1307 skb_queue_purge(&sk->sk_error_queue);
1309 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1310 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1312 if (!sock_flag(sk, SOCK_DEAD)) {
1313 pr_err("Attempt to release alive packet socket: %p\n", sk);
1317 sk_refcnt_debug_dec(sk);
1320 static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1325 rxhash = skb_get_hash(skb);
1326 for (i = 0; i < ROLLOVER_HLEN; i++)
1327 if (po->rollover->history[i] == rxhash)
1330 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1331 return count > (ROLLOVER_HLEN >> 1);
1334 static unsigned int fanout_demux_hash(struct packet_fanout *f,
1335 struct sk_buff *skb,
1338 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
1341 static unsigned int fanout_demux_lb(struct packet_fanout *f,
1342 struct sk_buff *skb,
1345 unsigned int val = atomic_inc_return(&f->rr_cur);
1350 static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1351 struct sk_buff *skb,
1354 return smp_processor_id() % num;
1357 static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1358 struct sk_buff *skb,
1361 return prandom_u32_max(num);
1364 static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1365 struct sk_buff *skb,
1366 unsigned int idx, bool try_self,
1369 struct packet_sock *po, *po_next, *po_skip = NULL;
1370 unsigned int i, j, room = ROOM_NONE;
1372 po = pkt_sk(f->arr[idx]);
1375 room = packet_rcv_has_room(po, skb);
1376 if (room == ROOM_NORMAL ||
1377 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1382 i = j = min_t(int, po->rollover->sock, num - 1);
1384 po_next = pkt_sk(f->arr[i]);
1385 if (po_next != po_skip && !po_next->pressure &&
1386 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
1388 po->rollover->sock = i;
1389 atomic_long_inc(&po->rollover->num);
1390 if (room == ROOM_LOW)
1391 atomic_long_inc(&po->rollover->num_huge);
1399 atomic_long_inc(&po->rollover->num_failed);
1403 static unsigned int fanout_demux_qm(struct packet_fanout *f,
1404 struct sk_buff *skb,
1407 return skb_get_queue_mapping(skb) % num;
1410 static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1411 struct sk_buff *skb,
1414 struct bpf_prog *prog;
1415 unsigned int ret = 0;
1418 prog = rcu_dereference(f->bpf_prog);
1420 ret = bpf_prog_run_clear_cb(prog, skb) % num;
1426 static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1428 return f->flags & (flag >> 8);
1431 static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1432 struct packet_type *pt, struct net_device *orig_dev)
1434 struct packet_fanout *f = pt->af_packet_priv;
1435 unsigned int num = READ_ONCE(f->num_members);
1436 struct net *net = read_pnet(&f->net);
1437 struct packet_sock *po;
1440 if (!net_eq(dev_net(dev), net) || !num) {
1445 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1446 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
1451 case PACKET_FANOUT_HASH:
1453 idx = fanout_demux_hash(f, skb, num);
1455 case PACKET_FANOUT_LB:
1456 idx = fanout_demux_lb(f, skb, num);
1458 case PACKET_FANOUT_CPU:
1459 idx = fanout_demux_cpu(f, skb, num);
1461 case PACKET_FANOUT_RND:
1462 idx = fanout_demux_rnd(f, skb, num);
1464 case PACKET_FANOUT_QM:
1465 idx = fanout_demux_qm(f, skb, num);
1467 case PACKET_FANOUT_ROLLOVER:
1468 idx = fanout_demux_rollover(f, skb, 0, false, num);
1470 case PACKET_FANOUT_CBPF:
1471 case PACKET_FANOUT_EBPF:
1472 idx = fanout_demux_bpf(f, skb, num);
1476 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1477 idx = fanout_demux_rollover(f, skb, idx, true, num);
1479 po = pkt_sk(f->arr[idx]);
1480 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1483 DEFINE_MUTEX(fanout_mutex);
1484 EXPORT_SYMBOL_GPL(fanout_mutex);
1485 static LIST_HEAD(fanout_list);
1486 static u16 fanout_next_id;
1488 static void __fanout_link(struct sock *sk, struct packet_sock *po)
1490 struct packet_fanout *f = po->fanout;
1492 spin_lock(&f->lock);
1493 f->arr[f->num_members] = sk;
1496 if (f->num_members == 1)
1497 dev_add_pack(&f->prot_hook);
1498 spin_unlock(&f->lock);
1501 static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1503 struct packet_fanout *f = po->fanout;
1506 spin_lock(&f->lock);
1507 for (i = 0; i < f->num_members; i++) {
1508 if (f->arr[i] == sk)
1511 BUG_ON(i >= f->num_members);
1512 f->arr[i] = f->arr[f->num_members - 1];
1514 if (f->num_members == 0)
1515 __dev_remove_pack(&f->prot_hook);
1516 spin_unlock(&f->lock);
1519 static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1521 if (sk->sk_family != PF_PACKET)
1524 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
1527 static void fanout_init_data(struct packet_fanout *f)
1530 case PACKET_FANOUT_LB:
1531 atomic_set(&f->rr_cur, 0);
1533 case PACKET_FANOUT_CBPF:
1534 case PACKET_FANOUT_EBPF:
1535 RCU_INIT_POINTER(f->bpf_prog, NULL);
1540 static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1542 struct bpf_prog *old;
1544 spin_lock(&f->lock);
1545 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1546 rcu_assign_pointer(f->bpf_prog, new);
1547 spin_unlock(&f->lock);
1551 bpf_prog_destroy(old);
1555 static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1558 struct bpf_prog *new;
1559 struct sock_fprog fprog;
1562 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1564 if (len != sizeof(fprog))
1566 if (copy_from_user(&fprog, data, len))
1569 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
1573 __fanout_set_data_bpf(po->fanout, new);
1577 static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1580 struct bpf_prog *new;
1583 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1585 if (len != sizeof(fd))
1587 if (copy_from_user(&fd, data, len))
1590 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
1592 return PTR_ERR(new);
1594 __fanout_set_data_bpf(po->fanout, new);
1598 static int fanout_set_data(struct packet_sock *po, char __user *data,
1601 switch (po->fanout->type) {
1602 case PACKET_FANOUT_CBPF:
1603 return fanout_set_data_cbpf(po, data, len);
1604 case PACKET_FANOUT_EBPF:
1605 return fanout_set_data_ebpf(po, data, len);
1611 static void fanout_release_data(struct packet_fanout *f)
1614 case PACKET_FANOUT_CBPF:
1615 case PACKET_FANOUT_EBPF:
1616 __fanout_set_data_bpf(f, NULL);
1620 static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1622 struct packet_fanout *f;
1624 list_for_each_entry(f, &fanout_list, list) {
1625 if (f->id == candidate_id &&
1626 read_pnet(&f->net) == sock_net(sk)) {
1633 static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1635 u16 id = fanout_next_id;
1638 if (__fanout_id_is_free(sk, id)) {
1640 fanout_next_id = id + 1;
1645 } while (id != fanout_next_id);
1650 static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1652 struct packet_rollover *rollover = NULL;
1653 struct packet_sock *po = pkt_sk(sk);
1654 struct packet_fanout *f, *match;
1655 u8 type = type_flags & 0xff;
1656 u8 flags = type_flags >> 8;
1660 case PACKET_FANOUT_ROLLOVER:
1661 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1663 case PACKET_FANOUT_HASH:
1664 case PACKET_FANOUT_LB:
1665 case PACKET_FANOUT_CPU:
1666 case PACKET_FANOUT_RND:
1667 case PACKET_FANOUT_QM:
1668 case PACKET_FANOUT_CBPF:
1669 case PACKET_FANOUT_EBPF:
1675 mutex_lock(&fanout_mutex);
1681 if (type == PACKET_FANOUT_ROLLOVER ||
1682 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1684 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1687 atomic_long_set(&rollover->num, 0);
1688 atomic_long_set(&rollover->num_huge, 0);
1689 atomic_long_set(&rollover->num_failed, 0);
1690 po->rollover = rollover;
1693 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1698 if (!fanout_find_new_id(sk, &id)) {
1702 /* ephemeral flag for the first socket in the group: drop it */
1703 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1707 list_for_each_entry(f, &fanout_list, list) {
1709 read_pnet(&f->net) == sock_net(sk)) {
1715 if (match && match->flags != flags)
1719 match = kzalloc(sizeof(*match), GFP_KERNEL);
1722 write_pnet(&match->net, sock_net(sk));
1725 match->flags = flags;
1726 INIT_LIST_HEAD(&match->list);
1727 spin_lock_init(&match->lock);
1728 refcount_set(&match->sk_ref, 0);
1729 fanout_init_data(match);
1730 match->prot_hook.type = po->prot_hook.type;
1731 match->prot_hook.dev = po->prot_hook.dev;
1732 match->prot_hook.func = packet_rcv_fanout;
1733 match->prot_hook.af_packet_priv = match;
1734 match->prot_hook.id_match = match_fanout_group;
1735 list_add(&match->list, &fanout_list);
1739 spin_lock(&po->bind_lock);
1741 match->type == type &&
1742 match->prot_hook.type == po->prot_hook.type &&
1743 match->prot_hook.dev == po->prot_hook.dev) {
1745 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1746 __dev_remove_pack(&po->prot_hook);
1748 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
1749 __fanout_link(sk, po);
1753 spin_unlock(&po->bind_lock);
1755 if (err && !refcount_read(&match->sk_ref)) {
1756 list_del(&match->list);
1761 if (err && rollover) {
1762 kfree_rcu(rollover, rcu);
1763 po->rollover = NULL;
1765 mutex_unlock(&fanout_mutex);
1769 /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1770 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1771 * It is the responsibility of the caller to call fanout_release_data() and
1772 * free the returned packet_fanout (after synchronize_net())
1774 static struct packet_fanout *fanout_release(struct sock *sk)
1776 struct packet_sock *po = pkt_sk(sk);
1777 struct packet_fanout *f;
1779 mutex_lock(&fanout_mutex);
1784 if (refcount_dec_and_test(&f->sk_ref))
1790 kfree_rcu(po->rollover, rcu);
1791 po->rollover = NULL;
1794 mutex_unlock(&fanout_mutex);
1799 static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1800 struct sk_buff *skb)
1802 /* Earlier code assumed this would be a VLAN pkt, double-check
1803 * this now that we have the actual packet in hand. We can only
1804 * do this check on Ethernet devices.
1806 if (unlikely(dev->type != ARPHRD_ETHER))
1809 skb_reset_mac_header(skb);
1810 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1813 static const struct proto_ops packet_ops;
1815 static const struct proto_ops packet_ops_spkt;
1817 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1818 struct packet_type *pt, struct net_device *orig_dev)
1821 struct sockaddr_pkt *spkt;
1824 * When we registered the protocol we saved the socket in the data
1825 * field for just this event.
1828 sk = pt->af_packet_priv;
1831 * Yank back the headers [hope the device set this
1832 * right or kerboom...]
1834 * Incoming packets have ll header pulled,
1837 * For outgoing ones skb->data == skb_mac_header(skb)
1838 * so that this procedure is noop.
1841 if (skb->pkt_type == PACKET_LOOPBACK)
1844 if (!net_eq(dev_net(dev), sock_net(sk)))
1847 skb = skb_share_check(skb, GFP_ATOMIC);
1851 /* drop any routing info */
1854 /* drop conntrack reference */
1857 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1859 skb_push(skb, skb->data - skb_mac_header(skb));
1862 * The SOCK_PACKET socket receives _all_ frames.
1865 spkt->spkt_family = dev->type;
1866 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1867 spkt->spkt_protocol = skb->protocol;
1870 * Charge the memory to the socket. This is done specifically
1871 * to prevent sockets using all the memory up.
1874 if (sock_queue_rcv_skb(sk, skb) == 0)
1885 * Output a raw packet to a device layer. This bypasses all the other
1886 * protocol layers and you must therefore supply it with a complete frame
1889 static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1892 struct sock *sk = sock->sk;
1893 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1894 struct sk_buff *skb = NULL;
1895 struct net_device *dev;
1896 struct sockcm_cookie sockc;
1902 * Get and verify the address.
1906 if (msg->msg_namelen < sizeof(struct sockaddr))
1908 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1909 proto = saddr->spkt_protocol;
1911 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1914 * Find the device first to size check it
1917 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1920 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1926 if (!(dev->flags & IFF_UP))
1930 * You may not queue a frame bigger than the mtu. This is the lowest level
1931 * raw protocol and you must do your own fragmentation at this level.
1934 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1935 if (!netif_supports_nofcs(dev)) {
1936 err = -EPROTONOSUPPORT;
1939 extra_len = 4; /* We're doing our own CRC */
1943 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1947 size_t reserved = LL_RESERVED_SPACE(dev);
1948 int tlen = dev->needed_tailroom;
1949 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1952 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1955 /* FIXME: Save some space for broken drivers that write a hard
1956 * header at transmission time by themselves. PPP is the notable
1957 * one here. This should really be fixed at the driver level.
1959 skb_reserve(skb, reserved);
1960 skb_reset_network_header(skb);
1962 /* Try to align data part correctly */
1967 skb_reset_network_header(skb);
1969 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1975 if (!dev_validate_header(dev, skb->data, len)) {
1979 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1980 !packet_extra_vlan_len_allowed(dev, skb)) {
1985 sockc.tsflags = sk->sk_tsflags;
1986 if (msg->msg_controllen) {
1987 err = sock_cmsg_send(sk, msg, &sockc);
1992 skb->protocol = proto;
1994 skb->priority = sk->sk_priority;
1995 skb->mark = sk->sk_mark;
1997 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1999 if (unlikely(extra_len == 4))
2002 skb_probe_transport_header(skb, 0);
2004 dev_queue_xmit(skb);
2015 static unsigned int run_filter(struct sk_buff *skb,
2016 const struct sock *sk,
2019 struct sk_filter *filter;
2022 filter = rcu_dereference(sk->sk_filter);
2024 res = bpf_prog_run_clear_cb(filter->prog, skb);
2030 static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2033 struct virtio_net_hdr vnet_hdr;
2035 if (*len < sizeof(vnet_hdr))
2037 *len -= sizeof(vnet_hdr);
2039 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true))
2042 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2046 * This function makes lazy skb cloning in hope that most of packets
2047 * are discarded by BPF.
2049 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2050 * and skb->cb are mangled. It works because (and until) packets
2051 * falling here are owned by current CPU. Output packets are cloned
2052 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2053 * sequencially, so that if we return skb to original state on exit,
2054 * we will not harm anyone.
2057 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2058 struct packet_type *pt, struct net_device *orig_dev)
2061 struct sockaddr_ll *sll;
2062 struct packet_sock *po;
2063 u8 *skb_head = skb->data;
2064 int skb_len = skb->len;
2065 unsigned int snaplen, res;
2066 bool is_drop_n_account = false;
2068 if (skb->pkt_type == PACKET_LOOPBACK)
2071 sk = pt->af_packet_priv;
2074 if (!net_eq(dev_net(dev), sock_net(sk)))
2079 if (dev->header_ops) {
2080 /* The device has an explicit notion of ll header,
2081 * exported to higher levels.
2083 * Otherwise, the device hides details of its frame
2084 * structure, so that corresponding packet head is
2085 * never delivered to user.
2087 if (sk->sk_type != SOCK_DGRAM)
2088 skb_push(skb, skb->data - skb_mac_header(skb));
2089 else if (skb->pkt_type == PACKET_OUTGOING) {
2090 /* Special case: outgoing packets have ll header at head */
2091 skb_pull(skb, skb_network_offset(skb));
2097 res = run_filter(skb, sk, snaplen);
2099 goto drop_n_restore;
2103 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2106 if (skb_shared(skb)) {
2107 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2111 if (skb_head != skb->data) {
2112 skb->data = skb_head;
2119 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
2121 sll = &PACKET_SKB_CB(skb)->sa.ll;
2122 sll->sll_hatype = dev->type;
2123 sll->sll_pkttype = skb->pkt_type;
2124 if (unlikely(po->origdev))
2125 sll->sll_ifindex = orig_dev->ifindex;
2127 sll->sll_ifindex = dev->ifindex;
2129 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2131 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2132 * Use their space for storing the original skb length.
2134 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
2136 if (pskb_trim(skb, snaplen))
2139 skb_set_owner_r(skb, sk);
2143 /* drop conntrack reference */
2146 spin_lock(&sk->sk_receive_queue.lock);
2147 po->stats.stats1.tp_packets++;
2148 sock_skb_set_dropcount(sk, skb);
2149 __skb_queue_tail(&sk->sk_receive_queue, skb);
2150 spin_unlock(&sk->sk_receive_queue.lock);
2151 sk->sk_data_ready(sk);
2155 is_drop_n_account = true;
2156 spin_lock(&sk->sk_receive_queue.lock);
2157 po->stats.stats1.tp_drops++;
2158 atomic_inc(&sk->sk_drops);
2159 spin_unlock(&sk->sk_receive_queue.lock);
2162 if (skb_head != skb->data && skb_shared(skb)) {
2163 skb->data = skb_head;
2167 if (!is_drop_n_account)
2174 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2175 struct packet_type *pt, struct net_device *orig_dev)
2178 struct packet_sock *po;
2179 struct sockaddr_ll *sll;
2180 union tpacket_uhdr h;
2181 u8 *skb_head = skb->data;
2182 int skb_len = skb->len;
2183 unsigned int snaplen, res;
2184 unsigned long status = TP_STATUS_USER;
2185 unsigned short macoff, netoff, hdrlen;
2186 struct sk_buff *copy_skb = NULL;
2189 bool is_drop_n_account = false;
2190 bool do_vnet = false;
2192 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2193 * We may add members to them until current aligned size without forcing
2194 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2196 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2197 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2199 if (skb->pkt_type == PACKET_LOOPBACK)
2202 sk = pt->af_packet_priv;
2205 if (!net_eq(dev_net(dev), sock_net(sk)))
2208 if (dev->header_ops) {
2209 if (sk->sk_type != SOCK_DGRAM)
2210 skb_push(skb, skb->data - skb_mac_header(skb));
2211 else if (skb->pkt_type == PACKET_OUTGOING) {
2212 /* Special case: outgoing packets have ll header at head */
2213 skb_pull(skb, skb_network_offset(skb));
2219 res = run_filter(skb, sk, snaplen);
2221 goto drop_n_restore;
2223 if (skb->ip_summed == CHECKSUM_PARTIAL)
2224 status |= TP_STATUS_CSUMNOTREADY;
2225 else if (skb->pkt_type != PACKET_OUTGOING &&
2226 (skb->ip_summed == CHECKSUM_COMPLETE ||
2227 skb_csum_unnecessary(skb)))
2228 status |= TP_STATUS_CSUM_VALID;
2233 if (sk->sk_type == SOCK_DGRAM) {
2234 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2237 unsigned int maclen = skb_network_offset(skb);
2238 netoff = TPACKET_ALIGN(po->tp_hdrlen +
2239 (maclen < 16 ? 16 : maclen)) +
2241 if (po->has_vnet_hdr) {
2242 netoff += sizeof(struct virtio_net_hdr);
2245 macoff = netoff - maclen;
2247 if (po->tp_version <= TPACKET_V2) {
2248 if (macoff + snaplen > po->rx_ring.frame_size) {
2249 if (po->copy_thresh &&
2250 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
2251 if (skb_shared(skb)) {
2252 copy_skb = skb_clone(skb, GFP_ATOMIC);
2254 copy_skb = skb_get(skb);
2255 skb_head = skb->data;
2258 skb_set_owner_r(copy_skb, sk);
2260 snaplen = po->rx_ring.frame_size - macoff;
2261 if ((int)snaplen < 0) {
2266 } else if (unlikely(macoff + snaplen >
2267 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2270 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2271 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2272 snaplen, nval, macoff);
2274 if (unlikely((int)snaplen < 0)) {
2276 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2280 spin_lock(&sk->sk_receive_queue.lock);
2281 h.raw = packet_current_rx_frame(po, skb,
2282 TP_STATUS_KERNEL, (macoff+snaplen));
2284 goto drop_n_account;
2285 if (po->tp_version <= TPACKET_V2) {
2286 packet_increment_rx_head(po, &po->rx_ring);
2288 * LOSING will be reported till you read the stats,
2289 * because it's COR - Clear On Read.
2290 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2293 if (po->stats.stats1.tp_drops)
2294 status |= TP_STATUS_LOSING;
2296 po->stats.stats1.tp_packets++;
2298 status |= TP_STATUS_COPY;
2299 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2301 spin_unlock(&sk->sk_receive_queue.lock);
2304 if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
2305 sizeof(struct virtio_net_hdr),
2307 spin_lock(&sk->sk_receive_queue.lock);
2308 goto drop_n_account;
2312 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2314 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
2315 getnstimeofday(&ts);
2317 status |= ts_status;
2319 switch (po->tp_version) {
2321 h.h1->tp_len = skb->len;
2322 h.h1->tp_snaplen = snaplen;
2323 h.h1->tp_mac = macoff;
2324 h.h1->tp_net = netoff;
2325 h.h1->tp_sec = ts.tv_sec;
2326 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
2327 hdrlen = sizeof(*h.h1);
2330 h.h2->tp_len = skb->len;
2331 h.h2->tp_snaplen = snaplen;
2332 h.h2->tp_mac = macoff;
2333 h.h2->tp_net = netoff;
2334 h.h2->tp_sec = ts.tv_sec;
2335 h.h2->tp_nsec = ts.tv_nsec;
2336 if (skb_vlan_tag_present(skb)) {
2337 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
2338 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2339 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2341 h.h2->tp_vlan_tci = 0;
2342 h.h2->tp_vlan_tpid = 0;
2344 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
2345 hdrlen = sizeof(*h.h2);
2348 /* tp_nxt_offset,vlan are already populated above.
2349 * So DONT clear those fields here
2351 h.h3->tp_status |= status;
2352 h.h3->tp_len = skb->len;
2353 h.h3->tp_snaplen = snaplen;
2354 h.h3->tp_mac = macoff;
2355 h.h3->tp_net = netoff;
2356 h.h3->tp_sec = ts.tv_sec;
2357 h.h3->tp_nsec = ts.tv_nsec;
2358 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
2359 hdrlen = sizeof(*h.h3);
2365 sll = h.raw + TPACKET_ALIGN(hdrlen);
2366 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2367 sll->sll_family = AF_PACKET;
2368 sll->sll_hatype = dev->type;
2369 sll->sll_protocol = skb->protocol;
2370 sll->sll_pkttype = skb->pkt_type;
2371 if (unlikely(po->origdev))
2372 sll->sll_ifindex = orig_dev->ifindex;
2374 sll->sll_ifindex = dev->ifindex;
2378 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2379 if (po->tp_version <= TPACKET_V2) {
2382 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2385 for (start = h.raw; start < end; start += PAGE_SIZE)
2386 flush_dcache_page(pgv_to_page(start));
2391 if (po->tp_version <= TPACKET_V2) {
2392 __packet_set_status(po, h.raw, status);
2393 sk->sk_data_ready(sk);
2395 prb_clear_blk_fill_status(&po->rx_ring);
2399 if (skb_head != skb->data && skb_shared(skb)) {
2400 skb->data = skb_head;
2404 if (!is_drop_n_account)
2411 is_drop_n_account = true;
2412 po->stats.stats1.tp_drops++;
2413 spin_unlock(&sk->sk_receive_queue.lock);
2415 sk->sk_data_ready(sk);
2416 kfree_skb(copy_skb);
2417 goto drop_n_restore;
2420 static void tpacket_destruct_skb(struct sk_buff *skb)
2422 struct packet_sock *po = pkt_sk(skb->sk);
2424 if (likely(po->tx_ring.pg_vec)) {
2428 ph = skb_shinfo(skb)->destructor_arg;
2429 packet_dec_pending(&po->tx_ring);
2431 ts = __packet_set_timestamp(po, ph, skb);
2432 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
2438 static void tpacket_set_protocol(const struct net_device *dev,
2439 struct sk_buff *skb)
2441 if (dev->type == ARPHRD_ETHER) {
2442 skb_reset_mac_header(skb);
2443 skb->protocol = eth_hdr(skb)->h_proto;
2447 static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2449 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2450 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2451 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2452 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2453 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2454 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2455 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2457 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2463 static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2464 struct virtio_net_hdr *vnet_hdr)
2466 if (*len < sizeof(*vnet_hdr))
2468 *len -= sizeof(*vnet_hdr);
2470 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
2473 return __packet_snd_vnet_parse(vnet_hdr, *len);
2476 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2477 void *frame, struct net_device *dev, void *data, int tp_len,
2478 __be16 proto, unsigned char *addr, int hlen, int copylen,
2479 const struct sockcm_cookie *sockc)
2481 union tpacket_uhdr ph;
2482 int to_write, offset, len, nr_frags, len_max;
2483 struct socket *sock = po->sk.sk_socket;
2489 skb->protocol = proto;
2491 skb->priority = po->sk.sk_priority;
2492 skb->mark = po->sk.sk_mark;
2493 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
2494 skb_shinfo(skb)->destructor_arg = ph.raw;
2496 skb_reserve(skb, hlen);
2497 skb_reset_network_header(skb);
2501 if (sock->type == SOCK_DGRAM) {
2502 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2504 if (unlikely(err < 0))
2506 } else if (copylen) {
2507 int hdrlen = min_t(int, copylen, tp_len);
2509 skb_push(skb, dev->hard_header_len);
2510 skb_put(skb, copylen - dev->hard_header_len);
2511 err = skb_store_bits(skb, 0, data, hdrlen);
2514 if (!dev_validate_header(dev, skb->data, hdrlen))
2517 tpacket_set_protocol(dev, skb);
2523 offset = offset_in_page(data);
2524 len_max = PAGE_SIZE - offset;
2525 len = ((to_write > len_max) ? len_max : to_write);
2527 skb->data_len = to_write;
2528 skb->len += to_write;
2529 skb->truesize += to_write;
2530 refcount_add(to_write, &po->sk.sk_wmem_alloc);
2532 while (likely(to_write)) {
2533 nr_frags = skb_shinfo(skb)->nr_frags;
2535 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2536 pr_err("Packet exceed the number of skb frags(%lu)\n",
2541 page = pgv_to_page(data);
2543 flush_dcache_page(page);
2545 skb_fill_page_desc(skb, nr_frags, page, offset, len);
2548 len_max = PAGE_SIZE;
2549 len = ((to_write > len_max) ? len_max : to_write);
2552 skb_probe_transport_header(skb, 0);
2557 static int tpacket_parse_header(struct packet_sock *po, void *frame,
2558 int size_max, void **data)
2560 union tpacket_uhdr ph;
2565 switch (po->tp_version) {
2567 if (ph.h3->tp_next_offset != 0) {
2568 pr_warn_once("variable sized slot not supported");
2571 tp_len = ph.h3->tp_len;
2574 tp_len = ph.h2->tp_len;
2577 tp_len = ph.h1->tp_len;
2580 if (unlikely(tp_len > size_max)) {
2581 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2585 if (unlikely(po->tp_tx_has_off)) {
2586 int off_min, off_max;
2588 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2589 off_max = po->tx_ring.frame_size - tp_len;
2590 if (po->sk.sk_type == SOCK_DGRAM) {
2591 switch (po->tp_version) {
2593 off = ph.h3->tp_net;
2596 off = ph.h2->tp_net;
2599 off = ph.h1->tp_net;
2603 switch (po->tp_version) {
2605 off = ph.h3->tp_mac;
2608 off = ph.h2->tp_mac;
2611 off = ph.h1->tp_mac;
2615 if (unlikely((off < off_min) || (off_max < off)))
2618 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2621 *data = frame + off;
2625 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2627 struct sk_buff *skb;
2628 struct net_device *dev;
2629 struct virtio_net_hdr *vnet_hdr = NULL;
2630 struct sockcm_cookie sockc;
2632 int err, reserve = 0;
2634 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2635 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2636 int tp_len, size_max;
2637 unsigned char *addr;
2640 int status = TP_STATUS_AVAILABLE;
2641 int hlen, tlen, copylen = 0;
2643 mutex_lock(&po->pg_vec_lock);
2645 if (likely(saddr == NULL)) {
2646 dev = packet_cached_dev_get(po);
2651 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2653 if (msg->msg_namelen < (saddr->sll_halen
2654 + offsetof(struct sockaddr_ll,
2657 proto = saddr->sll_protocol;
2658 addr = saddr->sll_addr;
2659 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2663 if (unlikely(dev == NULL))
2666 if (unlikely(!(dev->flags & IFF_UP)))
2669 sockc.tsflags = po->sk.sk_tsflags;
2670 if (msg->msg_controllen) {
2671 err = sock_cmsg_send(&po->sk, msg, &sockc);
2676 if (po->sk.sk_socket->type == SOCK_RAW)
2677 reserve = dev->hard_header_len;
2678 size_max = po->tx_ring.frame_size
2679 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2681 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
2682 size_max = dev->mtu + reserve + VLAN_HLEN;
2685 ph = packet_current_frame(po, &po->tx_ring,
2686 TP_STATUS_SEND_REQUEST);
2687 if (unlikely(ph == NULL)) {
2688 if (need_wait && need_resched())
2694 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2698 status = TP_STATUS_SEND_REQUEST;
2699 hlen = LL_RESERVED_SPACE(dev);
2700 tlen = dev->needed_tailroom;
2701 if (po->has_vnet_hdr) {
2703 data += sizeof(*vnet_hdr);
2704 tp_len -= sizeof(*vnet_hdr);
2706 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2710 copylen = __virtio16_to_cpu(vio_le(),
2713 copylen = max_t(int, copylen, dev->hard_header_len);
2714 skb = sock_alloc_send_skb(&po->sk,
2715 hlen + tlen + sizeof(struct sockaddr_ll) +
2716 (copylen - dev->hard_header_len),
2719 if (unlikely(skb == NULL)) {
2720 /* we assume the socket was initially writeable ... */
2721 if (likely(len_sum > 0))
2725 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
2726 addr, hlen, copylen, &sockc);
2727 if (likely(tp_len >= 0) &&
2728 tp_len > dev->mtu + reserve &&
2729 !po->has_vnet_hdr &&
2730 !packet_extra_vlan_len_allowed(dev, skb))
2733 if (unlikely(tp_len < 0)) {
2736 __packet_set_status(po, ph,
2737 TP_STATUS_AVAILABLE);
2738 packet_increment_head(&po->tx_ring);
2742 status = TP_STATUS_WRONG_FORMAT;
2748 if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr,
2754 skb->destructor = tpacket_destruct_skb;
2755 __packet_set_status(po, ph, TP_STATUS_SENDING);
2756 packet_inc_pending(&po->tx_ring);
2758 status = TP_STATUS_SEND_REQUEST;
2759 err = po->xmit(skb);
2760 if (unlikely(err > 0)) {
2761 err = net_xmit_errno(err);
2762 if (err && __packet_get_status(po, ph) ==
2763 TP_STATUS_AVAILABLE) {
2764 /* skb was destructed already */
2769 * skb was dropped but not destructed yet;
2770 * let's treat it like congestion or err < 0
2774 packet_increment_head(&po->tx_ring);
2776 } while (likely((ph != NULL) ||
2777 /* Note: packet_read_pending() might be slow if we have
2778 * to call it as it's per_cpu variable, but in fast-path
2779 * we already short-circuit the loop with the first
2780 * condition, and luckily don't have to go that path
2783 (need_wait && packet_read_pending(&po->tx_ring))));
2789 __packet_set_status(po, ph, status);
2794 mutex_unlock(&po->pg_vec_lock);
2798 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2799 size_t reserve, size_t len,
2800 size_t linear, int noblock,
2803 struct sk_buff *skb;
2805 /* Under a page? Don't bother with paged skb. */
2806 if (prepad + len < PAGE_SIZE || !linear)
2809 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2814 skb_reserve(skb, reserve);
2815 skb_put(skb, linear);
2816 skb->data_len = len - linear;
2817 skb->len += len - linear;
2822 static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2824 struct sock *sk = sock->sk;
2825 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2826 struct sk_buff *skb;
2827 struct net_device *dev;
2829 unsigned char *addr;
2830 int err, reserve = 0;
2831 struct sockcm_cookie sockc;
2832 struct virtio_net_hdr vnet_hdr = { 0 };
2834 struct packet_sock *po = pkt_sk(sk);
2835 bool has_vnet_hdr = false;
2836 int hlen, tlen, linear;
2840 * Get and verify the address.
2843 if (likely(saddr == NULL)) {
2844 dev = packet_cached_dev_get(po);
2849 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2851 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2853 proto = saddr->sll_protocol;
2854 addr = saddr->sll_addr;
2855 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2859 if (unlikely(dev == NULL))
2862 if (unlikely(!(dev->flags & IFF_UP)))
2865 sockc.tsflags = sk->sk_tsflags;
2866 sockc.mark = sk->sk_mark;
2867 if (msg->msg_controllen) {
2868 err = sock_cmsg_send(sk, msg, &sockc);
2873 if (sock->type == SOCK_RAW)
2874 reserve = dev->hard_header_len;
2875 if (po->has_vnet_hdr) {
2876 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2879 has_vnet_hdr = true;
2882 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2883 if (!netif_supports_nofcs(dev)) {
2884 err = -EPROTONOSUPPORT;
2887 extra_len = 4; /* We're doing our own CRC */
2891 if (!vnet_hdr.gso_type &&
2892 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
2896 hlen = LL_RESERVED_SPACE(dev);
2897 tlen = dev->needed_tailroom;
2898 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2899 linear = max(linear, min_t(int, len, dev->hard_header_len));
2900 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
2901 msg->msg_flags & MSG_DONTWAIT, &err);
2905 skb_set_network_header(skb, reserve);
2908 if (sock->type == SOCK_DGRAM) {
2909 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
2910 if (unlikely(offset < 0))
2914 /* Returns -EFAULT on error */
2915 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
2919 if (sock->type == SOCK_RAW &&
2920 !dev_validate_header(dev, skb->data, len)) {
2925 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
2927 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
2928 !packet_extra_vlan_len_allowed(dev, skb)) {
2933 skb->protocol = proto;
2935 skb->priority = sk->sk_priority;
2936 skb->mark = sockc.mark;
2939 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
2942 len += sizeof(vnet_hdr);
2945 skb_probe_transport_header(skb, reserve);
2947 if (unlikely(extra_len == 4))
2950 err = po->xmit(skb);
2951 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2967 static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
2969 struct sock *sk = sock->sk;
2970 struct packet_sock *po = pkt_sk(sk);
2972 if (po->tx_ring.pg_vec)
2973 return tpacket_snd(po, msg);
2975 return packet_snd(sock, msg, len);
2979 * Close a PACKET socket. This is fairly simple. We immediately go
2980 * to 'closed' state and remove our protocol entry in the device list.
2983 static int packet_release(struct socket *sock)
2985 struct sock *sk = sock->sk;
2986 struct packet_sock *po;
2987 struct packet_fanout *f;
2989 union tpacket_req_u req_u;
2997 mutex_lock(&net->packet.sklist_lock);
2998 sk_del_node_init_rcu(sk);
2999 mutex_unlock(&net->packet.sklist_lock);
3002 sock_prot_inuse_add(net, sk->sk_prot, -1);
3005 spin_lock(&po->bind_lock);
3006 unregister_prot_hook(sk, false);
3007 packet_cached_dev_reset(po);
3009 if (po->prot_hook.dev) {
3010 dev_put(po->prot_hook.dev);
3011 po->prot_hook.dev = NULL;
3013 spin_unlock(&po->bind_lock);
3015 packet_flush_mclist(sk);
3017 if (po->rx_ring.pg_vec) {
3018 memset(&req_u, 0, sizeof(req_u));
3019 packet_set_ring(sk, &req_u, 1, 0);
3022 if (po->tx_ring.pg_vec) {
3023 memset(&req_u, 0, sizeof(req_u));
3024 packet_set_ring(sk, &req_u, 1, 1);
3027 f = fanout_release(sk);
3032 fanout_release_data(f);
3036 * Now the socket is dead. No more input will appear.
3043 skb_queue_purge(&sk->sk_receive_queue);
3044 packet_free_pending(po);
3045 sk_refcnt_debug_release(sk);
3052 * Attach a packet hook.
3055 static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3058 struct packet_sock *po = pkt_sk(sk);
3059 struct net_device *dev_curr;
3062 struct net_device *dev = NULL;
3064 bool unlisted = false;
3067 spin_lock(&po->bind_lock);
3076 dev = dev_get_by_name_rcu(sock_net(sk), name);
3081 } else if (ifindex) {
3082 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3092 proto_curr = po->prot_hook.type;
3093 dev_curr = po->prot_hook.dev;
3095 need_rehook = proto_curr != proto || dev_curr != dev;
3100 __unregister_prot_hook(sk, true);
3102 dev_curr = po->prot_hook.dev;
3104 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3109 po->prot_hook.type = proto;
3111 if (unlikely(unlisted)) {
3113 po->prot_hook.dev = NULL;
3115 packet_cached_dev_reset(po);
3117 po->prot_hook.dev = dev;
3118 po->ifindex = dev ? dev->ifindex : 0;
3119 packet_cached_dev_assign(po, dev);
3125 if (proto == 0 || !need_rehook)
3128 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
3129 register_prot_hook(sk);
3131 sk->sk_err = ENETDOWN;
3132 if (!sock_flag(sk, SOCK_DEAD))
3133 sk->sk_error_report(sk);
3138 spin_unlock(&po->bind_lock);
3144 * Bind a packet socket to a device
3147 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3150 struct sock *sk = sock->sk;
3151 char name[sizeof(uaddr->sa_data) + 1];
3157 if (addr_len != sizeof(struct sockaddr))
3159 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3162 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3163 name[sizeof(uaddr->sa_data)] = 0;
3165 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
3168 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3170 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3171 struct sock *sk = sock->sk;
3177 if (addr_len < sizeof(struct sockaddr_ll))
3179 if (sll->sll_family != AF_PACKET)
3182 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3183 sll->sll_protocol ? : pkt_sk(sk)->num);
3186 static struct proto packet_proto = {
3188 .owner = THIS_MODULE,
3189 .obj_size = sizeof(struct packet_sock),
3193 * Create a packet of type SOCK_PACKET.
3196 static int packet_create(struct net *net, struct socket *sock, int protocol,
3200 struct packet_sock *po;
3201 __be16 proto = (__force __be16)protocol; /* weird, but documented */
3204 if (!ns_capable(net->user_ns, CAP_NET_RAW))
3206 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3207 sock->type != SOCK_PACKET)
3208 return -ESOCKTNOSUPPORT;
3210 sock->state = SS_UNCONNECTED;
3213 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
3217 sock->ops = &packet_ops;
3218 if (sock->type == SOCK_PACKET)
3219 sock->ops = &packet_ops_spkt;
3221 sock_init_data(sock, sk);
3224 sk->sk_family = PF_PACKET;
3226 po->xmit = dev_queue_xmit;
3228 err = packet_alloc_pending(po);
3232 packet_cached_dev_reset(po);
3234 sk->sk_destruct = packet_sock_destruct;
3235 sk_refcnt_debug_inc(sk);
3238 * Attach a protocol block
3241 spin_lock_init(&po->bind_lock);
3242 mutex_init(&po->pg_vec_lock);
3243 po->rollover = NULL;
3244 po->prot_hook.func = packet_rcv;
3246 if (sock->type == SOCK_PACKET)
3247 po->prot_hook.func = packet_rcv_spkt;
3249 po->prot_hook.af_packet_priv = sk;
3252 po->prot_hook.type = proto;
3253 register_prot_hook(sk);
3256 mutex_lock(&net->packet.sklist_lock);
3257 sk_add_node_rcu(sk, &net->packet.sklist);
3258 mutex_unlock(&net->packet.sklist_lock);
3261 sock_prot_inuse_add(net, &packet_proto, 1);
3272 * Pull a packet from our receive queue and hand it to the user.
3273 * If necessary we block.
3276 static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3279 struct sock *sk = sock->sk;
3280 struct sk_buff *skb;
3282 int vnet_hdr_len = 0;
3283 unsigned int origlen = 0;
3286 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
3290 /* What error should we return now? EUNATTACH? */
3291 if (pkt_sk(sk)->ifindex < 0)
3295 if (flags & MSG_ERRQUEUE) {
3296 err = sock_recv_errqueue(sk, msg, len,
3297 SOL_PACKET, PACKET_TX_TIMESTAMP);
3302 * Call the generic datagram receiver. This handles all sorts
3303 * of horrible races and re-entrancy so we can forget about it
3304 * in the protocol layers.
3306 * Now it will return ENETDOWN, if device have just gone down,
3307 * but then it will block.
3310 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
3313 * An error occurred so return it. Because skb_recv_datagram()
3314 * handles the blocking we don't see and worry about blocking
3321 if (pkt_sk(sk)->pressure)
3322 packet_rcv_has_room(pkt_sk(sk), NULL);
3324 if (pkt_sk(sk)->has_vnet_hdr) {
3325 err = packet_rcv_vnet(msg, skb, &len);
3328 vnet_hdr_len = sizeof(struct virtio_net_hdr);
3331 /* You lose any data beyond the buffer you gave. If it worries
3332 * a user program they can ask the device for its MTU
3338 msg->msg_flags |= MSG_TRUNC;
3341 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3345 if (sock->type != SOCK_PACKET) {
3346 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3348 /* Original length was stored in sockaddr_ll fields */
3349 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3350 sll->sll_family = AF_PACKET;
3351 sll->sll_protocol = skb->protocol;
3354 sock_recv_ts_and_drops(msg, sk, skb);
3356 if (msg->msg_name) {
3357 /* If the address length field is there to be filled
3358 * in, we fill it in now.
3360 if (sock->type == SOCK_PACKET) {
3361 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
3362 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3364 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3366 msg->msg_namelen = sll->sll_halen +
3367 offsetof(struct sockaddr_ll, sll_addr);
3369 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3373 if (pkt_sk(sk)->auxdata) {
3374 struct tpacket_auxdata aux;
3376 aux.tp_status = TP_STATUS_USER;
3377 if (skb->ip_summed == CHECKSUM_PARTIAL)
3378 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3379 else if (skb->pkt_type != PACKET_OUTGOING &&
3380 (skb->ip_summed == CHECKSUM_COMPLETE ||
3381 skb_csum_unnecessary(skb)))
3382 aux.tp_status |= TP_STATUS_CSUM_VALID;
3384 aux.tp_len = origlen;
3385 aux.tp_snaplen = skb->len;
3387 aux.tp_net = skb_network_offset(skb);
3388 if (skb_vlan_tag_present(skb)) {
3389 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
3390 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3391 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
3393 aux.tp_vlan_tci = 0;
3394 aux.tp_vlan_tpid = 0;
3396 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
3400 * Free or return the buffer as appropriate. Again this
3401 * hides all the races and re-entrancy issues from us.
3403 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
3406 skb_free_datagram(sk, skb);
3411 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3412 int *uaddr_len, int peer)
3414 struct net_device *dev;
3415 struct sock *sk = sock->sk;
3420 uaddr->sa_family = AF_PACKET;
3421 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
3423 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3425 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
3427 *uaddr_len = sizeof(*uaddr);
3432 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3433 int *uaddr_len, int peer)
3435 struct net_device *dev;
3436 struct sock *sk = sock->sk;
3437 struct packet_sock *po = pkt_sk(sk);
3438 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3443 sll->sll_family = AF_PACKET;
3444 sll->sll_ifindex = po->ifindex;
3445 sll->sll_protocol = po->num;
3446 sll->sll_pkttype = 0;
3448 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
3450 sll->sll_hatype = dev->type;
3451 sll->sll_halen = dev->addr_len;
3452 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
3454 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3458 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
3463 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3467 case PACKET_MR_MULTICAST:
3468 if (i->alen != dev->addr_len)
3471 return dev_mc_add(dev, i->addr);
3473 return dev_mc_del(dev, i->addr);
3475 case PACKET_MR_PROMISC:
3476 return dev_set_promiscuity(dev, what);
3477 case PACKET_MR_ALLMULTI:
3478 return dev_set_allmulti(dev, what);
3479 case PACKET_MR_UNICAST:
3480 if (i->alen != dev->addr_len)
3483 return dev_uc_add(dev, i->addr);
3485 return dev_uc_del(dev, i->addr);
3493 static void packet_dev_mclist_delete(struct net_device *dev,
3494 struct packet_mclist **mlp)
3496 struct packet_mclist *ml;
3498 while ((ml = *mlp) != NULL) {
3499 if (ml->ifindex == dev->ifindex) {
3500 packet_dev_mc(dev, ml, -1);
3508 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
3510 struct packet_sock *po = pkt_sk(sk);
3511 struct packet_mclist *ml, *i;
3512 struct net_device *dev;
3518 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
3523 if (mreq->mr_alen > dev->addr_len)
3527 i = kmalloc(sizeof(*i), GFP_KERNEL);
3532 for (ml = po->mclist; ml; ml = ml->next) {
3533 if (ml->ifindex == mreq->mr_ifindex &&
3534 ml->type == mreq->mr_type &&
3535 ml->alen == mreq->mr_alen &&
3536 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3538 /* Free the new element ... */
3544 i->type = mreq->mr_type;
3545 i->ifindex = mreq->mr_ifindex;
3546 i->alen = mreq->mr_alen;
3547 memcpy(i->addr, mreq->mr_address, i->alen);
3548 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
3550 i->next = po->mclist;
3552 err = packet_dev_mc(dev, i, 1);
3554 po->mclist = i->next;
3563 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
3565 struct packet_mclist *ml, **mlp;
3569 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3570 if (ml->ifindex == mreq->mr_ifindex &&
3571 ml->type == mreq->mr_type &&
3572 ml->alen == mreq->mr_alen &&
3573 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3574 if (--ml->count == 0) {
3575 struct net_device *dev;
3577 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3579 packet_dev_mc(dev, ml, -1);
3589 static void packet_flush_mclist(struct sock *sk)
3591 struct packet_sock *po = pkt_sk(sk);
3592 struct packet_mclist *ml;
3598 while ((ml = po->mclist) != NULL) {
3599 struct net_device *dev;
3601 po->mclist = ml->next;
3602 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3604 packet_dev_mc(dev, ml, -1);
3611 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
3613 struct sock *sk = sock->sk;
3614 struct packet_sock *po = pkt_sk(sk);
3617 if (level != SOL_PACKET)
3618 return -ENOPROTOOPT;
3621 case PACKET_ADD_MEMBERSHIP:
3622 case PACKET_DROP_MEMBERSHIP:
3624 struct packet_mreq_max mreq;
3626 memset(&mreq, 0, sizeof(mreq));
3627 if (len < sizeof(struct packet_mreq))
3629 if (len > sizeof(mreq))
3631 if (copy_from_user(&mreq, optval, len))
3633 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3635 if (optname == PACKET_ADD_MEMBERSHIP)
3636 ret = packet_mc_add(sk, &mreq);
3638 ret = packet_mc_drop(sk, &mreq);
3642 case PACKET_RX_RING:
3643 case PACKET_TX_RING:
3645 union tpacket_req_u req_u;
3648 switch (po->tp_version) {
3651 len = sizeof(req_u.req);
3655 len = sizeof(req_u.req3);
3660 if (copy_from_user(&req_u.req, optval, len))
3662 return packet_set_ring(sk, &req_u, 0,
3663 optname == PACKET_TX_RING);
3665 case PACKET_COPY_THRESH:
3669 if (optlen != sizeof(val))
3671 if (copy_from_user(&val, optval, sizeof(val)))
3674 pkt_sk(sk)->copy_thresh = val;
3677 case PACKET_VERSION:
3681 if (optlen != sizeof(val))
3683 if (copy_from_user(&val, optval, sizeof(val)))
3694 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3697 po->tp_version = val;
3703 case PACKET_RESERVE:
3707 if (optlen != sizeof(val))
3709 if (copy_from_user(&val, optval, sizeof(val)))
3714 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3717 po->tp_reserve = val;
3727 if (optlen != sizeof(val))
3729 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3731 if (copy_from_user(&val, optval, sizeof(val)))
3733 po->tp_loss = !!val;
3736 case PACKET_AUXDATA:
3740 if (optlen < sizeof(val))
3742 if (copy_from_user(&val, optval, sizeof(val)))
3745 po->auxdata = !!val;
3748 case PACKET_ORIGDEV:
3752 if (optlen < sizeof(val))
3754 if (copy_from_user(&val, optval, sizeof(val)))
3757 po->origdev = !!val;
3760 case PACKET_VNET_HDR:
3764 if (sock->type != SOCK_RAW)
3766 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3768 if (optlen < sizeof(val))
3770 if (copy_from_user(&val, optval, sizeof(val)))
3773 po->has_vnet_hdr = !!val;
3776 case PACKET_TIMESTAMP:
3780 if (optlen != sizeof(val))
3782 if (copy_from_user(&val, optval, sizeof(val)))
3785 po->tp_tstamp = val;
3792 if (optlen != sizeof(val))
3794 if (copy_from_user(&val, optval, sizeof(val)))
3797 return fanout_add(sk, val & 0xffff, val >> 16);
3799 case PACKET_FANOUT_DATA:
3804 return fanout_set_data(po, optval, optlen);
3806 case PACKET_TX_HAS_OFF:
3810 if (optlen != sizeof(val))
3812 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3814 if (copy_from_user(&val, optval, sizeof(val)))
3816 po->tp_tx_has_off = !!val;
3819 case PACKET_QDISC_BYPASS:
3823 if (optlen != sizeof(val))
3825 if (copy_from_user(&val, optval, sizeof(val)))
3828 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3832 return -ENOPROTOOPT;
3836 static int packet_getsockopt(struct socket *sock, int level, int optname,
3837 char __user *optval, int __user *optlen)
3840 int val, lv = sizeof(val);
3841 struct sock *sk = sock->sk;
3842 struct packet_sock *po = pkt_sk(sk);
3844 union tpacket_stats_u st;
3845 struct tpacket_rollover_stats rstats;
3846 struct packet_rollover *rollover;
3848 if (level != SOL_PACKET)
3849 return -ENOPROTOOPT;
3851 if (get_user(len, optlen))
3858 case PACKET_STATISTICS:
3859 spin_lock_bh(&sk->sk_receive_queue.lock);
3860 memcpy(&st, &po->stats, sizeof(st));
3861 memset(&po->stats, 0, sizeof(po->stats));
3862 spin_unlock_bh(&sk->sk_receive_queue.lock);
3864 if (po->tp_version == TPACKET_V3) {
3865 lv = sizeof(struct tpacket_stats_v3);
3866 st.stats3.tp_packets += st.stats3.tp_drops;
3869 lv = sizeof(struct tpacket_stats);
3870 st.stats1.tp_packets += st.stats1.tp_drops;
3875 case PACKET_AUXDATA:
3878 case PACKET_ORIGDEV:
3881 case PACKET_VNET_HDR:
3882 val = po->has_vnet_hdr;
3884 case PACKET_VERSION:
3885 val = po->tp_version;
3888 if (len > sizeof(int))
3890 if (len < sizeof(int))
3892 if (copy_from_user(&val, optval, len))
3896 val = sizeof(struct tpacket_hdr);
3899 val = sizeof(struct tpacket2_hdr);
3902 val = sizeof(struct tpacket3_hdr);
3908 case PACKET_RESERVE:
3909 val = po->tp_reserve;
3914 case PACKET_TIMESTAMP:
3915 val = po->tp_tstamp;
3919 ((u32)po->fanout->id |
3920 ((u32)po->fanout->type << 16) |
3921 ((u32)po->fanout->flags << 24)) :
3924 case PACKET_ROLLOVER_STATS:
3926 rollover = rcu_dereference(po->rollover);
3928 rstats.tp_all = atomic_long_read(&rollover->num);
3929 rstats.tp_huge = atomic_long_read(&rollover->num_huge);
3930 rstats.tp_failed = atomic_long_read(&rollover->num_failed);
3932 lv = sizeof(rstats);
3938 case PACKET_TX_HAS_OFF:
3939 val = po->tp_tx_has_off;
3941 case PACKET_QDISC_BYPASS:
3942 val = packet_use_direct_xmit(po);
3945 return -ENOPROTOOPT;
3950 if (put_user(len, optlen))
3952 if (copy_to_user(optval, data, len))
3958 #ifdef CONFIG_COMPAT
3959 static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3960 char __user *optval, unsigned int optlen)
3962 struct packet_sock *po = pkt_sk(sock->sk);
3964 if (level != SOL_PACKET)
3965 return -ENOPROTOOPT;
3967 if (optname == PACKET_FANOUT_DATA &&
3968 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3969 optval = (char __user *)get_compat_bpf_fprog(optval);
3972 optlen = sizeof(struct sock_fprog);
3975 return packet_setsockopt(sock, level, optname, optval, optlen);
3979 static int packet_notifier(struct notifier_block *this,
3980 unsigned long msg, void *ptr)
3983 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3984 struct net *net = dev_net(dev);
3987 sk_for_each_rcu(sk, &net->packet.sklist) {
3988 struct packet_sock *po = pkt_sk(sk);
3991 case NETDEV_UNREGISTER:
3993 packet_dev_mclist_delete(dev, &po->mclist);
3997 if (dev->ifindex == po->ifindex) {
3998 spin_lock(&po->bind_lock);
4000 __unregister_prot_hook(sk, false);
4001 sk->sk_err = ENETDOWN;
4002 if (!sock_flag(sk, SOCK_DEAD))
4003 sk->sk_error_report(sk);
4005 if (msg == NETDEV_UNREGISTER) {
4006 packet_cached_dev_reset(po);
4008 if (po->prot_hook.dev)
4009 dev_put(po->prot_hook.dev);
4010 po->prot_hook.dev = NULL;
4012 spin_unlock(&po->bind_lock);
4016 if (dev->ifindex == po->ifindex) {
4017 spin_lock(&po->bind_lock);
4019 register_prot_hook(sk);
4020 spin_unlock(&po->bind_lock);
4030 static int packet_ioctl(struct socket *sock, unsigned int cmd,
4033 struct sock *sk = sock->sk;
4038 int amount = sk_wmem_alloc_get(sk);
4040 return put_user(amount, (int __user *)arg);
4044 struct sk_buff *skb;
4047 spin_lock_bh(&sk->sk_receive_queue.lock);
4048 skb = skb_peek(&sk->sk_receive_queue);
4051 spin_unlock_bh(&sk->sk_receive_queue.lock);
4052 return put_user(amount, (int __user *)arg);
4055 return sock_get_timestamp(sk, (struct timeval __user *)arg);
4057 return sock_get_timestampns(sk, (struct timespec __user *)arg);
4067 case SIOCGIFBRDADDR:
4068 case SIOCSIFBRDADDR:
4069 case SIOCGIFNETMASK:
4070 case SIOCSIFNETMASK:
4071 case SIOCGIFDSTADDR:
4072 case SIOCSIFDSTADDR:
4074 return inet_dgram_ops.ioctl(sock, cmd, arg);
4078 return -ENOIOCTLCMD;
4083 static unsigned int packet_poll(struct file *file, struct socket *sock,
4086 struct sock *sk = sock->sk;
4087 struct packet_sock *po = pkt_sk(sk);
4088 unsigned int mask = datagram_poll(file, sock, wait);
4090 spin_lock_bh(&sk->sk_receive_queue.lock);
4091 if (po->rx_ring.pg_vec) {
4092 if (!packet_previous_rx_frame(po, &po->rx_ring,
4094 mask |= POLLIN | POLLRDNORM;
4096 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
4098 spin_unlock_bh(&sk->sk_receive_queue.lock);
4099 spin_lock_bh(&sk->sk_write_queue.lock);
4100 if (po->tx_ring.pg_vec) {
4101 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4102 mask |= POLLOUT | POLLWRNORM;
4104 spin_unlock_bh(&sk->sk_write_queue.lock);
4109 /* Dirty? Well, I still did not learn better way to account
4113 static void packet_mm_open(struct vm_area_struct *vma)
4115 struct file *file = vma->vm_file;
4116 struct socket *sock = file->private_data;
4117 struct sock *sk = sock->sk;
4120 atomic_inc(&pkt_sk(sk)->mapped);
4123 static void packet_mm_close(struct vm_area_struct *vma)
4125 struct file *file = vma->vm_file;
4126 struct socket *sock = file->private_data;
4127 struct sock *sk = sock->sk;
4130 atomic_dec(&pkt_sk(sk)->mapped);
4133 static const struct vm_operations_struct packet_mmap_ops = {
4134 .open = packet_mm_open,
4135 .close = packet_mm_close,
4138 static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4143 for (i = 0; i < len; i++) {
4144 if (likely(pg_vec[i].buffer)) {
4145 if (is_vmalloc_addr(pg_vec[i].buffer))
4146 vfree(pg_vec[i].buffer);
4148 free_pages((unsigned long)pg_vec[i].buffer,
4150 pg_vec[i].buffer = NULL;
4156 static char *alloc_one_pg_vec_page(unsigned long order)
4159 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4160 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4162 buffer = (char *) __get_free_pages(gfp_flags, order);
4166 /* __get_free_pages failed, fall back to vmalloc */
4167 buffer = vzalloc((1 << order) * PAGE_SIZE);
4171 /* vmalloc failed, lets dig into swap here */
4172 gfp_flags &= ~__GFP_NORETRY;
4173 buffer = (char *) __get_free_pages(gfp_flags, order);
4177 /* complete and utter failure */
4181 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4183 unsigned int block_nr = req->tp_block_nr;
4187 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4188 if (unlikely(!pg_vec))
4191 for (i = 0; i < block_nr; i++) {
4192 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
4193 if (unlikely(!pg_vec[i].buffer))
4194 goto out_free_pgvec;
4201 free_pg_vec(pg_vec, order, block_nr);
4206 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4207 int closing, int tx_ring)
4209 struct pgv *pg_vec = NULL;
4210 struct packet_sock *po = pkt_sk(sk);
4211 int was_running, order = 0;
4212 struct packet_ring_buffer *rb;
4213 struct sk_buff_head *rb_queue;
4216 /* Added to avoid minimal code churn */
4217 struct tpacket_req *req = &req_u->req;
4221 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4222 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
4226 if (atomic_read(&po->mapped))
4228 if (packet_read_pending(rb))
4232 if (req->tp_block_nr) {
4233 /* Sanity tests and some calculations */
4235 if (unlikely(rb->pg_vec))
4238 switch (po->tp_version) {
4240 po->tp_hdrlen = TPACKET_HDRLEN;
4243 po->tp_hdrlen = TPACKET2_HDRLEN;
4246 po->tp_hdrlen = TPACKET3_HDRLEN;
4251 if (unlikely((int)req->tp_block_size <= 0))
4253 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4255 if (po->tp_version >= TPACKET_V3 &&
4256 req->tp_block_size <=
4257 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
4259 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
4262 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
4265 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4266 if (unlikely(rb->frames_per_block == 0))
4268 if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
4270 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4275 order = get_order(req->tp_block_size);
4276 pg_vec = alloc_pg_vec(req, order);
4277 if (unlikely(!pg_vec))
4279 switch (po->tp_version) {
4281 /* Block transmit is not supported yet */
4283 init_prb_bdqc(po, rb, pg_vec, req_u);
4285 struct tpacket_req3 *req3 = &req_u->req3;
4287 if (req3->tp_retire_blk_tov ||
4288 req3->tp_sizeof_priv ||
4289 req3->tp_feature_req_word) {
4302 if (unlikely(req->tp_frame_nr))
4307 /* Detach socket from network */
4308 spin_lock(&po->bind_lock);
4309 was_running = po->running;
4313 __unregister_prot_hook(sk, false);
4315 spin_unlock(&po->bind_lock);
4320 mutex_lock(&po->pg_vec_lock);
4321 if (closing || atomic_read(&po->mapped) == 0) {
4323 spin_lock_bh(&rb_queue->lock);
4324 swap(rb->pg_vec, pg_vec);
4325 rb->frame_max = (req->tp_frame_nr - 1);
4327 rb->frame_size = req->tp_frame_size;
4328 spin_unlock_bh(&rb_queue->lock);
4330 swap(rb->pg_vec_order, order);
4331 swap(rb->pg_vec_len, req->tp_block_nr);
4333 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4334 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4335 tpacket_rcv : packet_rcv;
4336 skb_queue_purge(rb_queue);
4337 if (atomic_read(&po->mapped))
4338 pr_err("packet_mmap: vma is busy: %d\n",
4339 atomic_read(&po->mapped));
4341 mutex_unlock(&po->pg_vec_lock);
4343 spin_lock(&po->bind_lock);
4346 register_prot_hook(sk);
4348 spin_unlock(&po->bind_lock);
4349 if (pg_vec && (po->tp_version > TPACKET_V2)) {
4350 /* Because we don't support block-based V3 on tx-ring */
4352 prb_shutdown_retire_blk_timer(po, rb_queue);
4356 free_pg_vec(pg_vec, order, req->tp_block_nr);
4362 static int packet_mmap(struct file *file, struct socket *sock,
4363 struct vm_area_struct *vma)
4365 struct sock *sk = sock->sk;
4366 struct packet_sock *po = pkt_sk(sk);
4367 unsigned long size, expected_size;
4368 struct packet_ring_buffer *rb;
4369 unsigned long start;
4376 mutex_lock(&po->pg_vec_lock);
4379 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4381 expected_size += rb->pg_vec_len
4387 if (expected_size == 0)
4390 size = vma->vm_end - vma->vm_start;
4391 if (size != expected_size)
4394 start = vma->vm_start;
4395 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4396 if (rb->pg_vec == NULL)
4399 for (i = 0; i < rb->pg_vec_len; i++) {
4401 void *kaddr = rb->pg_vec[i].buffer;
4404 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4405 page = pgv_to_page(kaddr);
4406 err = vm_insert_page(vma, start, page);
4415 atomic_inc(&po->mapped);
4416 vma->vm_ops = &packet_mmap_ops;
4420 mutex_unlock(&po->pg_vec_lock);
4424 static const struct proto_ops packet_ops_spkt = {
4425 .family = PF_PACKET,
4426 .owner = THIS_MODULE,
4427 .release = packet_release,
4428 .bind = packet_bind_spkt,
4429 .connect = sock_no_connect,
4430 .socketpair = sock_no_socketpair,
4431 .accept = sock_no_accept,
4432 .getname = packet_getname_spkt,
4433 .poll = datagram_poll,
4434 .ioctl = packet_ioctl,
4435 .listen = sock_no_listen,
4436 .shutdown = sock_no_shutdown,
4437 .setsockopt = sock_no_setsockopt,
4438 .getsockopt = sock_no_getsockopt,
4439 .sendmsg = packet_sendmsg_spkt,
4440 .recvmsg = packet_recvmsg,
4441 .mmap = sock_no_mmap,
4442 .sendpage = sock_no_sendpage,
4445 static const struct proto_ops packet_ops = {
4446 .family = PF_PACKET,
4447 .owner = THIS_MODULE,
4448 .release = packet_release,
4449 .bind = packet_bind,
4450 .connect = sock_no_connect,
4451 .socketpair = sock_no_socketpair,
4452 .accept = sock_no_accept,
4453 .getname = packet_getname,
4454 .poll = packet_poll,
4455 .ioctl = packet_ioctl,
4456 .listen = sock_no_listen,
4457 .shutdown = sock_no_shutdown,
4458 .setsockopt = packet_setsockopt,
4459 .getsockopt = packet_getsockopt,
4460 #ifdef CONFIG_COMPAT
4461 .compat_setsockopt = compat_packet_setsockopt,
4463 .sendmsg = packet_sendmsg,
4464 .recvmsg = packet_recvmsg,
4465 .mmap = packet_mmap,
4466 .sendpage = sock_no_sendpage,
4469 static const struct net_proto_family packet_family_ops = {
4470 .family = PF_PACKET,
4471 .create = packet_create,
4472 .owner = THIS_MODULE,
4475 static struct notifier_block packet_netdev_notifier = {
4476 .notifier_call = packet_notifier,
4479 #ifdef CONFIG_PROC_FS
4481 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
4484 struct net *net = seq_file_net(seq);
4487 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
4490 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4492 struct net *net = seq_file_net(seq);
4493 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
4496 static void packet_seq_stop(struct seq_file *seq, void *v)
4502 static int packet_seq_show(struct seq_file *seq, void *v)
4504 if (v == SEQ_START_TOKEN)
4505 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4507 struct sock *s = sk_entry(v);
4508 const struct packet_sock *po = pkt_sk(s);
4511 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4513 refcount_read(&s->sk_refcnt),
4518 atomic_read(&s->sk_rmem_alloc),
4519 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
4526 static const struct seq_operations packet_seq_ops = {
4527 .start = packet_seq_start,
4528 .next = packet_seq_next,
4529 .stop = packet_seq_stop,
4530 .show = packet_seq_show,
4533 static int packet_seq_open(struct inode *inode, struct file *file)
4535 return seq_open_net(inode, file, &packet_seq_ops,
4536 sizeof(struct seq_net_private));
4539 static const struct file_operations packet_seq_fops = {
4540 .owner = THIS_MODULE,
4541 .open = packet_seq_open,
4543 .llseek = seq_lseek,
4544 .release = seq_release_net,
4549 static int __net_init packet_net_init(struct net *net)
4551 mutex_init(&net->packet.sklist_lock);
4552 INIT_HLIST_HEAD(&net->packet.sklist);
4554 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
4560 static void __net_exit packet_net_exit(struct net *net)
4562 remove_proc_entry("packet", net->proc_net);
4563 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
4566 static struct pernet_operations packet_net_ops = {
4567 .init = packet_net_init,
4568 .exit = packet_net_exit,
4572 static void __exit packet_exit(void)
4574 unregister_netdevice_notifier(&packet_netdev_notifier);
4575 unregister_pernet_subsys(&packet_net_ops);
4576 sock_unregister(PF_PACKET);
4577 proto_unregister(&packet_proto);
4580 static int __init packet_init(void)
4582 int rc = proto_register(&packet_proto, 0);
4587 sock_register(&packet_family_ops);
4588 register_pernet_subsys(&packet_net_ops);
4589 register_netdevice_notifier(&packet_netdev_notifier);
4594 module_init(packet_init);
4595 module_exit(packet_exit);
4596 MODULE_LICENSE("GPL");
4597 MODULE_ALIAS_NETPROTO(PF_PACKET);