xref: /kernel/linux/linux-5.10/net/packet/af_packet.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
4 *		operating system.  INET is implemented using the  BSD Socket
5 *		interface as the means of communication with the user level.
6 *
7 *		PACKET - implements raw packet sockets.
8 *
9 * Authors:	Ross Biro
10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12 *
13 * Fixes:
14 *		Alan Cox	:	verify_area() now used correctly
15 *		Alan Cox	:	new skbuff lists, look ma no backlogs!
16 *		Alan Cox	:	tidied skbuff lists.
17 *		Alan Cox	:	Now uses generic datagram routines I
18 *					added. Also fixed the peek/read crash
19 *					from all old Linux datagram code.
20 *		Alan Cox	:	Uses the improved datagram code.
21 *		Alan Cox	:	Added NULL's for socket options.
22 *		Alan Cox	:	Re-commented the code.
23 *		Alan Cox	:	Use new kernel side addressing
24 *		Rob Janssen	:	Correct MTU usage.
25 *		Dave Platt	:	Counter leaks caused by incorrect
26 *					interrupt locking and some slightly
27 *					dubious gcc output. Can you read
28 *					compiler: it said _VOLATILE_
29 *	Richard Kooijman	:	Timestamp fixes.
30 *		Alan Cox	:	New buffers. Use sk->mac.raw.
31 *		Alan Cox	:	sendmsg/recvmsg support.
32 *		Alan Cox	:	Protocol setting support
33 *	Alexey Kuznetsov	:	Untied from IPv4 stack.
34 *	Cyrus Durgin		:	Fixed kerneld for kmod.
35 *	Michal Ostrowski        :       Module initialization cleanup.
36 *         Ulises Alonso        :       Frame number limit removal and
37 *                                      packet_set_ring memory leak.
38 *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
39 *					The convention is that longer addresses
40 *					will simply extend the hardware address
41 *					byte arrays at the end of sockaddr_ll
42 *					and packet_mreq.
43 *		Johann Baudy	:	Added TX RING.
44 *		Chetan Loke	:	Implemented TPACKET_V3 block abstraction
45 *					layer.
46 *					Copyright (C) 2011, <lokec@ccs.neu.edu>
47 */
48
49#include <linux/types.h>
50#include <linux/mm.h>
51#include <linux/capability.h>
52#include <linux/fcntl.h>
53#include <linux/socket.h>
54#include <linux/in.h>
55#include <linux/inet.h>
56#include <linux/netdevice.h>
57#include <linux/if_packet.h>
58#include <linux/wireless.h>
59#include <linux/kernel.h>
60#include <linux/kmod.h>
61#include <linux/slab.h>
62#include <linux/vmalloc.h>
63#include <net/net_namespace.h>
64#include <net/ip.h>
65#include <net/protocol.h>
66#include <linux/skbuff.h>
67#include <net/sock.h>
68#include <linux/errno.h>
69#include <linux/timer.h>
70#include <linux/uaccess.h>
71#include <asm/ioctls.h>
72#include <asm/page.h>
73#include <asm/cacheflush.h>
74#include <asm/io.h>
75#include <linux/proc_fs.h>
76#include <linux/seq_file.h>
77#include <linux/poll.h>
78#include <linux/module.h>
79#include <linux/init.h>
80#include <linux/mutex.h>
81#include <linux/if_vlan.h>
82#include <linux/virtio_net.h>
83#include <linux/errqueue.h>
84#include <linux/net_tstamp.h>
85#include <linux/percpu.h>
86#ifdef CONFIG_INET
87#include <net/inet_common.h>
88#endif
89#include <linux/bpf.h>
90#include <net/compat.h>
91
92#include "internal.h"
93
94/*
95   Assumptions:
96   - If the device has no dev->header_ops->create, there is no LL header
97     visible above the device. In this case, its hard_header_len should be 0.
98     The device may prepend its own header internally. In this case, its
99     needed_headroom should be set to the space needed for it to add its
100     internal header.
101     For example, a WiFi driver pretending to be an Ethernet driver should
102     set its hard_header_len to be the Ethernet header length, and set its
103     needed_headroom to be (the real WiFi header length - the fake Ethernet
104     header length).
105   - packet socket receives packets with pulled ll header,
106     so that SOCK_RAW should push it back.
107
108On receive:
109-----------
110
111Incoming, dev_has_header(dev) == true
112   mac_header -> ll header
113   data       -> data
114
115Outgoing, dev_has_header(dev) == true
116   mac_header -> ll header
117   data       -> ll header
118
119Incoming, dev_has_header(dev) == false
120   mac_header -> data
121     However drivers often make it point to the ll header.
122     This is incorrect because the ll header should be invisible to us.
123   data       -> data
124
125Outgoing, dev_has_header(dev) == false
126   mac_header -> data. ll header is invisible to us.
127   data       -> data
128
129Resume
130  If dev_has_header(dev) == false we are unable to restore the ll header,
131    because it is invisible to us.
132
133
134On transmit:
135------------
136
137dev->header_ops != NULL
138   mac_header -> ll header
139   data       -> ll header
140
141dev->header_ops == NULL (ll header is invisible to us)
142   mac_header -> data
143   data       -> data
144
145   We should set network_header on output to the correct position,
146   packet classifier depends on it.
147 */
148
149/* Private packet socket structures. */
150
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
154struct packet_mreq_max {
155	int		mr_ifindex;
156	unsigned short	mr_type;
157	unsigned short	mr_alen;
158	unsigned char	mr_address[MAX_ADDR_LEN];
159};
160
161union tpacket_uhdr {
162	struct tpacket_hdr  *h1;
163	struct tpacket2_hdr *h2;
164	struct tpacket3_hdr *h3;
165	void *raw;
166};
167
168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
169		int closing, int tx_ring);
170
171#define V3_ALIGNMENT	(8)
172
173#define BLK_HDR_LEN	(ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176	(BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
178#define BLOCK_STATUS(x)	((x)->hdr.bh1.block_status)
179#define BLOCK_NUM_PKTS(x)	((x)->hdr.bh1.num_pkts)
180#define BLOCK_O2FP(x)		((x)->hdr.bh1.offset_to_first_pkt)
181#define BLOCK_LEN(x)		((x)->hdr.bh1.blk_len)
182#define BLOCK_SNUM(x)		((x)->hdr.bh1.seq_num)
183#define BLOCK_O2PRIV(x)	((x)->offset_to_priv)
184
185struct packet_sock;
186static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
187		       struct packet_type *pt, struct net_device *orig_dev);
188
189static void *packet_previous_frame(struct packet_sock *po,
190		struct packet_ring_buffer *rb,
191		int status);
192static void packet_increment_head(struct packet_ring_buffer *buff);
193static int prb_curr_blk_in_use(struct tpacket_block_desc *);
194static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
195			struct packet_sock *);
196static void prb_retire_current_block(struct tpacket_kbdq_core *,
197		struct packet_sock *, unsigned int status);
198static int prb_queue_frozen(struct tpacket_kbdq_core *);
199static void prb_open_block(struct tpacket_kbdq_core *,
200		struct tpacket_block_desc *);
201static void prb_retire_rx_blk_timer_expired(struct timer_list *);
202static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
203static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
204static void prb_clear_rxhash(struct tpacket_kbdq_core *,
205		struct tpacket3_hdr *);
206static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
207		struct tpacket3_hdr *);
208static void packet_flush_mclist(struct sock *sk);
209static u16 packet_pick_tx_queue(struct sk_buff *skb);
210
211struct packet_skb_cb {
212	union {
213		struct sockaddr_pkt pkt;
214		union {
215			/* Trick: alias skb original length with
216			 * ll.sll_family and ll.protocol in order
217			 * to save room.
218			 */
219			unsigned int origlen;
220			struct sockaddr_ll ll;
221		};
222	} sa;
223};
224
225#define vio_le() virtio_legacy_is_little_endian()
226
227#define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
228
229#define GET_PBDQC_FROM_RB(x)	((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
230#define GET_PBLOCK_DESC(x, bid)	\
231	((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
232#define GET_CURR_PBLOCK_DESC_FROM_CORE(x)	\
233	((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
234#define GET_NEXT_PRB_BLK_NUM(x) \
235	(((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
236	((x)->kactive_blk_num+1) : 0)
237
238static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
239static void __fanout_link(struct sock *sk, struct packet_sock *po);
240
241static int packet_direct_xmit(struct sk_buff *skb)
242{
243	return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
244}
245
246static struct net_device *packet_cached_dev_get(struct packet_sock *po)
247{
248	struct net_device *dev;
249
250	rcu_read_lock();
251	dev = rcu_dereference(po->cached_dev);
252	if (likely(dev))
253		dev_hold(dev);
254	rcu_read_unlock();
255
256	return dev;
257}
258
259static void packet_cached_dev_assign(struct packet_sock *po,
260				     struct net_device *dev)
261{
262	rcu_assign_pointer(po->cached_dev, dev);
263}
264
265static void packet_cached_dev_reset(struct packet_sock *po)
266{
267	RCU_INIT_POINTER(po->cached_dev, NULL);
268}
269
270static bool packet_use_direct_xmit(const struct packet_sock *po)
271{
272	/* Paired with WRITE_ONCE() in packet_setsockopt() */
273	return READ_ONCE(po->xmit) == packet_direct_xmit;
274}
275
276static u16 packet_pick_tx_queue(struct sk_buff *skb)
277{
278	struct net_device *dev = skb->dev;
279	const struct net_device_ops *ops = dev->netdev_ops;
280	int cpu = raw_smp_processor_id();
281	u16 queue_index;
282
283#ifdef CONFIG_XPS
284	skb->sender_cpu = cpu + 1;
285#endif
286	skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
287	if (ops->ndo_select_queue) {
288		queue_index = ops->ndo_select_queue(dev, skb, NULL);
289		queue_index = netdev_cap_txqueue(dev, queue_index);
290	} else {
291		queue_index = netdev_pick_tx(dev, skb, NULL);
292	}
293
294	return queue_index;
295}
296
297/* __register_prot_hook must be invoked through register_prot_hook
298 * or from a context in which asynchronous accesses to the packet
299 * socket is not possible (packet_create()).
300 */
301static void __register_prot_hook(struct sock *sk)
302{
303	struct packet_sock *po = pkt_sk(sk);
304
305	if (!po->running) {
306		if (po->fanout)
307			__fanout_link(sk, po);
308		else
309			dev_add_pack(&po->prot_hook);
310
311		sock_hold(sk);
312		po->running = 1;
313	}
314}
315
316static void register_prot_hook(struct sock *sk)
317{
318	lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
319	__register_prot_hook(sk);
320}
321
322/* If the sync parameter is true, we will temporarily drop
323 * the po->bind_lock and do a synchronize_net to make sure no
324 * asynchronous packet processing paths still refer to the elements
325 * of po->prot_hook.  If the sync parameter is false, it is the
326 * callers responsibility to take care of this.
327 */
328static void __unregister_prot_hook(struct sock *sk, bool sync)
329{
330	struct packet_sock *po = pkt_sk(sk);
331
332	lockdep_assert_held_once(&po->bind_lock);
333
334	po->running = 0;
335
336	if (po->fanout)
337		__fanout_unlink(sk, po);
338	else
339		__dev_remove_pack(&po->prot_hook);
340
341	__sock_put(sk);
342
343	if (sync) {
344		spin_unlock(&po->bind_lock);
345		synchronize_net();
346		spin_lock(&po->bind_lock);
347	}
348}
349
350static void unregister_prot_hook(struct sock *sk, bool sync)
351{
352	struct packet_sock *po = pkt_sk(sk);
353
354	if (po->running)
355		__unregister_prot_hook(sk, sync);
356}
357
358static inline struct page * __pure pgv_to_page(void *addr)
359{
360	if (is_vmalloc_addr(addr))
361		return vmalloc_to_page(addr);
362	return virt_to_page(addr);
363}
364
365static void __packet_set_status(struct packet_sock *po, void *frame, int status)
366{
367	union tpacket_uhdr h;
368
369	/* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */
370
371	h.raw = frame;
372	switch (po->tp_version) {
373	case TPACKET_V1:
374		WRITE_ONCE(h.h1->tp_status, status);
375		flush_dcache_page(pgv_to_page(&h.h1->tp_status));
376		break;
377	case TPACKET_V2:
378		WRITE_ONCE(h.h2->tp_status, status);
379		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
380		break;
381	case TPACKET_V3:
382		WRITE_ONCE(h.h3->tp_status, status);
383		flush_dcache_page(pgv_to_page(&h.h3->tp_status));
384		break;
385	default:
386		WARN(1, "TPACKET version not supported.\n");
387		BUG();
388	}
389
390	smp_wmb();
391}
392
393static int __packet_get_status(const struct packet_sock *po, void *frame)
394{
395	union tpacket_uhdr h;
396
397	smp_rmb();
398
399	/* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */
400
401	h.raw = frame;
402	switch (po->tp_version) {
403	case TPACKET_V1:
404		flush_dcache_page(pgv_to_page(&h.h1->tp_status));
405		return READ_ONCE(h.h1->tp_status);
406	case TPACKET_V2:
407		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
408		return READ_ONCE(h.h2->tp_status);
409	case TPACKET_V3:
410		flush_dcache_page(pgv_to_page(&h.h3->tp_status));
411		return READ_ONCE(h.h3->tp_status);
412	default:
413		WARN(1, "TPACKET version not supported.\n");
414		BUG();
415		return 0;
416	}
417}
418
419static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
420				   unsigned int flags)
421{
422	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
423
424	if (shhwtstamps &&
425	    (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
426	    ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
427		return TP_STATUS_TS_RAW_HARDWARE;
428
429	if ((flags & SOF_TIMESTAMPING_SOFTWARE) &&
430	    ktime_to_timespec64_cond(skb->tstamp, ts))
431		return TP_STATUS_TS_SOFTWARE;
432
433	return 0;
434}
435
436static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
437				    struct sk_buff *skb)
438{
439	union tpacket_uhdr h;
440	struct timespec64 ts;
441	__u32 ts_status;
442
443	if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
444		return 0;
445
446	h.raw = frame;
447	/*
448	 * versions 1 through 3 overflow the timestamps in y2106, since they
449	 * all store the seconds in a 32-bit unsigned integer.
450	 * If we create a version 4, that should have a 64-bit timestamp,
451	 * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
452	 * nanoseconds.
453	 */
454	switch (po->tp_version) {
455	case TPACKET_V1:
456		h.h1->tp_sec = ts.tv_sec;
457		h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
458		break;
459	case TPACKET_V2:
460		h.h2->tp_sec = ts.tv_sec;
461		h.h2->tp_nsec = ts.tv_nsec;
462		break;
463	case TPACKET_V3:
464		h.h3->tp_sec = ts.tv_sec;
465		h.h3->tp_nsec = ts.tv_nsec;
466		break;
467	default:
468		WARN(1, "TPACKET version not supported.\n");
469		BUG();
470	}
471
472	/* one flush is safe, as both fields always lie on the same cacheline */
473	flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
474	smp_wmb();
475
476	return ts_status;
477}
478
479static void *packet_lookup_frame(const struct packet_sock *po,
480				 const struct packet_ring_buffer *rb,
481				 unsigned int position,
482				 int status)
483{
484	unsigned int pg_vec_pos, frame_offset;
485	union tpacket_uhdr h;
486
487	pg_vec_pos = position / rb->frames_per_block;
488	frame_offset = position % rb->frames_per_block;
489
490	h.raw = rb->pg_vec[pg_vec_pos].buffer +
491		(frame_offset * rb->frame_size);
492
493	if (status != __packet_get_status(po, h.raw))
494		return NULL;
495
496	return h.raw;
497}
498
499static void *packet_current_frame(struct packet_sock *po,
500		struct packet_ring_buffer *rb,
501		int status)
502{
503	return packet_lookup_frame(po, rb, rb->head, status);
504}
505
506static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
507{
508	del_timer_sync(&pkc->retire_blk_timer);
509}
510
511static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
512		struct sk_buff_head *rb_queue)
513{
514	struct tpacket_kbdq_core *pkc;
515
516	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
517
518	spin_lock_bh(&rb_queue->lock);
519	pkc->delete_blk_timer = 1;
520	spin_unlock_bh(&rb_queue->lock);
521
522	prb_del_retire_blk_timer(pkc);
523}
524
525static void prb_setup_retire_blk_timer(struct packet_sock *po)
526{
527	struct tpacket_kbdq_core *pkc;
528
529	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
530	timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
531		    0);
532	pkc->retire_blk_timer.expires = jiffies;
533}
534
535static int prb_calc_retire_blk_tmo(struct packet_sock *po,
536				int blk_size_in_bytes)
537{
538	struct net_device *dev;
539	unsigned int mbits, div;
540	struct ethtool_link_ksettings ecmd;
541	int err;
542
543	rtnl_lock();
544	dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
545	if (unlikely(!dev)) {
546		rtnl_unlock();
547		return DEFAULT_PRB_RETIRE_TOV;
548	}
549	err = __ethtool_get_link_ksettings(dev, &ecmd);
550	rtnl_unlock();
551	if (err)
552		return DEFAULT_PRB_RETIRE_TOV;
553
554	/* If the link speed is so slow you don't really
555	 * need to worry about perf anyways
556	 */
557	if (ecmd.base.speed < SPEED_1000 ||
558	    ecmd.base.speed == SPEED_UNKNOWN)
559		return DEFAULT_PRB_RETIRE_TOV;
560
561	div = ecmd.base.speed / 1000;
562	mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
563
564	if (div)
565		mbits /= div;
566
567	if (div)
568		return mbits + 1;
569	return mbits;
570}
571
572static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
573			union tpacket_req_u *req_u)
574{
575	p1->feature_req_word = req_u->req3.tp_feature_req_word;
576}
577
578static void init_prb_bdqc(struct packet_sock *po,
579			struct packet_ring_buffer *rb,
580			struct pgv *pg_vec,
581			union tpacket_req_u *req_u)
582{
583	struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
584	struct tpacket_block_desc *pbd;
585
586	memset(p1, 0x0, sizeof(*p1));
587
588	p1->knxt_seq_num = 1;
589	p1->pkbdq = pg_vec;
590	pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
591	p1->pkblk_start	= pg_vec[0].buffer;
592	p1->kblk_size = req_u->req3.tp_block_size;
593	p1->knum_blocks	= req_u->req3.tp_block_nr;
594	p1->hdrlen = po->tp_hdrlen;
595	p1->version = po->tp_version;
596	p1->last_kactive_blk_num = 0;
597	po->stats.stats3.tp_freeze_q_cnt = 0;
598	if (req_u->req3.tp_retire_blk_tov)
599		p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
600	else
601		p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
602						req_u->req3.tp_block_size);
603	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
604	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
605	rwlock_init(&p1->blk_fill_in_prog_lock);
606
607	p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
608	prb_init_ft_ops(p1, req_u);
609	prb_setup_retire_blk_timer(po);
610	prb_open_block(p1, pbd);
611}
612
613/*  Do NOT update the last_blk_num first.
614 *  Assumes sk_buff_head lock is held.
615 */
616static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
617{
618	mod_timer(&pkc->retire_blk_timer,
619			jiffies + pkc->tov_in_jiffies);
620	pkc->last_kactive_blk_num = pkc->kactive_blk_num;
621}
622
623/*
624 * Timer logic:
625 * 1) We refresh the timer only when we open a block.
626 *    By doing this we don't waste cycles refreshing the timer
627 *	  on packet-by-packet basis.
628 *
629 * With a 1MB block-size, on a 1Gbps line, it will take
630 * i) ~8 ms to fill a block + ii) memcpy etc.
631 * In this cut we are not accounting for the memcpy time.
632 *
633 * So, if the user sets the 'tmo' to 10ms then the timer
634 * will never fire while the block is still getting filled
635 * (which is what we want). However, the user could choose
636 * to close a block early and that's fine.
637 *
638 * But when the timer does fire, we check whether or not to refresh it.
639 * Since the tmo granularity is in msecs, it is not too expensive
640 * to refresh the timer, lets say every '8' msecs.
641 * Either the user can set the 'tmo' or we can derive it based on
642 * a) line-speed and b) block-size.
643 * prb_calc_retire_blk_tmo() calculates the tmo.
644 *
645 */
646static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
647{
648	struct packet_sock *po =
649		from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
650	struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
651	unsigned int frozen;
652	struct tpacket_block_desc *pbd;
653
654	spin_lock(&po->sk.sk_receive_queue.lock);
655
656	frozen = prb_queue_frozen(pkc);
657	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
658
659	if (unlikely(pkc->delete_blk_timer))
660		goto out;
661
662	/* We only need to plug the race when the block is partially filled.
663	 * tpacket_rcv:
664	 *		lock(); increment BLOCK_NUM_PKTS; unlock()
665	 *		copy_bits() is in progress ...
666	 *		timer fires on other cpu:
667	 *		we can't retire the current block because copy_bits
668	 *		is in progress.
669	 *
670	 */
671	if (BLOCK_NUM_PKTS(pbd)) {
672		/* Waiting for skb_copy_bits to finish... */
673		write_lock(&pkc->blk_fill_in_prog_lock);
674		write_unlock(&pkc->blk_fill_in_prog_lock);
675	}
676
677	if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
678		if (!frozen) {
679			if (!BLOCK_NUM_PKTS(pbd)) {
680				/* An empty block. Just refresh the timer. */
681				goto refresh_timer;
682			}
683			prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
684			if (!prb_dispatch_next_block(pkc, po))
685				goto refresh_timer;
686			else
687				goto out;
688		} else {
689			/* Case 1. Queue was frozen because user-space was
690			 *	   lagging behind.
691			 */
692			if (prb_curr_blk_in_use(pbd)) {
693				/*
694				 * Ok, user-space is still behind.
695				 * So just refresh the timer.
696				 */
697				goto refresh_timer;
698			} else {
699			       /* Case 2. queue was frozen,user-space caught up,
700				* now the link went idle && the timer fired.
701				* We don't have a block to close.So we open this
702				* block and restart the timer.
703				* opening a block thaws the queue,restarts timer
704				* Thawing/timer-refresh is a side effect.
705				*/
706				prb_open_block(pkc, pbd);
707				goto out;
708			}
709		}
710	}
711
712refresh_timer:
713	_prb_refresh_rx_retire_blk_timer(pkc);
714
715out:
716	spin_unlock(&po->sk.sk_receive_queue.lock);
717}
718
719static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
720		struct tpacket_block_desc *pbd1, __u32 status)
721{
722	/* Flush everything minus the block header */
723
724#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
725	u8 *start, *end;
726
727	start = (u8 *)pbd1;
728
729	/* Skip the block header(we know header WILL fit in 4K) */
730	start += PAGE_SIZE;
731
732	end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
733	for (; start < end; start += PAGE_SIZE)
734		flush_dcache_page(pgv_to_page(start));
735
736	smp_wmb();
737#endif
738
739	/* Now update the block status. */
740
741	BLOCK_STATUS(pbd1) = status;
742
743	/* Flush the block header */
744
745#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
746	start = (u8 *)pbd1;
747	flush_dcache_page(pgv_to_page(start));
748
749	smp_wmb();
750#endif
751}
752
753/*
754 * Side effect:
755 *
756 * 1) flush the block
757 * 2) Increment active_blk_num
758 *
759 * Note:We DONT refresh the timer on purpose.
760 *	Because almost always the next block will be opened.
761 */
762static void prb_close_block(struct tpacket_kbdq_core *pkc1,
763		struct tpacket_block_desc *pbd1,
764		struct packet_sock *po, unsigned int stat)
765{
766	__u32 status = TP_STATUS_USER | stat;
767
768	struct tpacket3_hdr *last_pkt;
769	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
770	struct sock *sk = &po->sk;
771
772	if (atomic_read(&po->tp_drops))
773		status |= TP_STATUS_LOSING;
774
775	last_pkt = (struct tpacket3_hdr *)pkc1->prev;
776	last_pkt->tp_next_offset = 0;
777
778	/* Get the ts of the last pkt */
779	if (BLOCK_NUM_PKTS(pbd1)) {
780		h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
781		h1->ts_last_pkt.ts_nsec	= last_pkt->tp_nsec;
782	} else {
783		/* Ok, we tmo'd - so get the current time.
784		 *
785		 * It shouldn't really happen as we don't close empty
786		 * blocks. See prb_retire_rx_blk_timer_expired().
787		 */
788		struct timespec64 ts;
789		ktime_get_real_ts64(&ts);
790		h1->ts_last_pkt.ts_sec = ts.tv_sec;
791		h1->ts_last_pkt.ts_nsec	= ts.tv_nsec;
792	}
793
794	smp_wmb();
795
796	/* Flush the block */
797	prb_flush_block(pkc1, pbd1, status);
798
799	sk->sk_data_ready(sk);
800
801	pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
802}
803
804static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
805{
806	pkc->reset_pending_on_curr_blk = 0;
807}
808
809/*
810 * Side effect of opening a block:
811 *
812 * 1) prb_queue is thawed.
813 * 2) retire_blk_timer is refreshed.
814 *
815 */
816static void prb_open_block(struct tpacket_kbdq_core *pkc1,
817	struct tpacket_block_desc *pbd1)
818{
819	struct timespec64 ts;
820	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
821
822	smp_rmb();
823
824	/* We could have just memset this but we will lose the
825	 * flexibility of making the priv area sticky
826	 */
827
828	BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
829	BLOCK_NUM_PKTS(pbd1) = 0;
830	BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
831
832	ktime_get_real_ts64(&ts);
833
834	h1->ts_first_pkt.ts_sec = ts.tv_sec;
835	h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
836
837	pkc1->pkblk_start = (char *)pbd1;
838	pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
839
840	BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
841	BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
842
843	pbd1->version = pkc1->version;
844	pkc1->prev = pkc1->nxt_offset;
845	pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
846
847	prb_thaw_queue(pkc1);
848	_prb_refresh_rx_retire_blk_timer(pkc1);
849
850	smp_wmb();
851}
852
853/*
854 * Queue freeze logic:
855 * 1) Assume tp_block_nr = 8 blocks.
856 * 2) At time 't0', user opens Rx ring.
857 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
858 * 4) user-space is either sleeping or processing block '0'.
859 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
860 *    it will close block-7,loop around and try to fill block '0'.
861 *    call-flow:
862 *    __packet_lookup_frame_in_block
863 *      prb_retire_current_block()
864 *      prb_dispatch_next_block()
865 *        |->(BLOCK_STATUS == USER) evaluates to true
866 *    5.1) Since block-0 is currently in-use, we just freeze the queue.
867 * 6) Now there are two cases:
868 *    6.1) Link goes idle right after the queue is frozen.
869 *         But remember, the last open_block() refreshed the timer.
870 *         When this timer expires,it will refresh itself so that we can
871 *         re-open block-0 in near future.
872 *    6.2) Link is busy and keeps on receiving packets. This is a simple
873 *         case and __packet_lookup_frame_in_block will check if block-0
874 *         is free and can now be re-used.
875 */
876static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
877				  struct packet_sock *po)
878{
879	pkc->reset_pending_on_curr_blk = 1;
880	po->stats.stats3.tp_freeze_q_cnt++;
881}
882
883#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
884
885/*
886 * If the next block is free then we will dispatch it
887 * and return a good offset.
888 * Else, we will freeze the queue.
889 * So, caller must check the return value.
890 */
891static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
892		struct packet_sock *po)
893{
894	struct tpacket_block_desc *pbd;
895
896	smp_rmb();
897
898	/* 1. Get current block num */
899	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
900
901	/* 2. If this block is currently in_use then freeze the queue */
902	if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
903		prb_freeze_queue(pkc, po);
904		return NULL;
905	}
906
907	/*
908	 * 3.
909	 * open this block and return the offset where the first packet
910	 * needs to get stored.
911	 */
912	prb_open_block(pkc, pbd);
913	return (void *)pkc->nxt_offset;
914}
915
916static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
917		struct packet_sock *po, unsigned int status)
918{
919	struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
920
921	/* retire/close the current block */
922	if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
923		/*
924		 * Plug the case where copy_bits() is in progress on
925		 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
926		 * have space to copy the pkt in the current block and
927		 * called prb_retire_current_block()
928		 *
929		 * We don't need to worry about the TMO case because
930		 * the timer-handler already handled this case.
931		 */
932		if (!(status & TP_STATUS_BLK_TMO)) {
933			/* Waiting for skb_copy_bits to finish... */
934			write_lock(&pkc->blk_fill_in_prog_lock);
935			write_unlock(&pkc->blk_fill_in_prog_lock);
936		}
937		prb_close_block(pkc, pbd, po, status);
938		return;
939	}
940}
941
942static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
943{
944	return TP_STATUS_USER & BLOCK_STATUS(pbd);
945}
946
947static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
948{
949	return pkc->reset_pending_on_curr_blk;
950}
951
952static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
953	__releases(&pkc->blk_fill_in_prog_lock)
954{
955	struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
956
957	read_unlock(&pkc->blk_fill_in_prog_lock);
958}
959
960static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
961			struct tpacket3_hdr *ppd)
962{
963	ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
964}
965
966static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
967			struct tpacket3_hdr *ppd)
968{
969	ppd->hv1.tp_rxhash = 0;
970}
971
972static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
973			struct tpacket3_hdr *ppd)
974{
975	if (skb_vlan_tag_present(pkc->skb)) {
976		ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
977		ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
978		ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
979	} else {
980		ppd->hv1.tp_vlan_tci = 0;
981		ppd->hv1.tp_vlan_tpid = 0;
982		ppd->tp_status = TP_STATUS_AVAILABLE;
983	}
984}
985
986static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
987			struct tpacket3_hdr *ppd)
988{
989	ppd->hv1.tp_padding = 0;
990	prb_fill_vlan_info(pkc, ppd);
991
992	if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
993		prb_fill_rxhash(pkc, ppd);
994	else
995		prb_clear_rxhash(pkc, ppd);
996}
997
998static void prb_fill_curr_block(char *curr,
999				struct tpacket_kbdq_core *pkc,
1000				struct tpacket_block_desc *pbd,
1001				unsigned int len)
1002	__acquires(&pkc->blk_fill_in_prog_lock)
1003{
1004	struct tpacket3_hdr *ppd;
1005
1006	ppd  = (struct tpacket3_hdr *)curr;
1007	ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1008	pkc->prev = curr;
1009	pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1010	BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1011	BLOCK_NUM_PKTS(pbd) += 1;
1012	read_lock(&pkc->blk_fill_in_prog_lock);
1013	prb_run_all_ft_ops(pkc, ppd);
1014}
1015
1016/* Assumes caller has the sk->rx_queue.lock */
1017static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1018					    struct sk_buff *skb,
1019					    unsigned int len
1020					    )
1021{
1022	struct tpacket_kbdq_core *pkc;
1023	struct tpacket_block_desc *pbd;
1024	char *curr, *end;
1025
1026	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1027	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1028
1029	/* Queue is frozen when user space is lagging behind */
1030	if (prb_queue_frozen(pkc)) {
1031		/*
1032		 * Check if that last block which caused the queue to freeze,
1033		 * is still in_use by user-space.
1034		 */
1035		if (prb_curr_blk_in_use(pbd)) {
1036			/* Can't record this packet */
1037			return NULL;
1038		} else {
1039			/*
1040			 * Ok, the block was released by user-space.
1041			 * Now let's open that block.
1042			 * opening a block also thaws the queue.
1043			 * Thawing is a side effect.
1044			 */
1045			prb_open_block(pkc, pbd);
1046		}
1047	}
1048
1049	smp_mb();
1050	curr = pkc->nxt_offset;
1051	pkc->skb = skb;
1052	end = (char *)pbd + pkc->kblk_size;
1053
1054	/* first try the current block */
1055	if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1056		prb_fill_curr_block(curr, pkc, pbd, len);
1057		return (void *)curr;
1058	}
1059
1060	/* Ok, close the current block */
1061	prb_retire_current_block(pkc, po, 0);
1062
1063	/* Now, try to dispatch the next block */
1064	curr = (char *)prb_dispatch_next_block(pkc, po);
1065	if (curr) {
1066		pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1067		prb_fill_curr_block(curr, pkc, pbd, len);
1068		return (void *)curr;
1069	}
1070
1071	/*
1072	 * No free blocks are available.user_space hasn't caught up yet.
1073	 * Queue was just frozen and now this packet will get dropped.
1074	 */
1075	return NULL;
1076}
1077
1078static void *packet_current_rx_frame(struct packet_sock *po,
1079					    struct sk_buff *skb,
1080					    int status, unsigned int len)
1081{
1082	char *curr = NULL;
1083	switch (po->tp_version) {
1084	case TPACKET_V1:
1085	case TPACKET_V2:
1086		curr = packet_lookup_frame(po, &po->rx_ring,
1087					po->rx_ring.head, status);
1088		return curr;
1089	case TPACKET_V3:
1090		return __packet_lookup_frame_in_block(po, skb, len);
1091	default:
1092		WARN(1, "TPACKET version not supported\n");
1093		BUG();
1094		return NULL;
1095	}
1096}
1097
1098static void *prb_lookup_block(const struct packet_sock *po,
1099			      const struct packet_ring_buffer *rb,
1100			      unsigned int idx,
1101			      int status)
1102{
1103	struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
1104	struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1105
1106	if (status != BLOCK_STATUS(pbd))
1107		return NULL;
1108	return pbd;
1109}
1110
1111static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1112{
1113	unsigned int prev;
1114	if (rb->prb_bdqc.kactive_blk_num)
1115		prev = rb->prb_bdqc.kactive_blk_num-1;
1116	else
1117		prev = rb->prb_bdqc.knum_blocks-1;
1118	return prev;
1119}
1120
1121/* Assumes caller has held the rx_queue.lock */
1122static void *__prb_previous_block(struct packet_sock *po,
1123					 struct packet_ring_buffer *rb,
1124					 int status)
1125{
1126	unsigned int previous = prb_previous_blk_num(rb);
1127	return prb_lookup_block(po, rb, previous, status);
1128}
1129
1130static void *packet_previous_rx_frame(struct packet_sock *po,
1131					     struct packet_ring_buffer *rb,
1132					     int status)
1133{
1134	if (po->tp_version <= TPACKET_V2)
1135		return packet_previous_frame(po, rb, status);
1136
1137	return __prb_previous_block(po, rb, status);
1138}
1139
1140static void packet_increment_rx_head(struct packet_sock *po,
1141					    struct packet_ring_buffer *rb)
1142{
1143	switch (po->tp_version) {
1144	case TPACKET_V1:
1145	case TPACKET_V2:
1146		return packet_increment_head(rb);
1147	case TPACKET_V3:
1148	default:
1149		WARN(1, "TPACKET version not supported.\n");
1150		BUG();
1151		return;
1152	}
1153}
1154
1155static void *packet_previous_frame(struct packet_sock *po,
1156		struct packet_ring_buffer *rb,
1157		int status)
1158{
1159	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1160	return packet_lookup_frame(po, rb, previous, status);
1161}
1162
1163static void packet_increment_head(struct packet_ring_buffer *buff)
1164{
1165	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1166}
1167
1168static void packet_inc_pending(struct packet_ring_buffer *rb)
1169{
1170	this_cpu_inc(*rb->pending_refcnt);
1171}
1172
1173static void packet_dec_pending(struct packet_ring_buffer *rb)
1174{
1175	this_cpu_dec(*rb->pending_refcnt);
1176}
1177
1178static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1179{
1180	unsigned int refcnt = 0;
1181	int cpu;
1182
1183	/* We don't use pending refcount in rx_ring. */
1184	if (rb->pending_refcnt == NULL)
1185		return 0;
1186
1187	for_each_possible_cpu(cpu)
1188		refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1189
1190	return refcnt;
1191}
1192
1193static int packet_alloc_pending(struct packet_sock *po)
1194{
1195	po->rx_ring.pending_refcnt = NULL;
1196
1197	po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1198	if (unlikely(po->tx_ring.pending_refcnt == NULL))
1199		return -ENOBUFS;
1200
1201	return 0;
1202}
1203
1204static void packet_free_pending(struct packet_sock *po)
1205{
1206	free_percpu(po->tx_ring.pending_refcnt);
1207}
1208
1209#define ROOM_POW_OFF	2
1210#define ROOM_NONE	0x0
1211#define ROOM_LOW	0x1
1212#define ROOM_NORMAL	0x2
1213
1214static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
1215{
1216	int idx, len;
1217
1218	len = READ_ONCE(po->rx_ring.frame_max) + 1;
1219	idx = READ_ONCE(po->rx_ring.head);
1220	if (pow_off)
1221		idx += len >> pow_off;
1222	if (idx >= len)
1223		idx -= len;
1224	return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1225}
1226
1227static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
1228{
1229	int idx, len;
1230
1231	len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
1232	idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
1233	if (pow_off)
1234		idx += len >> pow_off;
1235	if (idx >= len)
1236		idx -= len;
1237	return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1238}
1239
1240static int __packet_rcv_has_room(const struct packet_sock *po,
1241				 const struct sk_buff *skb)
1242{
1243	const struct sock *sk = &po->sk;
1244	int ret = ROOM_NONE;
1245
1246	if (po->prot_hook.func != tpacket_rcv) {
1247		int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
1248		int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1249				   - (skb ? skb->truesize : 0);
1250
1251		if (avail > (rcvbuf >> ROOM_POW_OFF))
1252			return ROOM_NORMAL;
1253		else if (avail > 0)
1254			return ROOM_LOW;
1255		else
1256			return ROOM_NONE;
1257	}
1258
1259	if (po->tp_version == TPACKET_V3) {
1260		if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1261			ret = ROOM_NORMAL;
1262		else if (__tpacket_v3_has_room(po, 0))
1263			ret = ROOM_LOW;
1264	} else {
1265		if (__tpacket_has_room(po, ROOM_POW_OFF))
1266			ret = ROOM_NORMAL;
1267		else if (__tpacket_has_room(po, 0))
1268			ret = ROOM_LOW;
1269	}
1270
1271	return ret;
1272}
1273
1274static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1275{
1276	int pressure, ret;
1277
1278	ret = __packet_rcv_has_room(po, skb);
1279	pressure = ret != ROOM_NORMAL;
1280
1281	if (READ_ONCE(po->pressure) != pressure)
1282		WRITE_ONCE(po->pressure, pressure);
1283
1284	return ret;
1285}
1286
1287static void packet_rcv_try_clear_pressure(struct packet_sock *po)
1288{
1289	if (READ_ONCE(po->pressure) &&
1290	    __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
1291		WRITE_ONCE(po->pressure,  0);
1292}
1293
1294static void packet_sock_destruct(struct sock *sk)
1295{
1296	skb_queue_purge(&sk->sk_error_queue);
1297
1298	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1299	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1300
1301	if (!sock_flag(sk, SOCK_DEAD)) {
1302		pr_err("Attempt to release alive packet socket: %p\n", sk);
1303		return;
1304	}
1305
1306	sk_refcnt_debug_dec(sk);
1307}
1308
1309static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1310{
1311	u32 *history = po->rollover->history;
1312	u32 victim, rxhash;
1313	int i, count = 0;
1314
1315	rxhash = skb_get_hash(skb);
1316	for (i = 0; i < ROLLOVER_HLEN; i++)
1317		if (READ_ONCE(history[i]) == rxhash)
1318			count++;
1319
1320	victim = prandom_u32() % ROLLOVER_HLEN;
1321
1322	/* Avoid dirtying the cache line if possible */
1323	if (READ_ONCE(history[victim]) != rxhash)
1324		WRITE_ONCE(history[victim], rxhash);
1325
1326	return count > (ROLLOVER_HLEN >> 1);
1327}
1328
1329static unsigned int fanout_demux_hash(struct packet_fanout *f,
1330				      struct sk_buff *skb,
1331				      unsigned int num)
1332{
1333	return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
1334}
1335
1336static unsigned int fanout_demux_lb(struct packet_fanout *f,
1337				    struct sk_buff *skb,
1338				    unsigned int num)
1339{
1340	unsigned int val = atomic_inc_return(&f->rr_cur);
1341
1342	return val % num;
1343}
1344
1345static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1346				     struct sk_buff *skb,
1347				     unsigned int num)
1348{
1349	return smp_processor_id() % num;
1350}
1351
1352static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1353				     struct sk_buff *skb,
1354				     unsigned int num)
1355{
1356	return prandom_u32_max(num);
1357}
1358
1359static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1360					  struct sk_buff *skb,
1361					  unsigned int idx, bool try_self,
1362					  unsigned int num)
1363{
1364	struct packet_sock *po, *po_next, *po_skip = NULL;
1365	unsigned int i, j, room = ROOM_NONE;
1366
1367	po = pkt_sk(rcu_dereference(f->arr[idx]));
1368
1369	if (try_self) {
1370		room = packet_rcv_has_room(po, skb);
1371		if (room == ROOM_NORMAL ||
1372		    (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1373			return idx;
1374		po_skip = po;
1375	}
1376
1377	i = j = min_t(int, po->rollover->sock, num - 1);
1378	do {
1379		po_next = pkt_sk(rcu_dereference(f->arr[i]));
1380		if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
1381		    packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
1382			if (i != j)
1383				po->rollover->sock = i;
1384			atomic_long_inc(&po->rollover->num);
1385			if (room == ROOM_LOW)
1386				atomic_long_inc(&po->rollover->num_huge);
1387			return i;
1388		}
1389
1390		if (++i == num)
1391			i = 0;
1392	} while (i != j);
1393
1394	atomic_long_inc(&po->rollover->num_failed);
1395	return idx;
1396}
1397
1398static unsigned int fanout_demux_qm(struct packet_fanout *f,
1399				    struct sk_buff *skb,
1400				    unsigned int num)
1401{
1402	return skb_get_queue_mapping(skb) % num;
1403}
1404
1405static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1406				     struct sk_buff *skb,
1407				     unsigned int num)
1408{
1409	struct bpf_prog *prog;
1410	unsigned int ret = 0;
1411
1412	rcu_read_lock();
1413	prog = rcu_dereference(f->bpf_prog);
1414	if (prog)
1415		ret = bpf_prog_run_clear_cb(prog, skb) % num;
1416	rcu_read_unlock();
1417
1418	return ret;
1419}
1420
1421static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1422{
1423	return f->flags & (flag >> 8);
1424}
1425
1426static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1427			     struct packet_type *pt, struct net_device *orig_dev)
1428{
1429	struct packet_fanout *f = pt->af_packet_priv;
1430	unsigned int num = READ_ONCE(f->num_members);
1431	struct net *net = read_pnet(&f->net);
1432	struct packet_sock *po;
1433	unsigned int idx;
1434
1435	if (!net_eq(dev_net(dev), net) || !num) {
1436		kfree_skb(skb);
1437		return 0;
1438	}
1439
1440	if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1441		skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
1442		if (!skb)
1443			return 0;
1444	}
1445	switch (f->type) {
1446	case PACKET_FANOUT_HASH:
1447	default:
1448		idx = fanout_demux_hash(f, skb, num);
1449		break;
1450	case PACKET_FANOUT_LB:
1451		idx = fanout_demux_lb(f, skb, num);
1452		break;
1453	case PACKET_FANOUT_CPU:
1454		idx = fanout_demux_cpu(f, skb, num);
1455		break;
1456	case PACKET_FANOUT_RND:
1457		idx = fanout_demux_rnd(f, skb, num);
1458		break;
1459	case PACKET_FANOUT_QM:
1460		idx = fanout_demux_qm(f, skb, num);
1461		break;
1462	case PACKET_FANOUT_ROLLOVER:
1463		idx = fanout_demux_rollover(f, skb, 0, false, num);
1464		break;
1465	case PACKET_FANOUT_CBPF:
1466	case PACKET_FANOUT_EBPF:
1467		idx = fanout_demux_bpf(f, skb, num);
1468		break;
1469	}
1470
1471	if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1472		idx = fanout_demux_rollover(f, skb, idx, true, num);
1473
1474	po = pkt_sk(rcu_dereference(f->arr[idx]));
1475	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1476}
1477
1478DEFINE_MUTEX(fanout_mutex);
1479EXPORT_SYMBOL_GPL(fanout_mutex);
1480static LIST_HEAD(fanout_list);
1481static u16 fanout_next_id;
1482
1483static void __fanout_link(struct sock *sk, struct packet_sock *po)
1484{
1485	struct packet_fanout *f = po->fanout;
1486
1487	spin_lock(&f->lock);
1488	rcu_assign_pointer(f->arr[f->num_members], sk);
1489	smp_wmb();
1490	f->num_members++;
1491	if (f->num_members == 1)
1492		dev_add_pack(&f->prot_hook);
1493	spin_unlock(&f->lock);
1494}
1495
1496static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1497{
1498	struct packet_fanout *f = po->fanout;
1499	int i;
1500
1501	spin_lock(&f->lock);
1502	for (i = 0; i < f->num_members; i++) {
1503		if (rcu_dereference_protected(f->arr[i],
1504					      lockdep_is_held(&f->lock)) == sk)
1505			break;
1506	}
1507	BUG_ON(i >= f->num_members);
1508	rcu_assign_pointer(f->arr[i],
1509			   rcu_dereference_protected(f->arr[f->num_members - 1],
1510						     lockdep_is_held(&f->lock)));
1511	f->num_members--;
1512	if (f->num_members == 0)
1513		__dev_remove_pack(&f->prot_hook);
1514	spin_unlock(&f->lock);
1515}
1516
1517static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1518{
1519	if (sk->sk_family != PF_PACKET)
1520		return false;
1521
1522	return ptype->af_packet_priv == pkt_sk(sk)->fanout;
1523}
1524
1525static void fanout_init_data(struct packet_fanout *f)
1526{
1527	switch (f->type) {
1528	case PACKET_FANOUT_LB:
1529		atomic_set(&f->rr_cur, 0);
1530		break;
1531	case PACKET_FANOUT_CBPF:
1532	case PACKET_FANOUT_EBPF:
1533		RCU_INIT_POINTER(f->bpf_prog, NULL);
1534		break;
1535	}
1536}
1537
1538static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1539{
1540	struct bpf_prog *old;
1541
1542	spin_lock(&f->lock);
1543	old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1544	rcu_assign_pointer(f->bpf_prog, new);
1545	spin_unlock(&f->lock);
1546
1547	if (old) {
1548		synchronize_net();
1549		bpf_prog_destroy(old);
1550	}
1551}
1552
1553static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data,
1554				unsigned int len)
1555{
1556	struct bpf_prog *new;
1557	struct sock_fprog fprog;
1558	int ret;
1559
1560	if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1561		return -EPERM;
1562
1563	ret = copy_bpf_fprog_from_user(&fprog, data, len);
1564	if (ret)
1565		return ret;
1566
1567	ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
1568	if (ret)
1569		return ret;
1570
1571	__fanout_set_data_bpf(po->fanout, new);
1572	return 0;
1573}
1574
1575static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data,
1576				unsigned int len)
1577{
1578	struct bpf_prog *new;
1579	u32 fd;
1580
1581	if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1582		return -EPERM;
1583	if (len != sizeof(fd))
1584		return -EINVAL;
1585	if (copy_from_sockptr(&fd, data, len))
1586		return -EFAULT;
1587
1588	new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
1589	if (IS_ERR(new))
1590		return PTR_ERR(new);
1591
1592	__fanout_set_data_bpf(po->fanout, new);
1593	return 0;
1594}
1595
1596static int fanout_set_data(struct packet_sock *po, sockptr_t data,
1597			   unsigned int len)
1598{
1599	switch (po->fanout->type) {
1600	case PACKET_FANOUT_CBPF:
1601		return fanout_set_data_cbpf(po, data, len);
1602	case PACKET_FANOUT_EBPF:
1603		return fanout_set_data_ebpf(po, data, len);
1604	default:
1605		return -EINVAL;
1606	}
1607}
1608
1609static void fanout_release_data(struct packet_fanout *f)
1610{
1611	switch (f->type) {
1612	case PACKET_FANOUT_CBPF:
1613	case PACKET_FANOUT_EBPF:
1614		__fanout_set_data_bpf(f, NULL);
1615	}
1616}
1617
1618static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1619{
1620	struct packet_fanout *f;
1621
1622	list_for_each_entry(f, &fanout_list, list) {
1623		if (f->id == candidate_id &&
1624		    read_pnet(&f->net) == sock_net(sk)) {
1625			return false;
1626		}
1627	}
1628	return true;
1629}
1630
1631static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1632{
1633	u16 id = fanout_next_id;
1634
1635	do {
1636		if (__fanout_id_is_free(sk, id)) {
1637			*new_id = id;
1638			fanout_next_id = id + 1;
1639			return true;
1640		}
1641
1642		id++;
1643	} while (id != fanout_next_id);
1644
1645	return false;
1646}
1647
1648static int fanout_add(struct sock *sk, struct fanout_args *args)
1649{
1650	struct packet_rollover *rollover = NULL;
1651	struct packet_sock *po = pkt_sk(sk);
1652	u16 type_flags = args->type_flags;
1653	struct packet_fanout *f, *match;
1654	u8 type = type_flags & 0xff;
1655	u8 flags = type_flags >> 8;
1656	u16 id = args->id;
1657	int err;
1658
1659	switch (type) {
1660	case PACKET_FANOUT_ROLLOVER:
1661		if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1662			return -EINVAL;
1663	case PACKET_FANOUT_HASH:
1664	case PACKET_FANOUT_LB:
1665	case PACKET_FANOUT_CPU:
1666	case PACKET_FANOUT_RND:
1667	case PACKET_FANOUT_QM:
1668	case PACKET_FANOUT_CBPF:
1669	case PACKET_FANOUT_EBPF:
1670		break;
1671	default:
1672		return -EINVAL;
1673	}
1674
1675	mutex_lock(&fanout_mutex);
1676
1677	err = -EALREADY;
1678	if (po->fanout)
1679		goto out;
1680
1681	if (type == PACKET_FANOUT_ROLLOVER ||
1682	    (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1683		err = -ENOMEM;
1684		rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1685		if (!rollover)
1686			goto out;
1687		atomic_long_set(&rollover->num, 0);
1688		atomic_long_set(&rollover->num_huge, 0);
1689		atomic_long_set(&rollover->num_failed, 0);
1690	}
1691
1692	if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1693		if (id != 0) {
1694			err = -EINVAL;
1695			goto out;
1696		}
1697		if (!fanout_find_new_id(sk, &id)) {
1698			err = -ENOMEM;
1699			goto out;
1700		}
1701		/* ephemeral flag for the first socket in the group: drop it */
1702		flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1703	}
1704
1705	match = NULL;
1706	list_for_each_entry(f, &fanout_list, list) {
1707		if (f->id == id &&
1708		    read_pnet(&f->net) == sock_net(sk)) {
1709			match = f;
1710			break;
1711		}
1712	}
1713	err = -EINVAL;
1714	if (match) {
1715		if (match->flags != flags)
1716			goto out;
1717		if (args->max_num_members &&
1718		    args->max_num_members != match->max_num_members)
1719			goto out;
1720	} else {
1721		if (args->max_num_members > PACKET_FANOUT_MAX)
1722			goto out;
1723		if (!args->max_num_members)
1724			/* legacy PACKET_FANOUT_MAX */
1725			args->max_num_members = 256;
1726		err = -ENOMEM;
1727		match = kvzalloc(struct_size(match, arr, args->max_num_members),
1728				 GFP_KERNEL);
1729		if (!match)
1730			goto out;
1731		write_pnet(&match->net, sock_net(sk));
1732		match->id = id;
1733		match->type = type;
1734		match->flags = flags;
1735		INIT_LIST_HEAD(&match->list);
1736		spin_lock_init(&match->lock);
1737		refcount_set(&match->sk_ref, 0);
1738		fanout_init_data(match);
1739		match->prot_hook.type = po->prot_hook.type;
1740		match->prot_hook.dev = po->prot_hook.dev;
1741		match->prot_hook.func = packet_rcv_fanout;
1742		match->prot_hook.af_packet_priv = match;
1743		match->prot_hook.af_packet_net = read_pnet(&match->net);
1744		match->prot_hook.id_match = match_fanout_group;
1745		match->max_num_members = args->max_num_members;
1746		list_add(&match->list, &fanout_list);
1747	}
1748	err = -EINVAL;
1749
1750	spin_lock(&po->bind_lock);
1751	if (po->running &&
1752	    match->type == type &&
1753	    match->prot_hook.type == po->prot_hook.type &&
1754	    match->prot_hook.dev == po->prot_hook.dev) {
1755		err = -ENOSPC;
1756		if (refcount_read(&match->sk_ref) < match->max_num_members) {
1757			__dev_remove_pack(&po->prot_hook);
1758
1759			/* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */
1760			WRITE_ONCE(po->fanout, match);
1761
1762			po->rollover = rollover;
1763			rollover = NULL;
1764			refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
1765			__fanout_link(sk, po);
1766			err = 0;
1767		}
1768	}
1769	spin_unlock(&po->bind_lock);
1770
1771	if (err && !refcount_read(&match->sk_ref)) {
1772		list_del(&match->list);
1773		kvfree(match);
1774	}
1775
1776out:
1777	kfree(rollover);
1778	mutex_unlock(&fanout_mutex);
1779	return err;
1780}
1781
1782/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1783 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1784 * It is the responsibility of the caller to call fanout_release_data() and
1785 * free the returned packet_fanout (after synchronize_net())
1786 */
1787static struct packet_fanout *fanout_release(struct sock *sk)
1788{
1789	struct packet_sock *po = pkt_sk(sk);
1790	struct packet_fanout *f;
1791
1792	mutex_lock(&fanout_mutex);
1793	f = po->fanout;
1794	if (f) {
1795		po->fanout = NULL;
1796
1797		if (refcount_dec_and_test(&f->sk_ref))
1798			list_del(&f->list);
1799		else
1800			f = NULL;
1801	}
1802	mutex_unlock(&fanout_mutex);
1803
1804	return f;
1805}
1806
1807static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1808					  struct sk_buff *skb)
1809{
1810	/* Earlier code assumed this would be a VLAN pkt, double-check
1811	 * this now that we have the actual packet in hand. We can only
1812	 * do this check on Ethernet devices.
1813	 */
1814	if (unlikely(dev->type != ARPHRD_ETHER))
1815		return false;
1816
1817	skb_reset_mac_header(skb);
1818	return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1819}
1820
1821static const struct proto_ops packet_ops;
1822
1823static const struct proto_ops packet_ops_spkt;
1824
1825static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1826			   struct packet_type *pt, struct net_device *orig_dev)
1827{
1828	struct sock *sk;
1829	struct sockaddr_pkt *spkt;
1830
1831	/*
1832	 *	When we registered the protocol we saved the socket in the data
1833	 *	field for just this event.
1834	 */
1835
1836	sk = pt->af_packet_priv;
1837
1838	/*
1839	 *	Yank back the headers [hope the device set this
1840	 *	right or kerboom...]
1841	 *
1842	 *	Incoming packets have ll header pulled,
1843	 *	push it back.
1844	 *
1845	 *	For outgoing ones skb->data == skb_mac_header(skb)
1846	 *	so that this procedure is noop.
1847	 */
1848
1849	if (skb->pkt_type == PACKET_LOOPBACK)
1850		goto out;
1851
1852	if (!net_eq(dev_net(dev), sock_net(sk)))
1853		goto out;
1854
1855	skb = skb_share_check(skb, GFP_ATOMIC);
1856	if (skb == NULL)
1857		goto oom;
1858
1859	/* drop any routing info */
1860	skb_dst_drop(skb);
1861
1862	/* drop conntrack reference */
1863	nf_reset_ct(skb);
1864
1865	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1866
1867	skb_push(skb, skb->data - skb_mac_header(skb));
1868
1869	/*
1870	 *	The SOCK_PACKET socket receives _all_ frames.
1871	 */
1872
1873	spkt->spkt_family = dev->type;
1874	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1875	spkt->spkt_protocol = skb->protocol;
1876
1877	/*
1878	 *	Charge the memory to the socket. This is done specifically
1879	 *	to prevent sockets using all the memory up.
1880	 */
1881
1882	if (sock_queue_rcv_skb(sk, skb) == 0)
1883		return 0;
1884
1885out:
1886	kfree_skb(skb);
1887oom:
1888	return 0;
1889}
1890
1891static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1892{
1893	int depth;
1894
1895	if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1896	    sock->type == SOCK_RAW) {
1897		skb_reset_mac_header(skb);
1898		skb->protocol = dev_parse_header_protocol(skb);
1899	}
1900
1901	/* Move network header to the right position for VLAN tagged packets */
1902	if (likely(skb->dev->type == ARPHRD_ETHER) &&
1903	    eth_type_vlan(skb->protocol) &&
1904	    vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0)
1905		skb_set_network_header(skb, depth);
1906
1907	skb_probe_transport_header(skb);
1908}
1909
1910/*
1911 *	Output a raw packet to a device layer. This bypasses all the other
1912 *	protocol layers and you must therefore supply it with a complete frame
1913 */
1914
1915static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1916			       size_t len)
1917{
1918	struct sock *sk = sock->sk;
1919	DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1920	struct sk_buff *skb = NULL;
1921	struct net_device *dev;
1922	struct sockcm_cookie sockc;
1923	__be16 proto = 0;
1924	int err;
1925	int extra_len = 0;
1926
1927	/*
1928	 *	Get and verify the address.
1929	 */
1930
1931	if (saddr) {
1932		if (msg->msg_namelen < sizeof(struct sockaddr))
1933			return -EINVAL;
1934		if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1935			proto = saddr->spkt_protocol;
1936	} else
1937		return -ENOTCONN;	/* SOCK_PACKET must be sent giving an address */
1938
1939	/*
1940	 *	Find the device first to size check it
1941	 */
1942
1943	saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1944retry:
1945	rcu_read_lock();
1946	dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1947	err = -ENODEV;
1948	if (dev == NULL)
1949		goto out_unlock;
1950
1951	err = -ENETDOWN;
1952	if (!(dev->flags & IFF_UP))
1953		goto out_unlock;
1954
1955	/*
1956	 * You may not queue a frame bigger than the mtu. This is the lowest level
1957	 * raw protocol and you must do your own fragmentation at this level.
1958	 */
1959
1960	if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1961		if (!netif_supports_nofcs(dev)) {
1962			err = -EPROTONOSUPPORT;
1963			goto out_unlock;
1964		}
1965		extra_len = 4; /* We're doing our own CRC */
1966	}
1967
1968	err = -EMSGSIZE;
1969	if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1970		goto out_unlock;
1971
1972	if (!skb) {
1973		size_t reserved = LL_RESERVED_SPACE(dev);
1974		int tlen = dev->needed_tailroom;
1975		unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1976
1977		rcu_read_unlock();
1978		skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1979		if (skb == NULL)
1980			return -ENOBUFS;
1981		/* FIXME: Save some space for broken drivers that write a hard
1982		 * header at transmission time by themselves. PPP is the notable
1983		 * one here. This should really be fixed at the driver level.
1984		 */
1985		skb_reserve(skb, reserved);
1986		skb_reset_network_header(skb);
1987
1988		/* Try to align data part correctly */
1989		if (hhlen) {
1990			skb->data -= hhlen;
1991			skb->tail -= hhlen;
1992			if (len < hhlen)
1993				skb_reset_network_header(skb);
1994		}
1995		err = memcpy_from_msg(skb_put(skb, len), msg, len);
1996		if (err)
1997			goto out_free;
1998		goto retry;
1999	}
2000
2001	if (!dev_validate_header(dev, skb->data, len) || !skb->len) {
2002		err = -EINVAL;
2003		goto out_unlock;
2004	}
2005	if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
2006	    !packet_extra_vlan_len_allowed(dev, skb)) {
2007		err = -EMSGSIZE;
2008		goto out_unlock;
2009	}
2010
2011	sockcm_init(&sockc, sk);
2012	if (msg->msg_controllen) {
2013		err = sock_cmsg_send(sk, msg, &sockc);
2014		if (unlikely(err))
2015			goto out_unlock;
2016	}
2017
2018	skb->protocol = proto;
2019	skb->dev = dev;
2020	skb->priority = sk->sk_priority;
2021	skb->mark = sk->sk_mark;
2022	skb->tstamp = sockc.transmit_time;
2023
2024	skb_setup_tx_timestamp(skb, sockc.tsflags);
2025
2026	if (unlikely(extra_len == 4))
2027		skb->no_fcs = 1;
2028
2029	packet_parse_headers(skb, sock);
2030
2031	dev_queue_xmit(skb);
2032	rcu_read_unlock();
2033	return len;
2034
2035out_unlock:
2036	rcu_read_unlock();
2037out_free:
2038	kfree_skb(skb);
2039	return err;
2040}
2041
2042static unsigned int run_filter(struct sk_buff *skb,
2043			       const struct sock *sk,
2044			       unsigned int res)
2045{
2046	struct sk_filter *filter;
2047
2048	rcu_read_lock();
2049	filter = rcu_dereference(sk->sk_filter);
2050	if (filter != NULL)
2051		res = bpf_prog_run_clear_cb(filter->prog, skb);
2052	rcu_read_unlock();
2053
2054	return res;
2055}
2056
2057static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2058			   size_t *len)
2059{
2060	struct virtio_net_hdr vnet_hdr;
2061
2062	if (*len < sizeof(vnet_hdr))
2063		return -EINVAL;
2064	*len -= sizeof(vnet_hdr);
2065
2066	if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
2067		return -EINVAL;
2068
2069	return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2070}
2071
2072/*
2073 * This function makes lazy skb cloning in hope that most of packets
2074 * are discarded by BPF.
2075 *
2076 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2077 * and skb->cb are mangled. It works because (and until) packets
2078 * falling here are owned by current CPU. Output packets are cloned
2079 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2080 * sequencially, so that if we return skb to original state on exit,
2081 * we will not harm anyone.
2082 */
2083
2084static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2085		      struct packet_type *pt, struct net_device *orig_dev)
2086{
2087	struct sock *sk;
2088	struct sockaddr_ll *sll;
2089	struct packet_sock *po;
2090	u8 *skb_head = skb->data;
2091	int skb_len = skb->len;
2092	unsigned int snaplen, res;
2093	bool is_drop_n_account = false;
2094
2095	if (skb->pkt_type == PACKET_LOOPBACK)
2096		goto drop;
2097
2098	sk = pt->af_packet_priv;
2099	po = pkt_sk(sk);
2100
2101	if (!net_eq(dev_net(dev), sock_net(sk)))
2102		goto drop;
2103
2104	skb->dev = dev;
2105
2106	if (dev_has_header(dev)) {
2107		/* The device has an explicit notion of ll header,
2108		 * exported to higher levels.
2109		 *
2110		 * Otherwise, the device hides details of its frame
2111		 * structure, so that corresponding packet head is
2112		 * never delivered to user.
2113		 */
2114		if (sk->sk_type != SOCK_DGRAM)
2115			skb_push(skb, skb->data - skb_mac_header(skb));
2116		else if (skb->pkt_type == PACKET_OUTGOING) {
2117			/* Special case: outgoing packets have ll header at head */
2118			skb_pull(skb, skb_network_offset(skb));
2119		}
2120	}
2121
2122	snaplen = skb->len;
2123
2124	res = run_filter(skb, sk, snaplen);
2125	if (!res)
2126		goto drop_n_restore;
2127	if (snaplen > res)
2128		snaplen = res;
2129
2130	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2131		goto drop_n_acct;
2132
2133	if (skb_shared(skb)) {
2134		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2135		if (nskb == NULL)
2136			goto drop_n_acct;
2137
2138		if (skb_head != skb->data) {
2139			skb->data = skb_head;
2140			skb->len = skb_len;
2141		}
2142		consume_skb(skb);
2143		skb = nskb;
2144	}
2145
2146	sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
2147
2148	sll = &PACKET_SKB_CB(skb)->sa.ll;
2149	sll->sll_hatype = dev->type;
2150	sll->sll_pkttype = skb->pkt_type;
2151	if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
2152		sll->sll_ifindex = orig_dev->ifindex;
2153	else
2154		sll->sll_ifindex = dev->ifindex;
2155
2156	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2157
2158	/* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2159	 * Use their space for storing the original skb length.
2160	 */
2161	PACKET_SKB_CB(skb)->sa.origlen = skb->len;
2162
2163	if (pskb_trim(skb, snaplen))
2164		goto drop_n_acct;
2165
2166	skb_set_owner_r(skb, sk);
2167	skb->dev = NULL;
2168	skb_dst_drop(skb);
2169
2170	/* drop conntrack reference */
2171	nf_reset_ct(skb);
2172
2173	spin_lock(&sk->sk_receive_queue.lock);
2174	po->stats.stats1.tp_packets++;
2175	sock_skb_set_dropcount(sk, skb);
2176	__skb_queue_tail(&sk->sk_receive_queue, skb);
2177	spin_unlock(&sk->sk_receive_queue.lock);
2178	sk->sk_data_ready(sk);
2179	return 0;
2180
2181drop_n_acct:
2182	is_drop_n_account = true;
2183	atomic_inc(&po->tp_drops);
2184	atomic_inc(&sk->sk_drops);
2185
2186drop_n_restore:
2187	if (skb_head != skb->data && skb_shared(skb)) {
2188		skb->data = skb_head;
2189		skb->len = skb_len;
2190	}
2191drop:
2192	if (!is_drop_n_account)
2193		consume_skb(skb);
2194	else
2195		kfree_skb(skb);
2196	return 0;
2197}
2198
2199static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2200		       struct packet_type *pt, struct net_device *orig_dev)
2201{
2202	struct sock *sk;
2203	struct packet_sock *po;
2204	struct sockaddr_ll *sll;
2205	union tpacket_uhdr h;
2206	u8 *skb_head = skb->data;
2207	int skb_len = skb->len;
2208	unsigned int snaplen, res;
2209	unsigned long status = TP_STATUS_USER;
2210	unsigned short macoff, hdrlen;
2211	unsigned int netoff;
2212	struct sk_buff *copy_skb = NULL;
2213	struct timespec64 ts;
2214	__u32 ts_status;
2215	bool is_drop_n_account = false;
2216	unsigned int slot_id = 0;
2217	bool do_vnet = false;
2218
2219	/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2220	 * We may add members to them until current aligned size without forcing
2221	 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2222	 */
2223	BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2224	BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2225
2226	if (skb->pkt_type == PACKET_LOOPBACK)
2227		goto drop;
2228
2229	sk = pt->af_packet_priv;
2230	po = pkt_sk(sk);
2231
2232	if (!net_eq(dev_net(dev), sock_net(sk)))
2233		goto drop;
2234
2235	if (dev_has_header(dev)) {
2236		if (sk->sk_type != SOCK_DGRAM)
2237			skb_push(skb, skb->data - skb_mac_header(skb));
2238		else if (skb->pkt_type == PACKET_OUTGOING) {
2239			/* Special case: outgoing packets have ll header at head */
2240			skb_pull(skb, skb_network_offset(skb));
2241		}
2242	}
2243
2244	snaplen = skb->len;
2245
2246	res = run_filter(skb, sk, snaplen);
2247	if (!res)
2248		goto drop_n_restore;
2249
2250	/* If we are flooded, just give up */
2251	if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
2252		atomic_inc(&po->tp_drops);
2253		goto drop_n_restore;
2254	}
2255
2256	if (skb->ip_summed == CHECKSUM_PARTIAL)
2257		status |= TP_STATUS_CSUMNOTREADY;
2258	else if (skb->pkt_type != PACKET_OUTGOING &&
2259		 skb_csum_unnecessary(skb))
2260		status |= TP_STATUS_CSUM_VALID;
2261
2262	if (snaplen > res)
2263		snaplen = res;
2264
2265	if (sk->sk_type == SOCK_DGRAM) {
2266		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2267				  po->tp_reserve;
2268	} else {
2269		unsigned int maclen = skb_network_offset(skb);
2270		netoff = TPACKET_ALIGN(po->tp_hdrlen +
2271				       (maclen < 16 ? 16 : maclen)) +
2272				       po->tp_reserve;
2273		if (po->has_vnet_hdr) {
2274			netoff += sizeof(struct virtio_net_hdr);
2275			do_vnet = true;
2276		}
2277		macoff = netoff - maclen;
2278	}
2279	if (netoff > USHRT_MAX) {
2280		atomic_inc(&po->tp_drops);
2281		goto drop_n_restore;
2282	}
2283	if (po->tp_version <= TPACKET_V2) {
2284		if (macoff + snaplen > po->rx_ring.frame_size) {
2285			if (po->copy_thresh &&
2286			    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
2287				if (skb_shared(skb)) {
2288					copy_skb = skb_clone(skb, GFP_ATOMIC);
2289				} else {
2290					copy_skb = skb_get(skb);
2291					skb_head = skb->data;
2292				}
2293				if (copy_skb) {
2294					memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0,
2295					       sizeof(PACKET_SKB_CB(copy_skb)->sa.ll));
2296					skb_set_owner_r(copy_skb, sk);
2297				}
2298			}
2299			snaplen = po->rx_ring.frame_size - macoff;
2300			if ((int)snaplen < 0) {
2301				snaplen = 0;
2302				do_vnet = false;
2303			}
2304		}
2305	} else if (unlikely(macoff + snaplen >
2306			    GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2307		u32 nval;
2308
2309		nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2310		pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2311			    snaplen, nval, macoff);
2312		snaplen = nval;
2313		if (unlikely((int)snaplen < 0)) {
2314			snaplen = 0;
2315			macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2316			do_vnet = false;
2317		}
2318	}
2319	spin_lock(&sk->sk_receive_queue.lock);
2320	h.raw = packet_current_rx_frame(po, skb,
2321					TP_STATUS_KERNEL, (macoff+snaplen));
2322	if (!h.raw)
2323		goto drop_n_account;
2324
2325	if (po->tp_version <= TPACKET_V2) {
2326		slot_id = po->rx_ring.head;
2327		if (test_bit(slot_id, po->rx_ring.rx_owner_map))
2328			goto drop_n_account;
2329		__set_bit(slot_id, po->rx_ring.rx_owner_map);
2330	}
2331
2332	if (do_vnet &&
2333	    virtio_net_hdr_from_skb(skb, h.raw + macoff -
2334				    sizeof(struct virtio_net_hdr),
2335				    vio_le(), true, 0)) {
2336		if (po->tp_version == TPACKET_V3)
2337			prb_clear_blk_fill_status(&po->rx_ring);
2338		goto drop_n_account;
2339	}
2340
2341	if (po->tp_version <= TPACKET_V2) {
2342		packet_increment_rx_head(po, &po->rx_ring);
2343	/*
2344	 * LOSING will be reported till you read the stats,
2345	 * because it's COR - Clear On Read.
2346	 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2347	 * at packet level.
2348	 */
2349		if (atomic_read(&po->tp_drops))
2350			status |= TP_STATUS_LOSING;
2351	}
2352
2353	po->stats.stats1.tp_packets++;
2354	if (copy_skb) {
2355		status |= TP_STATUS_COPY;
2356		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2357	}
2358	spin_unlock(&sk->sk_receive_queue.lock);
2359
2360	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2361
2362	/* Always timestamp; prefer an existing software timestamp taken
2363	 * closer to the time of capture.
2364	 */
2365	ts_status = tpacket_get_timestamp(skb, &ts,
2366					  po->tp_tstamp | SOF_TIMESTAMPING_SOFTWARE);
2367	if (!ts_status)
2368		ktime_get_real_ts64(&ts);
2369
2370	status |= ts_status;
2371
2372	switch (po->tp_version) {
2373	case TPACKET_V1:
2374		h.h1->tp_len = skb->len;
2375		h.h1->tp_snaplen = snaplen;
2376		h.h1->tp_mac = macoff;
2377		h.h1->tp_net = netoff;
2378		h.h1->tp_sec = ts.tv_sec;
2379		h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
2380		hdrlen = sizeof(*h.h1);
2381		break;
2382	case TPACKET_V2:
2383		h.h2->tp_len = skb->len;
2384		h.h2->tp_snaplen = snaplen;
2385		h.h2->tp_mac = macoff;
2386		h.h2->tp_net = netoff;
2387		h.h2->tp_sec = ts.tv_sec;
2388		h.h2->tp_nsec = ts.tv_nsec;
2389		if (skb_vlan_tag_present(skb)) {
2390			h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
2391			h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2392			status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2393		} else {
2394			h.h2->tp_vlan_tci = 0;
2395			h.h2->tp_vlan_tpid = 0;
2396		}
2397		memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
2398		hdrlen = sizeof(*h.h2);
2399		break;
2400	case TPACKET_V3:
2401		/* tp_nxt_offset,vlan are already populated above.
2402		 * So DONT clear those fields here
2403		 */
2404		h.h3->tp_status |= status;
2405		h.h3->tp_len = skb->len;
2406		h.h3->tp_snaplen = snaplen;
2407		h.h3->tp_mac = macoff;
2408		h.h3->tp_net = netoff;
2409		h.h3->tp_sec  = ts.tv_sec;
2410		h.h3->tp_nsec = ts.tv_nsec;
2411		memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
2412		hdrlen = sizeof(*h.h3);
2413		break;
2414	default:
2415		BUG();
2416	}
2417
2418	sll = h.raw + TPACKET_ALIGN(hdrlen);
2419	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2420	sll->sll_family = AF_PACKET;
2421	sll->sll_hatype = dev->type;
2422	sll->sll_protocol = skb->protocol;
2423	sll->sll_pkttype = skb->pkt_type;
2424	if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
2425		sll->sll_ifindex = orig_dev->ifindex;
2426	else
2427		sll->sll_ifindex = dev->ifindex;
2428
2429	smp_mb();
2430
2431#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2432	if (po->tp_version <= TPACKET_V2) {
2433		u8 *start, *end;
2434
2435		end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2436					macoff + snaplen);
2437
2438		for (start = h.raw; start < end; start += PAGE_SIZE)
2439			flush_dcache_page(pgv_to_page(start));
2440	}
2441	smp_wmb();
2442#endif
2443
2444	if (po->tp_version <= TPACKET_V2) {
2445		spin_lock(&sk->sk_receive_queue.lock);
2446		__packet_set_status(po, h.raw, status);
2447		__clear_bit(slot_id, po->rx_ring.rx_owner_map);
2448		spin_unlock(&sk->sk_receive_queue.lock);
2449		sk->sk_data_ready(sk);
2450	} else if (po->tp_version == TPACKET_V3) {
2451		prb_clear_blk_fill_status(&po->rx_ring);
2452	}
2453
2454drop_n_restore:
2455	if (skb_head != skb->data && skb_shared(skb)) {
2456		skb->data = skb_head;
2457		skb->len = skb_len;
2458	}
2459drop:
2460	if (!is_drop_n_account)
2461		consume_skb(skb);
2462	else
2463		kfree_skb(skb);
2464	return 0;
2465
2466drop_n_account:
2467	spin_unlock(&sk->sk_receive_queue.lock);
2468	atomic_inc(&po->tp_drops);
2469	is_drop_n_account = true;
2470
2471	sk->sk_data_ready(sk);
2472	kfree_skb(copy_skb);
2473	goto drop_n_restore;
2474}
2475
2476static void tpacket_destruct_skb(struct sk_buff *skb)
2477{
2478	struct packet_sock *po = pkt_sk(skb->sk);
2479
2480	if (likely(po->tx_ring.pg_vec)) {
2481		void *ph;
2482		__u32 ts;
2483
2484		ph = skb_zcopy_get_nouarg(skb);
2485		packet_dec_pending(&po->tx_ring);
2486
2487		ts = __packet_set_timestamp(po, ph, skb);
2488		__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
2489
2490		if (!packet_read_pending(&po->tx_ring))
2491			complete(&po->skb_completion);
2492	}
2493
2494	sock_wfree(skb);
2495}
2496
2497static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2498{
2499	if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2500	    (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2501	     __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2502	      __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2503		vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2504			 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2505			__virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2506
2507	if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2508		return -EINVAL;
2509
2510	return 0;
2511}
2512
2513static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2514				 struct virtio_net_hdr *vnet_hdr)
2515{
2516	if (*len < sizeof(*vnet_hdr))
2517		return -EINVAL;
2518	*len -= sizeof(*vnet_hdr);
2519
2520	if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
2521		return -EFAULT;
2522
2523	return __packet_snd_vnet_parse(vnet_hdr, *len);
2524}
2525
2526static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2527		void *frame, struct net_device *dev, void *data, int tp_len,
2528		__be16 proto, unsigned char *addr, int hlen, int copylen,
2529		const struct sockcm_cookie *sockc)
2530{
2531	union tpacket_uhdr ph;
2532	int to_write, offset, len, nr_frags, len_max;
2533	struct socket *sock = po->sk.sk_socket;
2534	struct page *page;
2535	int err;
2536
2537	ph.raw = frame;
2538
2539	skb->protocol = proto;
2540	skb->dev = dev;
2541	skb->priority = po->sk.sk_priority;
2542	skb->mark = po->sk.sk_mark;
2543	skb->tstamp = sockc->transmit_time;
2544	skb_setup_tx_timestamp(skb, sockc->tsflags);
2545	skb_zcopy_set_nouarg(skb, ph.raw);
2546
2547	skb_reserve(skb, hlen);
2548	skb_reset_network_header(skb);
2549
2550	to_write = tp_len;
2551
2552	if (sock->type == SOCK_DGRAM) {
2553		err = dev_hard_header(skb, dev, ntohs(proto), addr,
2554				NULL, tp_len);
2555		if (unlikely(err < 0))
2556			return -EINVAL;
2557	} else if (copylen) {
2558		int hdrlen = min_t(int, copylen, tp_len);
2559
2560		skb_push(skb, dev->hard_header_len);
2561		skb_put(skb, copylen - dev->hard_header_len);
2562		err = skb_store_bits(skb, 0, data, hdrlen);
2563		if (unlikely(err))
2564			return err;
2565		if (!dev_validate_header(dev, skb->data, hdrlen))
2566			return -EINVAL;
2567
2568		data += hdrlen;
2569		to_write -= hdrlen;
2570	}
2571
2572	offset = offset_in_page(data);
2573	len_max = PAGE_SIZE - offset;
2574	len = ((to_write > len_max) ? len_max : to_write);
2575
2576	skb->data_len = to_write;
2577	skb->len += to_write;
2578	skb->truesize += to_write;
2579	refcount_add(to_write, &po->sk.sk_wmem_alloc);
2580
2581	while (likely(to_write)) {
2582		nr_frags = skb_shinfo(skb)->nr_frags;
2583
2584		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2585			pr_err("Packet exceed the number of skb frags(%lu)\n",
2586			       MAX_SKB_FRAGS);
2587			return -EFAULT;
2588		}
2589
2590		page = pgv_to_page(data);
2591		data += len;
2592		flush_dcache_page(page);
2593		get_page(page);
2594		skb_fill_page_desc(skb, nr_frags, page, offset, len);
2595		to_write -= len;
2596		offset = 0;
2597		len_max = PAGE_SIZE;
2598		len = ((to_write > len_max) ? len_max : to_write);
2599	}
2600
2601	packet_parse_headers(skb, sock);
2602
2603	return tp_len;
2604}
2605
2606static int tpacket_parse_header(struct packet_sock *po, void *frame,
2607				int size_max, void **data)
2608{
2609	union tpacket_uhdr ph;
2610	int tp_len, off;
2611
2612	ph.raw = frame;
2613
2614	switch (po->tp_version) {
2615	case TPACKET_V3:
2616		if (ph.h3->tp_next_offset != 0) {
2617			pr_warn_once("variable sized slot not supported");
2618			return -EINVAL;
2619		}
2620		tp_len = ph.h3->tp_len;
2621		break;
2622	case TPACKET_V2:
2623		tp_len = ph.h2->tp_len;
2624		break;
2625	default:
2626		tp_len = ph.h1->tp_len;
2627		break;
2628	}
2629	if (unlikely(tp_len > size_max)) {
2630		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2631		return -EMSGSIZE;
2632	}
2633
2634	if (unlikely(po->tp_tx_has_off)) {
2635		int off_min, off_max;
2636
2637		off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2638		off_max = po->tx_ring.frame_size - tp_len;
2639		if (po->sk.sk_type == SOCK_DGRAM) {
2640			switch (po->tp_version) {
2641			case TPACKET_V3:
2642				off = ph.h3->tp_net;
2643				break;
2644			case TPACKET_V2:
2645				off = ph.h2->tp_net;
2646				break;
2647			default:
2648				off = ph.h1->tp_net;
2649				break;
2650			}
2651		} else {
2652			switch (po->tp_version) {
2653			case TPACKET_V3:
2654				off = ph.h3->tp_mac;
2655				break;
2656			case TPACKET_V2:
2657				off = ph.h2->tp_mac;
2658				break;
2659			default:
2660				off = ph.h1->tp_mac;
2661				break;
2662			}
2663		}
2664		if (unlikely((off < off_min) || (off_max < off)))
2665			return -EINVAL;
2666	} else {
2667		off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2668	}
2669
2670	*data = frame + off;
2671	return tp_len;
2672}
2673
2674static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2675{
2676	struct sk_buff *skb = NULL;
2677	struct net_device *dev;
2678	struct virtio_net_hdr *vnet_hdr = NULL;
2679	struct sockcm_cookie sockc;
2680	__be16 proto;
2681	int err, reserve = 0;
2682	void *ph;
2683	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2684	bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2685	unsigned char *addr = NULL;
2686	int tp_len, size_max;
2687	void *data;
2688	int len_sum = 0;
2689	int status = TP_STATUS_AVAILABLE;
2690	int hlen, tlen, copylen = 0;
2691	long timeo = 0;
2692
2693	mutex_lock(&po->pg_vec_lock);
2694
2695	/* packet_sendmsg() check on tx_ring.pg_vec was lockless,
2696	 * we need to confirm it under protection of pg_vec_lock.
2697	 */
2698	if (unlikely(!po->tx_ring.pg_vec)) {
2699		err = -EBUSY;
2700		goto out;
2701	}
2702	if (likely(saddr == NULL)) {
2703		dev	= packet_cached_dev_get(po);
2704		proto	= READ_ONCE(po->num);
2705	} else {
2706		err = -EINVAL;
2707		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2708			goto out;
2709		if (msg->msg_namelen < (saddr->sll_halen
2710					+ offsetof(struct sockaddr_ll,
2711						sll_addr)))
2712			goto out;
2713		proto	= saddr->sll_protocol;
2714		dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2715		if (po->sk.sk_socket->type == SOCK_DGRAM) {
2716			if (dev && msg->msg_namelen < dev->addr_len +
2717				   offsetof(struct sockaddr_ll, sll_addr))
2718				goto out_put;
2719			addr = saddr->sll_addr;
2720		}
2721	}
2722
2723	err = -ENXIO;
2724	if (unlikely(dev == NULL))
2725		goto out;
2726	err = -ENETDOWN;
2727	if (unlikely(!(dev->flags & IFF_UP)))
2728		goto out_put;
2729
2730	sockcm_init(&sockc, &po->sk);
2731	if (msg->msg_controllen) {
2732		err = sock_cmsg_send(&po->sk, msg, &sockc);
2733		if (unlikely(err))
2734			goto out_put;
2735	}
2736
2737	if (po->sk.sk_socket->type == SOCK_RAW)
2738		reserve = dev->hard_header_len;
2739	size_max = po->tx_ring.frame_size
2740		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2741
2742	if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
2743		size_max = dev->mtu + reserve + VLAN_HLEN;
2744
2745	reinit_completion(&po->skb_completion);
2746
2747	do {
2748		ph = packet_current_frame(po, &po->tx_ring,
2749					  TP_STATUS_SEND_REQUEST);
2750		if (unlikely(ph == NULL)) {
2751			if (need_wait && skb) {
2752				timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
2753				timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
2754				if (timeo <= 0) {
2755					err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
2756					goto out_put;
2757				}
2758			}
2759			/* check for additional frames */
2760			continue;
2761		}
2762
2763		skb = NULL;
2764		tp_len = tpacket_parse_header(po, ph, size_max, &data);
2765		if (tp_len < 0)
2766			goto tpacket_error;
2767
2768		status = TP_STATUS_SEND_REQUEST;
2769		hlen = LL_RESERVED_SPACE(dev);
2770		tlen = dev->needed_tailroom;
2771		if (po->has_vnet_hdr) {
2772			vnet_hdr = data;
2773			data += sizeof(*vnet_hdr);
2774			tp_len -= sizeof(*vnet_hdr);
2775			if (tp_len < 0 ||
2776			    __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2777				tp_len = -EINVAL;
2778				goto tpacket_error;
2779			}
2780			copylen = __virtio16_to_cpu(vio_le(),
2781						    vnet_hdr->hdr_len);
2782		}
2783		copylen = max_t(int, copylen, dev->hard_header_len);
2784		skb = sock_alloc_send_skb(&po->sk,
2785				hlen + tlen + sizeof(struct sockaddr_ll) +
2786				(copylen - dev->hard_header_len),
2787				!need_wait, &err);
2788
2789		if (unlikely(skb == NULL)) {
2790			/* we assume the socket was initially writeable ... */
2791			if (likely(len_sum > 0))
2792				err = len_sum;
2793			goto out_status;
2794		}
2795		tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
2796					  addr, hlen, copylen, &sockc);
2797		if (likely(tp_len >= 0) &&
2798		    tp_len > dev->mtu + reserve &&
2799		    !po->has_vnet_hdr &&
2800		    !packet_extra_vlan_len_allowed(dev, skb))
2801			tp_len = -EMSGSIZE;
2802
2803		if (unlikely(tp_len < 0)) {
2804tpacket_error:
2805			if (po->tp_loss) {
2806				__packet_set_status(po, ph,
2807						TP_STATUS_AVAILABLE);
2808				packet_increment_head(&po->tx_ring);
2809				kfree_skb(skb);
2810				continue;
2811			} else {
2812				status = TP_STATUS_WRONG_FORMAT;
2813				err = tp_len;
2814				goto out_status;
2815			}
2816		}
2817
2818		if (po->has_vnet_hdr) {
2819			if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2820				tp_len = -EINVAL;
2821				goto tpacket_error;
2822			}
2823			virtio_net_hdr_set_proto(skb, vnet_hdr);
2824		}
2825
2826		skb->destructor = tpacket_destruct_skb;
2827		__packet_set_status(po, ph, TP_STATUS_SENDING);
2828		packet_inc_pending(&po->tx_ring);
2829
2830		status = TP_STATUS_SEND_REQUEST;
2831		/* Paired with WRITE_ONCE() in packet_setsockopt() */
2832		err = READ_ONCE(po->xmit)(skb);
2833		if (unlikely(err != 0)) {
2834			if (err > 0)
2835				err = net_xmit_errno(err);
2836			if (err && __packet_get_status(po, ph) ==
2837				   TP_STATUS_AVAILABLE) {
2838				/* skb was destructed already */
2839				skb = NULL;
2840				goto out_status;
2841			}
2842			/*
2843			 * skb was dropped but not destructed yet;
2844			 * let's treat it like congestion or err < 0
2845			 */
2846			err = 0;
2847		}
2848		packet_increment_head(&po->tx_ring);
2849		len_sum += tp_len;
2850	} while (likely((ph != NULL) ||
2851		/* Note: packet_read_pending() might be slow if we have
2852		 * to call it as it's per_cpu variable, but in fast-path
2853		 * we already short-circuit the loop with the first
2854		 * condition, and luckily don't have to go that path
2855		 * anyway.
2856		 */
2857		 (need_wait && packet_read_pending(&po->tx_ring))));
2858
2859	err = len_sum;
2860	goto out_put;
2861
2862out_status:
2863	__packet_set_status(po, ph, status);
2864	kfree_skb(skb);
2865out_put:
2866	dev_put(dev);
2867out:
2868	mutex_unlock(&po->pg_vec_lock);
2869	return err;
2870}
2871
2872static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2873				        size_t reserve, size_t len,
2874				        size_t linear, int noblock,
2875				        int *err)
2876{
2877	struct sk_buff *skb;
2878
2879	/* Under a page?  Don't bother with paged skb. */
2880	if (prepad + len < PAGE_SIZE || !linear)
2881		linear = len;
2882
2883	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2884				   err, 0);
2885	if (!skb)
2886		return NULL;
2887
2888	skb_reserve(skb, reserve);
2889	skb_put(skb, linear);
2890	skb->data_len = len - linear;
2891	skb->len += len - linear;
2892
2893	return skb;
2894}
2895
2896static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2897{
2898	struct sock *sk = sock->sk;
2899	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2900	struct sk_buff *skb;
2901	struct net_device *dev;
2902	__be16 proto;
2903	unsigned char *addr = NULL;
2904	int err, reserve = 0;
2905	struct sockcm_cookie sockc;
2906	struct virtio_net_hdr vnet_hdr = { 0 };
2907	int offset = 0;
2908	struct packet_sock *po = pkt_sk(sk);
2909	bool has_vnet_hdr = false;
2910	int hlen, tlen, linear;
2911	int extra_len = 0;
2912
2913	/*
2914	 *	Get and verify the address.
2915	 */
2916
2917	if (likely(saddr == NULL)) {
2918		dev	= packet_cached_dev_get(po);
2919		proto	= READ_ONCE(po->num);
2920	} else {
2921		err = -EINVAL;
2922		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2923			goto out;
2924		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2925			goto out;
2926		proto	= saddr->sll_protocol;
2927		dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2928		if (sock->type == SOCK_DGRAM) {
2929			if (dev && msg->msg_namelen < dev->addr_len +
2930				   offsetof(struct sockaddr_ll, sll_addr))
2931				goto out_unlock;
2932			addr = saddr->sll_addr;
2933		}
2934	}
2935
2936	err = -ENXIO;
2937	if (unlikely(dev == NULL))
2938		goto out_unlock;
2939	err = -ENETDOWN;
2940	if (unlikely(!(dev->flags & IFF_UP)))
2941		goto out_unlock;
2942
2943	sockcm_init(&sockc, sk);
2944	sockc.mark = sk->sk_mark;
2945	if (msg->msg_controllen) {
2946		err = sock_cmsg_send(sk, msg, &sockc);
2947		if (unlikely(err))
2948			goto out_unlock;
2949	}
2950
2951	if (sock->type == SOCK_RAW)
2952		reserve = dev->hard_header_len;
2953	if (po->has_vnet_hdr) {
2954		err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2955		if (err)
2956			goto out_unlock;
2957		has_vnet_hdr = true;
2958	}
2959
2960	if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2961		if (!netif_supports_nofcs(dev)) {
2962			err = -EPROTONOSUPPORT;
2963			goto out_unlock;
2964		}
2965		extra_len = 4; /* We're doing our own CRC */
2966	}
2967
2968	err = -EMSGSIZE;
2969	if (!vnet_hdr.gso_type &&
2970	    (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
2971		goto out_unlock;
2972
2973	err = -ENOBUFS;
2974	hlen = LL_RESERVED_SPACE(dev);
2975	tlen = dev->needed_tailroom;
2976	linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2977	linear = max(linear, min_t(int, len, dev->hard_header_len));
2978	skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
2979			       msg->msg_flags & MSG_DONTWAIT, &err);
2980	if (skb == NULL)
2981		goto out_unlock;
2982
2983	skb_reset_network_header(skb);
2984
2985	err = -EINVAL;
2986	if (sock->type == SOCK_DGRAM) {
2987		offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
2988		if (unlikely(offset < 0))
2989			goto out_free;
2990	} else if (reserve) {
2991		skb_reserve(skb, -reserve);
2992		if (len < reserve + sizeof(struct ipv6hdr) &&
2993		    dev->min_header_len != dev->hard_header_len)
2994			skb_reset_network_header(skb);
2995	}
2996
2997	/* Returns -EFAULT on error */
2998	err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
2999	if (err)
3000		goto out_free;
3001
3002	if ((sock->type == SOCK_RAW &&
3003	     !dev_validate_header(dev, skb->data, len)) || !skb->len) {
3004		err = -EINVAL;
3005		goto out_free;
3006	}
3007
3008	skb_setup_tx_timestamp(skb, sockc.tsflags);
3009
3010	if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3011	    !packet_extra_vlan_len_allowed(dev, skb)) {
3012		err = -EMSGSIZE;
3013		goto out_free;
3014	}
3015
3016	skb->protocol = proto;
3017	skb->dev = dev;
3018	skb->priority = sk->sk_priority;
3019	skb->mark = sockc.mark;
3020	skb->tstamp = sockc.transmit_time;
3021
3022	if (unlikely(extra_len == 4))
3023		skb->no_fcs = 1;
3024
3025	packet_parse_headers(skb, sock);
3026
3027	if (has_vnet_hdr) {
3028		err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
3029		if (err)
3030			goto out_free;
3031		len += sizeof(vnet_hdr);
3032		virtio_net_hdr_set_proto(skb, &vnet_hdr);
3033	}
3034
3035	/* Paired with WRITE_ONCE() in packet_setsockopt() */
3036	err = READ_ONCE(po->xmit)(skb);
3037	if (unlikely(err != 0)) {
3038		if (err > 0)
3039			err = net_xmit_errno(err);
3040		if (err)
3041			goto out_unlock;
3042	}
3043
3044	dev_put(dev);
3045
3046	return len;
3047
3048out_free:
3049	kfree_skb(skb);
3050out_unlock:
3051	if (dev)
3052		dev_put(dev);
3053out:
3054	return err;
3055}
3056
3057static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
3058{
3059	struct sock *sk = sock->sk;
3060	struct packet_sock *po = pkt_sk(sk);
3061
3062	/* Reading tx_ring.pg_vec without holding pg_vec_lock is racy.
3063	 * tpacket_snd() will redo the check safely.
3064	 */
3065	if (data_race(po->tx_ring.pg_vec))
3066		return tpacket_snd(po, msg);
3067
3068	return packet_snd(sock, msg, len);
3069}
3070
3071/*
3072 *	Close a PACKET socket. This is fairly simple. We immediately go
3073 *	to 'closed' state and remove our protocol entry in the device list.
3074 */
3075
3076static int packet_release(struct socket *sock)
3077{
3078	struct sock *sk = sock->sk;
3079	struct packet_sock *po;
3080	struct packet_fanout *f;
3081	struct net *net;
3082	union tpacket_req_u req_u;
3083
3084	if (!sk)
3085		return 0;
3086
3087	net = sock_net(sk);
3088	po = pkt_sk(sk);
3089
3090	mutex_lock(&net->packet.sklist_lock);
3091	sk_del_node_init_rcu(sk);
3092	mutex_unlock(&net->packet.sklist_lock);
3093
3094	preempt_disable();
3095	sock_prot_inuse_add(net, sk->sk_prot, -1);
3096	preempt_enable();
3097
3098	spin_lock(&po->bind_lock);
3099	unregister_prot_hook(sk, false);
3100	packet_cached_dev_reset(po);
3101
3102	if (po->prot_hook.dev) {
3103		dev_put(po->prot_hook.dev);
3104		po->prot_hook.dev = NULL;
3105	}
3106	spin_unlock(&po->bind_lock);
3107
3108	packet_flush_mclist(sk);
3109
3110	lock_sock(sk);
3111	if (po->rx_ring.pg_vec) {
3112		memset(&req_u, 0, sizeof(req_u));
3113		packet_set_ring(sk, &req_u, 1, 0);
3114	}
3115
3116	if (po->tx_ring.pg_vec) {
3117		memset(&req_u, 0, sizeof(req_u));
3118		packet_set_ring(sk, &req_u, 1, 1);
3119	}
3120	release_sock(sk);
3121
3122	f = fanout_release(sk);
3123
3124	synchronize_net();
3125
3126	kfree(po->rollover);
3127	if (f) {
3128		fanout_release_data(f);
3129		kvfree(f);
3130	}
3131	/*
3132	 *	Now the socket is dead. No more input will appear.
3133	 */
3134	sock_orphan(sk);
3135	sock->sk = NULL;
3136
3137	/* Purge queues */
3138
3139	skb_queue_purge(&sk->sk_receive_queue);
3140	packet_free_pending(po);
3141	sk_refcnt_debug_release(sk);
3142
3143	sock_put(sk);
3144	return 0;
3145}
3146
3147/*
3148 *	Attach a packet hook.
3149 */
3150
3151static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3152			  __be16 proto)
3153{
3154	struct packet_sock *po = pkt_sk(sk);
3155	struct net_device *dev_curr;
3156	__be16 proto_curr;
3157	bool need_rehook;
3158	struct net_device *dev = NULL;
3159	int ret = 0;
3160	bool unlisted = false;
3161
3162	lock_sock(sk);
3163	spin_lock(&po->bind_lock);
3164	if (!proto)
3165		proto = po->num;
3166
3167	rcu_read_lock();
3168
3169	if (po->fanout) {
3170		ret = -EINVAL;
3171		goto out_unlock;
3172	}
3173
3174	if (name) {
3175		dev = dev_get_by_name_rcu(sock_net(sk), name);
3176		if (!dev) {
3177			ret = -ENODEV;
3178			goto out_unlock;
3179		}
3180	} else if (ifindex) {
3181		dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3182		if (!dev) {
3183			ret = -ENODEV;
3184			goto out_unlock;
3185		}
3186	}
3187
3188	if (dev)
3189		dev_hold(dev);
3190
3191	proto_curr = po->prot_hook.type;
3192	dev_curr = po->prot_hook.dev;
3193
3194	need_rehook = proto_curr != proto || dev_curr != dev;
3195
3196	if (need_rehook) {
3197		if (po->running) {
3198			rcu_read_unlock();
3199			/* prevents packet_notifier() from calling
3200			 * register_prot_hook()
3201			 */
3202			WRITE_ONCE(po->num, 0);
3203			__unregister_prot_hook(sk, true);
3204			rcu_read_lock();
3205			dev_curr = po->prot_hook.dev;
3206			if (dev)
3207				unlisted = !dev_get_by_index_rcu(sock_net(sk),
3208								 dev->ifindex);
3209		}
3210
3211		BUG_ON(po->running);
3212		WRITE_ONCE(po->num, proto);
3213		po->prot_hook.type = proto;
3214
3215		if (unlikely(unlisted)) {
3216			dev_put(dev);
3217			po->prot_hook.dev = NULL;
3218			WRITE_ONCE(po->ifindex, -1);
3219			packet_cached_dev_reset(po);
3220		} else {
3221			po->prot_hook.dev = dev;
3222			WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
3223			packet_cached_dev_assign(po, dev);
3224		}
3225	}
3226	if (dev_curr)
3227		dev_put(dev_curr);
3228
3229	if (proto == 0 || !need_rehook)
3230		goto out_unlock;
3231
3232	if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
3233		register_prot_hook(sk);
3234	} else {
3235		sk->sk_err = ENETDOWN;
3236		if (!sock_flag(sk, SOCK_DEAD))
3237			sk->sk_error_report(sk);
3238	}
3239
3240out_unlock:
3241	rcu_read_unlock();
3242	spin_unlock(&po->bind_lock);
3243	release_sock(sk);
3244	return ret;
3245}
3246
3247/*
3248 *	Bind a packet socket to a device
3249 */
3250
3251static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3252			    int addr_len)
3253{
3254	struct sock *sk = sock->sk;
3255	char name[sizeof(uaddr->sa_data_min) + 1];
3256
3257	/*
3258	 *	Check legality
3259	 */
3260
3261	if (addr_len != sizeof(struct sockaddr))
3262		return -EINVAL;
3263	/* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3264	 * zero-terminated.
3265	 */
3266	memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min));
3267	name[sizeof(uaddr->sa_data_min)] = 0;
3268
3269	return packet_do_bind(sk, name, 0, 0);
3270}
3271
3272static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3273{
3274	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3275	struct sock *sk = sock->sk;
3276
3277	/*
3278	 *	Check legality
3279	 */
3280
3281	if (addr_len < sizeof(struct sockaddr_ll))
3282		return -EINVAL;
3283	if (sll->sll_family != AF_PACKET)
3284		return -EINVAL;
3285
3286	return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol);
3287}
3288
3289static struct proto packet_proto = {
3290	.name	  = "PACKET",
3291	.owner	  = THIS_MODULE,
3292	.obj_size = sizeof(struct packet_sock),
3293};
3294
3295/*
3296 *	Create a packet of type SOCK_PACKET.
3297 */
3298
3299static int packet_create(struct net *net, struct socket *sock, int protocol,
3300			 int kern)
3301{
3302	struct sock *sk;
3303	struct packet_sock *po;
3304	__be16 proto = (__force __be16)protocol; /* weird, but documented */
3305	int err;
3306
3307	if (!ns_capable(net->user_ns, CAP_NET_RAW))
3308		return -EPERM;
3309	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3310	    sock->type != SOCK_PACKET)
3311		return -ESOCKTNOSUPPORT;
3312
3313	sock->state = SS_UNCONNECTED;
3314
3315	err = -ENOBUFS;
3316	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
3317	if (sk == NULL)
3318		goto out;
3319
3320	sock->ops = &packet_ops;
3321	if (sock->type == SOCK_PACKET)
3322		sock->ops = &packet_ops_spkt;
3323
3324	sock_init_data(sock, sk);
3325
3326	po = pkt_sk(sk);
3327	init_completion(&po->skb_completion);
3328	sk->sk_family = PF_PACKET;
3329	po->num = proto;
3330	po->xmit = dev_queue_xmit;
3331
3332	err = packet_alloc_pending(po);
3333	if (err)
3334		goto out2;
3335
3336	packet_cached_dev_reset(po);
3337
3338	sk->sk_destruct = packet_sock_destruct;
3339	sk_refcnt_debug_inc(sk);
3340
3341	/*
3342	 *	Attach a protocol block
3343	 */
3344
3345	spin_lock_init(&po->bind_lock);
3346	mutex_init(&po->pg_vec_lock);
3347	po->rollover = NULL;
3348	po->prot_hook.func = packet_rcv;
3349
3350	if (sock->type == SOCK_PACKET)
3351		po->prot_hook.func = packet_rcv_spkt;
3352
3353	po->prot_hook.af_packet_priv = sk;
3354	po->prot_hook.af_packet_net = sock_net(sk);
3355
3356	if (proto) {
3357		po->prot_hook.type = proto;
3358		__register_prot_hook(sk);
3359	}
3360
3361	mutex_lock(&net->packet.sklist_lock);
3362	sk_add_node_tail_rcu(sk, &net->packet.sklist);
3363	mutex_unlock(&net->packet.sklist_lock);
3364
3365	preempt_disable();
3366	sock_prot_inuse_add(net, &packet_proto, 1);
3367	preempt_enable();
3368
3369	return 0;
3370out2:
3371	sk_free(sk);
3372out:
3373	return err;
3374}
3375
3376/*
3377 *	Pull a packet from our receive queue and hand it to the user.
3378 *	If necessary we block.
3379 */
3380
3381static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3382			  int flags)
3383{
3384	struct sock *sk = sock->sk;
3385	struct sk_buff *skb;
3386	int copied, err;
3387	int vnet_hdr_len = 0;
3388	unsigned int origlen = 0;
3389
3390	err = -EINVAL;
3391	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
3392		goto out;
3393
3394#if 0
3395	/* What error should we return now? EUNATTACH? */
3396	if (pkt_sk(sk)->ifindex < 0)
3397		return -ENODEV;
3398#endif
3399
3400	if (flags & MSG_ERRQUEUE) {
3401		err = sock_recv_errqueue(sk, msg, len,
3402					 SOL_PACKET, PACKET_TX_TIMESTAMP);
3403		goto out;
3404	}
3405
3406	/*
3407	 *	Call the generic datagram receiver. This handles all sorts
3408	 *	of horrible races and re-entrancy so we can forget about it
3409	 *	in the protocol layers.
3410	 *
3411	 *	Now it will return ENETDOWN, if device have just gone down,
3412	 *	but then it will block.
3413	 */
3414
3415	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
3416
3417	/*
3418	 *	An error occurred so return it. Because skb_recv_datagram()
3419	 *	handles the blocking we don't see and worry about blocking
3420	 *	retries.
3421	 */
3422
3423	if (skb == NULL)
3424		goto out;
3425
3426	packet_rcv_try_clear_pressure(pkt_sk(sk));
3427
3428	if (pkt_sk(sk)->has_vnet_hdr) {
3429		err = packet_rcv_vnet(msg, skb, &len);
3430		if (err)
3431			goto out_free;
3432		vnet_hdr_len = sizeof(struct virtio_net_hdr);
3433	}
3434
3435	/* You lose any data beyond the buffer you gave. If it worries
3436	 * a user program they can ask the device for its MTU
3437	 * anyway.
3438	 */
3439	copied = skb->len;
3440	if (copied > len) {
3441		copied = len;
3442		msg->msg_flags |= MSG_TRUNC;
3443	}
3444
3445	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3446	if (err)
3447		goto out_free;
3448
3449	if (sock->type != SOCK_PACKET) {
3450		struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3451
3452		/* Original length was stored in sockaddr_ll fields */
3453		origlen = PACKET_SKB_CB(skb)->sa.origlen;
3454		sll->sll_family = AF_PACKET;
3455		sll->sll_protocol = skb->protocol;
3456	}
3457
3458	sock_recv_ts_and_drops(msg, sk, skb);
3459
3460	if (msg->msg_name) {
3461		const size_t max_len = min(sizeof(skb->cb),
3462					   sizeof(struct sockaddr_storage));
3463		int copy_len;
3464
3465		/* If the address length field is there to be filled
3466		 * in, we fill it in now.
3467		 */
3468		if (sock->type == SOCK_PACKET) {
3469			__sockaddr_check_size(sizeof(struct sockaddr_pkt));
3470			msg->msg_namelen = sizeof(struct sockaddr_pkt);
3471			copy_len = msg->msg_namelen;
3472		} else {
3473			struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3474
3475			msg->msg_namelen = sll->sll_halen +
3476				offsetof(struct sockaddr_ll, sll_addr);
3477			copy_len = msg->msg_namelen;
3478			if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3479				memset(msg->msg_name +
3480				       offsetof(struct sockaddr_ll, sll_addr),
3481				       0, sizeof(sll->sll_addr));
3482				msg->msg_namelen = sizeof(struct sockaddr_ll);
3483			}
3484		}
3485		if (WARN_ON_ONCE(copy_len > max_len)) {
3486			copy_len = max_len;
3487			msg->msg_namelen = copy_len;
3488		}
3489		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
3490	}
3491
3492	if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) {
3493		struct tpacket_auxdata aux;
3494
3495		aux.tp_status = TP_STATUS_USER;
3496		if (skb->ip_summed == CHECKSUM_PARTIAL)
3497			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3498		else if (skb->pkt_type != PACKET_OUTGOING &&
3499			 skb_csum_unnecessary(skb))
3500			aux.tp_status |= TP_STATUS_CSUM_VALID;
3501
3502		aux.tp_len = origlen;
3503		aux.tp_snaplen = skb->len;
3504		aux.tp_mac = 0;
3505		aux.tp_net = skb_network_offset(skb);
3506		if (skb_vlan_tag_present(skb)) {
3507			aux.tp_vlan_tci = skb_vlan_tag_get(skb);
3508			aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3509			aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
3510		} else {
3511			aux.tp_vlan_tci = 0;
3512			aux.tp_vlan_tpid = 0;
3513		}
3514		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
3515	}
3516
3517	/*
3518	 *	Free or return the buffer as appropriate. Again this
3519	 *	hides all the races and re-entrancy issues from us.
3520	 */
3521	err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
3522
3523out_free:
3524	skb_free_datagram(sk, skb);
3525out:
3526	return err;
3527}
3528
3529static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3530			       int peer)
3531{
3532	struct net_device *dev;
3533	struct sock *sk	= sock->sk;
3534
3535	if (peer)
3536		return -EOPNOTSUPP;
3537
3538	uaddr->sa_family = AF_PACKET;
3539	memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min));
3540	rcu_read_lock();
3541	dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
3542	if (dev)
3543		strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min));
3544	rcu_read_unlock();
3545
3546	return sizeof(*uaddr);
3547}
3548
3549static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3550			  int peer)
3551{
3552	struct net_device *dev;
3553	struct sock *sk = sock->sk;
3554	struct packet_sock *po = pkt_sk(sk);
3555	DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3556	int ifindex;
3557
3558	if (peer)
3559		return -EOPNOTSUPP;
3560
3561	ifindex = READ_ONCE(po->ifindex);
3562	sll->sll_family = AF_PACKET;
3563	sll->sll_ifindex = ifindex;
3564	sll->sll_protocol = READ_ONCE(po->num);
3565	sll->sll_pkttype = 0;
3566	rcu_read_lock();
3567	dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3568	if (dev) {
3569		sll->sll_hatype = dev->type;
3570		sll->sll_halen = dev->addr_len;
3571		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
3572	} else {
3573		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
3574		sll->sll_halen = 0;
3575	}
3576	rcu_read_unlock();
3577
3578	return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
3579}
3580
3581static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3582			 int what)
3583{
3584	switch (i->type) {
3585	case PACKET_MR_MULTICAST:
3586		if (i->alen != dev->addr_len)
3587			return -EINVAL;
3588		if (what > 0)
3589			return dev_mc_add(dev, i->addr);
3590		else
3591			return dev_mc_del(dev, i->addr);
3592		break;
3593	case PACKET_MR_PROMISC:
3594		return dev_set_promiscuity(dev, what);
3595	case PACKET_MR_ALLMULTI:
3596		return dev_set_allmulti(dev, what);
3597	case PACKET_MR_UNICAST:
3598		if (i->alen != dev->addr_len)
3599			return -EINVAL;
3600		if (what > 0)
3601			return dev_uc_add(dev, i->addr);
3602		else
3603			return dev_uc_del(dev, i->addr);
3604		break;
3605	default:
3606		break;
3607	}
3608	return 0;
3609}
3610
3611static void packet_dev_mclist_delete(struct net_device *dev,
3612				     struct packet_mclist **mlp)
3613{
3614	struct packet_mclist *ml;
3615
3616	while ((ml = *mlp) != NULL) {
3617		if (ml->ifindex == dev->ifindex) {
3618			packet_dev_mc(dev, ml, -1);
3619			*mlp = ml->next;
3620			kfree(ml);
3621		} else
3622			mlp = &ml->next;
3623	}
3624}
3625
3626static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
3627{
3628	struct packet_sock *po = pkt_sk(sk);
3629	struct packet_mclist *ml, *i;
3630	struct net_device *dev;
3631	int err;
3632
3633	rtnl_lock();
3634
3635	err = -ENODEV;
3636	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
3637	if (!dev)
3638		goto done;
3639
3640	err = -EINVAL;
3641	if (mreq->mr_alen > dev->addr_len)
3642		goto done;
3643
3644	err = -ENOBUFS;
3645	i = kmalloc(sizeof(*i), GFP_KERNEL);
3646	if (i == NULL)
3647		goto done;
3648
3649	err = 0;
3650	for (ml = po->mclist; ml; ml = ml->next) {
3651		if (ml->ifindex == mreq->mr_ifindex &&
3652		    ml->type == mreq->mr_type &&
3653		    ml->alen == mreq->mr_alen &&
3654		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3655			ml->count++;
3656			/* Free the new element ... */
3657			kfree(i);
3658			goto done;
3659		}
3660	}
3661
3662	i->type = mreq->mr_type;
3663	i->ifindex = mreq->mr_ifindex;
3664	i->alen = mreq->mr_alen;
3665	memcpy(i->addr, mreq->mr_address, i->alen);
3666	memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
3667	i->count = 1;
3668	i->next = po->mclist;
3669	po->mclist = i;
3670	err = packet_dev_mc(dev, i, 1);
3671	if (err) {
3672		po->mclist = i->next;
3673		kfree(i);
3674	}
3675
3676done:
3677	rtnl_unlock();
3678	return err;
3679}
3680
3681static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
3682{
3683	struct packet_mclist *ml, **mlp;
3684
3685	rtnl_lock();
3686
3687	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3688		if (ml->ifindex == mreq->mr_ifindex &&
3689		    ml->type == mreq->mr_type &&
3690		    ml->alen == mreq->mr_alen &&
3691		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3692			if (--ml->count == 0) {
3693				struct net_device *dev;
3694				*mlp = ml->next;
3695				dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3696				if (dev)
3697					packet_dev_mc(dev, ml, -1);
3698				kfree(ml);
3699			}
3700			break;
3701		}
3702	}
3703	rtnl_unlock();
3704	return 0;
3705}
3706
3707static void packet_flush_mclist(struct sock *sk)
3708{
3709	struct packet_sock *po = pkt_sk(sk);
3710	struct packet_mclist *ml;
3711
3712	if (!po->mclist)
3713		return;
3714
3715	rtnl_lock();
3716	while ((ml = po->mclist) != NULL) {
3717		struct net_device *dev;
3718
3719		po->mclist = ml->next;
3720		dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3721		if (dev != NULL)
3722			packet_dev_mc(dev, ml, -1);
3723		kfree(ml);
3724	}
3725	rtnl_unlock();
3726}
3727
3728static int
3729packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
3730		  unsigned int optlen)
3731{
3732	struct sock *sk = sock->sk;
3733	struct packet_sock *po = pkt_sk(sk);
3734	int ret;
3735
3736	if (level != SOL_PACKET)
3737		return -ENOPROTOOPT;
3738
3739	switch (optname) {
3740	case PACKET_ADD_MEMBERSHIP:
3741	case PACKET_DROP_MEMBERSHIP:
3742	{
3743		struct packet_mreq_max mreq;
3744		int len = optlen;
3745		memset(&mreq, 0, sizeof(mreq));
3746		if (len < sizeof(struct packet_mreq))
3747			return -EINVAL;
3748		if (len > sizeof(mreq))
3749			len = sizeof(mreq);
3750		if (copy_from_sockptr(&mreq, optval, len))
3751			return -EFAULT;
3752		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3753			return -EINVAL;
3754		if (optname == PACKET_ADD_MEMBERSHIP)
3755			ret = packet_mc_add(sk, &mreq);
3756		else
3757			ret = packet_mc_drop(sk, &mreq);
3758		return ret;
3759	}
3760
3761	case PACKET_RX_RING:
3762	case PACKET_TX_RING:
3763	{
3764		union tpacket_req_u req_u;
3765		int len;
3766
3767		lock_sock(sk);
3768		switch (po->tp_version) {
3769		case TPACKET_V1:
3770		case TPACKET_V2:
3771			len = sizeof(req_u.req);
3772			break;
3773		case TPACKET_V3:
3774		default:
3775			len = sizeof(req_u.req3);
3776			break;
3777		}
3778		if (optlen < len) {
3779			ret = -EINVAL;
3780		} else {
3781			if (copy_from_sockptr(&req_u.req, optval, len))
3782				ret = -EFAULT;
3783			else
3784				ret = packet_set_ring(sk, &req_u, 0,
3785						    optname == PACKET_TX_RING);
3786		}
3787		release_sock(sk);
3788		return ret;
3789	}
3790	case PACKET_COPY_THRESH:
3791	{
3792		int val;
3793
3794		if (optlen != sizeof(val))
3795			return -EINVAL;
3796		if (copy_from_sockptr(&val, optval, sizeof(val)))
3797			return -EFAULT;
3798
3799		pkt_sk(sk)->copy_thresh = val;
3800		return 0;
3801	}
3802	case PACKET_VERSION:
3803	{
3804		int val;
3805
3806		if (optlen != sizeof(val))
3807			return -EINVAL;
3808		if (copy_from_sockptr(&val, optval, sizeof(val)))
3809			return -EFAULT;
3810		switch (val) {
3811		case TPACKET_V1:
3812		case TPACKET_V2:
3813		case TPACKET_V3:
3814			break;
3815		default:
3816			return -EINVAL;
3817		}
3818		lock_sock(sk);
3819		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3820			ret = -EBUSY;
3821		} else {
3822			po->tp_version = val;
3823			ret = 0;
3824		}
3825		release_sock(sk);
3826		return ret;
3827	}
3828	case PACKET_RESERVE:
3829	{
3830		unsigned int val;
3831
3832		if (optlen != sizeof(val))
3833			return -EINVAL;
3834		if (copy_from_sockptr(&val, optval, sizeof(val)))
3835			return -EFAULT;
3836		if (val > INT_MAX)
3837			return -EINVAL;
3838		lock_sock(sk);
3839		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3840			ret = -EBUSY;
3841		} else {
3842			po->tp_reserve = val;
3843			ret = 0;
3844		}
3845		release_sock(sk);
3846		return ret;
3847	}
3848	case PACKET_LOSS:
3849	{
3850		unsigned int val;
3851
3852		if (optlen != sizeof(val))
3853			return -EINVAL;
3854		if (copy_from_sockptr(&val, optval, sizeof(val)))
3855			return -EFAULT;
3856
3857		lock_sock(sk);
3858		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3859			ret = -EBUSY;
3860		} else {
3861			po->tp_loss = !!val;
3862			ret = 0;
3863		}
3864		release_sock(sk);
3865		return ret;
3866	}
3867	case PACKET_AUXDATA:
3868	{
3869		int val;
3870
3871		if (optlen < sizeof(val))
3872			return -EINVAL;
3873		if (copy_from_sockptr(&val, optval, sizeof(val)))
3874			return -EFAULT;
3875
3876		packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val);
3877		return 0;
3878	}
3879	case PACKET_ORIGDEV:
3880	{
3881		int val;
3882
3883		if (optlen < sizeof(val))
3884			return -EINVAL;
3885		if (copy_from_sockptr(&val, optval, sizeof(val)))
3886			return -EFAULT;
3887
3888		packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val);
3889		return 0;
3890	}
3891	case PACKET_VNET_HDR:
3892	{
3893		int val;
3894
3895		if (sock->type != SOCK_RAW)
3896			return -EINVAL;
3897		if (optlen < sizeof(val))
3898			return -EINVAL;
3899		if (copy_from_sockptr(&val, optval, sizeof(val)))
3900			return -EFAULT;
3901
3902		lock_sock(sk);
3903		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3904			ret = -EBUSY;
3905		} else {
3906			po->has_vnet_hdr = !!val;
3907			ret = 0;
3908		}
3909		release_sock(sk);
3910		return ret;
3911	}
3912	case PACKET_TIMESTAMP:
3913	{
3914		int val;
3915
3916		if (optlen != sizeof(val))
3917			return -EINVAL;
3918		if (copy_from_sockptr(&val, optval, sizeof(val)))
3919			return -EFAULT;
3920
3921		po->tp_tstamp = val;
3922		return 0;
3923	}
3924	case PACKET_FANOUT:
3925	{
3926		struct fanout_args args = { 0 };
3927
3928		if (optlen != sizeof(int) && optlen != sizeof(args))
3929			return -EINVAL;
3930		if (copy_from_sockptr(&args, optval, optlen))
3931			return -EFAULT;
3932
3933		return fanout_add(sk, &args);
3934	}
3935	case PACKET_FANOUT_DATA:
3936	{
3937		/* Paired with the WRITE_ONCE() in fanout_add() */
3938		if (!READ_ONCE(po->fanout))
3939			return -EINVAL;
3940
3941		return fanout_set_data(po, optval, optlen);
3942	}
3943	case PACKET_IGNORE_OUTGOING:
3944	{
3945		int val;
3946
3947		if (optlen != sizeof(val))
3948			return -EINVAL;
3949		if (copy_from_sockptr(&val, optval, sizeof(val)))
3950			return -EFAULT;
3951		if (val < 0 || val > 1)
3952			return -EINVAL;
3953
3954		WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val);
3955		return 0;
3956	}
3957	case PACKET_TX_HAS_OFF:
3958	{
3959		unsigned int val;
3960
3961		if (optlen != sizeof(val))
3962			return -EINVAL;
3963		if (copy_from_sockptr(&val, optval, sizeof(val)))
3964			return -EFAULT;
3965
3966		lock_sock(sk);
3967		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3968			ret = -EBUSY;
3969		} else {
3970			po->tp_tx_has_off = !!val;
3971			ret = 0;
3972		}
3973		release_sock(sk);
3974		return 0;
3975	}
3976	case PACKET_QDISC_BYPASS:
3977	{
3978		int val;
3979
3980		if (optlen != sizeof(val))
3981			return -EINVAL;
3982		if (copy_from_sockptr(&val, optval, sizeof(val)))
3983			return -EFAULT;
3984
3985		/* Paired with all lockless reads of po->xmit */
3986		WRITE_ONCE(po->xmit, val ? packet_direct_xmit : dev_queue_xmit);
3987		return 0;
3988	}
3989	default:
3990		return -ENOPROTOOPT;
3991	}
3992}
3993
3994static int packet_getsockopt(struct socket *sock, int level, int optname,
3995			     char __user *optval, int __user *optlen)
3996{
3997	int len;
3998	int val, lv = sizeof(val);
3999	struct sock *sk = sock->sk;
4000	struct packet_sock *po = pkt_sk(sk);
4001	void *data = &val;
4002	union tpacket_stats_u st;
4003	struct tpacket_rollover_stats rstats;
4004	int drops;
4005
4006	if (level != SOL_PACKET)
4007		return -ENOPROTOOPT;
4008
4009	if (get_user(len, optlen))
4010		return -EFAULT;
4011
4012	if (len < 0)
4013		return -EINVAL;
4014
4015	switch (optname) {
4016	case PACKET_STATISTICS:
4017		spin_lock_bh(&sk->sk_receive_queue.lock);
4018		memcpy(&st, &po->stats, sizeof(st));
4019		memset(&po->stats, 0, sizeof(po->stats));
4020		spin_unlock_bh(&sk->sk_receive_queue.lock);
4021		drops = atomic_xchg(&po->tp_drops, 0);
4022
4023		if (po->tp_version == TPACKET_V3) {
4024			lv = sizeof(struct tpacket_stats_v3);
4025			st.stats3.tp_drops = drops;
4026			st.stats3.tp_packets += drops;
4027			data = &st.stats3;
4028		} else {
4029			lv = sizeof(struct tpacket_stats);
4030			st.stats1.tp_drops = drops;
4031			st.stats1.tp_packets += drops;
4032			data = &st.stats1;
4033		}
4034
4035		break;
4036	case PACKET_AUXDATA:
4037		val = packet_sock_flag(po, PACKET_SOCK_AUXDATA);
4038		break;
4039	case PACKET_ORIGDEV:
4040		val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
4041		break;
4042	case PACKET_VNET_HDR:
4043		val = po->has_vnet_hdr;
4044		break;
4045	case PACKET_VERSION:
4046		val = po->tp_version;
4047		break;
4048	case PACKET_HDRLEN:
4049		if (len > sizeof(int))
4050			len = sizeof(int);
4051		if (len < sizeof(int))
4052			return -EINVAL;
4053		if (copy_from_user(&val, optval, len))
4054			return -EFAULT;
4055		switch (val) {
4056		case TPACKET_V1:
4057			val = sizeof(struct tpacket_hdr);
4058			break;
4059		case TPACKET_V2:
4060			val = sizeof(struct tpacket2_hdr);
4061			break;
4062		case TPACKET_V3:
4063			val = sizeof(struct tpacket3_hdr);
4064			break;
4065		default:
4066			return -EINVAL;
4067		}
4068		break;
4069	case PACKET_RESERVE:
4070		val = po->tp_reserve;
4071		break;
4072	case PACKET_LOSS:
4073		val = po->tp_loss;
4074		break;
4075	case PACKET_TIMESTAMP:
4076		val = po->tp_tstamp;
4077		break;
4078	case PACKET_FANOUT:
4079		val = (po->fanout ?
4080		       ((u32)po->fanout->id |
4081			((u32)po->fanout->type << 16) |
4082			((u32)po->fanout->flags << 24)) :
4083		       0);
4084		break;
4085	case PACKET_IGNORE_OUTGOING:
4086		val = READ_ONCE(po->prot_hook.ignore_outgoing);
4087		break;
4088	case PACKET_ROLLOVER_STATS:
4089		if (!po->rollover)
4090			return -EINVAL;
4091		rstats.tp_all = atomic_long_read(&po->rollover->num);
4092		rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
4093		rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
4094		data = &rstats;
4095		lv = sizeof(rstats);
4096		break;
4097	case PACKET_TX_HAS_OFF:
4098		val = po->tp_tx_has_off;
4099		break;
4100	case PACKET_QDISC_BYPASS:
4101		val = packet_use_direct_xmit(po);
4102		break;
4103	default:
4104		return -ENOPROTOOPT;
4105	}
4106
4107	if (len > lv)
4108		len = lv;
4109	if (put_user(len, optlen))
4110		return -EFAULT;
4111	if (copy_to_user(optval, data, len))
4112		return -EFAULT;
4113	return 0;
4114}
4115
4116static int packet_notifier(struct notifier_block *this,
4117			   unsigned long msg, void *ptr)
4118{
4119	struct sock *sk;
4120	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4121	struct net *net = dev_net(dev);
4122
4123	rcu_read_lock();
4124	sk_for_each_rcu(sk, &net->packet.sklist) {
4125		struct packet_sock *po = pkt_sk(sk);
4126
4127		switch (msg) {
4128		case NETDEV_UNREGISTER:
4129			if (po->mclist)
4130				packet_dev_mclist_delete(dev, &po->mclist);
4131			fallthrough;
4132
4133		case NETDEV_DOWN:
4134			if (dev->ifindex == po->ifindex) {
4135				spin_lock(&po->bind_lock);
4136				if (po->running) {
4137					__unregister_prot_hook(sk, false);
4138					sk->sk_err = ENETDOWN;
4139					if (!sock_flag(sk, SOCK_DEAD))
4140						sk->sk_error_report(sk);
4141				}
4142				if (msg == NETDEV_UNREGISTER) {
4143					packet_cached_dev_reset(po);
4144					WRITE_ONCE(po->ifindex, -1);
4145					if (po->prot_hook.dev)
4146						dev_put(po->prot_hook.dev);
4147					po->prot_hook.dev = NULL;
4148				}
4149				spin_unlock(&po->bind_lock);
4150			}
4151			break;
4152		case NETDEV_UP:
4153			if (dev->ifindex == po->ifindex) {
4154				spin_lock(&po->bind_lock);
4155				if (po->num)
4156					register_prot_hook(sk);
4157				spin_unlock(&po->bind_lock);
4158			}
4159			break;
4160		}
4161	}
4162	rcu_read_unlock();
4163	return NOTIFY_DONE;
4164}
4165
4166
4167static int packet_ioctl(struct socket *sock, unsigned int cmd,
4168			unsigned long arg)
4169{
4170	struct sock *sk = sock->sk;
4171
4172	switch (cmd) {
4173	case SIOCOUTQ:
4174	{
4175		int amount = sk_wmem_alloc_get(sk);
4176
4177		return put_user(amount, (int __user *)arg);
4178	}
4179	case SIOCINQ:
4180	{
4181		struct sk_buff *skb;
4182		int amount = 0;
4183
4184		spin_lock_bh(&sk->sk_receive_queue.lock);
4185		skb = skb_peek(&sk->sk_receive_queue);
4186		if (skb)
4187			amount = skb->len;
4188		spin_unlock_bh(&sk->sk_receive_queue.lock);
4189		return put_user(amount, (int __user *)arg);
4190	}
4191#ifdef CONFIG_INET
4192	case SIOCADDRT:
4193	case SIOCDELRT:
4194	case SIOCDARP:
4195	case SIOCGARP:
4196	case SIOCSARP:
4197	case SIOCGIFADDR:
4198	case SIOCSIFADDR:
4199	case SIOCGIFBRDADDR:
4200	case SIOCSIFBRDADDR:
4201	case SIOCGIFNETMASK:
4202	case SIOCSIFNETMASK:
4203	case SIOCGIFDSTADDR:
4204	case SIOCSIFDSTADDR:
4205	case SIOCSIFFLAGS:
4206		return inet_dgram_ops.ioctl(sock, cmd, arg);
4207#endif
4208
4209	default:
4210		return -ENOIOCTLCMD;
4211	}
4212	return 0;
4213}
4214
4215static __poll_t packet_poll(struct file *file, struct socket *sock,
4216				poll_table *wait)
4217{
4218	struct sock *sk = sock->sk;
4219	struct packet_sock *po = pkt_sk(sk);
4220	__poll_t mask = datagram_poll(file, sock, wait);
4221
4222	spin_lock_bh(&sk->sk_receive_queue.lock);
4223	if (po->rx_ring.pg_vec) {
4224		if (!packet_previous_rx_frame(po, &po->rx_ring,
4225			TP_STATUS_KERNEL))
4226			mask |= EPOLLIN | EPOLLRDNORM;
4227	}
4228	packet_rcv_try_clear_pressure(po);
4229	spin_unlock_bh(&sk->sk_receive_queue.lock);
4230	spin_lock_bh(&sk->sk_write_queue.lock);
4231	if (po->tx_ring.pg_vec) {
4232		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4233			mask |= EPOLLOUT | EPOLLWRNORM;
4234	}
4235	spin_unlock_bh(&sk->sk_write_queue.lock);
4236	return mask;
4237}
4238
4239
4240/* Dirty? Well, I still did not learn better way to account
4241 * for user mmaps.
4242 */
4243
4244static void packet_mm_open(struct vm_area_struct *vma)
4245{
4246	struct file *file = vma->vm_file;
4247	struct socket *sock = file->private_data;
4248	struct sock *sk = sock->sk;
4249
4250	if (sk)
4251		atomic_long_inc(&pkt_sk(sk)->mapped);
4252}
4253
4254static void packet_mm_close(struct vm_area_struct *vma)
4255{
4256	struct file *file = vma->vm_file;
4257	struct socket *sock = file->private_data;
4258	struct sock *sk = sock->sk;
4259
4260	if (sk)
4261		atomic_long_dec(&pkt_sk(sk)->mapped);
4262}
4263
4264static const struct vm_operations_struct packet_mmap_ops = {
4265	.open	=	packet_mm_open,
4266	.close	=	packet_mm_close,
4267};
4268
4269static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4270			unsigned int len)
4271{
4272	int i;
4273
4274	for (i = 0; i < len; i++) {
4275		if (likely(pg_vec[i].buffer)) {
4276			if (is_vmalloc_addr(pg_vec[i].buffer))
4277				vfree(pg_vec[i].buffer);
4278			else
4279				free_pages((unsigned long)pg_vec[i].buffer,
4280					   order);
4281			pg_vec[i].buffer = NULL;
4282		}
4283	}
4284	kfree(pg_vec);
4285}
4286
4287static char *alloc_one_pg_vec_page(unsigned long order)
4288{
4289	char *buffer;
4290	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4291			  __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4292
4293	buffer = (char *) __get_free_pages(gfp_flags, order);
4294	if (buffer)
4295		return buffer;
4296
4297	/* __get_free_pages failed, fall back to vmalloc */
4298	buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4299	if (buffer)
4300		return buffer;
4301
4302	/* vmalloc failed, lets dig into swap here */
4303	gfp_flags &= ~__GFP_NORETRY;
4304	buffer = (char *) __get_free_pages(gfp_flags, order);
4305	if (buffer)
4306		return buffer;
4307
4308	/* complete and utter failure */
4309	return NULL;
4310}
4311
4312static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4313{
4314	unsigned int block_nr = req->tp_block_nr;
4315	struct pgv *pg_vec;
4316	int i;
4317
4318	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
4319	if (unlikely(!pg_vec))
4320		goto out;
4321
4322	for (i = 0; i < block_nr; i++) {
4323		pg_vec[i].buffer = alloc_one_pg_vec_page(order);
4324		if (unlikely(!pg_vec[i].buffer))
4325			goto out_free_pgvec;
4326	}
4327
4328out:
4329	return pg_vec;
4330
4331out_free_pgvec:
4332	free_pg_vec(pg_vec, order, block_nr);
4333	pg_vec = NULL;
4334	goto out;
4335}
4336
4337static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4338		int closing, int tx_ring)
4339{
4340	struct pgv *pg_vec = NULL;
4341	struct packet_sock *po = pkt_sk(sk);
4342	unsigned long *rx_owner_map = NULL;
4343	int was_running, order = 0;
4344	struct packet_ring_buffer *rb;
4345	struct sk_buff_head *rb_queue;
4346	__be16 num;
4347	int err;
4348	/* Added to avoid minimal code churn */
4349	struct tpacket_req *req = &req_u->req;
4350
4351	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4352	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
4353
4354	err = -EBUSY;
4355	if (!closing) {
4356		if (atomic_long_read(&po->mapped))
4357			goto out;
4358		if (packet_read_pending(rb))
4359			goto out;
4360	}
4361
4362	if (req->tp_block_nr) {
4363		unsigned int min_frame_size;
4364
4365		/* Sanity tests and some calculations */
4366		err = -EBUSY;
4367		if (unlikely(rb->pg_vec))
4368			goto out;
4369
4370		switch (po->tp_version) {
4371		case TPACKET_V1:
4372			po->tp_hdrlen = TPACKET_HDRLEN;
4373			break;
4374		case TPACKET_V2:
4375			po->tp_hdrlen = TPACKET2_HDRLEN;
4376			break;
4377		case TPACKET_V3:
4378			po->tp_hdrlen = TPACKET3_HDRLEN;
4379			break;
4380		}
4381
4382		err = -EINVAL;
4383		if (unlikely((int)req->tp_block_size <= 0))
4384			goto out;
4385		if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4386			goto out;
4387		min_frame_size = po->tp_hdrlen + po->tp_reserve;
4388		if (po->tp_version >= TPACKET_V3 &&
4389		    req->tp_block_size <
4390		    BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
4391			goto out;
4392		if (unlikely(req->tp_frame_size < min_frame_size))
4393			goto out;
4394		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
4395			goto out;
4396
4397		rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4398		if (unlikely(rb->frames_per_block == 0))
4399			goto out;
4400		if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
4401			goto out;
4402		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4403					req->tp_frame_nr))
4404			goto out;
4405
4406		err = -ENOMEM;
4407		order = get_order(req->tp_block_size);
4408		pg_vec = alloc_pg_vec(req, order);
4409		if (unlikely(!pg_vec))
4410			goto out;
4411		switch (po->tp_version) {
4412		case TPACKET_V3:
4413			/* Block transmit is not supported yet */
4414			if (!tx_ring) {
4415				init_prb_bdqc(po, rb, pg_vec, req_u);
4416			} else {
4417				struct tpacket_req3 *req3 = &req_u->req3;
4418
4419				if (req3->tp_retire_blk_tov ||
4420				    req3->tp_sizeof_priv ||
4421				    req3->tp_feature_req_word) {
4422					err = -EINVAL;
4423					goto out_free_pg_vec;
4424				}
4425			}
4426			break;
4427		default:
4428			if (!tx_ring) {
4429				rx_owner_map = bitmap_alloc(req->tp_frame_nr,
4430					GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
4431				if (!rx_owner_map)
4432					goto out_free_pg_vec;
4433			}
4434			break;
4435		}
4436	}
4437	/* Done */
4438	else {
4439		err = -EINVAL;
4440		if (unlikely(req->tp_frame_nr))
4441			goto out;
4442	}
4443
4444
4445	/* Detach socket from network */
4446	spin_lock(&po->bind_lock);
4447	was_running = po->running;
4448	num = po->num;
4449	if (was_running) {
4450		WRITE_ONCE(po->num, 0);
4451		__unregister_prot_hook(sk, false);
4452	}
4453	spin_unlock(&po->bind_lock);
4454
4455	synchronize_net();
4456
4457	err = -EBUSY;
4458	mutex_lock(&po->pg_vec_lock);
4459	if (closing || atomic_long_read(&po->mapped) == 0) {
4460		err = 0;
4461		spin_lock_bh(&rb_queue->lock);
4462		swap(rb->pg_vec, pg_vec);
4463		if (po->tp_version <= TPACKET_V2)
4464			swap(rb->rx_owner_map, rx_owner_map);
4465		rb->frame_max = (req->tp_frame_nr - 1);
4466		rb->head = 0;
4467		rb->frame_size = req->tp_frame_size;
4468		spin_unlock_bh(&rb_queue->lock);
4469
4470		swap(rb->pg_vec_order, order);
4471		swap(rb->pg_vec_len, req->tp_block_nr);
4472
4473		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4474		po->prot_hook.func = (po->rx_ring.pg_vec) ?
4475						tpacket_rcv : packet_rcv;
4476		skb_queue_purge(rb_queue);
4477		if (atomic_long_read(&po->mapped))
4478			pr_err("packet_mmap: vma is busy: %ld\n",
4479			       atomic_long_read(&po->mapped));
4480	}
4481	mutex_unlock(&po->pg_vec_lock);
4482
4483	spin_lock(&po->bind_lock);
4484	if (was_running) {
4485		WRITE_ONCE(po->num, num);
4486		register_prot_hook(sk);
4487	}
4488	spin_unlock(&po->bind_lock);
4489	if (pg_vec && (po->tp_version > TPACKET_V2)) {
4490		/* Because we don't support block-based V3 on tx-ring */
4491		if (!tx_ring)
4492			prb_shutdown_retire_blk_timer(po, rb_queue);
4493	}
4494
4495out_free_pg_vec:
4496	if (pg_vec) {
4497		bitmap_free(rx_owner_map);
4498		free_pg_vec(pg_vec, order, req->tp_block_nr);
4499	}
4500out:
4501	return err;
4502}
4503
4504static int packet_mmap(struct file *file, struct socket *sock,
4505		struct vm_area_struct *vma)
4506{
4507	struct sock *sk = sock->sk;
4508	struct packet_sock *po = pkt_sk(sk);
4509	unsigned long size, expected_size;
4510	struct packet_ring_buffer *rb;
4511	unsigned long start;
4512	int err = -EINVAL;
4513	int i;
4514
4515	if (vma->vm_pgoff)
4516		return -EINVAL;
4517
4518	mutex_lock(&po->pg_vec_lock);
4519
4520	expected_size = 0;
4521	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4522		if (rb->pg_vec) {
4523			expected_size += rb->pg_vec_len
4524						* rb->pg_vec_pages
4525						* PAGE_SIZE;
4526		}
4527	}
4528
4529	if (expected_size == 0)
4530		goto out;
4531
4532	size = vma->vm_end - vma->vm_start;
4533	if (size != expected_size)
4534		goto out;
4535
4536	start = vma->vm_start;
4537	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4538		if (rb->pg_vec == NULL)
4539			continue;
4540
4541		for (i = 0; i < rb->pg_vec_len; i++) {
4542			struct page *page;
4543			void *kaddr = rb->pg_vec[i].buffer;
4544			int pg_num;
4545
4546			for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4547				page = pgv_to_page(kaddr);
4548				err = vm_insert_page(vma, start, page);
4549				if (unlikely(err))
4550					goto out;
4551				start += PAGE_SIZE;
4552				kaddr += PAGE_SIZE;
4553			}
4554		}
4555	}
4556
4557	atomic_long_inc(&po->mapped);
4558	vma->vm_ops = &packet_mmap_ops;
4559	err = 0;
4560
4561out:
4562	mutex_unlock(&po->pg_vec_lock);
4563	return err;
4564}
4565
4566static const struct proto_ops packet_ops_spkt = {
4567	.family =	PF_PACKET,
4568	.owner =	THIS_MODULE,
4569	.release =	packet_release,
4570	.bind =		packet_bind_spkt,
4571	.connect =	sock_no_connect,
4572	.socketpair =	sock_no_socketpair,
4573	.accept =	sock_no_accept,
4574	.getname =	packet_getname_spkt,
4575	.poll =		datagram_poll,
4576	.ioctl =	packet_ioctl,
4577	.gettstamp =	sock_gettstamp,
4578	.listen =	sock_no_listen,
4579	.shutdown =	sock_no_shutdown,
4580	.sendmsg =	packet_sendmsg_spkt,
4581	.recvmsg =	packet_recvmsg,
4582	.mmap =		sock_no_mmap,
4583	.sendpage =	sock_no_sendpage,
4584};
4585
4586static const struct proto_ops packet_ops = {
4587	.family =	PF_PACKET,
4588	.owner =	THIS_MODULE,
4589	.release =	packet_release,
4590	.bind =		packet_bind,
4591	.connect =	sock_no_connect,
4592	.socketpair =	sock_no_socketpair,
4593	.accept =	sock_no_accept,
4594	.getname =	packet_getname,
4595	.poll =		packet_poll,
4596	.ioctl =	packet_ioctl,
4597	.gettstamp =	sock_gettstamp,
4598	.listen =	sock_no_listen,
4599	.shutdown =	sock_no_shutdown,
4600	.setsockopt =	packet_setsockopt,
4601	.getsockopt =	packet_getsockopt,
4602	.sendmsg =	packet_sendmsg,
4603	.recvmsg =	packet_recvmsg,
4604	.mmap =		packet_mmap,
4605	.sendpage =	sock_no_sendpage,
4606};
4607
4608static const struct net_proto_family packet_family_ops = {
4609	.family =	PF_PACKET,
4610	.create =	packet_create,
4611	.owner	=	THIS_MODULE,
4612};
4613
4614static struct notifier_block packet_netdev_notifier = {
4615	.notifier_call =	packet_notifier,
4616};
4617
4618#ifdef CONFIG_PROC_FS
4619
4620static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
4621	__acquires(RCU)
4622{
4623	struct net *net = seq_file_net(seq);
4624
4625	rcu_read_lock();
4626	return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
4627}
4628
4629static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4630{
4631	struct net *net = seq_file_net(seq);
4632	return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
4633}
4634
4635static void packet_seq_stop(struct seq_file *seq, void *v)
4636	__releases(RCU)
4637{
4638	rcu_read_unlock();
4639}
4640
4641static int packet_seq_show(struct seq_file *seq, void *v)
4642{
4643	if (v == SEQ_START_TOKEN)
4644		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
4645	else {
4646		struct sock *s = sk_entry(v);
4647		const struct packet_sock *po = pkt_sk(s);
4648
4649		seq_printf(seq,
4650			   "%pK %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
4651			   s,
4652			   refcount_read(&s->sk_refcnt),
4653			   s->sk_type,
4654			   ntohs(READ_ONCE(po->num)),
4655			   READ_ONCE(po->ifindex),
4656			   po->running,
4657			   atomic_read(&s->sk_rmem_alloc),
4658			   from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
4659			   sock_i_ino(s));
4660	}
4661
4662	return 0;
4663}
4664
4665static const struct seq_operations packet_seq_ops = {
4666	.start	= packet_seq_start,
4667	.next	= packet_seq_next,
4668	.stop	= packet_seq_stop,
4669	.show	= packet_seq_show,
4670};
4671#endif
4672
4673static int __net_init packet_net_init(struct net *net)
4674{
4675	mutex_init(&net->packet.sklist_lock);
4676	INIT_HLIST_HEAD(&net->packet.sklist);
4677
4678#ifdef CONFIG_PROC_FS
4679	if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4680			sizeof(struct seq_net_private)))
4681		return -ENOMEM;
4682#endif /* CONFIG_PROC_FS */
4683
4684	return 0;
4685}
4686
4687static void __net_exit packet_net_exit(struct net *net)
4688{
4689	remove_proc_entry("packet", net->proc_net);
4690	WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
4691}
4692
4693static struct pernet_operations packet_net_ops = {
4694	.init = packet_net_init,
4695	.exit = packet_net_exit,
4696};
4697
4698
4699static void __exit packet_exit(void)
4700{
4701	unregister_netdevice_notifier(&packet_netdev_notifier);
4702	unregister_pernet_subsys(&packet_net_ops);
4703	sock_unregister(PF_PACKET);
4704	proto_unregister(&packet_proto);
4705}
4706
4707static int __init packet_init(void)
4708{
4709	int rc;
4710
4711	rc = proto_register(&packet_proto, 0);
4712	if (rc)
4713		goto out;
4714	rc = sock_register(&packet_family_ops);
4715	if (rc)
4716		goto out_proto;
4717	rc = register_pernet_subsys(&packet_net_ops);
4718	if (rc)
4719		goto out_sock;
4720	rc = register_netdevice_notifier(&packet_netdev_notifier);
4721	if (rc)
4722		goto out_pernet;
4723
4724	return 0;
4725
4726out_pernet:
4727	unregister_pernet_subsys(&packet_net_ops);
4728out_sock:
4729	sock_unregister(PF_PACKET);
4730out_proto:
4731	proto_unregister(&packet_proto);
4732out:
4733	return rc;
4734}
4735
4736module_init(packet_init);
4737module_exit(packet_exit);
4738MODULE_LICENSE("GPL");
4739MODULE_ALIAS_NETPROTO(PF_PACKET);
4740