~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/packet/af_packet.c

Version: ~ [ linux-5.6-rc3 ] ~ [ linux-5.5.6 ] ~ [ linux-5.4.22 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.106 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.171 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.214 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.214 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.82 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  3  *              operating system.  INET is implemented using the  BSD Socket
  4  *              interface as the means of communication with the user level.
  5  *
  6  *              PACKET - implements raw packet sockets.
  7  *
  8  * Authors:     Ross Biro
  9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
 11  *
 12  * Fixes:
 13  *              Alan Cox        :       verify_area() now used correctly
 14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
 15  *              Alan Cox        :       tidied skbuff lists.
 16  *              Alan Cox        :       Now uses generic datagram routines I
 17  *                                      added. Also fixed the peek/read crash
 18  *                                      from all old Linux datagram code.
 19  *              Alan Cox        :       Uses the improved datagram code.
 20  *              Alan Cox        :       Added NULL's for socket options.
 21  *              Alan Cox        :       Re-commented the code.
 22  *              Alan Cox        :       Use new kernel side addressing
 23  *              Rob Janssen     :       Correct MTU usage.
 24  *              Dave Platt      :       Counter leaks caused by incorrect
 25  *                                      interrupt locking and some slightly
 26  *                                      dubious gcc output. Can you read
 27  *                                      compiler: it said _VOLATILE_
 28  *      Richard Kooijman        :       Timestamp fixes.
 29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
 30  *              Alan Cox        :       sendmsg/recvmsg support.
 31  *              Alan Cox        :       Protocol setting support
 32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
 33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
 34  *      Michal Ostrowski        :       Module initialization cleanup.
 35  *         Ulises Alonso        :       Frame number limit removal and
 36  *                                      packet_set_ring memory leak.
 37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
 38  *                                      The convention is that longer addresses
 39  *                                      will simply extend the hardware address
 40  *                                      byte arrays at the end of sockaddr_ll
 41  *                                      and packet_mreq.
 42  *              Johann Baudy    :       Added TX RING.
 43  *              Chetan Loke     :       Implemented TPACKET_V3 block abstraction
 44  *                                      layer.
 45  *                                      Copyright (C) 2011, <lokec@ccs.neu.edu>
 46  *
 47  *
 48  *              This program is free software; you can redistribute it and/or
 49  *              modify it under the terms of the GNU General Public License
 50  *              as published by the Free Software Foundation; either version
 51  *              2 of the License, or (at your option) any later version.
 52  *
 53  */
 54 
 55 #include <linux/types.h>
 56 #include <linux/mm.h>
 57 #include <linux/capability.h>
 58 #include <linux/fcntl.h>
 59 #include <linux/socket.h>
 60 #include <linux/in.h>
 61 #include <linux/inet.h>
 62 #include <linux/netdevice.h>
 63 #include <linux/if_packet.h>
 64 #include <linux/wireless.h>
 65 #include <linux/kernel.h>
 66 #include <linux/kmod.h>
 67 #include <linux/slab.h>
 68 #include <linux/vmalloc.h>
 69 #include <net/net_namespace.h>
 70 #include <net/ip.h>
 71 #include <net/protocol.h>
 72 #include <linux/skbuff.h>
 73 #include <net/sock.h>
 74 #include <linux/errno.h>
 75 #include <linux/timer.h>
 76 #include <asm/uaccess.h>
 77 #include <asm/ioctls.h>
 78 #include <asm/page.h>
 79 #include <asm/cacheflush.h>
 80 #include <asm/io.h>
 81 #include <linux/proc_fs.h>
 82 #include <linux/seq_file.h>
 83 #include <linux/poll.h>
 84 #include <linux/module.h>
 85 #include <linux/init.h>
 86 #include <linux/mutex.h>
 87 #include <linux/if_vlan.h>
 88 #include <linux/virtio_net.h>
 89 #include <linux/errqueue.h>
 90 #include <linux/net_tstamp.h>
 91 #include <net/flow_keys.h>
 92 
 93 #ifdef CONFIG_INET
 94 #include <net/inet_common.h>
 95 #endif
 96 
 97 #include "internal.h"
 98 
 99 /*
100    Assumptions:
101    - if device has no dev->hard_header routine, it adds and removes ll header
102      inside itself. In this case ll header is invisible outside of device,
103      but higher levels still should reserve dev->hard_header_len.
104      Some devices are enough clever to reallocate skb, when header
105      will not fit to reserved space (tunnel), another ones are silly
106      (PPP).
107    - packet socket receives packets with pulled ll header,
108      so that SOCK_RAW should push it back.
109 
110 On receive:
111 -----------
112 
113 Incoming, dev->hard_header!=NULL
114    mac_header -> ll header
115    data       -> data
116 
117 Outgoing, dev->hard_header!=NULL
118    mac_header -> ll header
119    data       -> ll header
120 
121 Incoming, dev->hard_header==NULL
122    mac_header -> UNKNOWN position. It is very likely, that it points to ll
123                  header.  PPP makes it, that is wrong, because introduce
124                  assymetry between rx and tx paths.
125    data       -> data
126 
127 Outgoing, dev->hard_header==NULL
128    mac_header -> data. ll header is still not built!
129    data       -> data
130 
131 Resume
132   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
133 
134 
135 On transmit:
136 ------------
137 
138 dev->hard_header != NULL
139    mac_header -> ll header
140    data       -> ll header
141 
142 dev->hard_header == NULL (ll header is added by device, we cannot control it)
143    mac_header -> data
144    data       -> data
145 
146    We should set nh.raw on output to correct posistion,
147    packet classifier depends on it.
148  */
149 
150 /* Private packet socket structures. */
151 
152 /* identical to struct packet_mreq except it has
153  * a longer address field.
154  */
155 struct packet_mreq_max {
156         int             mr_ifindex;
157         unsigned short  mr_type;
158         unsigned short  mr_alen;
159         unsigned char   mr_address[MAX_ADDR_LEN];
160 };
161 
162 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
163                 int closing, int tx_ring);
164 
165 
166 #define V3_ALIGNMENT    (8)
167 
168 #define BLK_HDR_LEN     (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
169 
170 #define BLK_PLUS_PRIV(sz_of_priv) \
171         (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
172 
173 #define PGV_FROM_VMALLOC 1
174 
175 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
176 #define BLOCK_NUM_PKTS(x)       ((x)->hdr.bh1.num_pkts)
177 #define BLOCK_O2FP(x)           ((x)->hdr.bh1.offset_to_first_pkt)
178 #define BLOCK_LEN(x)            ((x)->hdr.bh1.blk_len)
179 #define BLOCK_SNUM(x)           ((x)->hdr.bh1.seq_num)
180 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
181 #define BLOCK_PRIV(x)           ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
182 
183 struct packet_sock;
184 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
185 
186 static void *packet_previous_frame(struct packet_sock *po,
187                 struct packet_ring_buffer *rb,
188                 int status);
189 static void packet_increment_head(struct packet_ring_buffer *buff);
190 static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
191                         struct tpacket_block_desc *);
192 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
193                         struct packet_sock *);
194 static void prb_retire_current_block(struct tpacket_kbdq_core *,
195                 struct packet_sock *, unsigned int status);
196 static int prb_queue_frozen(struct tpacket_kbdq_core *);
197 static void prb_open_block(struct tpacket_kbdq_core *,
198                 struct tpacket_block_desc *);
199 static void prb_retire_rx_blk_timer_expired(unsigned long);
200 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
201 static void prb_init_blk_timer(struct packet_sock *,
202                 struct tpacket_kbdq_core *,
203                 void (*func) (unsigned long));
204 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
205 static void prb_clear_rxhash(struct tpacket_kbdq_core *,
206                 struct tpacket3_hdr *);
207 static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
208                 struct tpacket3_hdr *);
209 static void packet_flush_mclist(struct sock *sk);
210 
211 struct packet_skb_cb {
212         unsigned int origlen;
213         union {
214                 struct sockaddr_pkt pkt;
215                 struct sockaddr_ll ll;
216         } sa;
217 };
218 
219 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
220 
221 #define GET_PBDQC_FROM_RB(x)    ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
222 #define GET_PBLOCK_DESC(x, bid) \
223         ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
224 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x)       \
225         ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
226 #define GET_NEXT_PRB_BLK_NUM(x) \
227         (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
228         ((x)->kactive_blk_num+1) : 0)
229 
230 static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
231 static void __fanout_link(struct sock *sk, struct packet_sock *po);
232 
233 /* register_prot_hook must be invoked with the po->bind_lock held,
234  * or from a context in which asynchronous accesses to the packet
235  * socket is not possible (packet_create()).
236  */
237 static void register_prot_hook(struct sock *sk)
238 {
239         struct packet_sock *po = pkt_sk(sk);
240         if (!po->running) {
241                 if (po->fanout)
242                         __fanout_link(sk, po);
243                 else
244                         dev_add_pack(&po->prot_hook);
245                 sock_hold(sk);
246                 po->running = 1;
247         }
248 }
249 
250 /* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
251  * held.   If the sync parameter is true, we will temporarily drop
252  * the po->bind_lock and do a synchronize_net to make sure no
253  * asynchronous packet processing paths still refer to the elements
254  * of po->prot_hook.  If the sync parameter is false, it is the
255  * callers responsibility to take care of this.
256  */
257 static void __unregister_prot_hook(struct sock *sk, bool sync)
258 {
259         struct packet_sock *po = pkt_sk(sk);
260 
261         po->running = 0;
262         if (po->fanout)
263                 __fanout_unlink(sk, po);
264         else
265                 __dev_remove_pack(&po->prot_hook);
266         __sock_put(sk);
267 
268         if (sync) {
269                 spin_unlock(&po->bind_lock);
270                 synchronize_net();
271                 spin_lock(&po->bind_lock);
272         }
273 }
274 
275 static void unregister_prot_hook(struct sock *sk, bool sync)
276 {
277         struct packet_sock *po = pkt_sk(sk);
278 
279         if (po->running)
280                 __unregister_prot_hook(sk, sync);
281 }
282 
283 static inline __pure struct page *pgv_to_page(void *addr)
284 {
285         if (is_vmalloc_addr(addr))
286                 return vmalloc_to_page(addr);
287         return virt_to_page(addr);
288 }
289 
290 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
291 {
292         union {
293                 struct tpacket_hdr *h1;
294                 struct tpacket2_hdr *h2;
295                 void *raw;
296         } h;
297 
298         h.raw = frame;
299         switch (po->tp_version) {
300         case TPACKET_V1:
301                 h.h1->tp_status = status;
302                 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
303                 break;
304         case TPACKET_V2:
305                 h.h2->tp_status = status;
306                 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
307                 break;
308         case TPACKET_V3:
309         default:
310                 WARN(1, "TPACKET version not supported.\n");
311                 BUG();
312         }
313 
314         smp_wmb();
315 }
316 
317 static int __packet_get_status(struct packet_sock *po, void *frame)
318 {
319         union {
320                 struct tpacket_hdr *h1;
321                 struct tpacket2_hdr *h2;
322                 void *raw;
323         } h;
324 
325         smp_rmb();
326 
327         h.raw = frame;
328         switch (po->tp_version) {
329         case TPACKET_V1:
330                 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
331                 return h.h1->tp_status;
332         case TPACKET_V2:
333                 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
334                 return h.h2->tp_status;
335         case TPACKET_V3:
336         default:
337                 WARN(1, "TPACKET version not supported.\n");
338                 BUG();
339                 return 0;
340         }
341 }
342 
343 static void *packet_lookup_frame(struct packet_sock *po,
344                 struct packet_ring_buffer *rb,
345                 unsigned int position,
346                 int status)
347 {
348         unsigned int pg_vec_pos, frame_offset;
349         union {
350                 struct tpacket_hdr *h1;
351                 struct tpacket2_hdr *h2;
352                 void *raw;
353         } h;
354 
355         pg_vec_pos = position / rb->frames_per_block;
356         frame_offset = position % rb->frames_per_block;
357 
358         h.raw = rb->pg_vec[pg_vec_pos].buffer +
359                 (frame_offset * rb->frame_size);
360 
361         if (status != __packet_get_status(po, h.raw))
362                 return NULL;
363 
364         return h.raw;
365 }
366 
367 static void *packet_current_frame(struct packet_sock *po,
368                 struct packet_ring_buffer *rb,
369                 int status)
370 {
371         return packet_lookup_frame(po, rb, rb->head, status);
372 }
373 
374 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
375 {
376         del_timer_sync(&pkc->retire_blk_timer);
377 }
378 
379 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
380                 int tx_ring,
381                 struct sk_buff_head *rb_queue)
382 {
383         struct tpacket_kbdq_core *pkc;
384 
385         pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
386 
387         spin_lock(&rb_queue->lock);
388         pkc->delete_blk_timer = 1;
389         spin_unlock(&rb_queue->lock);
390 
391         prb_del_retire_blk_timer(pkc);
392 }
393 
394 static void prb_init_blk_timer(struct packet_sock *po,
395                 struct tpacket_kbdq_core *pkc,
396                 void (*func) (unsigned long))
397 {
398         init_timer(&pkc->retire_blk_timer);
399         pkc->retire_blk_timer.data = (long)po;
400         pkc->retire_blk_timer.function = func;
401         pkc->retire_blk_timer.expires = jiffies;
402 }
403 
404 static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
405 {
406         struct tpacket_kbdq_core *pkc;
407 
408         if (tx_ring)
409                 BUG();
410 
411         pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
412         prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
413 }
414 
415 static int prb_calc_retire_blk_tmo(struct packet_sock *po,
416                                 int blk_size_in_bytes)
417 {
418         struct net_device *dev;
419         unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
420         struct ethtool_cmd ecmd;
421         int err;
422         u32 speed;
423 
424         rtnl_lock();
425         dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
426         if (unlikely(!dev)) {
427                 rtnl_unlock();
428                 return DEFAULT_PRB_RETIRE_TOV;
429         }
430         err = __ethtool_get_settings(dev, &ecmd);
431         speed = ethtool_cmd_speed(&ecmd);
432         rtnl_unlock();
433         if (!err) {
434                 /*
435                  * If the link speed is so slow you don't really
436                  * need to worry about perf anyways
437                  */
438                 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
439                         return DEFAULT_PRB_RETIRE_TOV;
440                 } else {
441                         msec = 1;
442                         div = speed / 1000;
443                 }
444         }
445 
446         mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
447 
448         if (div)
449                 mbits /= div;
450 
451         tmo = mbits * msec;
452 
453         if (div)
454                 return tmo+1;
455         return tmo;
456 }
457 
458 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
459                         union tpacket_req_u *req_u)
460 {
461         p1->feature_req_word = req_u->req3.tp_feature_req_word;
462 }
463 
464 static void init_prb_bdqc(struct packet_sock *po,
465                         struct packet_ring_buffer *rb,
466                         struct pgv *pg_vec,
467                         union tpacket_req_u *req_u, int tx_ring)
468 {
469         struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
470         struct tpacket_block_desc *pbd;
471 
472         memset(p1, 0x0, sizeof(*p1));
473 
474         p1->knxt_seq_num = 1;
475         p1->pkbdq = pg_vec;
476         pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
477         p1->pkblk_start = pg_vec[0].buffer;
478         p1->kblk_size = req_u->req3.tp_block_size;
479         p1->knum_blocks = req_u->req3.tp_block_nr;
480         p1->hdrlen = po->tp_hdrlen;
481         p1->version = po->tp_version;
482         p1->last_kactive_blk_num = 0;
483         po->stats_u.stats3.tp_freeze_q_cnt = 0;
484         if (req_u->req3.tp_retire_blk_tov)
485                 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
486         else
487                 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
488                                                 req_u->req3.tp_block_size);
489         p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
490         p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
491 
492         prb_init_ft_ops(p1, req_u);
493         prb_setup_retire_blk_timer(po, tx_ring);
494         prb_open_block(p1, pbd);
495 }
496 
497 /*  Do NOT update the last_blk_num first.
498  *  Assumes sk_buff_head lock is held.
499  */
500 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
501 {
502         mod_timer(&pkc->retire_blk_timer,
503                         jiffies + pkc->tov_in_jiffies);
504         pkc->last_kactive_blk_num = pkc->kactive_blk_num;
505 }
506 
507 /*
508  * Timer logic:
509  * 1) We refresh the timer only when we open a block.
510  *    By doing this we don't waste cycles refreshing the timer
511  *        on packet-by-packet basis.
512  *
513  * With a 1MB block-size, on a 1Gbps line, it will take
514  * i) ~8 ms to fill a block + ii) memcpy etc.
515  * In this cut we are not accounting for the memcpy time.
516  *
517  * So, if the user sets the 'tmo' to 10ms then the timer
518  * will never fire while the block is still getting filled
519  * (which is what we want). However, the user could choose
520  * to close a block early and that's fine.
521  *
522  * But when the timer does fire, we check whether or not to refresh it.
523  * Since the tmo granularity is in msecs, it is not too expensive
524  * to refresh the timer, lets say every '8' msecs.
525  * Either the user can set the 'tmo' or we can derive it based on
526  * a) line-speed and b) block-size.
527  * prb_calc_retire_blk_tmo() calculates the tmo.
528  *
529  */
530 static void prb_retire_rx_blk_timer_expired(unsigned long data)
531 {
532         struct packet_sock *po = (struct packet_sock *)data;
533         struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
534         unsigned int frozen;
535         struct tpacket_block_desc *pbd;
536 
537         spin_lock(&po->sk.sk_receive_queue.lock);
538 
539         frozen = prb_queue_frozen(pkc);
540         pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
541 
542         if (unlikely(pkc->delete_blk_timer))
543                 goto out;
544 
545         /* We only need to plug the race when the block is partially filled.
546          * tpacket_rcv:
547          *              lock(); increment BLOCK_NUM_PKTS; unlock()
548          *              copy_bits() is in progress ...
549          *              timer fires on other cpu:
550          *              we can't retire the current block because copy_bits
551          *              is in progress.
552          *
553          */
554         if (BLOCK_NUM_PKTS(pbd)) {
555                 while (atomic_read(&pkc->blk_fill_in_prog)) {
556                         /* Waiting for skb_copy_bits to finish... */
557                         cpu_relax();
558                 }
559         }
560 
561         if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
562                 if (!frozen) {
563                         prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
564                         if (!prb_dispatch_next_block(pkc, po))
565                                 goto refresh_timer;
566                         else
567                                 goto out;
568                 } else {
569                         /* Case 1. Queue was frozen because user-space was
570                          *         lagging behind.
571                          */
572                         if (prb_curr_blk_in_use(pkc, pbd)) {
573                                 /*
574                                  * Ok, user-space is still behind.
575                                  * So just refresh the timer.
576                                  */
577                                 goto refresh_timer;
578                         } else {
579                                /* Case 2. queue was frozen,user-space caught up,
580                                 * now the link went idle && the timer fired.
581                                 * We don't have a block to close.So we open this
582                                 * block and restart the timer.
583                                 * opening a block thaws the queue,restarts timer
584                                 * Thawing/timer-refresh is a side effect.
585                                 */
586                                 prb_open_block(pkc, pbd);
587                                 goto out;
588                         }
589                 }
590         }
591 
592 refresh_timer:
593         _prb_refresh_rx_retire_blk_timer(pkc);
594 
595 out:
596         spin_unlock(&po->sk.sk_receive_queue.lock);
597 }
598 
599 static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
600                 struct tpacket_block_desc *pbd1, __u32 status)
601 {
602         /* Flush everything minus the block header */
603 
604 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
605         u8 *start, *end;
606 
607         start = (u8 *)pbd1;
608 
609         /* Skip the block header(we know header WILL fit in 4K) */
610         start += PAGE_SIZE;
611 
612         end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
613         for (; start < end; start += PAGE_SIZE)
614                 flush_dcache_page(pgv_to_page(start));
615 
616         smp_wmb();
617 #endif
618 
619         /* Now update the block status. */
620 
621         BLOCK_STATUS(pbd1) = status;
622 
623         /* Flush the block header */
624 
625 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
626         start = (u8 *)pbd1;
627         flush_dcache_page(pgv_to_page(start));
628 
629         smp_wmb();
630 #endif
631 }
632 
633 /*
634  * Side effect:
635  *
636  * 1) flush the block
637  * 2) Increment active_blk_num
638  *
639  * Note:We DONT refresh the timer on purpose.
640  *      Because almost always the next block will be opened.
641  */
642 static void prb_close_block(struct tpacket_kbdq_core *pkc1,
643                 struct tpacket_block_desc *pbd1,
644                 struct packet_sock *po, unsigned int stat)
645 {
646         __u32 status = TP_STATUS_USER | stat;
647 
648         struct tpacket3_hdr *last_pkt;
649         struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
650 
651         if (po->stats.tp_drops)
652                 status |= TP_STATUS_LOSING;
653 
654         last_pkt = (struct tpacket3_hdr *)pkc1->prev;
655         last_pkt->tp_next_offset = 0;
656 
657         /* Get the ts of the last pkt */
658         if (BLOCK_NUM_PKTS(pbd1)) {
659                 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
660                 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
661         } else {
662                 /* Ok, we tmo'd - so get the current time */
663                 struct timespec ts;
664                 getnstimeofday(&ts);
665                 h1->ts_last_pkt.ts_sec = ts.tv_sec;
666                 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
667         }
668 
669         smp_wmb();
670 
671         /* Flush the block */
672         prb_flush_block(pkc1, pbd1, status);
673 
674         pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
675 }
676 
677 static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
678 {
679         pkc->reset_pending_on_curr_blk = 0;
680 }
681 
682 /*
683  * Side effect of opening a block:
684  *
685  * 1) prb_queue is thawed.
686  * 2) retire_blk_timer is refreshed.
687  *
688  */
689 static void prb_open_block(struct tpacket_kbdq_core *pkc1,
690         struct tpacket_block_desc *pbd1)
691 {
692         struct timespec ts;
693         struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
694 
695         smp_rmb();
696 
697         /* We could have just memset this but we will lose the
698          * flexibility of making the priv area sticky
699          */
700 
701         BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
702         BLOCK_NUM_PKTS(pbd1) = 0;
703         BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
704 
705         getnstimeofday(&ts);
706 
707         h1->ts_first_pkt.ts_sec = ts.tv_sec;
708         h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
709 
710         pkc1->pkblk_start = (char *)pbd1;
711         pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
712 
713         BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
714         BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
715 
716         pbd1->version = pkc1->version;
717         pkc1->prev = pkc1->nxt_offset;
718         pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
719 
720         prb_thaw_queue(pkc1);
721         _prb_refresh_rx_retire_blk_timer(pkc1);
722 
723         smp_wmb();
724 }
725 
726 /*
727  * Queue freeze logic:
728  * 1) Assume tp_block_nr = 8 blocks.
729  * 2) At time 't0', user opens Rx ring.
730  * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
731  * 4) user-space is either sleeping or processing block ''.
732  * 5) tpacket_rcv is currently filling block '7', since there is no space left,
733  *    it will close block-7,loop around and try to fill block ''.
734  *    call-flow:
735  *    __packet_lookup_frame_in_block
736  *      prb_retire_current_block()
737  *      prb_dispatch_next_block()
738  *        |->(BLOCK_STATUS == USER) evaluates to true
739  *    5.1) Since block-0 is currently in-use, we just freeze the queue.
740  * 6) Now there are two cases:
741  *    6.1) Link goes idle right after the queue is frozen.
742  *         But remember, the last open_block() refreshed the timer.
743  *         When this timer expires,it will refresh itself so that we can
744  *         re-open block-0 in near future.
745  *    6.2) Link is busy and keeps on receiving packets. This is a simple
746  *         case and __packet_lookup_frame_in_block will check if block-0
747  *         is free and can now be re-used.
748  */
749 static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
750                                   struct packet_sock *po)
751 {
752         pkc->reset_pending_on_curr_blk = 1;
753         po->stats_u.stats3.tp_freeze_q_cnt++;
754 }
755 
756 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
757 
758 /*
759  * If the next block is free then we will dispatch it
760  * and return a good offset.
761  * Else, we will freeze the queue.
762  * So, caller must check the return value.
763  */
764 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
765                 struct packet_sock *po)
766 {
767         struct tpacket_block_desc *pbd;
768 
769         smp_rmb();
770 
771         /* 1. Get current block num */
772         pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
773 
774         /* 2. If this block is currently in_use then freeze the queue */
775         if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
776                 prb_freeze_queue(pkc, po);
777                 return NULL;
778         }
779 
780         /*
781          * 3.
782          * open this block and return the offset where the first packet
783          * needs to get stored.
784          */
785         prb_open_block(pkc, pbd);
786         return (void *)pkc->nxt_offset;
787 }
788 
789 static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
790                 struct packet_sock *po, unsigned int status)
791 {
792         struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
793 
794         /* retire/close the current block */
795         if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
796                 /*
797                  * Plug the case where copy_bits() is in progress on
798                  * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
799                  * have space to copy the pkt in the current block and
800                  * called prb_retire_current_block()
801                  *
802                  * We don't need to worry about the TMO case because
803                  * the timer-handler already handled this case.
804                  */
805                 if (!(status & TP_STATUS_BLK_TMO)) {
806                         while (atomic_read(&pkc->blk_fill_in_prog)) {
807                                 /* Waiting for skb_copy_bits to finish... */
808                                 cpu_relax();
809                         }
810                 }
811                 prb_close_block(pkc, pbd, po, status);
812                 return;
813         }
814 }
815 
816 static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
817                                       struct tpacket_block_desc *pbd)
818 {
819         return TP_STATUS_USER & BLOCK_STATUS(pbd);
820 }
821 
822 static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
823 {
824         return pkc->reset_pending_on_curr_blk;
825 }
826 
827 static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
828 {
829         struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
830         atomic_dec(&pkc->blk_fill_in_prog);
831 }
832 
833 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
834                         struct tpacket3_hdr *ppd)
835 {
836         ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
837 }
838 
839 static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
840                         struct tpacket3_hdr *ppd)
841 {
842         ppd->hv1.tp_rxhash = 0;
843 }
844 
845 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
846                         struct tpacket3_hdr *ppd)
847 {
848         if (vlan_tx_tag_present(pkc->skb)) {
849                 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
850                 ppd->tp_status = TP_STATUS_VLAN_VALID;
851         } else {
852                 ppd->hv1.tp_vlan_tci = 0;
853                 ppd->tp_status = TP_STATUS_AVAILABLE;
854         }
855 }
856 
857 static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
858                         struct tpacket3_hdr *ppd)
859 {
860         prb_fill_vlan_info(pkc, ppd);
861 
862         if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
863                 prb_fill_rxhash(pkc, ppd);
864         else
865                 prb_clear_rxhash(pkc, ppd);
866 }
867 
868 static void prb_fill_curr_block(char *curr,
869                                 struct tpacket_kbdq_core *pkc,
870                                 struct tpacket_block_desc *pbd,
871                                 unsigned int len)
872 {
873         struct tpacket3_hdr *ppd;
874 
875         ppd  = (struct tpacket3_hdr *)curr;
876         ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
877         pkc->prev = curr;
878         pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
879         BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
880         BLOCK_NUM_PKTS(pbd) += 1;
881         atomic_inc(&pkc->blk_fill_in_prog);
882         prb_run_all_ft_ops(pkc, ppd);
883 }
884 
885 /* Assumes caller has the sk->rx_queue.lock */
886 static void *__packet_lookup_frame_in_block(struct packet_sock *po,
887                                             struct sk_buff *skb,
888                                                 int status,
889                                             unsigned int len
890                                             )
891 {
892         struct tpacket_kbdq_core *pkc;
893         struct tpacket_block_desc *pbd;
894         char *curr, *end;
895 
896         pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
897         pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
898 
899         /* Queue is frozen when user space is lagging behind */
900         if (prb_queue_frozen(pkc)) {
901                 /*
902                  * Check if that last block which caused the queue to freeze,
903                  * is still in_use by user-space.
904                  */
905                 if (prb_curr_blk_in_use(pkc, pbd)) {
906                         /* Can't record this packet */
907                         return NULL;
908                 } else {
909                         /*
910                          * Ok, the block was released by user-space.
911                          * Now let's open that block.
912                          * opening a block also thaws the queue.
913                          * Thawing is a side effect.
914                          */
915                         prb_open_block(pkc, pbd);
916                 }
917         }
918 
919         smp_mb();
920         curr = pkc->nxt_offset;
921         pkc->skb = skb;
922         end = (char *)pbd + pkc->kblk_size;
923 
924         /* first try the current block */
925         if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
926                 prb_fill_curr_block(curr, pkc, pbd, len);
927                 return (void *)curr;
928         }
929 
930         /* Ok, close the current block */
931         prb_retire_current_block(pkc, po, 0);
932 
933         /* Now, try to dispatch the next block */
934         curr = (char *)prb_dispatch_next_block(pkc, po);
935         if (curr) {
936                 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
937                 prb_fill_curr_block(curr, pkc, pbd, len);
938                 return (void *)curr;
939         }
940 
941         /*
942          * No free blocks are available.user_space hasn't caught up yet.
943          * Queue was just frozen and now this packet will get dropped.
944          */
945         return NULL;
946 }
947 
948 static void *packet_current_rx_frame(struct packet_sock *po,
949                                             struct sk_buff *skb,
950                                             int status, unsigned int len)
951 {
952         char *curr = NULL;
953         switch (po->tp_version) {
954         case TPACKET_V1:
955         case TPACKET_V2:
956                 curr = packet_lookup_frame(po, &po->rx_ring,
957                                         po->rx_ring.head, status);
958                 return curr;
959         case TPACKET_V3:
960                 return __packet_lookup_frame_in_block(po, skb, status, len);
961         default:
962                 WARN(1, "TPACKET version not supported\n");
963                 BUG();
964                 return NULL;
965         }
966 }
967 
968 static void *prb_lookup_block(struct packet_sock *po,
969                                      struct packet_ring_buffer *rb,
970                                      unsigned int previous,
971                                      int status)
972 {
973         struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
974         struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, previous);
975 
976         if (status != BLOCK_STATUS(pbd))
977                 return NULL;
978         return pbd;
979 }
980 
981 static int prb_previous_blk_num(struct packet_ring_buffer *rb)
982 {
983         unsigned int prev;
984         if (rb->prb_bdqc.kactive_blk_num)
985                 prev = rb->prb_bdqc.kactive_blk_num-1;
986         else
987                 prev = rb->prb_bdqc.knum_blocks-1;
988         return prev;
989 }
990 
991 /* Assumes caller has held the rx_queue.lock */
992 static void *__prb_previous_block(struct packet_sock *po,
993                                          struct packet_ring_buffer *rb,
994                                          int status)
995 {
996         unsigned int previous = prb_previous_blk_num(rb);
997         return prb_lookup_block(po, rb, previous, status);
998 }
999 
1000 static void *packet_previous_rx_frame(struct packet_sock *po,
1001                                              struct packet_ring_buffer *rb,
1002                                              int status)
1003 {
1004         if (po->tp_version <= TPACKET_V2)
1005                 return packet_previous_frame(po, rb, status);
1006 
1007         return __prb_previous_block(po, rb, status);
1008 }
1009 
1010 static void packet_increment_rx_head(struct packet_sock *po,
1011                                             struct packet_ring_buffer *rb)
1012 {
1013         switch (po->tp_version) {
1014         case TPACKET_V1:
1015         case TPACKET_V2:
1016                 return packet_increment_head(rb);
1017         case TPACKET_V3:
1018         default:
1019                 WARN(1, "TPACKET version not supported.\n");
1020                 BUG();
1021                 return;
1022         }
1023 }
1024 
1025 static void *packet_previous_frame(struct packet_sock *po,
1026                 struct packet_ring_buffer *rb,
1027                 int status)
1028 {
1029         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1030         return packet_lookup_frame(po, rb, previous, status);
1031 }
1032 
1033 static void packet_increment_head(struct packet_ring_buffer *buff)
1034 {
1035         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1036 }
1037 
1038 static void packet_sock_destruct(struct sock *sk)
1039 {
1040         skb_queue_purge(&sk->sk_error_queue);
1041 
1042         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1043         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1044 
1045         if (!sock_flag(sk, SOCK_DEAD)) {
1046                 pr_err("Attempt to release alive packet socket: %p\n", sk);
1047                 return;
1048         }
1049 
1050         sk_refcnt_debug_dec(sk);
1051 }
1052 
1053 static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1054 {
1055         int x = atomic_read(&f->rr_cur) + 1;
1056 
1057         if (x >= num)
1058                 x = 0;
1059 
1060         return x;
1061 }
1062 
1063 static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1064 {
1065         u32 idx, hash = skb->rxhash;
1066 
1067         idx = ((u64)hash * num) >> 32;
1068 
1069         return f->arr[idx];
1070 }
1071 
1072 static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1073 {
1074         int cur, old;
1075 
1076         cur = atomic_read(&f->rr_cur);
1077         while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1078                                      fanout_rr_next(f, num))) != cur)
1079                 cur = old;
1080         return f->arr[cur];
1081 }
1082 
1083 static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1084 {
1085         unsigned int cpu = smp_processor_id();
1086 
1087         return f->arr[cpu % num];
1088 }
1089 
1090 static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1091                              struct packet_type *pt, struct net_device *orig_dev)
1092 {
1093         struct packet_fanout *f = pt->af_packet_priv;
1094         unsigned int num = f->num_members;
1095         struct packet_sock *po;
1096         struct sock *sk;
1097 
1098         if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1099             !num) {
1100                 kfree_skb(skb);
1101                 return 0;
1102         }
1103 
1104         switch (f->type) {
1105         case PACKET_FANOUT_HASH:
1106         default:
1107                 if (f->defrag) {
1108                         skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
1109                         if (!skb)
1110                                 return 0;
1111                 }
1112                 skb_get_rxhash(skb);
1113                 sk = fanout_demux_hash(f, skb, num);
1114                 break;
1115         case PACKET_FANOUT_LB:
1116                 sk = fanout_demux_lb(f, skb, num);
1117                 break;
1118         case PACKET_FANOUT_CPU:
1119                 sk = fanout_demux_cpu(f, skb, num);
1120                 break;
1121         }
1122 
1123         po = pkt_sk(sk);
1124 
1125         return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1126 }
1127 
1128 DEFINE_MUTEX(fanout_mutex);
1129 EXPORT_SYMBOL_GPL(fanout_mutex);
1130 static LIST_HEAD(fanout_list);
1131 
1132 static void __fanout_link(struct sock *sk, struct packet_sock *po)
1133 {
1134         struct packet_fanout *f = po->fanout;
1135 
1136         spin_lock(&f->lock);
1137         f->arr[f->num_members] = sk;
1138         smp_wmb();
1139         f->num_members++;
1140         spin_unlock(&f->lock);
1141 }
1142 
1143 static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1144 {
1145         struct packet_fanout *f = po->fanout;
1146         int i;
1147 
1148         spin_lock(&f->lock);
1149         for (i = 0; i < f->num_members; i++) {
1150                 if (f->arr[i] == sk)
1151                         break;
1152         }
1153         BUG_ON(i >= f->num_members);
1154         f->arr[i] = f->arr[f->num_members - 1];
1155         f->num_members--;
1156         spin_unlock(&f->lock);
1157 }
1158 
1159 static bool match_fanout_group(struct packet_type *ptype, struct sock * sk)
1160 {
1161         if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout)
1162                 return true;
1163 
1164         return false;
1165 }
1166 
1167 static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1168 {
1169         struct packet_sock *po = pkt_sk(sk);
1170         struct packet_fanout *f, *match;
1171         u8 type = type_flags & 0xff;
1172         u8 defrag = (type_flags & PACKET_FANOUT_FLAG_DEFRAG) ? 1 : 0;
1173         int err;
1174 
1175         switch (type) {
1176         case PACKET_FANOUT_HASH:
1177         case PACKET_FANOUT_LB:
1178         case PACKET_FANOUT_CPU:
1179                 break;
1180         default:
1181                 return -EINVAL;
1182         }
1183 
1184         if (!po->running)
1185                 return -EINVAL;
1186 
1187         if (po->fanout)
1188                 return -EALREADY;
1189 
1190         mutex_lock(&fanout_mutex);
1191         match = NULL;
1192         list_for_each_entry(f, &fanout_list, list) {
1193                 if (f->id == id &&
1194                     read_pnet(&f->net) == sock_net(sk)) {
1195                         match = f;
1196                         break;
1197                 }
1198         }
1199         err = -EINVAL;
1200         if (match && match->defrag != defrag)
1201                 goto out;
1202         if (!match) {
1203                 err = -ENOMEM;
1204                 match = kzalloc(sizeof(*match), GFP_KERNEL);
1205                 if (!match)
1206                         goto out;
1207                 write_pnet(&match->net, sock_net(sk));
1208                 match->id = id;
1209                 match->type = type;
1210                 match->defrag = defrag;
1211                 atomic_set(&match->rr_cur, 0);
1212                 INIT_LIST_HEAD(&match->list);
1213                 spin_lock_init(&match->lock);
1214                 atomic_set(&match->sk_ref, 0);
1215                 match->prot_hook.type = po->prot_hook.type;
1216                 match->prot_hook.dev = po->prot_hook.dev;
1217                 match->prot_hook.func = packet_rcv_fanout;
1218                 match->prot_hook.af_packet_priv = match;
1219                 match->prot_hook.id_match = match_fanout_group;
1220                 dev_add_pack(&match->prot_hook);
1221                 list_add(&match->list, &fanout_list);
1222         }
1223         err = -EINVAL;
1224         if (match->type == type &&
1225             match->prot_hook.type == po->prot_hook.type &&
1226             match->prot_hook.dev == po->prot_hook.dev) {
1227                 err = -ENOSPC;
1228                 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1229                         __dev_remove_pack(&po->prot_hook);
1230                         po->fanout = match;
1231                         atomic_inc(&match->sk_ref);
1232                         __fanout_link(sk, po);
1233                         err = 0;
1234                 }
1235         }
1236 out:
1237         mutex_unlock(&fanout_mutex);
1238         return err;
1239 }
1240 
1241 static void fanout_release(struct sock *sk)
1242 {
1243         struct packet_sock *po = pkt_sk(sk);
1244         struct packet_fanout *f;
1245 
1246         f = po->fanout;
1247         if (!f)
1248                 return;
1249 
1250         mutex_lock(&fanout_mutex);
1251         po->fanout = NULL;
1252 
1253         if (atomic_dec_and_test(&f->sk_ref)) {
1254                 list_del(&f->list);
1255                 dev_remove_pack(&f->prot_hook);
1256                 kfree(f);
1257         }
1258         mutex_unlock(&fanout_mutex);
1259 }
1260 
1261 static const struct proto_ops packet_ops;
1262 
1263 static const struct proto_ops packet_ops_spkt;
1264 
1265 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1266                            struct packet_type *pt, struct net_device *orig_dev)
1267 {
1268         struct sock *sk;
1269         struct sockaddr_pkt *spkt;
1270 
1271         /*
1272          *      When we registered the protocol we saved the socket in the data
1273          *      field for just this event.
1274          */
1275 
1276         sk = pt->af_packet_priv;
1277 
1278         /*
1279          *      Yank back the headers [hope the device set this
1280          *      right or kerboom...]
1281          *
1282          *      Incoming packets have ll header pulled,
1283          *      push it back.
1284          *
1285          *      For outgoing ones skb->data == skb_mac_header(skb)
1286          *      so that this procedure is noop.
1287          */
1288 
1289         if (skb->pkt_type == PACKET_LOOPBACK)
1290                 goto out;
1291 
1292         if (!net_eq(dev_net(dev), sock_net(sk)))
1293                 goto out;
1294 
1295         skb = skb_share_check(skb, GFP_ATOMIC);
1296         if (skb == NULL)
1297                 goto oom;
1298 
1299         /* drop any routing info */
1300         skb_dst_drop(skb);
1301 
1302         /* drop conntrack reference */
1303         nf_reset(skb);
1304 
1305         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1306 
1307         skb_push(skb, skb->data - skb_mac_header(skb));
1308 
1309         /*
1310          *      The SOCK_PACKET socket receives _all_ frames.
1311          */
1312 
1313         spkt->spkt_family = dev->type;
1314         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1315         spkt->spkt_protocol = skb->protocol;
1316 
1317         /*
1318          *      Charge the memory to the socket. This is done specifically
1319          *      to prevent sockets using all the memory up.
1320          */
1321 
1322         if (sock_queue_rcv_skb(sk, skb) == 0)
1323                 return 0;
1324 
1325 out:
1326         kfree_skb(skb);
1327 oom:
1328         return 0;
1329 }
1330 
1331 
1332 /*
1333  *      Output a raw packet to a device layer. This bypasses all the other
1334  *      protocol layers and you must therefore supply it with a complete frame
1335  */
1336 
1337 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1338                                struct msghdr *msg, size_t len)
1339 {
1340         struct sock *sk = sock->sk;
1341         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1342         struct sk_buff *skb = NULL;
1343         struct net_device *dev;
1344         __be16 proto = 0;
1345         int err;
1346         int extra_len = 0;
1347         struct flow_keys keys;
1348 
1349         /*
1350          *      Get and verify the address.
1351          */
1352 
1353         if (saddr) {
1354                 if (msg->msg_namelen < sizeof(struct sockaddr))
1355                         return -EINVAL;
1356                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1357                         proto = saddr->spkt_protocol;
1358         } else
1359                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
1360 
1361         /*
1362          *      Find the device first to size check it
1363          */
1364 
1365         saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1366 retry:
1367         rcu_read_lock();
1368         dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1369         err = -ENODEV;
1370         if (dev == NULL)
1371                 goto out_unlock;
1372 
1373         err = -ENETDOWN;
1374         if (!(dev->flags & IFF_UP))
1375                 goto out_unlock;
1376 
1377         /*
1378          * You may not queue a frame bigger than the mtu. This is the lowest level
1379          * raw protocol and you must do your own fragmentation at this level.
1380          */
1381 
1382         if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1383                 if (!netif_supports_nofcs(dev)) {
1384                         err = -EPROTONOSUPPORT;
1385                         goto out_unlock;
1386                 }
1387                 extra_len = 4; /* We're doing our own CRC */
1388         }
1389 
1390         err = -EMSGSIZE;
1391         if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1392                 goto out_unlock;
1393 
1394         if (!skb) {
1395                 size_t reserved = LL_RESERVED_SPACE(dev);
1396                 int tlen = dev->needed_tailroom;
1397                 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1398 
1399                 rcu_read_unlock();
1400                 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1401                 if (skb == NULL)
1402                         return -ENOBUFS;
1403                 /* FIXME: Save some space for broken drivers that write a hard
1404                  * header at transmission time by themselves. PPP is the notable
1405                  * one here. This should really be fixed at the driver level.
1406                  */
1407                 skb_reserve(skb, reserved);
1408                 skb_reset_network_header(skb);
1409 
1410                 /* Try to align data part correctly */
1411                 if (hhlen) {
1412                         skb->data -= hhlen;
1413                         skb->tail -= hhlen;
1414                         if (len < hhlen)
1415                                 skb_reset_network_header(skb);
1416                 }
1417                 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1418                 if (err)
1419                         goto out_free;
1420                 goto retry;
1421         }
1422 
1423         if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
1424                 /* Earlier code assumed this would be a VLAN pkt,
1425                  * double-check this now that we have the actual
1426                  * packet in hand.
1427                  */
1428                 struct ethhdr *ehdr;
1429                 skb_reset_mac_header(skb);
1430                 ehdr = eth_hdr(skb);
1431                 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1432                         err = -EMSGSIZE;
1433                         goto out_unlock;
1434                 }
1435         }
1436 
1437         skb->protocol = proto;
1438         skb->dev = dev;
1439         skb->priority = sk->sk_priority;
1440         skb->mark = sk->sk_mark;
1441         err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1442         if (err < 0)
1443                 goto out_unlock;
1444 
1445         if (unlikely(extra_len == 4))
1446                 skb->no_fcs = 1;
1447 
1448         if (skb_flow_dissect(skb, &keys))
1449                 skb_set_transport_header(skb, keys.thoff);
1450         else
1451                 skb_reset_transport_header(skb);
1452 
1453         dev_queue_xmit(skb);
1454         rcu_read_unlock();
1455         return len;
1456 
1457 out_unlock:
1458         rcu_read_unlock();
1459 out_free:
1460         kfree_skb(skb);
1461         return err;
1462 }
1463 
1464 static unsigned int run_filter(const struct sk_buff *skb,
1465                                       const struct sock *sk,
1466                                       unsigned int res)
1467 {
1468         struct sk_filter *filter;
1469 
1470         rcu_read_lock();
1471         filter = rcu_dereference(sk->sk_filter);
1472         if (filter != NULL)
1473                 res = SK_RUN_FILTER(filter, skb);
1474         rcu_read_unlock();
1475 
1476         return res;
1477 }
1478 
1479 /*
1480  * This function makes lazy skb cloning in hope that most of packets
1481  * are discarded by BPF.
1482  *
1483  * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1484  * and skb->cb are mangled. It works because (and until) packets
1485  * falling here are owned by current CPU. Output packets are cloned
1486  * by dev_queue_xmit_nit(), input packets are processed by net_bh
1487  * sequencially, so that if we return skb to original state on exit,
1488  * we will not harm anyone.
1489  */
1490 
1491 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1492                       struct packet_type *pt, struct net_device *orig_dev)
1493 {
1494         struct sock *sk;
1495         struct sockaddr_ll *sll;
1496         struct packet_sock *po;
1497         u8 *skb_head = skb->data;
1498         int skb_len = skb->len;
1499         unsigned int snaplen, res;
1500 
1501         if (skb->pkt_type == PACKET_LOOPBACK)
1502                 goto drop;
1503 
1504         sk = pt->af_packet_priv;
1505         po = pkt_sk(sk);
1506 
1507         if (!net_eq(dev_net(dev), sock_net(sk)))
1508                 goto drop;
1509 
1510         skb->dev = dev;
1511 
1512         if (dev->header_ops) {
1513                 /* The device has an explicit notion of ll header,
1514                  * exported to higher levels.
1515                  *
1516                  * Otherwise, the device hides details of its frame
1517                  * structure, so that corresponding packet head is
1518                  * never delivered to user.
1519                  */
1520                 if (sk->sk_type != SOCK_DGRAM)
1521                         skb_push(skb, skb->data - skb_mac_header(skb));
1522                 else if (skb->pkt_type == PACKET_OUTGOING) {
1523                         /* Special case: outgoing packets have ll header at head */
1524                         skb_pull(skb, skb_network_offset(skb));
1525                 }
1526         }
1527 
1528         snaplen = skb->len;
1529 
1530         res = run_filter(skb, sk, snaplen);
1531         if (!res)
1532                 goto drop_n_restore;
1533         if (snaplen > res)
1534                 snaplen = res;
1535 
1536         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1537                 goto drop_n_acct;
1538 
1539         if (skb_shared(skb)) {
1540                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1541                 if (nskb == NULL)
1542                         goto drop_n_acct;
1543 
1544                 if (skb_head != skb->data) {
1545                         skb->data = skb_head;
1546                         skb->len = skb_len;
1547                 }
1548                 consume_skb(skb);
1549                 skb = nskb;
1550         }
1551 
1552         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1553                      sizeof(skb->cb));
1554 
1555         sll = &PACKET_SKB_CB(skb)->sa.ll;
1556         sll->sll_family = AF_PACKET;
1557         sll->sll_hatype = dev->type;
1558         sll->sll_protocol = skb->protocol;
1559         sll->sll_pkttype = skb->pkt_type;
1560         if (unlikely(po->origdev))
1561                 sll->sll_ifindex = orig_dev->ifindex;
1562         else
1563                 sll->sll_ifindex = dev->ifindex;
1564 
1565         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1566 
1567         PACKET_SKB_CB(skb)->origlen = skb->len;
1568 
1569         if (pskb_trim(skb, snaplen))
1570                 goto drop_n_acct;
1571 
1572         skb_set_owner_r(skb, sk);
1573         skb->dev = NULL;
1574         skb_dst_drop(skb);
1575 
1576         /* drop conntrack reference */
1577         nf_reset(skb);
1578 
1579         spin_lock(&sk->sk_receive_queue.lock);
1580         po->stats.tp_packets++;
1581         skb->dropcount = atomic_read(&sk->sk_drops);
1582         __skb_queue_tail(&sk->sk_receive_queue, skb);
1583         spin_unlock(&sk->sk_receive_queue.lock);
1584         sk->sk_data_ready(sk, skb->len);
1585         return 0;
1586 
1587 drop_n_acct:
1588         spin_lock(&sk->sk_receive_queue.lock);
1589         po->stats.tp_drops++;
1590         atomic_inc(&sk->sk_drops);
1591         spin_unlock(&sk->sk_receive_queue.lock);
1592 
1593 drop_n_restore:
1594         if (skb_head != skb->data && skb_shared(skb)) {
1595                 skb->data = skb_head;
1596                 skb->len = skb_len;
1597         }
1598 drop:
1599         consume_skb(skb);
1600         return 0;
1601 }
1602 
1603 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1604                        struct packet_type *pt, struct net_device *orig_dev)
1605 {
1606         struct sock *sk;
1607         struct packet_sock *po;
1608         struct sockaddr_ll *sll;
1609         union {
1610                 struct tpacket_hdr *h1;
1611                 struct tpacket2_hdr *h2;
1612                 struct tpacket3_hdr *h3;
1613                 void *raw;
1614         } h;
1615         u8 *skb_head = skb->data;
1616         int skb_len = skb->len;
1617         unsigned int snaplen, res;
1618         unsigned long status = TP_STATUS_USER;
1619         unsigned short macoff, netoff, hdrlen;
1620         struct sk_buff *copy_skb = NULL;
1621         struct timeval tv;
1622         struct timespec ts;
1623         struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
1624 
1625         if (skb->pkt_type == PACKET_LOOPBACK)
1626                 goto drop;
1627 
1628         sk = pt->af_packet_priv;
1629         po = pkt_sk(sk);
1630 
1631         if (!net_eq(dev_net(dev), sock_net(sk)))
1632                 goto drop;
1633 
1634         if (dev->header_ops) {
1635                 if (sk->sk_type != SOCK_DGRAM)
1636                         skb_push(skb, skb->data - skb_mac_header(skb));
1637                 else if (skb->pkt_type == PACKET_OUTGOING) {
1638                         /* Special case: outgoing packets have ll header at head */
1639                         skb_pull(skb, skb_network_offset(skb));
1640                 }
1641         }
1642 
1643         if (skb->ip_summed == CHECKSUM_PARTIAL)
1644                 status |= TP_STATUS_CSUMNOTREADY;
1645 
1646         snaplen = skb->len;
1647 
1648         res = run_filter(skb, sk, snaplen);
1649         if (!res)
1650                 goto drop_n_restore;
1651         if (snaplen > res)
1652                 snaplen = res;
1653 
1654         if (sk->sk_type == SOCK_DGRAM) {
1655                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1656                                   po->tp_reserve;
1657         } else {
1658                 unsigned int maclen = skb_network_offset(skb);
1659                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
1660                                        (maclen < 16 ? 16 : maclen)) +
1661                         po->tp_reserve;
1662                 macoff = netoff - maclen;
1663         }
1664         if (po->tp_version <= TPACKET_V2) {
1665                 if (macoff + snaplen > po->rx_ring.frame_size) {
1666                         if (po->copy_thresh &&
1667                             atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1668                                 if (skb_shared(skb)) {
1669                                         copy_skb = skb_clone(skb, GFP_ATOMIC);
1670                                 } else {
1671                                         copy_skb = skb_get(skb);
1672                                         skb_head = skb->data;
1673                                 }
1674                                 if (copy_skb)
1675                                         skb_set_owner_r(copy_skb, sk);
1676                         }
1677                         snaplen = po->rx_ring.frame_size - macoff;
1678                         if ((int)snaplen < 0)
1679                                 snaplen = 0;
1680                 }
1681         }
1682         spin_lock(&sk->sk_receive_queue.lock);
1683         h.raw = packet_current_rx_frame(po, skb,
1684                                         TP_STATUS_KERNEL, (macoff+snaplen));
1685         if (!h.raw)
1686                 goto ring_is_full;
1687         if (po->tp_version <= TPACKET_V2) {
1688                 packet_increment_rx_head(po, &po->rx_ring);
1689         /*
1690          * LOSING will be reported till you read the stats,
1691          * because it's COR - Clear On Read.
1692          * Anyways, moving it for V1/V2 only as V3 doesn't need this
1693          * at packet level.
1694          */
1695                 if (po->stats.tp_drops)
1696                         status |= TP_STATUS_LOSING;
1697         }
1698         po->stats.tp_packets++;
1699         if (copy_skb) {
1700                 status |= TP_STATUS_COPY;
1701                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1702         }
1703         spin_unlock(&sk->sk_receive_queue.lock);
1704 
1705         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
1706 
1707         switch (po->tp_version) {
1708         case TPACKET_V1:
1709                 h.h1->tp_len = skb->len;
1710                 h.h1->tp_snaplen = snaplen;
1711                 h.h1->tp_mac = macoff;
1712                 h.h1->tp_net = netoff;
1713                 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1714                                 && shhwtstamps->syststamp.tv64)
1715                         tv = ktime_to_timeval(shhwtstamps->syststamp);
1716                 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1717                                 && shhwtstamps->hwtstamp.tv64)
1718                         tv = ktime_to_timeval(shhwtstamps->hwtstamp);
1719                 else if (skb->tstamp.tv64)
1720                         tv = ktime_to_timeval(skb->tstamp);
1721                 else
1722                         do_gettimeofday(&tv);
1723                 h.h1->tp_sec = tv.tv_sec;
1724                 h.h1->tp_usec = tv.tv_usec;
1725                 hdrlen = sizeof(*h.h1);
1726                 break;
1727         case TPACKET_V2:
1728                 h.h2->tp_len = skb->len;
1729                 h.h2->tp_snaplen = snaplen;
1730                 h.h2->tp_mac = macoff;
1731                 h.h2->tp_net = netoff;
1732                 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1733                                 && shhwtstamps->syststamp.tv64)
1734                         ts = ktime_to_timespec(shhwtstamps->syststamp);
1735                 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1736                                 && shhwtstamps->hwtstamp.tv64)
1737                         ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1738                 else if (skb->tstamp.tv64)
1739                         ts = ktime_to_timespec(skb->tstamp);
1740                 else
1741                         getnstimeofday(&ts);
1742                 h.h2->tp_sec = ts.tv_sec;
1743                 h.h2->tp_nsec = ts.tv_nsec;
1744                 if (vlan_tx_tag_present(skb)) {
1745                         h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1746                         status |= TP_STATUS_VLAN_VALID;
1747                 } else {
1748                         h.h2->tp_vlan_tci = 0;
1749                 }
1750                 h.h2->tp_padding = 0;
1751                 hdrlen = sizeof(*h.h2);
1752                 break;
1753         case TPACKET_V3:
1754                 /* tp_nxt_offset,vlan are already populated above.
1755                  * So DONT clear those fields here
1756                  */
1757                 h.h3->tp_status |= status;
1758                 h.h3->tp_len = skb->len;
1759                 h.h3->tp_snaplen = snaplen;
1760                 h.h3->tp_mac = macoff;
1761                 h.h3->tp_net = netoff;
1762                 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1763                                 && shhwtstamps->syststamp.tv64)
1764                         ts = ktime_to_timespec(shhwtstamps->syststamp);
1765                 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1766                                 && shhwtstamps->hwtstamp.tv64)
1767                         ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1768                 else if (skb->tstamp.tv64)
1769                         ts = ktime_to_timespec(skb->tstamp);
1770                 else
1771                         getnstimeofday(&ts);
1772                 h.h3->tp_sec  = ts.tv_sec;
1773                 h.h3->tp_nsec = ts.tv_nsec;
1774                 hdrlen = sizeof(*h.h3);
1775                 break;
1776         default:
1777                 BUG();
1778         }
1779 
1780         sll = h.raw + TPACKET_ALIGN(hdrlen);
1781         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1782         sll->sll_family = AF_PACKET;
1783         sll->sll_hatype = dev->type;
1784         sll->sll_protocol = skb->protocol;
1785         sll->sll_pkttype = skb->pkt_type;
1786         if (unlikely(po->origdev))
1787                 sll->sll_ifindex = orig_dev->ifindex;
1788         else
1789                 sll->sll_ifindex = dev->ifindex;
1790 
1791         smp_mb();
1792 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1793         {
1794                 u8 *start, *end;
1795 
1796                 if (po->tp_version <= TPACKET_V2) {
1797                         end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1798                                 + macoff + snaplen);
1799                         for (start = h.raw; start < end; start += PAGE_SIZE)
1800                                 flush_dcache_page(pgv_to_page(start));
1801                 }
1802                 smp_wmb();
1803         }
1804 #endif
1805         if (po->tp_version <= TPACKET_V2)
1806                 __packet_set_status(po, h.raw, status);
1807         else
1808                 prb_clear_blk_fill_status(&po->rx_ring);
1809 
1810         sk->sk_data_ready(sk, 0);
1811 
1812 drop_n_restore:
1813         if (skb_head != skb->data && skb_shared(skb)) {
1814                 skb->data = skb_head;
1815                 skb->len = skb_len;
1816         }
1817 drop:
1818         kfree_skb(skb);
1819         return 0;
1820 
1821 ring_is_full:
1822         po->stats.tp_drops++;
1823         spin_unlock(&sk->sk_receive_queue.lock);
1824 
1825         sk->sk_data_ready(sk, 0);
1826         kfree_skb(copy_skb);
1827         goto drop_n_restore;
1828 }
1829 
1830 static void tpacket_destruct_skb(struct sk_buff *skb)
1831 {
1832         struct packet_sock *po = pkt_sk(skb->sk);
1833         void *ph;
1834 
1835         if (likely(po->tx_ring.pg_vec)) {
1836                 ph = skb_shinfo(skb)->destructor_arg;
1837                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1838                 atomic_dec(&po->tx_ring.pending);
1839                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
1840         }
1841 
1842         sock_wfree(skb);
1843 }
1844 
1845 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1846                 void *frame, struct net_device *dev, int size_max,
1847                 __be16 proto, unsigned char *addr, int hlen)
1848 {
1849         union {
1850                 struct tpacket_hdr *h1;
1851                 struct tpacket2_hdr *h2;
1852                 void *raw;
1853         } ph;
1854         int to_write, offset, len, tp_len, nr_frags, len_max;
1855         struct socket *sock = po->sk.sk_socket;
1856         struct page *page;
1857         void *data;
1858         int err;
1859         struct flow_keys keys;
1860 
1861         ph.raw = frame;
1862 
1863         skb->protocol = proto;
1864         skb->dev = dev;
1865         skb->priority = po->sk.sk_priority;
1866         skb->mark = po->sk.sk_mark;
1867         skb_shinfo(skb)->destructor_arg = ph.raw;
1868 
1869         switch (po->tp_version) {
1870         case TPACKET_V2:
1871                 tp_len = ph.h2->tp_len;
1872                 break;
1873         default:
1874                 tp_len = ph.h1->tp_len;
1875                 break;
1876         }
1877         if (unlikely(tp_len > size_max)) {
1878                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
1879                 return -EMSGSIZE;
1880         }
1881 
1882         skb_reserve(skb, hlen);
1883         skb_reset_network_header(skb);
1884 
1885         if (skb_flow_dissect(skb, &keys))
1886                 skb_set_transport_header(skb, keys.thoff);
1887         else
1888                 skb_reset_transport_header(skb);
1889 
1890         if (po->tp_tx_has_off) {
1891                 int off_min, off_max, off;
1892                 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
1893                 off_max = po->tx_ring.frame_size - tp_len;
1894                 if (sock->type == SOCK_DGRAM) {
1895                         switch (po->tp_version) {
1896                         case TPACKET_V2:
1897                                 off = ph.h2->tp_net;
1898                                 break;
1899                         default:
1900                                 off = ph.h1->tp_net;
1901                                 break;
1902                         }
1903                 } else {
1904                         switch (po->tp_version) {
1905                         case TPACKET_V2:
1906                                 off = ph.h2->tp_mac;
1907                                 break;
1908                         default:
1909                                 off = ph.h1->tp_mac;
1910                                 break;
1911                         }
1912                 }
1913                 if (unlikely((off < off_min) || (off_max < off)))
1914                         return -EINVAL;
1915                 data = ph.raw + off;
1916         } else {
1917                 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
1918         }
1919         to_write = tp_len;
1920 
1921         if (sock->type == SOCK_DGRAM) {
1922                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
1923                                 NULL, tp_len);
1924                 if (unlikely(err < 0))
1925                         return -EINVAL;
1926         } else if (dev->hard_header_len) {
1927                 /* net device doesn't like empty head */
1928                 if (unlikely(tp_len <= dev->hard_header_len)) {
1929                         pr_err("packet size is too short (%d < %d)\n",
1930                                tp_len, dev->hard_header_len);
1931                         return -EINVAL;
1932                 }
1933 
1934                 skb_push(skb, dev->hard_header_len);
1935                 err = skb_store_bits(skb, 0, data,
1936                                 dev->hard_header_len);
1937                 if (unlikely(err))
1938                         return err;
1939 
1940                 data += dev->hard_header_len;
1941                 to_write -= dev->hard_header_len;
1942         }
1943 
1944         offset = offset_in_page(data);
1945         len_max = PAGE_SIZE - offset;
1946         len = ((to_write > len_max) ? len_max : to_write);
1947 
1948         skb->data_len = to_write;
1949         skb->len += to_write;
1950         skb->truesize += to_write;
1951         atomic_add(to_write, &po->sk.sk_wmem_alloc);
1952 
1953         while (likely(to_write)) {
1954                 nr_frags = skb_shinfo(skb)->nr_frags;
1955 
1956                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
1957                         pr_err("Packet exceed the number of skb frags(%lu)\n",
1958                                MAX_SKB_FRAGS);
1959                         return -EFAULT;
1960                 }
1961 
1962                 page = pgv_to_page(data);
1963                 data += len;
1964                 flush_dcache_page(page);
1965                 get_page(page);
1966                 skb_fill_page_desc(skb, nr_frags, page, offset, len);
1967                 to_write -= len;
1968                 offset = 0;
1969                 len_max = PAGE_SIZE;
1970                 len = ((to_write > len_max) ? len_max : to_write);
1971         }
1972 
1973         return tp_len;
1974 }
1975 
1976 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
1977 {
1978         struct sk_buff *skb;
1979         struct net_device *dev;
1980         __be16 proto;
1981         bool need_rls_dev = false;
1982         int err, reserve = 0;
1983         void *ph;
1984         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1985         int tp_len, size_max;
1986         unsigned char *addr;
1987         int len_sum = 0;
1988         int status = TP_STATUS_AVAILABLE;
1989         int hlen, tlen;
1990 
1991         mutex_lock(&po->pg_vec_lock);
1992 
1993         if (saddr == NULL) {
1994                 dev = po->prot_hook.dev;
1995                 proto   = po->num;
1996                 addr    = NULL;
1997         } else {
1998                 err = -EINVAL;
1999                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2000                         goto out;
2001                 if (msg->msg_namelen < (saddr->sll_halen
2002                                         + offsetof(struct sockaddr_ll,
2003                                                 sll_addr)))
2004                         goto out;
2005                 proto   = saddr->sll_protocol;
2006                 addr    = saddr->sll_addr;
2007                 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2008                 need_rls_dev = true;
2009         }
2010 
2011         err = -ENXIO;
2012         if (unlikely(dev == NULL))
2013                 goto out;
2014 
2015         reserve = dev->hard_header_len;
2016 
2017         err = -ENETDOWN;
2018         if (unlikely(!(dev->flags & IFF_UP)))
2019                 goto out_put;
2020 
2021         size_max = po->tx_ring.frame_size
2022                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2023 
2024         if (size_max > dev->mtu + reserve)
2025                 size_max = dev->mtu + reserve;
2026 
2027         do {
2028                 ph = packet_current_frame(po, &po->tx_ring,
2029                                 TP_STATUS_SEND_REQUEST);
2030 
2031                 if (unlikely(ph == NULL)) {
2032                         schedule();
2033                         continue;
2034                 }
2035 
2036                 status = TP_STATUS_SEND_REQUEST;
2037                 hlen = LL_RESERVED_SPACE(dev);
2038                 tlen = dev->needed_tailroom;
2039                 skb = sock_alloc_send_skb(&po->sk,
2040                                 hlen + tlen + sizeof(struct sockaddr_ll),
2041                                 0, &err);
2042 
2043                 if (unlikely(skb == NULL))
2044                         goto out_status;
2045 
2046                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
2047                                 addr, hlen);
2048 
2049                 if (unlikely(tp_len < 0)) {
2050                         if (po->tp_loss) {
2051                                 __packet_set_status(po, ph,
2052                                                 TP_STATUS_AVAILABLE);
2053                                 packet_increment_head(&po->tx_ring);
2054                                 kfree_skb(skb);
2055                                 continue;
2056                         } else {
2057                                 status = TP_STATUS_WRONG_FORMAT;
2058                                 err = tp_len;
2059                                 goto out_status;
2060                         }
2061                 }
2062 
2063                 skb->destructor = tpacket_destruct_skb;
2064                 __packet_set_status(po, ph, TP_STATUS_SENDING);
2065                 atomic_inc(&po->tx_ring.pending);
2066 
2067                 status = TP_STATUS_SEND_REQUEST;
2068                 err = dev_queue_xmit(skb);
2069                 if (unlikely(err > 0)) {
2070                         err = net_xmit_errno(err);
2071                         if (err && __packet_get_status(po, ph) ==
2072                                    TP_STATUS_AVAILABLE) {
2073                                 /* skb was destructed already */
2074                                 skb = NULL;
2075                                 goto out_status;
2076                         }
2077                         /*
2078                          * skb was dropped but not destructed yet;
2079                          * let's treat it like congestion or err < 0
2080                          */
2081                         err = 0;
2082                 }
2083                 packet_increment_head(&po->tx_ring);
2084                 len_sum += tp_len;
2085         } while (likely((ph != NULL) ||
2086                         ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2087                          (atomic_read(&po->tx_ring.pending))))
2088                 );
2089 
2090         err = len_sum;
2091         goto out_put;
2092 
2093 out_status:
2094         __packet_set_status(po, ph, status);
2095         kfree_skb(skb);
2096 out_put:
2097         if (need_rls_dev)
2098                 dev_put(dev);
2099 out:
2100         mutex_unlock(&po->pg_vec_lock);
2101         return err;
2102 }
2103 
2104 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2105                                         size_t reserve, size_t len,
2106                                         size_t linear, int noblock,
2107                                         int *err)
2108 {
2109         struct sk_buff *skb;
2110 
2111         /* Under a page?  Don't bother with paged skb. */
2112         if (prepad + len < PAGE_SIZE || !linear)
2113                 linear = len;
2114 
2115         skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2116                                    err);
2117         if (!skb)
2118                 return NULL;
2119 
2120         skb_reserve(skb, reserve);
2121         skb_put(skb, linear);
2122         skb->data_len = len - linear;
2123         skb->len += len - linear;
2124 
2125         return skb;
2126 }
2127 
2128 static int packet_snd(struct socket *sock,
2129                           struct msghdr *msg, size_t len)
2130 {
2131         struct sock *sk = sock->sk;
2132         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
2133         struct sk_buff *skb;
2134         struct net_device *dev;
2135         __be16 proto;
2136         bool need_rls_dev = false;
2137         unsigned char *addr;
2138         int err, reserve = 0;
2139         struct virtio_net_hdr vnet_hdr = { 0 };
2140         int offset = 0;
2141         int vnet_hdr_len;
2142         struct packet_sock *po = pkt_sk(sk);
2143         unsigned short gso_type = 0;
2144         int hlen, tlen;
2145         int extra_len = 0;
2146         struct flow_keys keys;
2147 
2148         /*
2149          *      Get and verify the address.
2150          */
2151 
2152         if (saddr == NULL) {
2153                 dev = po->prot_hook.dev;
2154                 proto   = po->num;
2155                 addr    = NULL;
2156         } else {
2157                 err = -EINVAL;
2158                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2159                         goto out;
2160                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2161                         goto out;
2162                 proto   = saddr->sll_protocol;
2163                 addr    = saddr->sll_addr;
2164                 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2165                 need_rls_dev = true;
2166         }
2167 
2168         err = -ENXIO;
2169         if (dev == NULL)
2170                 goto out_unlock;
2171         if (sock->type == SOCK_RAW)
2172                 reserve = dev->hard_header_len;
2173 
2174         err = -ENETDOWN;
2175         if (!(dev->flags & IFF_UP))
2176                 goto out_unlock;
2177 
2178         if (po->has_vnet_hdr) {
2179                 vnet_hdr_len = sizeof(vnet_hdr);
2180 
2181                 err = -EINVAL;
2182                 if (len < vnet_hdr_len)
2183                         goto out_unlock;
2184 
2185                 len -= vnet_hdr_len;
2186 
2187                 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2188                                        vnet_hdr_len);
2189                 if (err < 0)
2190                         goto out_unlock;
2191 
2192                 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2193                     (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2194                       vnet_hdr.hdr_len))
2195                         vnet_hdr.hdr_len = vnet_hdr.csum_start +
2196                                                  vnet_hdr.csum_offset + 2;
2197 
2198                 err = -EINVAL;
2199                 if (vnet_hdr.hdr_len > len)
2200                         goto out_unlock;
2201 
2202                 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2203                         switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2204                         case VIRTIO_NET_HDR_GSO_TCPV4:
2205                                 gso_type = SKB_GSO_TCPV4;
2206                                 break;
2207                         case VIRTIO_NET_HDR_GSO_TCPV6:
2208                                 gso_type = SKB_GSO_TCPV6;
2209                                 break;
2210                         case VIRTIO_NET_HDR_GSO_UDP:
2211                                 gso_type = SKB_GSO_UDP;
2212                                 break;
2213                         default:
2214                                 goto out_unlock;
2215                         }
2216 
2217                         if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2218                                 gso_type |= SKB_GSO_TCP_ECN;
2219 
2220                         if (vnet_hdr.gso_size == 0)
2221                                 goto out_unlock;
2222 
2223                 }
2224         }
2225 
2226         if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2227                 if (!netif_supports_nofcs(dev)) {
2228                         err = -EPROTONOSUPPORT;
2229                         goto out_unlock;
2230                 }
2231                 extra_len = 4; /* We're doing our own CRC */
2232         }
2233 
2234         err = -EMSGSIZE;
2235         if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
2236                 goto out_unlock;
2237 
2238         err = -ENOBUFS;
2239         hlen = LL_RESERVED_SPACE(dev);
2240         tlen = dev->needed_tailroom;
2241         skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
2242                                msg->msg_flags & MSG_DONTWAIT, &err);
2243         if (skb == NULL)
2244                 goto out_unlock;
2245 
2246         skb_set_network_header(skb, reserve);
2247 
2248         err = -EINVAL;
2249         if (sock->type == SOCK_DGRAM &&
2250             (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
2251                 goto out_free;
2252 
2253         /* Returns -EFAULT on error */
2254         err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
2255         if (err)
2256                 goto out_free;
2257         err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
2258         if (err < 0)
2259                 goto out_free;
2260 
2261         if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
2262                 /* Earlier code assumed this would be a VLAN pkt,
2263                  * double-check this now that we have the actual
2264                  * packet in hand.
2265                  */
2266                 struct ethhdr *ehdr;
2267                 skb_reset_mac_header(skb);
2268                 ehdr = eth_hdr(skb);
2269                 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2270                         err = -EMSGSIZE;
2271                         goto out_free;
2272                 }
2273         }
2274 
2275         skb->protocol = proto;
2276         skb->dev = dev;
2277         skb->priority = sk->sk_priority;
2278         skb->mark = sk->sk_mark;
2279 
2280         if (po->has_vnet_hdr) {
2281                 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2282                         if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2283                                                   vnet_hdr.csum_offset)) {
2284                                 err = -EINVAL;
2285                                 goto out_free;
2286                         }
2287                 }
2288 
2289                 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2290                 skb_shinfo(skb)->gso_type = gso_type;
2291 
2292                 /* Header must be checked, and gso_segs computed. */
2293                 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2294                 skb_shinfo(skb)->gso_segs = 0;
2295 
2296                 len += vnet_hdr_len;
2297         }
2298 
2299         if (skb->ip_summed == CHECKSUM_PARTIAL)
2300                 skb_set_transport_header(skb, skb_checksum_start_offset(skb));
2301         else if (skb_flow_dissect(skb, &keys))
2302                 skb_set_transport_header(skb, keys.thoff);
2303         else
2304                 skb_set_transport_header(skb, reserve);
2305 
2306         if (unlikely(extra_len == 4))
2307                 skb->no_fcs = 1;
2308 
2309         /*
2310          *      Now send it
2311          */
2312 
2313         err = dev_queue_xmit(skb);
2314         if (err > 0 && (err = net_xmit_errno(err)) != 0)
2315                 goto out_unlock;
2316 
2317         if (need_rls_dev)
2318                 dev_put(dev);
2319 
2320         return len;
2321 
2322 out_free:
2323         kfree_skb(skb);
2324 out_unlock:
2325         if (dev && need_rls_dev)
2326                 dev_put(dev);
2327 out:
2328         return err;
2329 }
2330 
2331 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2332                 struct msghdr *msg, size_t len)
2333 {
2334         struct sock *sk = sock->sk;
2335         struct packet_sock *po = pkt_sk(sk);
2336         if (po->tx_ring.pg_vec)
2337                 return tpacket_snd(po, msg);
2338         else
2339                 return packet_snd(sock, msg, len);
2340 }
2341 
2342 /*
2343  *      Close a PACKET socket. This is fairly simple. We immediately go
2344  *      to 'closed' state and remove our protocol entry in the device list.
2345  */
2346 
2347 static int packet_release(struct socket *sock)
2348 {
2349         struct sock *sk = sock->sk;
2350         struct packet_sock *po;
2351         struct net *net;
2352         union tpacket_req_u req_u;
2353 
2354         if (!sk)
2355                 return 0;
2356 
2357         net = sock_net(sk);
2358         po = pkt_sk(sk);
2359 
2360         mutex_lock(&net->packet.sklist_lock);
2361         sk_del_node_init_rcu(sk);
2362         mutex_unlock(&net->packet.sklist_lock);
2363 
2364         preempt_disable();
2365         sock_prot_inuse_add(net, sk->sk_prot, -1);
2366         preempt_enable();
2367 
2368         spin_lock(&po->bind_lock);
2369         unregister_prot_hook(sk, false);
2370         if (po->prot_hook.dev) {
2371                 dev_put(po->prot_hook.dev);
2372                 po->prot_hook.dev = NULL;
2373         }
2374         spin_unlock(&po->bind_lock);
2375 
2376         packet_flush_mclist(sk);
2377 
2378         if (po->rx_ring.pg_vec) {
2379                 memset(&req_u, 0, sizeof(req_u));
2380                 packet_set_ring(sk, &req_u, 1, 0);
2381         }
2382 
2383         if (po->tx_ring.pg_vec) {
2384                 memset(&req_u, 0, sizeof(req_u));
2385                 packet_set_ring(sk, &req_u, 1, 1);
2386         }
2387 
2388         fanout_release(sk);
2389 
2390         synchronize_net();
2391         /*
2392          *      Now the socket is dead. No more input will appear.
2393          */
2394         sock_orphan(sk);
2395         sock->sk = NULL;
2396 
2397         /* Purge queues */
2398 
2399         skb_queue_purge(&sk->sk_receive_queue);
2400         sk_refcnt_debug_release(sk);
2401 
2402         sock_put(sk);
2403         return 0;
2404 }
2405 
2406 /*
2407  *      Attach a packet hook.
2408  */
2409 
2410 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
2411 {
2412         struct packet_sock *po = pkt_sk(sk);
2413 
2414         if (po->fanout) {
2415                 if (dev)
2416                         dev_put(dev);
2417 
2418                 return -EINVAL;
2419         }
2420 
2421         lock_sock(sk);
2422 
2423         spin_lock(&po->bind_lock);
2424         unregister_prot_hook(sk, true);
2425         po->num = protocol;
2426         po->prot_hook.type = protocol;
2427         if (po->prot_hook.dev)
2428                 dev_put(po->prot_hook.dev);
2429         po->prot_hook.dev = dev;
2430 
2431         po->ifindex = dev ? dev->ifindex : 0;
2432 
2433         if (protocol == 0)
2434                 goto out_unlock;
2435 
2436         if (!dev || (dev->flags & IFF_UP)) {
2437                 register_prot_hook(sk);
2438         } else {
2439                 sk->sk_err = ENETDOWN;
2440                 if (!sock_flag(sk, SOCK_DEAD))
2441                         sk->sk_error_report(sk);
2442         }
2443 
2444 out_unlock:
2445         spin_unlock(&po->bind_lock);
2446         release_sock(sk);
2447         return 0;
2448 }
2449 
2450 /*
2451  *      Bind a packet socket to a device
2452  */
2453 
2454 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2455                             int addr_len)
2456 {
2457         struct sock *sk = sock->sk;
2458         char name[15];
2459         struct net_device *dev;
2460         int err = -ENODEV;
2461 
2462         /*
2463          *      Check legality
2464          */
2465 
2466         if (addr_len != sizeof(struct sockaddr))
2467                 return -EINVAL;
2468         strlcpy(name, uaddr->sa_data, sizeof(name));
2469 
2470         dev = dev_get_by_name(sock_net(sk), name);
2471         if (dev)
2472                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
2473         return err;
2474 }
2475 
2476 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2477 {
2478         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2479         struct sock *sk = sock->sk;
2480         struct net_device *dev = NULL;
2481         int err;
2482 
2483 
2484         /*
2485          *      Check legality
2486          */
2487 
2488         if (addr_len < sizeof(struct sockaddr_ll))
2489                 return -EINVAL;
2490         if (sll->sll_family != AF_PACKET)
2491                 return -EINVAL;
2492 
2493         if (sll->sll_ifindex) {
2494                 err = -ENODEV;
2495                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
2496                 if (dev == NULL)
2497                         goto out;
2498         }
2499         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
2500 
2501 out:
2502         return err;
2503 }
2504 
2505 static struct proto packet_proto = {
2506         .name     = "PACKET",
2507         .owner    = THIS_MODULE,
2508         .obj_size = sizeof(struct packet_sock),
2509 };
2510 
2511 /*
2512  *      Create a packet of type SOCK_PACKET.
2513  */
2514 
2515 static int packet_create(struct net *net, struct socket *sock, int protocol,
2516                          int kern)
2517 {
2518         struct sock *sk;
2519         struct packet_sock *po;
2520         __be16 proto = (__force __be16)protocol; /* weird, but documented */
2521         int err;
2522 
2523         if (!ns_capable(net->user_ns, CAP_NET_RAW))
2524                 return -EPERM;
2525         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2526             sock->type != SOCK_PACKET)
2527                 return -ESOCKTNOSUPPORT;
2528 
2529         sock->state = SS_UNCONNECTED;
2530 
2531         err = -ENOBUFS;
2532         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
2533         if (sk == NULL)
2534                 goto out;
2535 
2536         sock->ops = &packet_ops;
2537         if (sock->type == SOCK_PACKET)
2538                 sock->ops = &packet_ops_spkt;
2539 
2540         sock_init_data(sock, sk);
2541 
2542         po = pkt_sk(sk);
2543         sk->sk_family = PF_PACKET;
2544         po->num = proto;
2545 
2546         sk->sk_destruct = packet_sock_destruct;
2547         sk_refcnt_debug_inc(sk);
2548 
2549         /*
2550          *      Attach a protocol block
2551          */
2552 
2553         spin_lock_init(&po->bind_lock);
2554         mutex_init(&po->pg_vec_lock);
2555         po->prot_hook.func = packet_rcv;
2556 
2557         if (sock->type == SOCK_PACKET)
2558                 po->prot_hook.func = packet_rcv_spkt;
2559 
2560         po->prot_hook.af_packet_priv = sk;
2561 
2562         if (proto) {
2563                 po->prot_hook.type = proto;
2564                 register_prot_hook(sk);
2565         }
2566 
2567         mutex_lock(&net->packet.sklist_lock);
2568         sk_add_node_rcu(sk, &net->packet.sklist);
2569         mutex_unlock(&net->packet.sklist_lock);
2570 
2571         preempt_disable();
2572         sock_prot_inuse_add(net, &packet_proto, 1);
2573         preempt_enable();
2574 
2575         return 0;
2576 out:
2577         return err;
2578 }
2579 
2580 static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
2581 {
2582         struct sock_exterr_skb *serr;
2583         struct sk_buff *skb, *skb2;
2584         int copied, err;
2585 
2586         err = -EAGAIN;
2587         skb = skb_dequeue(&sk->sk_error_queue);
2588         if (skb == NULL)
2589                 goto out;
2590 
2591         copied = skb->len;
2592         if (copied > len) {
2593                 msg->msg_flags |= MSG_TRUNC;
2594                 copied = len;
2595         }
2596         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2597         if (err)
2598                 goto out_free_skb;
2599 
2600         sock_recv_timestamp(msg, sk, skb);
2601 
2602         serr = SKB_EXT_ERR(skb);
2603         put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
2604                  sizeof(serr->ee), &serr->ee);
2605 
2606         msg->msg_flags |= MSG_ERRQUEUE;
2607         err = copied;
2608 
2609         /* Reset and regenerate socket error */
2610         spin_lock_bh(&sk->sk_error_queue.lock);
2611         sk->sk_err = 0;
2612         if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2613                 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2614                 spin_unlock_bh(&sk->sk_error_queue.lock);
2615                 sk->sk_error_report(sk);
2616         } else
2617                 spin_unlock_bh(&sk->sk_error_queue.lock);
2618 
2619 out_free_skb:
2620         kfree_skb(skb);
2621 out:
2622         return err;
2623 }
2624 
2625 /*
2626  *      Pull a packet from our receive queue and hand it to the user.
2627  *      If necessary we block.
2628  */
2629 
2630 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2631                           struct msghdr *msg, size_t len, int flags)
2632 {
2633         struct sock *sk = sock->sk;
2634         struct sk_buff *skb;
2635         int copied, err;
2636         struct sockaddr_ll *sll;
2637         int vnet_hdr_len = 0;
2638 
2639         err = -EINVAL;
2640         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
2641                 goto out;
2642 
2643 #if 0
2644         /* What error should we return now? EUNATTACH? */
2645         if (pkt_sk(sk)->ifindex < 0)
2646                 return -ENODEV;
2647 #endif
2648 
2649         if (flags & MSG_ERRQUEUE) {
2650                 err = packet_recv_error(sk, msg, len);
2651                 goto out;
2652         }
2653 
2654         /*
2655          *      Call the generic datagram receiver. This handles all sorts
2656          *      of horrible races and re-entrancy so we can forget about it
2657          *      in the protocol layers.
2658          *
2659          *      Now it will return ENETDOWN, if device have just gone down,
2660          *      but then it will block.
2661          */
2662 
2663         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
2664 
2665         /*
2666          *      An error occurred so return it. Because skb_recv_datagram()
2667          *      handles the blocking we don't see and worry about blocking
2668          *      retries.
2669          */
2670 
2671         if (skb == NULL)
2672                 goto out;
2673 
2674         if (pkt_sk(sk)->has_vnet_hdr) {
2675                 struct virtio_net_hdr vnet_hdr = { 0 };
2676 
2677                 err = -EINVAL;
2678                 vnet_hdr_len = sizeof(vnet_hdr);
2679                 if (len < vnet_hdr_len)
2680                         goto out_free;
2681 
2682                 len -= vnet_hdr_len;
2683 
2684                 if (skb_is_gso(skb)) {
2685                         struct skb_shared_info *sinfo = skb_shinfo(skb);
2686 
2687                         /* This is a hint as to how much should be linear. */
2688                         vnet_hdr.hdr_len = skb_headlen(skb);
2689                         vnet_hdr.gso_size = sinfo->gso_size;
2690                         if (sinfo->gso_type & SKB_GSO_TCPV4)
2691                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2692                         else if (sinfo->gso_type & SKB_GSO_TCPV6)
2693                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2694                         else if (sinfo->gso_type & SKB_GSO_UDP)
2695                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2696                         else if (sinfo->gso_type & SKB_GSO_FCOE)
2697                                 goto out_free;
2698                         else
2699                                 BUG();
2700                         if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2701                                 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2702                 } else
2703                         vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2704 
2705                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2706                         vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
2707                         vnet_hdr.csum_start = skb_checksum_start_offset(skb);
2708                         vnet_hdr.csum_offset = skb->csum_offset;
2709                 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2710                         vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
2711                 } /* else everything is zero */
2712 
2713                 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2714                                      vnet_hdr_len);
2715                 if (err < 0)
2716                         goto out_free;
2717         }
2718 
2719         /*
2720          *      If the address length field is there to be filled in, we fill
2721          *      it in now.
2722          */
2723 
2724         sll = &PACKET_SKB_CB(skb)->sa.ll;
2725         if (sock->type == SOCK_PACKET)
2726                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2727         else
2728                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
2729 
2730         /*
2731          *      You lose any data beyond the buffer you gave. If it worries a
2732          *      user program they can ask the device for its MTU anyway.
2733          */
2734 
2735         copied = skb->len;
2736         if (copied > len) {
2737                 copied = len;
2738                 msg->msg_flags |= MSG_TRUNC;
2739         }
2740 
2741         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2742         if (err)
2743                 goto out_free;
2744 
2745         sock_recv_ts_and_drops(msg, sk, skb);
2746 
2747         if (msg->msg_name)
2748                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2749                        msg->msg_namelen);
2750 
2751         if (pkt_sk(sk)->auxdata) {
2752                 struct tpacket_auxdata aux;
2753 
2754                 aux.tp_status = TP_STATUS_USER;
2755                 if (skb->ip_summed == CHECKSUM_PARTIAL)
2756                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2757                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2758                 aux.tp_snaplen = skb->len;
2759                 aux.tp_mac = 0;
2760                 aux.tp_net = skb_network_offset(skb);
2761                 if (vlan_tx_tag_present(skb)) {
2762                         aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2763                         aux.tp_status |= TP_STATUS_VLAN_VALID;
2764                 } else {
2765                         aux.tp_vlan_tci = 0;
2766                 }
2767                 aux.tp_padding = 0;
2768                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
2769         }
2770 
2771         /*
2772          *      Free or return the buffer as appropriate. Again this
2773          *      hides all the races and re-entrancy issues from us.
2774          */
2775         err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
2776 
2777 out_free:
2778         skb_free_datagram(sk, skb);
2779 out:
2780         return err;
2781 }
2782 
2783 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2784                                int *uaddr_len, int peer)
2785 {
2786         struct net_device *dev;
2787         struct sock *sk = sock->sk;
2788 
2789         if (peer)
2790                 return -EOPNOTSUPP;
2791 
2792         uaddr->sa_family = AF_PACKET;
2793         memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
2794         rcu_read_lock();
2795         dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2796         if (dev)
2797                 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
2798         rcu_read_unlock();
2799         *uaddr_len = sizeof(*uaddr);
2800 
2801         return 0;
2802 }
2803 
2804 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2805                           int *uaddr_len, int peer)
2806 {
2807         struct net_device *dev;
2808         struct sock *sk = sock->sk;
2809         struct packet_sock *po = pkt_sk(sk);
2810         DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
2811 
2812         if (peer)
2813                 return -EOPNOTSUPP;
2814 
2815         sll->sll_family = AF_PACKET;
2816         sll->sll_ifindex = po->ifindex;
2817         sll->sll_protocol = po->num;
2818         sll->sll_pkttype = 0;
2819         rcu_read_lock();
2820         dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
2821         if (dev) {
2822                 sll->sll_hatype = dev->type;
2823                 sll->sll_halen = dev->addr_len;
2824                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
2825         } else {
2826                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
2827                 sll->sll_halen = 0;
2828         }
2829         rcu_read_unlock();
2830         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
2831 
2832         return 0;
2833 }
2834 
2835 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2836                          int what)
2837 {
2838         switch (i->type) {
2839         case PACKET_MR_MULTICAST:
2840                 if (i->alen != dev->addr_len)
2841                         return -EINVAL;
2842                 if (what > 0)
2843                         return dev_mc_add(dev, i->addr);
2844                 else
2845                         return dev_mc_del(dev, i->addr);
2846                 break;
2847         case PACKET_MR_PROMISC:
2848                 return dev_set_promiscuity(dev, what);
2849                 break;
2850         case PACKET_MR_ALLMULTI:
2851                 return dev_set_allmulti(dev, what);
2852                 break;
2853         case PACKET_MR_UNICAST:
2854                 if (i->alen != dev->addr_len)
2855                         return -EINVAL;
2856                 if (what > 0)
2857                         return dev_uc_add(dev, i->addr);
2858                 else
2859                         return dev_uc_del(dev, i->addr);
2860                 break;
2861         default:
2862                 break;
2863         }
2864         return 0;
2865 }
2866 
2867 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2868 {
2869         for ( ; i; i = i->next) {
2870                 if (i->ifindex == dev->ifindex)
2871                         packet_dev_mc(dev, i, what);
2872         }
2873 }
2874 
2875 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
2876 {
2877         struct packet_sock *po = pkt_sk(sk);
2878         struct packet_mclist *ml, *i;
2879         struct net_device *dev;
2880         int err;
2881 
2882         rtnl_lock();
2883 
2884         err = -ENODEV;
2885         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
2886         if (!dev)
2887                 goto done;
2888 
2889         err = -EINVAL;
2890         if (mreq->mr_alen > dev->addr_len)
2891                 goto done;
2892 
2893         err = -ENOBUFS;
2894         i = kmalloc(sizeof(*i), GFP_KERNEL);
2895         if (i == NULL)
2896                 goto done;
2897 
2898         err = 0;
2899         for (ml = po->mclist; ml; ml = ml->next) {
2900                 if (ml->ifindex == mreq->mr_ifindex &&
2901                     ml->type == mreq->mr_type &&
2902                     ml->alen == mreq->mr_alen &&
2903                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2904                         ml->count++;
2905                         /* Free the new element ... */
2906                         kfree(i);
2907                         goto done;
2908                 }
2909         }
2910 
2911         i->type = mreq->mr_type;
2912         i->ifindex = mreq->mr_ifindex;
2913         i->alen = mreq->mr_alen;
2914         memcpy(i->addr, mreq->mr_address, i->alen);
2915         i->count = 1;
2916         i->next = po->mclist;
2917         po->mclist = i;
2918         err = packet_dev_mc(dev, i, 1);
2919         if (err) {
2920                 po->mclist = i->next;
2921                 kfree(i);
2922         }
2923 
2924 done:
2925         rtnl_unlock();
2926         return err;
2927 }
2928 
2929 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
2930 {
2931         struct packet_mclist *ml, **mlp;
2932 
2933         rtnl_lock();
2934 
2935         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
2936                 if (ml->ifindex == mreq->mr_ifindex &&
2937                     ml->type == mreq->mr_type &&
2938                     ml->alen == mreq->mr_alen &&
2939                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2940                         if (--ml->count == 0) {
2941                                 struct net_device *dev;
2942                                 *mlp = ml->next;
2943                                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
2944                                 if (dev)
2945                                         packet_dev_mc(dev, ml, -1);
2946                                 kfree(ml);
2947                         }
2948                         rtnl_unlock();
2949                         return 0;
2950                 }
2951         }
2952         rtnl_unlock();
2953         return -EADDRNOTAVAIL;
2954 }
2955 
2956 static void packet_flush_mclist(struct sock *sk)
2957 {
2958         struct packet_sock *po = pkt_sk(sk);
2959         struct packet_mclist *ml;
2960 
2961         if (!po->mclist)
2962                 return;
2963 
2964         rtnl_lock();
2965         while ((ml = po->mclist) != NULL) {
2966                 struct net_device *dev;
2967 
2968                 po->mclist = ml->next;
2969                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
2970                 if (dev != NULL)
2971                         packet_dev_mc(dev, ml, -1);
2972                 kfree(ml);
2973         }
2974         rtnl_unlock();
2975 }
2976 
2977 static int
2978 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
2979 {
2980         struct sock *sk = sock->sk;
2981         struct packet_sock *po = pkt_sk(sk);
2982         int ret;
2983 
2984         if (level != SOL_PACKET)
2985                 return -ENOPROTOOPT;
2986 
2987         switch (optname) {
2988         case PACKET_ADD_MEMBERSHIP:
2989         case PACKET_DROP_MEMBERSHIP:
2990         {
2991                 struct packet_mreq_max mreq;
2992                 int len = optlen;
2993                 memset(&mreq, 0, sizeof(mreq));
2994                 if (len < sizeof(struct packet_mreq))
2995                         return -EINVAL;
2996                 if (len > sizeof(mreq))
2997                         len = sizeof(mreq);
2998                 if (copy_from_user(&mreq, optval, len))
2999                         return -EFAULT;
3000                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3001                         return -EINVAL;
3002                 if (optname == PACKET_ADD_MEMBERSHIP)
3003                         ret = packet_mc_add(sk, &mreq);
3004                 else
3005                         ret = packet_mc_drop(sk, &mreq);
3006                 return ret;
3007         }
3008 
3009         case PACKET_RX_RING:
3010         case PACKET_TX_RING:
3011         {
3012                 union tpacket_req_u req_u;
3013                 int len;
3014 
3015                 switch (po->tp_version) {
3016                 case TPACKET_V1:
3017                 case TPACKET_V2:
3018                         len = sizeof(req_u.req);
3019                         break;
3020                 case TPACKET_V3:
3021                 default:
3022                         len = sizeof(req_u.req3);
3023                         break;
3024                 }
3025                 if (optlen < len)
3026                         return -EINVAL;
3027                 if (pkt_sk(sk)->has_vnet_hdr)
3028                         return -EINVAL;
3029                 if (copy_from_user(&req_u.req, optval, len))
3030                         return -EFAULT;
3031                 return packet_set_ring(sk, &req_u, 0,
3032                         optname == PACKET_TX_RING);
3033         }
3034         case PACKET_COPY_THRESH:
3035         {
3036                 int val;
3037 
3038                 if (optlen != sizeof(val))
3039                         return -EINVAL;
3040                 if (copy_from_user(&val, optval, sizeof(val)))
3041                         return -EFAULT;
3042 
3043                 pkt_sk(sk)->copy_thresh = val;
3044                 return 0;
3045         }
3046         case PACKET_VERSION:
3047         {
3048                 int val;
3049 
3050                 if (optlen != sizeof(val))
3051                         return -EINVAL;
3052                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3053                         return -EBUSY;
3054                 if (copy_from_user(&val, optval, sizeof(val)))
3055                         return -EFAULT;
3056                 switch (val) {
3057                 case TPACKET_V1:
3058                 case TPACKET_V2:
3059                 case TPACKET_V3:
3060                         po->tp_version = val;
3061                         return 0;
3062                 default:
3063                         return -EINVAL;
3064                 }
3065         }
3066         case PACKET_RESERVE:
3067         {
3068                 unsigned int val;
3069 
3070                 if (optlen != sizeof(val))
3071                         return -EINVAL;
3072                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3073                         return -EBUSY;
3074                 if (copy_from_user(&val, optval, sizeof(val)))
3075                         return -EFAULT;
3076                 po->tp_reserve = val;
3077                 return 0;
3078         }
3079         case PACKET_LOSS:
3080         {
3081                 unsigned int val;
3082 
3083                 if (optlen != sizeof(val))
3084                         return -EINVAL;
3085                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3086                         return -EBUSY;
3087                 if (copy_from_user(&val, optval, sizeof(val)))
3088                         return -EFAULT;
3089                 po->tp_loss = !!val;
3090                 return 0;
3091         }
3092         case PACKET_AUXDATA:
3093         {
3094                 int val;
3095 
3096                 if (optlen < sizeof(val))
3097                         return -EINVAL;
3098                 if (copy_from_user(&val, optval, sizeof(val)))
3099                         return -EFAULT;
3100 
3101                 po->auxdata = !!val;
3102                 return 0;
3103         }
3104         case PACKET_ORIGDEV:
3105         {
3106                 int val;
3107 
3108                 if (optlen < sizeof(val))
3109                         return -EINVAL;
3110                 if (copy_from_user(&val, optval, sizeof(val)))
3111                         return -EFAULT;
3112 
3113                 po->origdev = !!val;
3114                 return 0;
3115         }
3116         case PACKET_VNET_HDR:
3117         {
3118                 int val;
3119 
3120                 if (sock->type != SOCK_RAW)
3121                         return -EINVAL;
3122                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3123                         return -EBUSY;
3124                 if (optlen < sizeof(val))
3125                         return -EINVAL;
3126                 if (copy_from_user(&val, optval, sizeof(val)))
3127                         return -EFAULT;
3128 
3129                 po->has_vnet_hdr = !!val;
3130                 return 0;
3131         }
3132         case PACKET_TIMESTAMP:
3133         {
3134                 int val;
3135 
3136                 if (optlen != sizeof(val))
3137                         return -EINVAL;
3138                 if (copy_from_user(&val, optval, sizeof(val)))
3139                         return -EFAULT;
3140 
3141                 po->tp_tstamp = val;
3142                 return 0;
3143         }
3144         case PACKET_FANOUT:
3145         {
3146                 int val;
3147 
3148                 if (optlen != sizeof(val))
3149                         return -EINVAL;
3150                 if (copy_from_user(&val, optval, sizeof(val)))
3151                         return -EFAULT;
3152 
3153                 return fanout_add(sk, val & 0xffff, val >> 16);
3154         }
3155         case PACKET_TX_HAS_OFF:
3156         {
3157                 unsigned int val;
3158 
3159                 if (optlen != sizeof(val))
3160                         return -EINVAL;
3161                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3162                         return -EBUSY;
3163                 if (copy_from_user(&val, optval, sizeof(val)))
3164                         return -EFAULT;
3165                 po->tp_tx_has_off = !!val;
3166                 return 0;
3167         }
3168         default:
3169                 return -ENOPROTOOPT;
3170         }
3171 }
3172 
3173 static int packet_getsockopt(struct socket *sock, int level, int optname,
3174                              char __user *optval, int __user *optlen)
3175 {
3176         int len;
3177         int val, lv = sizeof(val);
3178         struct sock *sk = sock->sk;
3179         struct packet_sock *po = pkt_sk(sk);
3180         void *data = &val;
3181         struct tpacket_stats st;
3182         union tpacket_stats_u st_u;
3183 
3184         if (level != SOL_PACKET)
3185                 return -ENOPROTOOPT;
3186 
3187         if (get_user(len, optlen))
3188                 return -EFAULT;
3189 
3190         if (len < 0)
3191                 return -EINVAL;
3192 
3193         switch (optname) {
3194         case PACKET_STATISTICS:
3195                 spin_lock_bh(&sk->sk_receive_queue.lock);
3196                 if (po->tp_version == TPACKET_V3) {
3197                         lv = sizeof(struct tpacket_stats_v3);
3198                         memcpy(&st_u.stats3, &po->stats,
3199                                sizeof(struct tpacket_stats));
3200                         st_u.stats3.tp_freeze_q_cnt =
3201                                         po->stats_u.stats3.tp_freeze_q_cnt;
3202                         st_u.stats3.tp_packets += po->stats.tp_drops;
3203                         data = &st_u.stats3;
3204                 } else {
3205                         lv = sizeof(struct tpacket_stats);
3206                         st = po->stats;
3207                         st.tp_packets += st.tp_drops;
3208                         data = &st;
3209                 }
3210                 memset(&po->stats, 0, sizeof(st));
3211                 spin_unlock_bh(&sk->sk_receive_queue.lock);
3212                 break;
3213         case PACKET_AUXDATA:
3214                 val = po->auxdata;
3215                 break;
3216         case PACKET_ORIGDEV:
3217                 val = po->origdev;
3218                 break;
3219         case PACKET_VNET_HDR:
3220                 val = po->has_vnet_hdr;
3221                 break;
3222         case PACKET_VERSION:
3223                 val = po->tp_version;
3224                 break;
3225         case PACKET_HDRLEN:
3226                 if (len > sizeof(int))
3227                         len = sizeof(int);
3228                 if (copy_from_user(&val, optval, len))
3229                         return -EFAULT;
3230                 switch (val) {
3231                 case TPACKET_V1:
3232                         val = sizeof(struct tpacket_hdr);
3233                         break;
3234                 case TPACKET_V2:
3235                         val = sizeof(struct tpacket2_hdr);
3236                         break;
3237                 case TPACKET_V3:
3238                         val = sizeof(struct tpacket3_hdr);
3239                         break;
3240                 default:
3241                         return -EINVAL;
3242                 }
3243                 break;
3244         case PACKET_RESERVE:
3245                 val = po->tp_reserve;
3246                 break;
3247         case PACKET_LOSS:
3248                 val = po->tp_loss;
3249                 break;
3250         case PACKET_TIMESTAMP:
3251                 val = po->tp_tstamp;
3252                 break;
3253         case PACKET_FANOUT:
3254                 val = (po->fanout ?
3255                        ((u32)po->fanout->id |
3256                         ((u32)po->fanout->type << 16)) :
3257                        0);
3258                 break;
3259         case PACKET_TX_HAS_OFF:
3260                 val = po->tp_tx_has_off;
3261                 break;
3262         default:
3263                 return -ENOPROTOOPT;
3264         }
3265 
3266         if (len > lv)
3267                 len = lv;
3268         if (put_user(len, optlen))
3269                 return -EFAULT;
3270         if (copy_to_user(optval, data, len))
3271                 return -EFAULT;
3272         return 0;
3273 }
3274 
3275 
3276 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
3277 {
3278         struct sock *sk;
3279         struct net_device *dev = data;
3280         struct net *net = dev_net(dev);
3281 
3282         rcu_read_lock();
3283         sk_for_each_rcu(sk, &net->packet.sklist) {
3284                 struct packet_sock *po = pkt_sk(sk);
3285 
3286                 switch (msg) {
3287                 case NETDEV_UNREGISTER:
3288                         if (po->mclist)
3289                                 packet_dev_mclist(dev, po->mclist, -1);
3290                         /* fallthrough */
3291 
3292                 case NETDEV_DOWN:
3293                         if (dev->ifindex == po->ifindex) {
3294                                 spin_lock(&po->bind_lock);
3295                                 if (po->running) {
3296                                         __unregister_prot_hook(sk, false);
3297                                         sk->sk_err = ENETDOWN;
3298                                         if (!sock_flag(sk, SOCK_DEAD))
3299                                                 sk->sk_error_report(sk);
3300                                 }
3301                                 if (msg == NETDEV_UNREGISTER) {
3302                                         po->ifindex = -1;
3303                                         if (po->prot_hook.dev)
3304                                                 dev_put(po->prot_hook.dev);
3305                                         po->prot_hook.dev = NULL;
3306                                 }
3307                                 spin_unlock(&po->bind_lock);
3308                         }
3309                         break;
3310                 case NETDEV_UP:
3311                         if (dev->ifindex == po->ifindex) {
3312                                 spin_lock(&po->bind_lock);
3313                                 if (po->num)
3314                                         register_prot_hook(sk);
3315                                 spin_unlock(&po->bind_lock);
3316                         }
3317                         break;
3318                 }
3319         }
3320         rcu_read_unlock();
3321         return NOTIFY_DONE;
3322 }
3323 
3324 
3325 static int packet_ioctl(struct socket *sock, unsigned int cmd,
3326                         unsigned long arg)
3327 {
3328         struct sock *sk = sock->sk;
3329 
3330         switch (cmd) {
3331         case SIOCOUTQ:
3332         {
3333                 int amount = sk_wmem_alloc_get(sk);
3334 
3335                 return put_user(amount, (int __user *)arg);
3336         }
3337         case SIOCINQ:
3338         {
3339                 struct sk_buff *skb;
3340                 int amount = 0;
3341 
3342                 spin_lock_bh(&sk->sk_receive_queue.lock);
3343                 skb = skb_peek(&sk->sk_receive_queue);
3344                 if (skb)
3345                         amount = skb->len;
3346                 spin_unlock_bh(&sk->sk_receive_queue.lock);
3347                 return put_user(amount, (int __user *)arg);
3348         }
3349         case SIOCGSTAMP:
3350                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3351         case SIOCGSTAMPNS:
3352                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
3353 
3354 #ifdef CONFIG_INET
3355         case SIOCADDRT:
3356         case SIOCDELRT:
3357         case SIOCDARP:
3358         case SIOCGARP:
3359         case SIOCSARP:
3360         case SIOCGIFADDR:
3361         case SIOCSIFADDR:
3362         case SIOCGIFBRDADDR:
3363         case SIOCSIFBRDADDR:
3364         case SIOCGIFNETMASK:
3365         case SIOCSIFNETMASK:
3366         case SIOCGIFDSTADDR:
3367         case SIOCSIFDSTADDR:
3368         case SIOCSIFFLAGS:
3369                 return inet_dgram_ops.ioctl(sock, cmd, arg);
3370 #endif
3371 
3372         default:
3373                 return -ENOIOCTLCMD;
3374         }
3375         return 0;
3376 }
3377 
3378 static unsigned int packet_poll(struct file *file, struct socket *sock,
3379                                 poll_table *wait)
3380 {
3381         struct sock *sk = sock->sk;
3382         struct packet_sock *po = pkt_sk(sk);
3383         unsigned int mask = datagram_poll(file, sock, wait);
3384 
3385         spin_lock_bh(&sk->sk_receive_queue.lock);
3386         if (po->rx_ring.pg_vec) {
3387                 if (!packet_previous_rx_frame(po, &po->rx_ring,
3388                         TP_STATUS_KERNEL))
3389                         mask |= POLLIN | POLLRDNORM;
3390         }
3391         spin_unlock_bh(&sk->sk_receive_queue.lock);
3392         spin_lock_bh(&sk->sk_write_queue.lock);
3393         if (po->tx_ring.pg_vec) {
3394                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3395                         mask |= POLLOUT | POLLWRNORM;
3396         }
3397         spin_unlock_bh(&sk->sk_write_queue.lock);
3398         return mask;
3399 }
3400 
3401 
3402 /* Dirty? Well, I still did not learn better way to account
3403  * for user mmaps.
3404  */
3405 
3406 static void packet_mm_open(struct vm_area_struct *vma)
3407 {
3408         struct file *file = vma->vm_file;
3409         struct socket *sock = file->private_data;
3410         struct sock *sk = sock->sk;
3411 
3412         if (sk)
3413                 atomic_inc(&pkt_sk(sk)->mapped);
3414 }
3415 
3416 static void packet_mm_close(struct vm_area_struct *vma)
3417 {
3418         struct file *file = vma->vm_file;
3419         struct socket *sock = file->private_data;
3420         struct sock *sk = sock->sk;
3421 
3422         if (sk)
3423                 atomic_dec(&pkt_sk(sk)->mapped);
3424 }
3425 
3426 static const struct vm_operations_struct packet_mmap_ops = {
3427         .open   =       packet_mm_open,
3428         .close  =       packet_mm_close,
3429 };
3430 
3431 static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3432                         unsigned int len)
3433 {
3434         int i;
3435 
3436         for (i = 0; i < len; i++) {
3437                 if (likely(pg_vec[i].buffer)) {
3438                         if (is_vmalloc_addr(pg_vec[i].buffer))
3439                                 vfree(pg_vec[i].buffer);
3440                         else
3441                                 free_pages((unsigned long)pg_vec[i].buffer,
3442                                            order);
3443                         pg_vec[i].buffer = NULL;
3444                 }
3445         }
3446         kfree(pg_vec);
3447 }
3448 
3449 static char *alloc_one_pg_vec_page(unsigned long order)
3450 {
3451         char *buffer = NULL;
3452         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3453                           __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3454 
3455         buffer = (char *) __get_free_pages(gfp_flags, order);
3456 
3457         if (buffer)
3458                 return buffer;
3459 
3460         /*
3461          * __get_free_pages failed, fall back to vmalloc
3462          */
3463         buffer = vzalloc((1 << order) * PAGE_SIZE);
3464 
3465         if (buffer)
3466                 return buffer;
3467 
3468         /*
3469          * vmalloc failed, lets dig into swap here
3470          */
3471         gfp_flags &= ~__GFP_NORETRY;
3472         buffer = (char *)__get_free_pages(gfp_flags, order);
3473         if (buffer)
3474                 return buffer;
3475 
3476         /*
3477          * complete and utter failure
3478          */
3479         return NULL;
3480 }
3481 
3482 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
3483 {
3484         unsigned int block_nr = req->tp_block_nr;
3485         struct pgv *pg_vec;
3486         int i;
3487 
3488         pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
3489         if (unlikely(!pg_vec))
3490                 goto out;
3491 
3492         for (i = 0; i < block_nr; i++) {
3493                 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
3494                 if (unlikely(!pg_vec[i].buffer))
3495                         goto out_free_pgvec;
3496         }
3497 
3498 out:
3499         return pg_vec;
3500 
3501 out_free_pgvec:
3502         free_pg_vec(pg_vec, order, block_nr);
3503         pg_vec = NULL;
3504         goto out;
3505 }
3506 
3507 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
3508                 int closing, int tx_ring)
3509 {
3510         struct pgv *pg_vec = NULL;
3511         struct packet_sock *po = pkt_sk(sk);
3512         int was_running, order = 0;
3513         struct packet_ring_buffer *rb;
3514         struct sk_buff_head *rb_queue;
3515         __be16 num;
3516         int err = -EINVAL;
3517         /* Added to avoid minimal code churn */
3518         struct tpacket_req *req = &req_u->req;
3519 
3520         /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3521         if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3522                 WARN(1, "Tx-ring is not supported.\n");
3523                 goto out;
3524         }
3525 
3526         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3527         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
3528 
3529         err = -EBUSY;
3530         if (!closing) {
3531                 if (atomic_read(&po->mapped))
3532                         goto out;
3533                 if (atomic_read(&rb->pending))
3534                         goto out;
3535         }
3536 
3537         if (req->tp_block_nr) {
3538                 /* Sanity tests and some calculations */
3539                 err = -EBUSY;
3540                 if (unlikely(rb->pg_vec))
3541                         goto out;
3542 
3543                 switch (po->tp_version) {
3544                 case TPACKET_V1:
3545                         po->tp_hdrlen = TPACKET_HDRLEN;
3546                         break;
3547                 case TPACKET_V2:
3548                         po->tp_hdrlen = TPACKET2_HDRLEN;
3549                         break;
3550                 case TPACKET_V3:
3551                         po->tp_hdrlen = TPACKET3_HDRLEN;
3552                         break;
3553                 }
3554 
3555                 err = -EINVAL;
3556                 if (unlikely((int)req->tp_block_size <= 0))
3557                         goto out;
3558                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
3559                         goto out;
3560                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
3561                                         po->tp_reserve))
3562                         goto out;
3563                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
3564                         goto out;
3565 
3566                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3567                 if (unlikely(rb->frames_per_block <= 0))
3568                         goto out;
3569                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3570                                         req->tp_frame_nr))
3571                         goto out;
3572 
3573                 err = -ENOMEM;
3574                 order = get_order(req->tp_block_size);
3575                 pg_vec = alloc_pg_vec(req, order);
3576                 if (unlikely(!pg_vec))
3577                         goto out;
3578                 switch (po->tp_version) {
3579                 case TPACKET_V3:
3580                 /* Transmit path is not supported. We checked
3581                  * it above but just being paranoid
3582                  */
3583                         if (!tx_ring)
3584                                 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3585                                 break;
3586                 default:
3587                         break;
3588                 }
3589         }
3590         /* Done */
3591         else {
3592                 err = -EINVAL;
3593                 if (unlikely(req->tp_frame_nr))
3594                         goto out;
3595         }
3596 
3597         lock_sock(sk);
3598 
3599         /* Detach socket from network */
3600         spin_lock(&po->bind_lock);
3601         was_running = po->running;
3602         num = po->num;
3603         if (was_running) {
3604                 po->num = 0;
3605                 __unregister_prot_hook(sk, false);
3606         }
3607         spin_unlock(&po->bind_lock);
3608 
3609         synchronize_net();
3610 
3611         err = -EBUSY;
3612         mutex_lock(&po->pg_vec_lock);
3613         if (closing || atomic_read(&po->mapped) == 0) {
3614                 err = 0;
3615                 spin_lock_bh(&rb_queue->lock);
3616                 swap(rb->pg_vec, pg_vec);
3617                 rb->frame_max = (req->tp_frame_nr - 1);
3618                 rb->head = 0;
3619                 rb->frame_size = req->tp_frame_size;
3620                 spin_unlock_bh(&rb_queue->lock);
3621 
3622                 swap(rb->pg_vec_order, order);
3623                 swap(rb->pg_vec_len, req->tp_block_nr);
3624 
3625                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3626                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3627                                                 tpacket_rcv : packet_rcv;
3628                 skb_queue_purge(rb_queue);
3629                 if (atomic_read(&po->mapped))
3630                         pr_err("packet_mmap: vma is busy: %d\n",
3631                                atomic_read(&po->mapped));
3632         }
3633         mutex_unlock(&po->pg_vec_lock);
3634 
3635         spin_lock(&po->bind_lock);
3636         if (was_running) {
3637                 po->num = num;
3638                 register_prot_hook(sk);
3639         }
3640         spin_unlock(&po->bind_lock);
3641         if (closing && (po->tp_version > TPACKET_V2)) {
3642                 /* Because we don't support block-based V3 on tx-ring */
3643                 if (!tx_ring)
3644                         prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3645         }
3646         release_sock(sk);
3647 
3648         if (pg_vec)
3649                 free_pg_vec(pg_vec, order, req->tp_block_nr);
3650 out:
3651         return err;
3652 }
3653 
3654 static int packet_mmap(struct file *file, struct socket *sock,
3655                 struct vm_area_struct *vma)
3656 {
3657         struct sock *sk = sock->sk;
3658         struct packet_sock *po = pkt_sk(sk);
3659         unsigned long size, expected_size;
3660         struct packet_ring_buffer *rb;
3661         unsigned long start;
3662         int err = -EINVAL;
3663         int i;
3664 
3665         if (vma->vm_pgoff)
3666                 return -EINVAL;
3667 
3668         mutex_lock(&po->pg_vec_lock);
3669 
3670         expected_size = 0;
3671         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3672                 if (rb->pg_vec) {
3673                         expected_size += rb->pg_vec_len
3674                                                 * rb->pg_vec_pages
3675                                                 * PAGE_SIZE;
3676                 }
3677         }
3678 
3679         if (expected_size == 0)
3680                 goto out;
3681 
3682         size = vma->vm_end - vma->vm_start;
3683         if (size != expected_size)
3684                 goto out;
3685 
3686         start = vma->vm_start;
3687         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3688                 if (rb->pg_vec == NULL)
3689                         continue;
3690 
3691                 for (i = 0; i < rb->pg_vec_len; i++) {
3692                         struct page *page;
3693                         void *kaddr = rb->pg_vec[i].buffer;
3694                         int pg_num;
3695 
3696                         for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3697                                 page = pgv_to_page(kaddr);
3698                                 err = vm_insert_page(vma, start, page);
3699                                 if (unlikely(err))
3700                                         goto out;
3701                                 start += PAGE_SIZE;
3702                                 kaddr += PAGE_SIZE;
3703                         }
3704                 }
3705         }
3706 
3707         atomic_inc(&po->mapped);
3708         vma->vm_ops = &packet_mmap_ops;
3709         err = 0;
3710 
3711 out:
3712         mutex_unlock(&po->pg_vec_lock);
3713         return err;
3714 }
3715 
3716 static const struct proto_ops packet_ops_spkt = {
3717         .family =       PF_PACKET,
3718         .owner =        THIS_MODULE,
3719         .release =      packet_release,
3720         .bind =         packet_bind_spkt,
3721         .connect =      sock_no_connect,
3722         .socketpair =   sock_no_socketpair,
3723         .accept =       sock_no_accept,
3724         .getname =      packet_getname_spkt,
3725         .poll =         datagram_poll,
3726         .ioctl =        packet_ioctl,
3727         .listen =       sock_no_listen,
3728         .shutdown =     sock_no_shutdown,
3729         .setsockopt =   sock_no_setsockopt,
3730         .getsockopt =   sock_no_getsockopt,
3731         .sendmsg =      packet_sendmsg_spkt,
3732         .recvmsg =      packet_recvmsg,
3733         .mmap =         sock_no_mmap,
3734         .sendpage =     sock_no_sendpage,
3735 };
3736 
3737 static const struct proto_ops packet_ops = {
3738         .family =       PF_PACKET,
3739         .owner =        THIS_MODULE,
3740         .release =      packet_release,
3741         .bind =         packet_bind,
3742         .connect =      sock_no_connect,
3743         .socketpair =   sock_no_socketpair,
3744         .accept =       sock_no_accept,
3745         .getname =      packet_getname,
3746         .poll =         packet_poll,
3747         .ioctl =        packet_ioctl,
3748         .listen =       sock_no_listen,
3749         .shutdown =     sock_no_shutdown,
3750         .setsockopt =   packet_setsockopt,
3751         .getsockopt =   packet_getsockopt,
3752         .sendmsg =      packet_sendmsg,
3753         .recvmsg =      packet_recvmsg,
3754         .mmap =         packet_mmap,
3755         .sendpage =     sock_no_sendpage,
3756 };
3757 
3758 static const struct net_proto_family packet_family_ops = {
3759         .family =       PF_PACKET,
3760         .create =       packet_create,
3761         .owner  =       THIS_MODULE,
3762 };
3763 
3764 static struct notifier_block packet_netdev_notifier = {
3765         .notifier_call =        packet_notifier,
3766 };
3767 
3768 #ifdef CONFIG_PROC_FS
3769 
3770 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
3771         __acquires(RCU)
3772 {
3773         struct net *net = seq_file_net(seq);
3774 
3775         rcu_read_lock();
3776         return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
3777 }
3778 
3779 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3780 {
3781         struct net *net = seq_file_net(seq);
3782         return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
3783 }
3784 
3785 static void packet_seq_stop(struct seq_file *seq, void *v)
3786         __releases(RCU)
3787 {
3788         rcu_read_unlock();
3789 }
3790 
3791 static int packet_seq_show(struct seq_file *seq, void *v)
3792 {
3793         if (v == SEQ_START_TOKEN)
3794                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
3795         else {
3796                 struct sock *s = sk_entry(v);
3797                 const struct packet_sock *po = pkt_sk(s);
3798 
3799                 seq_printf(seq,
3800                            "%pK %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
3801                            s,
3802                            atomic_read(&s->sk_refcnt),
3803                            s->sk_type,
3804                            ntohs(po->num),
3805                            po->ifindex,
3806                            po->running,
3807                            atomic_read(&s->sk_rmem_alloc),
3808                            from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
3809                            sock_i_ino(s));
3810         }
3811 
3812         return 0;
3813 }
3814 
3815 static const struct seq_operations packet_seq_ops = {
3816         .start  = packet_seq_start,
3817         .next   = packet_seq_next,
3818         .stop   = packet_seq_stop,
3819         .show   = packet_seq_show,
3820 };
3821 
3822 static int packet_seq_open(struct inode *inode, struct file *file)
3823 {
3824         return seq_open_net(inode, file, &packet_seq_ops,
3825                             sizeof(struct seq_net_private));
3826 }
3827 
3828 static const struct file_operations packet_seq_fops = {
3829         .owner          = THIS_MODULE,
3830         .open           = packet_seq_open,
3831         .read           = seq_read,
3832         .llseek         = seq_lseek,
3833         .release        = seq_release_net,
3834 };
3835 
3836 #endif
3837 
3838 static int __net_init packet_net_init(struct net *net)
3839 {
3840         mutex_init(&net->packet.sklist_lock);
3841         INIT_HLIST_HEAD(&net->packet.sklist);
3842 
3843         if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
3844                 return -ENOMEM;
3845 
3846         return 0;
3847 }
3848 
3849 static void __net_exit packet_net_exit(struct net *net)
3850 {
3851         remove_proc_entry("packet", net->proc_net);
3852 }
3853 
3854 static struct pernet_operations packet_net_ops = {
3855         .init = packet_net_init,
3856         .exit = packet_net_exit,
3857 };
3858 
3859 
3860 static void __exit packet_exit(void)
3861 {
3862         unregister_netdevice_notifier(&packet_netdev_notifier);
3863         unregister_pernet_subsys(&packet_net_ops);
3864         sock_unregister(PF_PACKET);
3865         proto_unregister(&packet_proto);
3866 }
3867 
3868 static int __init packet_init(void)
3869 {
3870         int rc = proto_register(&packet_proto, 0);
3871 
3872         if (rc != 0)
3873                 goto out;
3874 
3875         sock_register(&packet_family_ops);
3876         register_pernet_subsys(&packet_net_ops);
3877         register_netdevice_notifier(&packet_netdev_notifier);
3878 out:
3879         return rc;
3880 }
3881 
3882 module_init(packet_init);
3883 module_exit(packet_exit);
3884 MODULE_LICENSE("GPL");
3885 MODULE_ALIAS_NETPROTO(PF_PACKET);
3886 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp