~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/core/dev.c

Version: ~ [ linux-5.9 ] ~ [ linux-5.8.14 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.70 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.150 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.200 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.238 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.238 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  *      NET3    Protocol independent device support routines.
  3  *
  4  *              This program is free software; you can redistribute it and/or
  5  *              modify it under the terms of the GNU General Public License
  6  *              as published by the Free Software Foundation; either version
  7  *              2 of the License, or (at your option) any later version.
  8  *
  9  *      Derived from the non IP parts of dev.c 1.0.19
 10  *              Authors:        Ross Biro
 11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
 13  *
 14  *      Additional Authors:
 15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
 16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
 17  *              David Hinds <dahinds@users.sourceforge.net>
 18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 19  *              Adam Sulmicki <adam@cfar.umd.edu>
 20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
 21  *
 22  *      Changes:
 23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
 24  *                                      to 2 if register_netdev gets called
 25  *                                      before net_dev_init & also removed a
 26  *                                      few lines of code in the process.
 27  *              Alan Cox        :       device private ioctl copies fields back.
 28  *              Alan Cox        :       Transmit queue code does relevant
 29  *                                      stunts to keep the queue safe.
 30  *              Alan Cox        :       Fixed double lock.
 31  *              Alan Cox        :       Fixed promisc NULL pointer trap
 32  *              ????????        :       Support the full private ioctl range
 33  *              Alan Cox        :       Moved ioctl permission check into
 34  *                                      drivers
 35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
 36  *              Alan Cox        :       100 backlog just doesn't cut it when
 37  *                                      you start doing multicast video 8)
 38  *              Alan Cox        :       Rewrote net_bh and list manager.
 39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
 40  *              Alan Cox        :       Took out transmit every packet pass
 41  *                                      Saved a few bytes in the ioctl handler
 42  *              Alan Cox        :       Network driver sets packet type before
 43  *                                      calling netif_rx. Saves a function
 44  *                                      call a packet.
 45  *              Alan Cox        :       Hashed net_bh()
 46  *              Richard Kooijman:       Timestamp fixes.
 47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
 48  *              Alan Cox        :       Device lock protection.
 49  *              Alan Cox        :       Fixed nasty side effect of device close
 50  *                                      changes.
 51  *              Rudi Cilibrasi  :       Pass the right thing to
 52  *                                      set_mac_address()
 53  *              Dave Miller     :       32bit quantity for the device lock to
 54  *                                      make it work out on a Sparc.
 55  *              Bjorn Ekwall    :       Added KERNELD hack.
 56  *              Alan Cox        :       Cleaned up the backlog initialise.
 57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
 58  *                                      1 device.
 59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
 60  *                                      is no device open function.
 61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
 62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
 63  *              Cyrus Durgin    :       Cleaned for KMOD
 64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
 65  *                                      A network device unload needs to purge
 66  *                                      the backlog queue.
 67  *      Paul Rusty Russell      :       SIOCSIFNAME
 68  *              Pekka Riikonen  :       Netdev boot-time settings code
 69  *              Andrew Morton   :       Make unregister_netdevice wait
 70  *                                      indefinitely on dev->refcnt
 71  *              J Hadi Salim    :       - Backlog queue sampling
 72  *                                      - netif_rx() feedback
 73  */
 74 
 75 #include <asm/uaccess.h>
 76 #include <linux/bitops.h>
 77 #include <linux/capability.h>
 78 #include <linux/cpu.h>
 79 #include <linux/types.h>
 80 #include <linux/kernel.h>
 81 #include <linux/hash.h>
 82 #include <linux/slab.h>
 83 #include <linux/sched.h>
 84 #include <linux/mutex.h>
 85 #include <linux/string.h>
 86 #include <linux/mm.h>
 87 #include <linux/socket.h>
 88 #include <linux/sockios.h>
 89 #include <linux/errno.h>
 90 #include <linux/interrupt.h>
 91 #include <linux/if_ether.h>
 92 #include <linux/netdevice.h>
 93 #include <linux/etherdevice.h>
 94 #include <linux/ethtool.h>
 95 #include <linux/notifier.h>
 96 #include <linux/skbuff.h>
 97 #include <net/net_namespace.h>
 98 #include <net/sock.h>
 99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <linux/ipv6.h>
122 #include <linux/in.h>
123 #include <linux/jhash.h>
124 #include <linux/random.h>
125 #include <trace/events/napi.h>
126 #include <trace/events/net.h>
127 #include <trace/events/skb.h>
128 #include <linux/pci.h>
129 #include <linux/inetdevice.h>
130 #include <linux/cpu_rmap.h>
131 #include <linux/static_key.h>
132 
133 #include "net-sysfs.h"
134 
135 /* Instead of increasing this, you should create a hash table. */
136 #define MAX_GRO_SKBS 8
137 
138 /* This should be increased if a protocol with a bigger head is added. */
139 #define GRO_MAX_HEAD (MAX_HEADER + 128)
140 
141 static DEFINE_SPINLOCK(ptype_lock);
142 static DEFINE_SPINLOCK(offload_lock);
143 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
144 struct list_head ptype_all __read_mostly;       /* Taps */
145 static struct list_head offload_base __read_mostly;
146 
147 /*
148  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
149  * semaphore.
150  *
151  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
152  *
153  * Writers must hold the rtnl semaphore while they loop through the
154  * dev_base_head list, and hold dev_base_lock for writing when they do the
155  * actual updates.  This allows pure readers to access the list even
156  * while a writer is preparing to update it.
157  *
158  * To put it another way, dev_base_lock is held for writing only to
159  * protect against pure readers; the rtnl semaphore provides the
160  * protection against other writers.
161  *
162  * See, for example usages, register_netdevice() and
163  * unregister_netdevice(), which must be called with the rtnl
164  * semaphore held.
165  */
166 DEFINE_RWLOCK(dev_base_lock);
167 EXPORT_SYMBOL(dev_base_lock);
168 
169 seqcount_t devnet_rename_seq;
170 
171 static inline void dev_base_seq_inc(struct net *net)
172 {
173         while (++net->dev_base_seq == 0);
174 }
175 
176 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
177 {
178         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
179 
180         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
181 }
182 
183 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
184 {
185         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
186 }
187 
188 static inline void rps_lock(struct softnet_data *sd)
189 {
190 #ifdef CONFIG_RPS
191         spin_lock(&sd->input_pkt_queue.lock);
192 #endif
193 }
194 
195 static inline void rps_unlock(struct softnet_data *sd)
196 {
197 #ifdef CONFIG_RPS
198         spin_unlock(&sd->input_pkt_queue.lock);
199 #endif
200 }
201 
202 /* Device list insertion */
203 static void list_netdevice(struct net_device *dev)
204 {
205         struct net *net = dev_net(dev);
206 
207         ASSERT_RTNL();
208 
209         write_lock_bh(&dev_base_lock);
210         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
211         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
212         hlist_add_head_rcu(&dev->index_hlist,
213                            dev_index_hash(net, dev->ifindex));
214         write_unlock_bh(&dev_base_lock);
215 
216         dev_base_seq_inc(net);
217 }
218 
219 /* Device list removal
220  * caller must respect a RCU grace period before freeing/reusing dev
221  */
222 static void unlist_netdevice(struct net_device *dev)
223 {
224         ASSERT_RTNL();
225 
226         /* Unlink dev from the device chain */
227         write_lock_bh(&dev_base_lock);
228         list_del_rcu(&dev->dev_list);
229         hlist_del_rcu(&dev->name_hlist);
230         hlist_del_rcu(&dev->index_hlist);
231         write_unlock_bh(&dev_base_lock);
232 
233         dev_base_seq_inc(dev_net(dev));
234 }
235 
236 /*
237  *      Our notifier list
238  */
239 
240 static RAW_NOTIFIER_HEAD(netdev_chain);
241 
242 /*
243  *      Device drivers call our routines to queue packets here. We empty the
244  *      queue in the local softnet handler.
245  */
246 
247 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
248 EXPORT_PER_CPU_SYMBOL(softnet_data);
249 
250 #ifdef CONFIG_LOCKDEP
251 /*
252  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
253  * according to dev->type
254  */
255 static const unsigned short netdev_lock_type[] =
256         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
257          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
258          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
259          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
260          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
261          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
262          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
263          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
264          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
265          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
266          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
267          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
268          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
269          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
270          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
271 
272 static const char *const netdev_lock_name[] =
273         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
274          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
275          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
276          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
277          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
278          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
279          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
280          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
281          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
282          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
283          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
284          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
285          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
286          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
287          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
288 
289 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
290 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
291 
292 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
293 {
294         int i;
295 
296         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
297                 if (netdev_lock_type[i] == dev_type)
298                         return i;
299         /* the last key is used by default */
300         return ARRAY_SIZE(netdev_lock_type) - 1;
301 }
302 
303 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
304                                                  unsigned short dev_type)
305 {
306         int i;
307 
308         i = netdev_lock_pos(dev_type);
309         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
310                                    netdev_lock_name[i]);
311 }
312 
313 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
314 {
315         int i;
316 
317         i = netdev_lock_pos(dev->type);
318         lockdep_set_class_and_name(&dev->addr_list_lock,
319                                    &netdev_addr_lock_key[i],
320                                    netdev_lock_name[i]);
321 }
322 #else
323 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
324                                                  unsigned short dev_type)
325 {
326 }
327 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
328 {
329 }
330 #endif
331 
332 /*******************************************************************************
333 
334                 Protocol management and registration routines
335 
336 *******************************************************************************/
337 
338 /*
339  *      Add a protocol ID to the list. Now that the input handler is
340  *      smarter we can dispense with all the messy stuff that used to be
341  *      here.
342  *
343  *      BEWARE!!! Protocol handlers, mangling input packets,
344  *      MUST BE last in hash buckets and checking protocol handlers
345  *      MUST start from promiscuous ptype_all chain in net_bh.
346  *      It is true now, do not change it.
347  *      Explanation follows: if protocol handler, mangling packet, will
348  *      be the first on list, it is not able to sense, that packet
349  *      is cloned and should be copied-on-write, so that it will
350  *      change it and subsequent readers will get broken packet.
351  *                                                      --ANK (980803)
352  */
353 
354 static inline struct list_head *ptype_head(const struct packet_type *pt)
355 {
356         if (pt->type == htons(ETH_P_ALL))
357                 return &ptype_all;
358         else
359                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
360 }
361 
362 /**
363  *      dev_add_pack - add packet handler
364  *      @pt: packet type declaration
365  *
366  *      Add a protocol handler to the networking stack. The passed &packet_type
367  *      is linked into kernel lists and may not be freed until it has been
368  *      removed from the kernel lists.
369  *
370  *      This call does not sleep therefore it can not
371  *      guarantee all CPU's that are in middle of receiving packets
372  *      will see the new packet type (until the next received packet).
373  */
374 
375 void dev_add_pack(struct packet_type *pt)
376 {
377         struct list_head *head = ptype_head(pt);
378 
379         spin_lock(&ptype_lock);
380         list_add_rcu(&pt->list, head);
381         spin_unlock(&ptype_lock);
382 }
383 EXPORT_SYMBOL(dev_add_pack);
384 
385 /**
386  *      __dev_remove_pack        - remove packet handler
387  *      @pt: packet type declaration
388  *
389  *      Remove a protocol handler that was previously added to the kernel
390  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
391  *      from the kernel lists and can be freed or reused once this function
392  *      returns.
393  *
394  *      The packet type might still be in use by receivers
395  *      and must not be freed until after all the CPU's have gone
396  *      through a quiescent state.
397  */
398 void __dev_remove_pack(struct packet_type *pt)
399 {
400         struct list_head *head = ptype_head(pt);
401         struct packet_type *pt1;
402 
403         spin_lock(&ptype_lock);
404 
405         list_for_each_entry(pt1, head, list) {
406                 if (pt == pt1) {
407                         list_del_rcu(&pt->list);
408                         goto out;
409                 }
410         }
411 
412         pr_warn("dev_remove_pack: %p not found\n", pt);
413 out:
414         spin_unlock(&ptype_lock);
415 }
416 EXPORT_SYMBOL(__dev_remove_pack);
417 
418 /**
419  *      dev_remove_pack  - remove packet handler
420  *      @pt: packet type declaration
421  *
422  *      Remove a protocol handler that was previously added to the kernel
423  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
424  *      from the kernel lists and can be freed or reused once this function
425  *      returns.
426  *
427  *      This call sleeps to guarantee that no CPU is looking at the packet
428  *      type after return.
429  */
430 void dev_remove_pack(struct packet_type *pt)
431 {
432         __dev_remove_pack(pt);
433 
434         synchronize_net();
435 }
436 EXPORT_SYMBOL(dev_remove_pack);
437 
438 
439 /**
440  *      dev_add_offload - register offload handlers
441  *      @po: protocol offload declaration
442  *
443  *      Add protocol offload handlers to the networking stack. The passed
444  *      &proto_offload is linked into kernel lists and may not be freed until
445  *      it has been removed from the kernel lists.
446  *
447  *      This call does not sleep therefore it can not
448  *      guarantee all CPU's that are in middle of receiving packets
449  *      will see the new offload handlers (until the next received packet).
450  */
451 void dev_add_offload(struct packet_offload *po)
452 {
453         struct list_head *head = &offload_base;
454 
455         spin_lock(&offload_lock);
456         list_add_rcu(&po->list, head);
457         spin_unlock(&offload_lock);
458 }
459 EXPORT_SYMBOL(dev_add_offload);
460 
461 /**
462  *      __dev_remove_offload     - remove offload handler
463  *      @po: packet offload declaration
464  *
465  *      Remove a protocol offload handler that was previously added to the
466  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
467  *      is removed from the kernel lists and can be freed or reused once this
468  *      function returns.
469  *
470  *      The packet type might still be in use by receivers
471  *      and must not be freed until after all the CPU's have gone
472  *      through a quiescent state.
473  */
474 void __dev_remove_offload(struct packet_offload *po)
475 {
476         struct list_head *head = &offload_base;
477         struct packet_offload *po1;
478 
479         spin_lock(&offload_lock);
480 
481         list_for_each_entry(po1, head, list) {
482                 if (po == po1) {
483                         list_del_rcu(&po->list);
484                         goto out;
485                 }
486         }
487 
488         pr_warn("dev_remove_offload: %p not found\n", po);
489 out:
490         spin_unlock(&offload_lock);
491 }
492 EXPORT_SYMBOL(__dev_remove_offload);
493 
494 /**
495  *      dev_remove_offload       - remove packet offload handler
496  *      @po: packet offload declaration
497  *
498  *      Remove a packet offload handler that was previously added to the kernel
499  *      offload handlers by dev_add_offload(). The passed &offload_type is
500  *      removed from the kernel lists and can be freed or reused once this
501  *      function returns.
502  *
503  *      This call sleeps to guarantee that no CPU is looking at the packet
504  *      type after return.
505  */
506 void dev_remove_offload(struct packet_offload *po)
507 {
508         __dev_remove_offload(po);
509 
510         synchronize_net();
511 }
512 EXPORT_SYMBOL(dev_remove_offload);
513 
514 /******************************************************************************
515 
516                       Device Boot-time Settings Routines
517 
518 *******************************************************************************/
519 
520 /* Boot time configuration table */
521 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
522 
523 /**
524  *      netdev_boot_setup_add   - add new setup entry
525  *      @name: name of the device
526  *      @map: configured settings for the device
527  *
528  *      Adds new setup entry to the dev_boot_setup list.  The function
529  *      returns 0 on error and 1 on success.  This is a generic routine to
530  *      all netdevices.
531  */
532 static int netdev_boot_setup_add(char *name, struct ifmap *map)
533 {
534         struct netdev_boot_setup *s;
535         int i;
536 
537         s = dev_boot_setup;
538         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
539                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
540                         memset(s[i].name, 0, sizeof(s[i].name));
541                         strlcpy(s[i].name, name, IFNAMSIZ);
542                         memcpy(&s[i].map, map, sizeof(s[i].map));
543                         break;
544                 }
545         }
546 
547         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
548 }
549 
550 /**
551  *      netdev_boot_setup_check - check boot time settings
552  *      @dev: the netdevice
553  *
554  *      Check boot time settings for the device.
555  *      The found settings are set for the device to be used
556  *      later in the device probing.
557  *      Returns 0 if no settings found, 1 if they are.
558  */
559 int netdev_boot_setup_check(struct net_device *dev)
560 {
561         struct netdev_boot_setup *s = dev_boot_setup;
562         int i;
563 
564         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
565                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
566                     !strcmp(dev->name, s[i].name)) {
567                         dev->irq        = s[i].map.irq;
568                         dev->base_addr  = s[i].map.base_addr;
569                         dev->mem_start  = s[i].map.mem_start;
570                         dev->mem_end    = s[i].map.mem_end;
571                         return 1;
572                 }
573         }
574         return 0;
575 }
576 EXPORT_SYMBOL(netdev_boot_setup_check);
577 
578 
579 /**
580  *      netdev_boot_base        - get address from boot time settings
581  *      @prefix: prefix for network device
582  *      @unit: id for network device
583  *
584  *      Check boot time settings for the base address of device.
585  *      The found settings are set for the device to be used
586  *      later in the device probing.
587  *      Returns 0 if no settings found.
588  */
589 unsigned long netdev_boot_base(const char *prefix, int unit)
590 {
591         const struct netdev_boot_setup *s = dev_boot_setup;
592         char name[IFNAMSIZ];
593         int i;
594 
595         sprintf(name, "%s%d", prefix, unit);
596 
597         /*
598          * If device already registered then return base of 1
599          * to indicate not to probe for this interface
600          */
601         if (__dev_get_by_name(&init_net, name))
602                 return 1;
603 
604         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
605                 if (!strcmp(name, s[i].name))
606                         return s[i].map.base_addr;
607         return 0;
608 }
609 
610 /*
611  * Saves at boot time configured settings for any netdevice.
612  */
613 int __init netdev_boot_setup(char *str)
614 {
615         int ints[5];
616         struct ifmap map;
617 
618         str = get_options(str, ARRAY_SIZE(ints), ints);
619         if (!str || !*str)
620                 return 0;
621 
622         /* Save settings */
623         memset(&map, 0, sizeof(map));
624         if (ints[0] > 0)
625                 map.irq = ints[1];
626         if (ints[0] > 1)
627                 map.base_addr = ints[2];
628         if (ints[0] > 2)
629                 map.mem_start = ints[3];
630         if (ints[0] > 3)
631                 map.mem_end = ints[4];
632 
633         /* Add new entry to the list */
634         return netdev_boot_setup_add(str, &map);
635 }
636 
637 __setup("netdev=", netdev_boot_setup);
638 
639 /*******************************************************************************
640 
641                             Device Interface Subroutines
642 
643 *******************************************************************************/
644 
645 /**
646  *      __dev_get_by_name       - find a device by its name
647  *      @net: the applicable net namespace
648  *      @name: name to find
649  *
650  *      Find an interface by name. Must be called under RTNL semaphore
651  *      or @dev_base_lock. If the name is found a pointer to the device
652  *      is returned. If the name is not found then %NULL is returned. The
653  *      reference counters are not incremented so the caller must be
654  *      careful with locks.
655  */
656 
657 struct net_device *__dev_get_by_name(struct net *net, const char *name)
658 {
659         struct net_device *dev;
660         struct hlist_head *head = dev_name_hash(net, name);
661 
662         hlist_for_each_entry(dev, head, name_hlist)
663                 if (!strncmp(dev->name, name, IFNAMSIZ))
664                         return dev;
665 
666         return NULL;
667 }
668 EXPORT_SYMBOL(__dev_get_by_name);
669 
670 /**
671  *      dev_get_by_name_rcu     - find a device by its name
672  *      @net: the applicable net namespace
673  *      @name: name to find
674  *
675  *      Find an interface by name.
676  *      If the name is found a pointer to the device is returned.
677  *      If the name is not found then %NULL is returned.
678  *      The reference counters are not incremented so the caller must be
679  *      careful with locks. The caller must hold RCU lock.
680  */
681 
682 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
683 {
684         struct net_device *dev;
685         struct hlist_head *head = dev_name_hash(net, name);
686 
687         hlist_for_each_entry_rcu(dev, head, name_hlist)
688                 if (!strncmp(dev->name, name, IFNAMSIZ))
689                         return dev;
690 
691         return NULL;
692 }
693 EXPORT_SYMBOL(dev_get_by_name_rcu);
694 
695 /**
696  *      dev_get_by_name         - find a device by its name
697  *      @net: the applicable net namespace
698  *      @name: name to find
699  *
700  *      Find an interface by name. This can be called from any
701  *      context and does its own locking. The returned handle has
702  *      the usage count incremented and the caller must use dev_put() to
703  *      release it when it is no longer needed. %NULL is returned if no
704  *      matching device is found.
705  */
706 
707 struct net_device *dev_get_by_name(struct net *net, const char *name)
708 {
709         struct net_device *dev;
710 
711         rcu_read_lock();
712         dev = dev_get_by_name_rcu(net, name);
713         if (dev)
714                 dev_hold(dev);
715         rcu_read_unlock();
716         return dev;
717 }
718 EXPORT_SYMBOL(dev_get_by_name);
719 
720 /**
721  *      __dev_get_by_index - find a device by its ifindex
722  *      @net: the applicable net namespace
723  *      @ifindex: index of device
724  *
725  *      Search for an interface by index. Returns %NULL if the device
726  *      is not found or a pointer to the device. The device has not
727  *      had its reference counter increased so the caller must be careful
728  *      about locking. The caller must hold either the RTNL semaphore
729  *      or @dev_base_lock.
730  */
731 
732 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
733 {
734         struct net_device *dev;
735         struct hlist_head *head = dev_index_hash(net, ifindex);
736 
737         hlist_for_each_entry(dev, head, index_hlist)
738                 if (dev->ifindex == ifindex)
739                         return dev;
740 
741         return NULL;
742 }
743 EXPORT_SYMBOL(__dev_get_by_index);
744 
745 /**
746  *      dev_get_by_index_rcu - find a device by its ifindex
747  *      @net: the applicable net namespace
748  *      @ifindex: index of device
749  *
750  *      Search for an interface by index. Returns %NULL if the device
751  *      is not found or a pointer to the device. The device has not
752  *      had its reference counter increased so the caller must be careful
753  *      about locking. The caller must hold RCU lock.
754  */
755 
756 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
757 {
758         struct net_device *dev;
759         struct hlist_head *head = dev_index_hash(net, ifindex);
760 
761         hlist_for_each_entry_rcu(dev, head, index_hlist)
762                 if (dev->ifindex == ifindex)
763                         return dev;
764 
765         return NULL;
766 }
767 EXPORT_SYMBOL(dev_get_by_index_rcu);
768 
769 
770 /**
771  *      dev_get_by_index - find a device by its ifindex
772  *      @net: the applicable net namespace
773  *      @ifindex: index of device
774  *
775  *      Search for an interface by index. Returns NULL if the device
776  *      is not found or a pointer to the device. The device returned has
777  *      had a reference added and the pointer is safe until the user calls
778  *      dev_put to indicate they have finished with it.
779  */
780 
781 struct net_device *dev_get_by_index(struct net *net, int ifindex)
782 {
783         struct net_device *dev;
784 
785         rcu_read_lock();
786         dev = dev_get_by_index_rcu(net, ifindex);
787         if (dev)
788                 dev_hold(dev);
789         rcu_read_unlock();
790         return dev;
791 }
792 EXPORT_SYMBOL(dev_get_by_index);
793 
794 /**
795  *      netdev_get_name - get a netdevice name, knowing its ifindex.
796  *      @net: network namespace
797  *      @name: a pointer to the buffer where the name will be stored.
798  *      @ifindex: the ifindex of the interface to get the name from.
799  *
800  *      The use of raw_seqcount_begin() and cond_resched() before
801  *      retrying is required as we want to give the writers a chance
802  *      to complete when CONFIG_PREEMPT is not set.
803  */
804 int netdev_get_name(struct net *net, char *name, int ifindex)
805 {
806         struct net_device *dev;
807         unsigned int seq;
808 
809 retry:
810         seq = raw_seqcount_begin(&devnet_rename_seq);
811         rcu_read_lock();
812         dev = dev_get_by_index_rcu(net, ifindex);
813         if (!dev) {
814                 rcu_read_unlock();
815                 return -ENODEV;
816         }
817 
818         strcpy(name, dev->name);
819         rcu_read_unlock();
820         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
821                 cond_resched();
822                 goto retry;
823         }
824 
825         return 0;
826 }
827 
828 /**
829  *      dev_getbyhwaddr_rcu - find a device by its hardware address
830  *      @net: the applicable net namespace
831  *      @type: media type of device
832  *      @ha: hardware address
833  *
834  *      Search for an interface by MAC address. Returns NULL if the device
835  *      is not found or a pointer to the device.
836  *      The caller must hold RCU or RTNL.
837  *      The returned device has not had its ref count increased
838  *      and the caller must therefore be careful about locking
839  *
840  */
841 
842 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
843                                        const char *ha)
844 {
845         struct net_device *dev;
846 
847         for_each_netdev_rcu(net, dev)
848                 if (dev->type == type &&
849                     !memcmp(dev->dev_addr, ha, dev->addr_len))
850                         return dev;
851 
852         return NULL;
853 }
854 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
855 
856 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
857 {
858         struct net_device *dev;
859 
860         ASSERT_RTNL();
861         for_each_netdev(net, dev)
862                 if (dev->type == type)
863                         return dev;
864 
865         return NULL;
866 }
867 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
868 
869 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
870 {
871         struct net_device *dev, *ret = NULL;
872 
873         rcu_read_lock();
874         for_each_netdev_rcu(net, dev)
875                 if (dev->type == type) {
876                         dev_hold(dev);
877                         ret = dev;
878                         break;
879                 }
880         rcu_read_unlock();
881         return ret;
882 }
883 EXPORT_SYMBOL(dev_getfirstbyhwtype);
884 
885 /**
886  *      dev_get_by_flags_rcu - find any device with given flags
887  *      @net: the applicable net namespace
888  *      @if_flags: IFF_* values
889  *      @mask: bitmask of bits in if_flags to check
890  *
891  *      Search for any interface with the given flags. Returns NULL if a device
892  *      is not found or a pointer to the device. Must be called inside
893  *      rcu_read_lock(), and result refcount is unchanged.
894  */
895 
896 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
897                                     unsigned short mask)
898 {
899         struct net_device *dev, *ret;
900 
901         ret = NULL;
902         for_each_netdev_rcu(net, dev) {
903                 if (((dev->flags ^ if_flags) & mask) == 0) {
904                         ret = dev;
905                         break;
906                 }
907         }
908         return ret;
909 }
910 EXPORT_SYMBOL(dev_get_by_flags_rcu);
911 
912 /**
913  *      dev_valid_name - check if name is okay for network device
914  *      @name: name string
915  *
916  *      Network device names need to be valid file names to
917  *      to allow sysfs to work.  We also disallow any kind of
918  *      whitespace.
919  */
920 bool dev_valid_name(const char *name)
921 {
922         if (*name == '\0')
923                 return false;
924         if (strlen(name) >= IFNAMSIZ)
925                 return false;
926         if (!strcmp(name, ".") || !strcmp(name, ".."))
927                 return false;
928 
929         while (*name) {
930                 if (*name == '/' || *name == ':' || isspace(*name))
931                         return false;
932                 name++;
933         }
934         return true;
935 }
936 EXPORT_SYMBOL(dev_valid_name);
937 
938 /**
939  *      __dev_alloc_name - allocate a name for a device
940  *      @net: network namespace to allocate the device name in
941  *      @name: name format string
942  *      @buf:  scratch buffer and result name string
943  *
944  *      Passed a format string - eg "lt%d" it will try and find a suitable
945  *      id. It scans list of devices to build up a free map, then chooses
946  *      the first empty slot. The caller must hold the dev_base or rtnl lock
947  *      while allocating the name and adding the device in order to avoid
948  *      duplicates.
949  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
950  *      Returns the number of the unit assigned or a negative errno code.
951  */
952 
953 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
954 {
955         int i = 0;
956         const char *p;
957         const int max_netdevices = 8*PAGE_SIZE;
958         unsigned long *inuse;
959         struct net_device *d;
960 
961         p = strnchr(name, IFNAMSIZ-1, '%');
962         if (p) {
963                 /*
964                  * Verify the string as this thing may have come from
965                  * the user.  There must be either one "%d" and no other "%"
966                  * characters.
967                  */
968                 if (p[1] != 'd' || strchr(p + 2, '%'))
969                         return -EINVAL;
970 
971                 /* Use one page as a bit array of possible slots */
972                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
973                 if (!inuse)
974                         return -ENOMEM;
975 
976                 for_each_netdev(net, d) {
977                         if (!sscanf(d->name, name, &i))
978                                 continue;
979                         if (i < 0 || i >= max_netdevices)
980                                 continue;
981 
982                         /*  avoid cases where sscanf is not exact inverse of printf */
983                         snprintf(buf, IFNAMSIZ, name, i);
984                         if (!strncmp(buf, d->name, IFNAMSIZ))
985                                 set_bit(i, inuse);
986                 }
987 
988                 i = find_first_zero_bit(inuse, max_netdevices);
989                 free_page((unsigned long) inuse);
990         }
991 
992         if (buf != name)
993                 snprintf(buf, IFNAMSIZ, name, i);
994         if (!__dev_get_by_name(net, buf))
995                 return i;
996 
997         /* It is possible to run out of possible slots
998          * when the name is long and there isn't enough space left
999          * for the digits, or if all bits are used.
1000          */
1001         return -ENFILE;
1002 }
1003 
1004 /**
1005  *      dev_alloc_name - allocate a name for a device
1006  *      @dev: device
1007  *      @name: name format string
1008  *
1009  *      Passed a format string - eg "lt%d" it will try and find a suitable
1010  *      id. It scans list of devices to build up a free map, then chooses
1011  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1012  *      while allocating the name and adding the device in order to avoid
1013  *      duplicates.
1014  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1015  *      Returns the number of the unit assigned or a negative errno code.
1016  */
1017 
1018 int dev_alloc_name(struct net_device *dev, const char *name)
1019 {
1020         char buf[IFNAMSIZ];
1021         struct net *net;
1022         int ret;
1023 
1024         BUG_ON(!dev_net(dev));
1025         net = dev_net(dev);
1026         ret = __dev_alloc_name(net, name, buf);
1027         if (ret >= 0)
1028                 strlcpy(dev->name, buf, IFNAMSIZ);
1029         return ret;
1030 }
1031 EXPORT_SYMBOL(dev_alloc_name);
1032 
1033 static int dev_alloc_name_ns(struct net *net,
1034                              struct net_device *dev,
1035                              const char *name)
1036 {
1037         char buf[IFNAMSIZ];
1038         int ret;
1039 
1040         ret = __dev_alloc_name(net, name, buf);
1041         if (ret >= 0)
1042                 strlcpy(dev->name, buf, IFNAMSIZ);
1043         return ret;
1044 }
1045 
1046 static int dev_get_valid_name(struct net *net,
1047                               struct net_device *dev,
1048                               const char *name)
1049 {
1050         BUG_ON(!net);
1051 
1052         if (!dev_valid_name(name))
1053                 return -EINVAL;
1054 
1055         if (strchr(name, '%'))
1056                 return dev_alloc_name_ns(net, dev, name);
1057         else if (__dev_get_by_name(net, name))
1058                 return -EEXIST;
1059         else if (dev->name != name)
1060                 strlcpy(dev->name, name, IFNAMSIZ);
1061 
1062         return 0;
1063 }
1064 
1065 /**
1066  *      dev_change_name - change name of a device
1067  *      @dev: device
1068  *      @newname: name (or format string) must be at least IFNAMSIZ
1069  *
1070  *      Change name of a device, can pass format strings "eth%d".
1071  *      for wildcarding.
1072  */
1073 int dev_change_name(struct net_device *dev, const char *newname)
1074 {
1075         char oldname[IFNAMSIZ];
1076         int err = 0;
1077         int ret;
1078         struct net *net;
1079 
1080         ASSERT_RTNL();
1081         BUG_ON(!dev_net(dev));
1082 
1083         net = dev_net(dev);
1084         if (dev->flags & IFF_UP)
1085                 return -EBUSY;
1086 
1087         write_seqcount_begin(&devnet_rename_seq);
1088 
1089         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1090                 write_seqcount_end(&devnet_rename_seq);
1091                 return 0;
1092         }
1093 
1094         memcpy(oldname, dev->name, IFNAMSIZ);
1095 
1096         err = dev_get_valid_name(net, dev, newname);
1097         if (err < 0) {
1098                 write_seqcount_end(&devnet_rename_seq);
1099                 return err;
1100         }
1101 
1102 rollback:
1103         ret = device_rename(&dev->dev, dev->name);
1104         if (ret) {
1105                 memcpy(dev->name, oldname, IFNAMSIZ);
1106                 write_seqcount_end(&devnet_rename_seq);
1107                 return ret;
1108         }
1109 
1110         write_seqcount_end(&devnet_rename_seq);
1111 
1112         write_lock_bh(&dev_base_lock);
1113         hlist_del_rcu(&dev->name_hlist);
1114         write_unlock_bh(&dev_base_lock);
1115 
1116         synchronize_rcu();
1117 
1118         write_lock_bh(&dev_base_lock);
1119         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1120         write_unlock_bh(&dev_base_lock);
1121 
1122         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1123         ret = notifier_to_errno(ret);
1124 
1125         if (ret) {
1126                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1127                 if (err >= 0) {
1128                         err = ret;
1129                         write_seqcount_begin(&devnet_rename_seq);
1130                         memcpy(dev->name, oldname, IFNAMSIZ);
1131                         goto rollback;
1132                 } else {
1133                         pr_err("%s: name change rollback failed: %d\n",
1134                                dev->name, ret);
1135                 }
1136         }
1137 
1138         return err;
1139 }
1140 
1141 /**
1142  *      dev_set_alias - change ifalias of a device
1143  *      @dev: device
1144  *      @alias: name up to IFALIASZ
1145  *      @len: limit of bytes to copy from info
1146  *
1147  *      Set ifalias for a device,
1148  */
1149 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1150 {
1151         char *new_ifalias;
1152 
1153         ASSERT_RTNL();
1154 
1155         if (len >= IFALIASZ)
1156                 return -EINVAL;
1157 
1158         if (!len) {
1159                 kfree(dev->ifalias);
1160                 dev->ifalias = NULL;
1161                 return 0;
1162         }
1163 
1164         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1165         if (!new_ifalias)
1166                 return -ENOMEM;
1167         dev->ifalias = new_ifalias;
1168 
1169         strlcpy(dev->ifalias, alias, len+1);
1170         return len;
1171 }
1172 
1173 
1174 /**
1175  *      netdev_features_change - device changes features
1176  *      @dev: device to cause notification
1177  *
1178  *      Called to indicate a device has changed features.
1179  */
1180 void netdev_features_change(struct net_device *dev)
1181 {
1182         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1183 }
1184 EXPORT_SYMBOL(netdev_features_change);
1185 
1186 /**
1187  *      netdev_state_change - device changes state
1188  *      @dev: device to cause notification
1189  *
1190  *      Called to indicate a device has changed state. This function calls
1191  *      the notifier chains for netdev_chain and sends a NEWLINK message
1192  *      to the routing socket.
1193  */
1194 void netdev_state_change(struct net_device *dev)
1195 {
1196         if (dev->flags & IFF_UP) {
1197                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1198                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1199         }
1200 }
1201 EXPORT_SYMBOL(netdev_state_change);
1202 
1203 /**
1204  *      netdev_notify_peers - notify network peers about existence of @dev
1205  *      @dev: network device
1206  *
1207  * Generate traffic such that interested network peers are aware of
1208  * @dev, such as by generating a gratuitous ARP. This may be used when
1209  * a device wants to inform the rest of the network about some sort of
1210  * reconfiguration such as a failover event or virtual machine
1211  * migration.
1212  */
1213 void netdev_notify_peers(struct net_device *dev)
1214 {
1215         rtnl_lock();
1216         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1217         rtnl_unlock();
1218 }
1219 EXPORT_SYMBOL(netdev_notify_peers);
1220 
1221 static int __dev_open(struct net_device *dev)
1222 {
1223         const struct net_device_ops *ops = dev->netdev_ops;
1224         int ret;
1225 
1226         ASSERT_RTNL();
1227 
1228         if (!netif_device_present(dev))
1229                 return -ENODEV;
1230 
1231         /* Block netpoll from trying to do any rx path servicing.
1232          * If we don't do this there is a chance ndo_poll_controller
1233          * or ndo_poll may be running while we open the device
1234          */
1235         ret = netpoll_rx_disable(dev);
1236         if (ret)
1237                 return ret;
1238 
1239         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1240         ret = notifier_to_errno(ret);
1241         if (ret)
1242                 return ret;
1243 
1244         set_bit(__LINK_STATE_START, &dev->state);
1245 
1246         if (ops->ndo_validate_addr)
1247                 ret = ops->ndo_validate_addr(dev);
1248 
1249         if (!ret && ops->ndo_open)
1250                 ret = ops->ndo_open(dev);
1251 
1252         netpoll_rx_enable(dev);
1253 
1254         if (ret)
1255                 clear_bit(__LINK_STATE_START, &dev->state);
1256         else {
1257                 dev->flags |= IFF_UP;
1258                 net_dmaengine_get();
1259                 dev_set_rx_mode(dev);
1260                 dev_activate(dev);
1261                 add_device_randomness(dev->dev_addr, dev->addr_len);
1262         }
1263 
1264         return ret;
1265 }
1266 
1267 /**
1268  *      dev_open        - prepare an interface for use.
1269  *      @dev:   device to open
1270  *
1271  *      Takes a device from down to up state. The device's private open
1272  *      function is invoked and then the multicast lists are loaded. Finally
1273  *      the device is moved into the up state and a %NETDEV_UP message is
1274  *      sent to the netdev notifier chain.
1275  *
1276  *      Calling this function on an active interface is a nop. On a failure
1277  *      a negative errno code is returned.
1278  */
1279 int dev_open(struct net_device *dev)
1280 {
1281         int ret;
1282 
1283         if (dev->flags & IFF_UP)
1284                 return 0;
1285 
1286         ret = __dev_open(dev);
1287         if (ret < 0)
1288                 return ret;
1289 
1290         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1291         call_netdevice_notifiers(NETDEV_UP, dev);
1292 
1293         return ret;
1294 }
1295 EXPORT_SYMBOL(dev_open);
1296 
1297 static int __dev_close_many(struct list_head *head)
1298 {
1299         struct net_device *dev;
1300 
1301         ASSERT_RTNL();
1302         might_sleep();
1303 
1304         list_for_each_entry(dev, head, unreg_list) {
1305                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1306 
1307                 clear_bit(__LINK_STATE_START, &dev->state);
1308 
1309                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1310                  * can be even on different cpu. So just clear netif_running().
1311                  *
1312                  * dev->stop() will invoke napi_disable() on all of it's
1313                  * napi_struct instances on this device.
1314                  */
1315                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1316         }
1317 
1318         dev_deactivate_many(head);
1319 
1320         list_for_each_entry(dev, head, unreg_list) {
1321                 const struct net_device_ops *ops = dev->netdev_ops;
1322 
1323                 /*
1324                  *      Call the device specific close. This cannot fail.
1325                  *      Only if device is UP
1326                  *
1327                  *      We allow it to be called even after a DETACH hot-plug
1328                  *      event.
1329                  */
1330                 if (ops->ndo_stop)
1331                         ops->ndo_stop(dev);
1332 
1333                 dev->flags &= ~IFF_UP;
1334                 net_dmaengine_put();
1335         }
1336 
1337         return 0;
1338 }
1339 
1340 static int __dev_close(struct net_device *dev)
1341 {
1342         int retval;
1343         LIST_HEAD(single);
1344 
1345         /* Temporarily disable netpoll until the interface is down */
1346         retval = netpoll_rx_disable(dev);
1347         if (retval)
1348                 return retval;
1349 
1350         list_add(&dev->unreg_list, &single);
1351         retval = __dev_close_many(&single);
1352         list_del(&single);
1353 
1354         netpoll_rx_enable(dev);
1355         return retval;
1356 }
1357 
1358 static int dev_close_many(struct list_head *head)
1359 {
1360         struct net_device *dev, *tmp;
1361         LIST_HEAD(tmp_list);
1362 
1363         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1364                 if (!(dev->flags & IFF_UP))
1365                         list_move(&dev->unreg_list, &tmp_list);
1366 
1367         __dev_close_many(head);
1368 
1369         list_for_each_entry(dev, head, unreg_list) {
1370                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1371                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1372         }
1373 
1374         /* rollback_registered_many needs the complete original list */
1375         list_splice(&tmp_list, head);
1376         return 0;
1377 }
1378 
1379 /**
1380  *      dev_close - shutdown an interface.
1381  *      @dev: device to shutdown
1382  *
1383  *      This function moves an active device into down state. A
1384  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1385  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1386  *      chain.
1387  */
1388 int dev_close(struct net_device *dev)
1389 {
1390         int ret = 0;
1391         if (dev->flags & IFF_UP) {
1392                 LIST_HEAD(single);
1393 
1394                 /* Block netpoll rx while the interface is going down */
1395                 ret = netpoll_rx_disable(dev);
1396                 if (ret)
1397                         return ret;
1398 
1399                 list_add(&dev->unreg_list, &single);
1400                 dev_close_many(&single);
1401                 list_del(&single);
1402 
1403                 netpoll_rx_enable(dev);
1404         }
1405         return ret;
1406 }
1407 EXPORT_SYMBOL(dev_close);
1408 
1409 
1410 /**
1411  *      dev_disable_lro - disable Large Receive Offload on a device
1412  *      @dev: device
1413  *
1414  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1415  *      called under RTNL.  This is needed if received packets may be
1416  *      forwarded to another interface.
1417  */
1418 void dev_disable_lro(struct net_device *dev)
1419 {
1420         /*
1421          * If we're trying to disable lro on a vlan device
1422          * use the underlying physical device instead
1423          */
1424         if (is_vlan_dev(dev))
1425                 dev = vlan_dev_real_dev(dev);
1426 
1427         dev->wanted_features &= ~NETIF_F_LRO;
1428         netdev_update_features(dev);
1429 
1430         if (unlikely(dev->features & NETIF_F_LRO))
1431                 netdev_WARN(dev, "failed to disable LRO!\n");
1432 }
1433 EXPORT_SYMBOL(dev_disable_lro);
1434 
1435 
1436 static int dev_boot_phase = 1;
1437 
1438 /**
1439  *      register_netdevice_notifier - register a network notifier block
1440  *      @nb: notifier
1441  *
1442  *      Register a notifier to be called when network device events occur.
1443  *      The notifier passed is linked into the kernel structures and must
1444  *      not be reused until it has been unregistered. A negative errno code
1445  *      is returned on a failure.
1446  *
1447  *      When registered all registration and up events are replayed
1448  *      to the new notifier to allow device to have a race free
1449  *      view of the network device list.
1450  */
1451 
1452 int register_netdevice_notifier(struct notifier_block *nb)
1453 {
1454         struct net_device *dev;
1455         struct net_device *last;
1456         struct net *net;
1457         int err;
1458 
1459         rtnl_lock();
1460         err = raw_notifier_chain_register(&netdev_chain, nb);
1461         if (err)
1462                 goto unlock;
1463         if (dev_boot_phase)
1464                 goto unlock;
1465         for_each_net(net) {
1466                 for_each_netdev(net, dev) {
1467                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1468                         err = notifier_to_errno(err);
1469                         if (err)
1470                                 goto rollback;
1471 
1472                         if (!(dev->flags & IFF_UP))
1473                                 continue;
1474 
1475                         nb->notifier_call(nb, NETDEV_UP, dev);
1476                 }
1477         }
1478 
1479 unlock:
1480         rtnl_unlock();
1481         return err;
1482 
1483 rollback:
1484         last = dev;
1485         for_each_net(net) {
1486                 for_each_netdev(net, dev) {
1487                         if (dev == last)
1488                                 goto outroll;
1489 
1490                         if (dev->flags & IFF_UP) {
1491                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1492                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1493                         }
1494                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1495                 }
1496         }
1497 
1498 outroll:
1499         raw_notifier_chain_unregister(&netdev_chain, nb);
1500         goto unlock;
1501 }
1502 EXPORT_SYMBOL(register_netdevice_notifier);
1503 
1504 /**
1505  *      unregister_netdevice_notifier - unregister a network notifier block
1506  *      @nb: notifier
1507  *
1508  *      Unregister a notifier previously registered by
1509  *      register_netdevice_notifier(). The notifier is unlinked into the
1510  *      kernel structures and may then be reused. A negative errno code
1511  *      is returned on a failure.
1512  *
1513  *      After unregistering unregister and down device events are synthesized
1514  *      for all devices on the device list to the removed notifier to remove
1515  *      the need for special case cleanup code.
1516  */
1517 
1518 int unregister_netdevice_notifier(struct notifier_block *nb)
1519 {
1520         struct net_device *dev;
1521         struct net *net;
1522         int err;
1523 
1524         rtnl_lock();
1525         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1526         if (err)
1527                 goto unlock;
1528 
1529         for_each_net(net) {
1530                 for_each_netdev(net, dev) {
1531                         if (dev->flags & IFF_UP) {
1532                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1533                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1534                         }
1535                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1536                 }
1537         }
1538 unlock:
1539         rtnl_unlock();
1540         return err;
1541 }
1542 EXPORT_SYMBOL(unregister_netdevice_notifier);
1543 
1544 /**
1545  *      call_netdevice_notifiers - call all network notifier blocks
1546  *      @val: value passed unmodified to notifier function
1547  *      @dev: net_device pointer passed unmodified to notifier function
1548  *
1549  *      Call all network notifier blocks.  Parameters and return value
1550  *      are as for raw_notifier_call_chain().
1551  */
1552 
1553 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1554 {
1555         ASSERT_RTNL();
1556         return raw_notifier_call_chain(&netdev_chain, val, dev);
1557 }
1558 EXPORT_SYMBOL(call_netdevice_notifiers);
1559 
1560 static struct static_key netstamp_needed __read_mostly;
1561 #ifdef HAVE_JUMP_LABEL
1562 static atomic_t netstamp_needed_deferred;
1563 static atomic_t netstamp_wanted;
1564 static void netstamp_clear(struct work_struct *work)
1565 {
1566         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1567         int wanted;
1568 
1569         wanted = atomic_add_return(deferred, &netstamp_wanted);
1570         if (wanted > 0)
1571                 static_key_enable(&netstamp_needed);
1572         else
1573                 static_key_disable(&netstamp_needed);
1574 }
1575 static DECLARE_WORK(netstamp_work, netstamp_clear);
1576 #endif
1577 
1578 void net_enable_timestamp(void)
1579 {
1580 #ifdef HAVE_JUMP_LABEL
1581         int wanted;
1582 
1583         while (1) {
1584                 wanted = atomic_read(&netstamp_wanted);
1585                 if (wanted <= 0)
1586                         break;
1587                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1588                         return;
1589         }
1590         atomic_inc(&netstamp_needed_deferred);
1591         schedule_work(&netstamp_work);
1592 #else
1593         static_key_slow_inc(&netstamp_needed);
1594 #endif
1595 }
1596 EXPORT_SYMBOL(net_enable_timestamp);
1597 
1598 void net_disable_timestamp(void)
1599 {
1600 #ifdef HAVE_JUMP_LABEL
1601         int wanted;
1602 
1603         while (1) {
1604                 wanted = atomic_read(&netstamp_wanted);
1605                 if (wanted <= 1)
1606                         break;
1607                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1608                         return;
1609         }
1610         atomic_dec(&netstamp_needed_deferred);
1611         schedule_work(&netstamp_work);
1612 #else
1613         static_key_slow_dec(&netstamp_needed);
1614 #endif
1615 }
1616 EXPORT_SYMBOL(net_disable_timestamp);
1617 
1618 static inline void net_timestamp_set(struct sk_buff *skb)
1619 {
1620         skb->tstamp.tv64 = 0;
1621         if (static_key_false(&netstamp_needed))
1622                 __net_timestamp(skb);
1623 }
1624 
1625 #define net_timestamp_check(COND, SKB)                  \
1626         if (static_key_false(&netstamp_needed)) {               \
1627                 if ((COND) && !(SKB)->tstamp.tv64)      \
1628                         __net_timestamp(SKB);           \
1629         }                                               \
1630 
1631 static inline bool is_skb_forwardable(struct net_device *dev,
1632                                       struct sk_buff *skb)
1633 {
1634         unsigned int len;
1635 
1636         if (!(dev->flags & IFF_UP))
1637                 return false;
1638 
1639         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1640         if (skb->len <= len)
1641                 return true;
1642 
1643         /* if TSO is enabled, we don't care about the length as the packet
1644          * could be forwarded without being segmented before
1645          */
1646         if (skb_is_gso(skb))
1647                 return true;
1648 
1649         return false;
1650 }
1651 
1652 /**
1653  * dev_forward_skb - loopback an skb to another netif
1654  *
1655  * @dev: destination network device
1656  * @skb: buffer to forward
1657  *
1658  * return values:
1659  *      NET_RX_SUCCESS  (no congestion)
1660  *      NET_RX_DROP     (packet was dropped, but freed)
1661  *
1662  * dev_forward_skb can be used for injecting an skb from the
1663  * start_xmit function of one device into the receive queue
1664  * of another device.
1665  *
1666  * The receiving device may be in another namespace, so
1667  * we have to clear all information in the skb that could
1668  * impact namespace isolation.
1669  */
1670 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1671 {
1672         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1673                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1674                         atomic_long_inc(&dev->rx_dropped);
1675                         kfree_skb(skb);
1676                         return NET_RX_DROP;
1677                 }
1678         }
1679 
1680         skb_orphan(skb);
1681 
1682         if (unlikely(!is_skb_forwardable(dev, skb))) {
1683                 atomic_long_inc(&dev->rx_dropped);
1684                 kfree_skb(skb);
1685                 return NET_RX_DROP;
1686         }
1687         skb->skb_iif = 0;
1688         skb->dev = dev;
1689         skb_dst_drop(skb);
1690         skb->tstamp.tv64 = 0;
1691         skb->pkt_type = PACKET_HOST;
1692         skb->protocol = eth_type_trans(skb, dev);
1693         skb->mark = 0;
1694         secpath_reset(skb);
1695         nf_reset(skb);
1696         nf_reset_trace(skb);
1697         return netif_rx(skb);
1698 }
1699 EXPORT_SYMBOL_GPL(dev_forward_skb);
1700 
1701 static inline int deliver_skb(struct sk_buff *skb,
1702                               struct packet_type *pt_prev,
1703                               struct net_device *orig_dev)
1704 {
1705         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1706                 return -ENOMEM;
1707         atomic_inc(&skb->users);
1708         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1709 }
1710 
1711 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1712 {
1713         if (!ptype->af_packet_priv || !skb->sk)
1714                 return false;
1715 
1716         if (ptype->id_match)
1717                 return ptype->id_match(ptype, skb->sk);
1718         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1719                 return true;
1720 
1721         return false;
1722 }
1723 
1724 /*
1725  *      Support routine. Sends outgoing frames to any network
1726  *      taps currently in use.
1727  */
1728 
1729 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1730 {
1731         struct packet_type *ptype;
1732         struct sk_buff *skb2 = NULL;
1733         struct packet_type *pt_prev = NULL;
1734 
1735         rcu_read_lock();
1736         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1737                 /* Never send packets back to the socket
1738                  * they originated from - MvS (miquels@drinkel.ow.org)
1739                  */
1740                 if ((ptype->dev == dev || !ptype->dev) &&
1741                     (!skb_loop_sk(ptype, skb))) {
1742                         if (pt_prev) {
1743                                 deliver_skb(skb2, pt_prev, skb->dev);
1744                                 pt_prev = ptype;
1745                                 continue;
1746                         }
1747 
1748                         skb2 = skb_clone(skb, GFP_ATOMIC);
1749                         if (!skb2)
1750                                 break;
1751 
1752                         net_timestamp_set(skb2);
1753 
1754                         /* skb->nh should be correctly
1755                            set by sender, so that the second statement is
1756                            just protection against buggy protocols.
1757                          */
1758                         skb_reset_mac_header(skb2);
1759 
1760                         if (skb_network_header(skb2) < skb2->data ||
1761                             skb2->network_header > skb2->tail) {
1762                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1763                                                      ntohs(skb2->protocol),
1764                                                      dev->name);
1765                                 skb_reset_network_header(skb2);
1766                         }
1767 
1768                         skb2->transport_header = skb2->network_header;
1769                         skb2->pkt_type = PACKET_OUTGOING;
1770                         pt_prev = ptype;
1771                 }
1772         }
1773         if (pt_prev)
1774                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1775         rcu_read_unlock();
1776 }
1777 
1778 /**
1779  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1780  * @dev: Network device
1781  * @txq: number of queues available
1782  *
1783  * If real_num_tx_queues is changed the tc mappings may no longer be
1784  * valid. To resolve this verify the tc mapping remains valid and if
1785  * not NULL the mapping. With no priorities mapping to this
1786  * offset/count pair it will no longer be used. In the worst case TC0
1787  * is invalid nothing can be done so disable priority mappings. If is
1788  * expected that drivers will fix this mapping if they can before
1789  * calling netif_set_real_num_tx_queues.
1790  */
1791 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1792 {
1793         int i;
1794         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1795 
1796         /* If TC0 is invalidated disable TC mapping */
1797         if (tc->offset + tc->count > txq) {
1798                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1799                 dev->num_tc = 0;
1800                 return;
1801         }
1802 
1803         /* Invalidated prio to tc mappings set to TC0 */
1804         for (i = 1; i < TC_BITMASK + 1; i++) {
1805                 int q = netdev_get_prio_tc_map(dev, i);
1806 
1807                 tc = &dev->tc_to_txq[q];
1808                 if (tc->offset + tc->count > txq) {
1809                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1810                                 i, q);
1811                         netdev_set_prio_tc_map(dev, i, 0);
1812                 }
1813         }
1814 }
1815 
1816 #ifdef CONFIG_XPS
1817 static DEFINE_MUTEX(xps_map_mutex);
1818 #define xmap_dereference(P)             \
1819         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1820 
1821 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1822                                         int cpu, u16 index)
1823 {
1824         struct xps_map *map = NULL;
1825         int pos;
1826 
1827         if (dev_maps)
1828                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1829 
1830         for (pos = 0; map && pos < map->len; pos++) {
1831                 if (map->queues[pos] == index) {
1832                         if (map->len > 1) {
1833                                 map->queues[pos] = map->queues[--map->len];
1834                         } else {
1835                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1836                                 kfree_rcu(map, rcu);
1837                                 map = NULL;
1838                         }
1839                         break;
1840                 }
1841         }
1842 
1843         return map;
1844 }
1845 
1846 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1847 {
1848         struct xps_dev_maps *dev_maps;
1849         int cpu, i;
1850         bool active = false;
1851 
1852         mutex_lock(&xps_map_mutex);
1853         dev_maps = xmap_dereference(dev->xps_maps);
1854 
1855         if (!dev_maps)
1856                 goto out_no_maps;
1857 
1858         for_each_possible_cpu(cpu) {
1859                 for (i = index; i < dev->num_tx_queues; i++) {
1860                         if (!remove_xps_queue(dev_maps, cpu, i))
1861                                 break;
1862                 }
1863                 if (i == dev->num_tx_queues)
1864                         active = true;
1865         }
1866 
1867         if (!active) {
1868                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1869                 kfree_rcu(dev_maps, rcu);
1870         }
1871 
1872         for (i = index; i < dev->num_tx_queues; i++)
1873                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1874                                              NUMA_NO_NODE);
1875 
1876 out_no_maps:
1877         mutex_unlock(&xps_map_mutex);
1878 }
1879 
1880 static struct xps_map *expand_xps_map(struct xps_map *map,
1881                                       int cpu, u16 index)
1882 {
1883         struct xps_map *new_map;
1884         int alloc_len = XPS_MIN_MAP_ALLOC;
1885         int i, pos;
1886 
1887         for (pos = 0; map && pos < map->len; pos++) {
1888                 if (map->queues[pos] != index)
1889                         continue;
1890                 return map;
1891         }
1892 
1893         /* Need to add queue to this CPU's existing map */
1894         if (map) {
1895                 if (pos < map->alloc_len)
1896                         return map;
1897 
1898                 alloc_len = map->alloc_len * 2;
1899         }
1900 
1901         /* Need to allocate new map to store queue on this CPU's map */
1902         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1903                                cpu_to_node(cpu));
1904         if (!new_map)
1905                 return NULL;
1906 
1907         for (i = 0; i < pos; i++)
1908                 new_map->queues[i] = map->queues[i];
1909         new_map->alloc_len = alloc_len;
1910         new_map->len = pos;
1911 
1912         return new_map;
1913 }
1914 
1915 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1916 {
1917         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1918         struct xps_map *map, *new_map;
1919         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1920         int cpu, numa_node_id = -2;
1921         bool active = false;
1922 
1923         mutex_lock(&xps_map_mutex);
1924 
1925         dev_maps = xmap_dereference(dev->xps_maps);
1926 
1927         /* allocate memory for queue storage */
1928         for_each_online_cpu(cpu) {
1929                 if (!cpumask_test_cpu(cpu, mask))
1930                         continue;
1931 
1932                 if (!new_dev_maps)
1933                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1934                 if (!new_dev_maps) {
1935                         mutex_unlock(&xps_map_mutex);
1936                         return -ENOMEM;
1937                 }
1938 
1939                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1940                                  NULL;
1941 
1942                 map = expand_xps_map(map, cpu, index);
1943                 if (!map)
1944                         goto error;
1945 
1946                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1947         }
1948 
1949         if (!new_dev_maps)
1950                 goto out_no_new_maps;
1951 
1952         for_each_possible_cpu(cpu) {
1953                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1954                         /* add queue to CPU maps */
1955                         int pos = 0;
1956 
1957                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1958                         while ((pos < map->len) && (map->queues[pos] != index))
1959                                 pos++;
1960 
1961                         if (pos == map->len)
1962                                 map->queues[map->len++] = index;
1963 #ifdef CONFIG_NUMA
1964                         if (numa_node_id == -2)
1965                                 numa_node_id = cpu_to_node(cpu);
1966                         else if (numa_node_id != cpu_to_node(cpu))
1967                                 numa_node_id = -1;
1968 #endif
1969                 } else if (dev_maps) {
1970                         /* fill in the new device map from the old device map */
1971                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1972                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1973                 }
1974 
1975         }
1976 
1977         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1978 
1979         /* Cleanup old maps */
1980         if (dev_maps) {
1981                 for_each_possible_cpu(cpu) {
1982                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1983                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1984                         if (map && map != new_map)
1985                                 kfree_rcu(map, rcu);
1986                 }
1987 
1988                 kfree_rcu(dev_maps, rcu);
1989         }
1990 
1991         dev_maps = new_dev_maps;
1992         active = true;
1993 
1994 out_no_new_maps:
1995         /* update Tx queue numa node */
1996         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1997                                      (numa_node_id >= 0) ? numa_node_id :
1998                                      NUMA_NO_NODE);
1999 
2000         if (!dev_maps)
2001                 goto out_no_maps;
2002 
2003         /* removes queue from unused CPUs */
2004         for_each_possible_cpu(cpu) {
2005                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2006                         continue;
2007 
2008                 if (remove_xps_queue(dev_maps, cpu, index))
2009                         active = true;
2010         }
2011 
2012         /* free map if not active */
2013         if (!active) {
2014                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2015                 kfree_rcu(dev_maps, rcu);
2016         }
2017 
2018 out_no_maps:
2019         mutex_unlock(&xps_map_mutex);
2020 
2021         return 0;
2022 error:
2023         /* remove any maps that we added */
2024         for_each_possible_cpu(cpu) {
2025                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2026                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2027                                  NULL;
2028                 if (new_map && new_map != map)
2029                         kfree(new_map);
2030         }
2031 
2032         mutex_unlock(&xps_map_mutex);
2033 
2034         kfree(new_dev_maps);
2035         return -ENOMEM;
2036 }
2037 EXPORT_SYMBOL(netif_set_xps_queue);
2038 
2039 #endif
2040 /*
2041  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2042  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2043  */
2044 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2045 {
2046         int rc;
2047 
2048         if (txq < 1 || txq > dev->num_tx_queues)
2049                 return -EINVAL;
2050 
2051         if (dev->reg_state == NETREG_REGISTERED ||
2052             dev->reg_state == NETREG_UNREGISTERING) {
2053                 ASSERT_RTNL();
2054 
2055                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2056                                                   txq);
2057                 if (rc)
2058                         return rc;
2059 
2060                 if (dev->num_tc)
2061                         netif_setup_tc(dev, txq);
2062 
2063                 if (txq < dev->real_num_tx_queues) {
2064                         qdisc_reset_all_tx_gt(dev, txq);
2065 #ifdef CONFIG_XPS
2066                         netif_reset_xps_queues_gt(dev, txq);
2067 #endif
2068                 }
2069         }
2070 
2071         dev->real_num_tx_queues = txq;
2072         return 0;
2073 }
2074 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2075 
2076 #ifdef CONFIG_RPS
2077 /**
2078  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2079  *      @dev: Network device
2080  *      @rxq: Actual number of RX queues
2081  *
2082  *      This must be called either with the rtnl_lock held or before
2083  *      registration of the net device.  Returns 0 on success, or a
2084  *      negative error code.  If called before registration, it always
2085  *      succeeds.
2086  */
2087 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2088 {
2089         int rc;
2090 
2091         if (rxq < 1 || rxq > dev->num_rx_queues)
2092                 return -EINVAL;
2093 
2094         if (dev->reg_state == NETREG_REGISTERED) {
2095                 ASSERT_RTNL();
2096 
2097                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2098                                                   rxq);
2099                 if (rc)
2100                         return rc;
2101         }
2102 
2103         dev->real_num_rx_queues = rxq;
2104         return 0;
2105 }
2106 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2107 #endif
2108 
2109 /**
2110  * netif_get_num_default_rss_queues - default number of RSS queues
2111  *
2112  * This routine should set an upper limit on the number of RSS queues
2113  * used by default by multiqueue devices.
2114  */
2115 int netif_get_num_default_rss_queues(void)
2116 {
2117         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2118 }
2119 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2120 
2121 static inline void __netif_reschedule(struct Qdisc *q)
2122 {
2123         struct softnet_data *sd;
2124         unsigned long flags;
2125 
2126         local_irq_save(flags);
2127         sd = &__get_cpu_var(softnet_data);
2128         q->next_sched = NULL;
2129         *sd->output_queue_tailp = q;
2130         sd->output_queue_tailp = &q->next_sched;
2131         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2132         local_irq_restore(flags);
2133 }
2134 
2135 void __netif_schedule(struct Qdisc *q)
2136 {
2137         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2138                 __netif_reschedule(q);
2139 }
2140 EXPORT_SYMBOL(__netif_schedule);
2141 
2142 void dev_kfree_skb_irq(struct sk_buff *skb)
2143 {
2144         if (atomic_dec_and_test(&skb->users)) {
2145                 struct softnet_data *sd;
2146                 unsigned long flags;
2147 
2148                 local_irq_save(flags);
2149                 sd = &__get_cpu_var(softnet_data);
2150                 skb->next = sd->completion_queue;
2151                 sd->completion_queue = skb;
2152                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2153                 local_irq_restore(flags);
2154         }
2155 }
2156 EXPORT_SYMBOL(dev_kfree_skb_irq);
2157 
2158 void dev_kfree_skb_any(struct sk_buff *skb)
2159 {
2160         if (in_irq() || irqs_disabled())
2161                 dev_kfree_skb_irq(skb);
2162         else
2163                 dev_kfree_skb(skb);
2164 }
2165 EXPORT_SYMBOL(dev_kfree_skb_any);
2166 
2167 
2168 /**
2169  * netif_device_detach - mark device as removed
2170  * @dev: network device
2171  *
2172  * Mark device as removed from system and therefore no longer available.
2173  */
2174 void netif_device_detach(struct net_device *dev)
2175 {
2176         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2177             netif_running(dev)) {
2178                 netif_tx_stop_all_queues(dev);
2179         }
2180 }
2181 EXPORT_SYMBOL(netif_device_detach);
2182 
2183 /**
2184  * netif_device_attach - mark device as attached
2185  * @dev: network device
2186  *
2187  * Mark device as attached from system and restart if needed.
2188  */
2189 void netif_device_attach(struct net_device *dev)
2190 {
2191         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2192             netif_running(dev)) {
2193                 netif_tx_wake_all_queues(dev);
2194                 __netdev_watchdog_up(dev);
2195         }
2196 }
2197 EXPORT_SYMBOL(netif_device_attach);
2198 
2199 static void skb_warn_bad_offload(const struct sk_buff *skb)
2200 {
2201         static const netdev_features_t null_features = 0;
2202         struct net_device *dev = skb->dev;
2203         const char *driver = "";
2204 
2205         if (!net_ratelimit())
2206                 return;
2207 
2208         if (dev && dev->dev.parent)
2209                 driver = dev_driver_string(dev->dev.parent);
2210 
2211         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2212              "gso_type=%d ip_summed=%d\n",
2213              driver, dev ? &dev->features : &null_features,
2214              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2215              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2216              skb_shinfo(skb)->gso_type, skb->ip_summed);
2217 }
2218 
2219 /*
2220  * Invalidate hardware checksum when packet is to be mangled, and
2221  * complete checksum manually on outgoing path.
2222  */
2223 int skb_checksum_help(struct sk_buff *skb)
2224 {
2225         __wsum csum;
2226         int ret = 0, offset;
2227 
2228         if (skb->ip_summed == CHECKSUM_COMPLETE)
2229                 goto out_set_summed;
2230 
2231         if (unlikely(skb_shinfo(skb)->gso_size)) {
2232                 skb_warn_bad_offload(skb);
2233                 return -EINVAL;
2234         }
2235 
2236         /* Before computing a checksum, we should make sure no frag could
2237          * be modified by an external entity : checksum could be wrong.
2238          */
2239         if (skb_has_shared_frag(skb)) {
2240                 ret = __skb_linearize(skb);
2241                 if (ret)
2242                         goto out;
2243         }
2244 
2245         offset = skb_checksum_start_offset(skb);
2246         BUG_ON(offset >= skb_headlen(skb));
2247         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2248 
2249         offset += skb->csum_offset;
2250         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2251 
2252         if (skb_cloned(skb) &&
2253             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2254                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2255                 if (ret)
2256                         goto out;
2257         }
2258 
2259         *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2260 out_set_summed:
2261         skb->ip_summed = CHECKSUM_NONE;
2262 out:
2263         return ret;
2264 }
2265 EXPORT_SYMBOL(skb_checksum_help);
2266 
2267 __be16 skb_network_protocol(struct sk_buff *skb)
2268 {
2269         __be16 type = skb->protocol;
2270         int vlan_depth = ETH_HLEN;
2271 
2272         /* Tunnel gso handlers can set protocol to ethernet. */
2273         if (type == htons(ETH_P_TEB)) {
2274                 struct ethhdr *eth;
2275 
2276                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2277                         return 0;
2278 
2279                 eth = (struct ethhdr *)skb_mac_header(skb);
2280                 type = eth->h_proto;
2281         }
2282 
2283         while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2284                 struct vlan_hdr *vh;
2285 
2286                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2287                         return 0;
2288 
2289                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2290                 type = vh->h_vlan_encapsulated_proto;
2291                 vlan_depth += VLAN_HLEN;
2292         }
2293 
2294         return type;
2295 }
2296 
2297 /**
2298  *      skb_mac_gso_segment - mac layer segmentation handler.
2299  *      @skb: buffer to segment
2300  *      @features: features for the output path (see dev->features)
2301  */
2302 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2303                                     netdev_features_t features)
2304 {
2305         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2306         struct packet_offload *ptype;
2307         __be16 type = skb_network_protocol(skb);
2308 
2309         if (unlikely(!type))
2310                 return ERR_PTR(-EINVAL);
2311 
2312         __skb_pull(skb, skb->mac_len);
2313 
2314         rcu_read_lock();
2315         list_for_each_entry_rcu(ptype, &offload_base, list) {
2316                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2317                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2318                                 int err;
2319 
2320                                 err = ptype->callbacks.gso_send_check(skb);
2321                                 segs = ERR_PTR(err);
2322                                 if (err || skb_gso_ok(skb, features))
2323                                         break;
2324                                 __skb_push(skb, (skb->data -
2325                                                  skb_network_header(skb)));
2326                         }
2327                         segs = ptype->callbacks.gso_segment(skb, features);
2328                         break;
2329                 }
2330         }
2331         rcu_read_unlock();
2332 
2333         __skb_push(skb, skb->data - skb_mac_header(skb));
2334 
2335         return segs;
2336 }
2337 EXPORT_SYMBOL(skb_mac_gso_segment);
2338 
2339 
2340 /* openvswitch calls this on rx path, so we need a different check.
2341  */
2342 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2343 {
2344         if (tx_path)
2345                 return skb->ip_summed != CHECKSUM_PARTIAL &&
2346                        skb->ip_summed != CHECKSUM_NONE;
2347 
2348         return skb->ip_summed == CHECKSUM_NONE;
2349 }
2350 
2351 /**
2352  *      __skb_gso_segment - Perform segmentation on skb.
2353  *      @skb: buffer to segment
2354  *      @features: features for the output path (see dev->features)
2355  *      @tx_path: whether it is called in TX path
2356  *
2357  *      This function segments the given skb and returns a list of segments.
2358  *
2359  *      It may return NULL if the skb requires no segmentation.  This is
2360  *      only possible when GSO is used for verifying header integrity.
2361  */
2362 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2363                                   netdev_features_t features, bool tx_path)
2364 {
2365         struct sk_buff *segs;
2366 
2367         if (unlikely(skb_needs_check(skb, tx_path))) {
2368                 int err;
2369 
2370                 /* We're going to init ->check field in TCP or UDP header */
2371                 if (skb_header_cloned(skb) &&
2372                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2373                         return ERR_PTR(err);
2374         }
2375 
2376         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2377         skb_reset_mac_header(skb);
2378         skb_reset_mac_len(skb);
2379 
2380         segs = skb_mac_gso_segment(skb, features);
2381 
2382         if (unlikely(skb_needs_check(skb, tx_path)))
2383                 skb_warn_bad_offload(skb);
2384 
2385         return segs;
2386 }
2387 EXPORT_SYMBOL(__skb_gso_segment);
2388 
2389 /* Take action when hardware reception checksum errors are detected. */
2390 #ifdef CONFIG_BUG
2391 void netdev_rx_csum_fault(struct net_device *dev)
2392 {
2393         if (net_ratelimit()) {
2394                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2395                 dump_stack();
2396         }
2397 }
2398 EXPORT_SYMBOL(netdev_rx_csum_fault);
2399 #endif
2400 
2401 /* Actually, we should eliminate this check as soon as we know, that:
2402  * 1. IOMMU is present and allows to map all the memory.
2403  * 2. No high memory really exists on this machine.
2404  */
2405 
2406 static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb)
2407 {
2408 #ifdef CONFIG_HIGHMEM
2409         int i;
2410         if (!(dev->features & NETIF_F_HIGHDMA)) {
2411                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2412                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2413                         if (PageHighMem(skb_frag_page(frag)))
2414                                 return 1;
2415                 }
2416         }
2417 
2418         if (PCI_DMA_BUS_IS_PHYS) {
2419                 struct device *pdev = dev->dev.parent;
2420 
2421                 if (!pdev)
2422                         return 0;
2423                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2424                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2425                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2426                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2427                                 return 1;
2428                 }
2429         }
2430 #endif
2431         return 0;
2432 }
2433 
2434 struct dev_gso_cb {
2435         void (*destructor)(struct sk_buff *skb);
2436 };
2437 
2438 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2439 
2440 static void dev_gso_skb_destructor(struct sk_buff *skb)
2441 {
2442         struct dev_gso_cb *cb;
2443 
2444         do {
2445                 struct sk_buff *nskb = skb->next;
2446 
2447                 skb->next = nskb->next;
2448                 nskb->next = NULL;
2449                 kfree_skb(nskb);
2450         } while (skb->next);
2451 
2452         cb = DEV_GSO_CB(skb);
2453         if (cb->destructor)
2454                 cb->destructor(skb);
2455 }
2456 
2457 /**
2458  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2459  *      @skb: buffer to segment
2460  *      @features: device features as applicable to this skb
2461  *
2462  *      This function segments the given skb and stores the list of segments
2463  *      in skb->next.
2464  */
2465 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2466 {
2467         struct sk_buff *segs;
2468 
2469         segs = skb_gso_segment(skb, features);
2470 
2471         /* Verifying header integrity only. */
2472         if (!segs)
2473                 return 0;
2474 
2475         if (IS_ERR(segs))
2476                 return PTR_ERR(segs);
2477 
2478         skb->next = segs;
2479         DEV_GSO_CB(skb)->destructor = skb->destructor;
2480         skb->destructor = dev_gso_skb_destructor;
2481 
2482         return 0;
2483 }
2484 
2485 static netdev_features_t harmonize_features(struct sk_buff *skb,
2486                                             __be16 protocol,
2487                                             const struct net_device *dev,
2488                                             netdev_features_t features)
2489 {
2490         if (skb->ip_summed != CHECKSUM_NONE &&
2491             !can_checksum_protocol(features, protocol)) {
2492                 features &= ~NETIF_F_ALL_CSUM;
2493         }
2494         if (illegal_highdma(dev, skb))
2495                 features &= ~NETIF_F_SG;
2496 
2497         return features;
2498 }
2499 
2500 netdev_features_t netif_skb_dev_features(struct sk_buff *skb,
2501                                          const struct net_device *dev)
2502 {
2503         __be16 protocol = skb->protocol;
2504         netdev_features_t features = dev->features;
2505 
2506         if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs)
2507                 features &= ~NETIF_F_GSO_MASK;
2508 
2509         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2510                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2511                 protocol = veh->h_vlan_encapsulated_proto;
2512         } else if (!vlan_tx_tag_present(skb)) {
2513                 return harmonize_features(skb, protocol, dev, features);
2514         }
2515 
2516         features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2517                                                NETIF_F_HW_VLAN_STAG_TX);
2518 
2519         if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
2520                 return harmonize_features(skb, protocol, dev, features);
2521         } else {
2522                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2523                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2524                                 NETIF_F_HW_VLAN_STAG_TX;
2525                 return harmonize_features(skb, protocol, dev, features);
2526         }
2527 
2528         return harmonize_features(skb, protocol, dev, features);
2529 }
2530 EXPORT_SYMBOL(netif_skb_dev_features);
2531 
2532 /*
2533  * Returns true if either:
2534  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2535  *      2. skb is fragmented and the device does not support SG.
2536  */
2537 static inline int skb_needs_linearize(struct sk_buff *skb,
2538                                       netdev_features_t features)
2539 {
2540         return skb_is_nonlinear(skb) &&
2541                         ((skb_has_frag_list(skb) &&
2542                                 !(features & NETIF_F_FRAGLIST)) ||
2543                         (skb_shinfo(skb)->nr_frags &&
2544                                 !(features & NETIF_F_SG)));
2545 }
2546 
2547 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2548                         struct netdev_queue *txq)
2549 {
2550         const struct net_device_ops *ops = dev->netdev_ops;
2551         int rc = NETDEV_TX_OK;
2552         unsigned int skb_len;
2553 
2554         if (likely(!skb->next)) {
2555                 netdev_features_t features;
2556 
2557                 /*
2558                  * If device doesn't need skb->dst, release it right now while
2559                  * its hot in this cpu cache
2560                  */
2561                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2562                         skb_dst_drop(skb);
2563 
2564                 features = netif_skb_features(skb);
2565 
2566                 if (vlan_tx_tag_present(skb) &&
2567                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2568                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2569                                              vlan_tx_tag_get(skb));
2570                         if (unlikely(!skb))
2571                                 goto out;
2572 
2573                         skb->vlan_tci = 0;
2574                 }
2575 
2576                 /* If encapsulation offload request, verify we are testing
2577                  * hardware encapsulation features instead of standard
2578                  * features for the netdev
2579                  */
2580                 if (skb->encapsulation)
2581                         features &= dev->hw_enc_features;
2582 
2583                 if (netif_needs_gso(skb, features)) {
2584                         if (unlikely(dev_gso_segment(skb, features)))
2585                                 goto out_kfree_skb;
2586                         if (skb->next)
2587                                 goto gso;
2588                 } else {
2589                         if (skb_needs_linearize(skb, features) &&
2590                             __skb_linearize(skb))
2591                                 goto out_kfree_skb;
2592 
2593                         /* If packet is not checksummed and device does not
2594                          * support checksumming for this protocol, complete
2595                          * checksumming here.
2596                          */
2597                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2598                                 if (skb->encapsulation)
2599                                         skb_set_inner_transport_header(skb,
2600                                                 skb_checksum_start_offset(skb));
2601                                 else
2602                                         skb_set_transport_header(skb,
2603                                                 skb_checksum_start_offset(skb));
2604                                 if (!(features & NETIF_F_ALL_CSUM) &&
2605                                      skb_checksum_help(skb))
2606                                         goto out_kfree_skb;
2607                         }
2608                 }
2609 
2610                 if (!list_empty(&ptype_all))
2611                         dev_queue_xmit_nit(skb, dev);
2612 
2613                 skb_len = skb->len;
2614                 rc = ops->ndo_start_xmit(skb, dev);
2615                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2616                 if (rc == NETDEV_TX_OK)
2617                         txq_trans_update(txq);
2618                 return rc;
2619         }
2620 
2621 gso:
2622         do {
2623                 struct sk_buff *nskb = skb->next;
2624 
2625                 skb->next = nskb->next;
2626                 nskb->next = NULL;
2627 
2628                 if (!list_empty(&ptype_all))
2629                         dev_queue_xmit_nit(nskb, dev);
2630 
2631                 skb_len = nskb->len;
2632                 rc = ops->ndo_start_xmit(nskb, dev);
2633                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2634                 if (unlikely(rc != NETDEV_TX_OK)) {
2635                         if (rc & ~NETDEV_TX_MASK)
2636                                 goto out_kfree_gso_skb;
2637                         nskb->next = skb->next;
2638                         skb->next = nskb;
2639                         return rc;
2640                 }
2641                 txq_trans_update(txq);
2642                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2643                         return NETDEV_TX_BUSY;
2644         } while (skb->next);
2645 
2646 out_kfree_gso_skb:
2647         if (likely(skb->next == NULL)) {
2648                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2649                 consume_skb(skb);
2650                 return rc;
2651         }
2652 out_kfree_skb:
2653         kfree_skb(skb);
2654 out:
2655         return rc;
2656 }
2657 
2658 static void qdisc_pkt_len_init(struct sk_buff *skb)
2659 {
2660         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2661 
2662         qdisc_skb_cb(skb)->pkt_len = skb->len;
2663 
2664         /* To get more precise estimation of bytes sent on wire,
2665          * we add to pkt_len the headers size of all segments
2666          */
2667         if (shinfo->gso_size)  {
2668                 unsigned int hdr_len;
2669                 u16 gso_segs = shinfo->gso_segs;
2670 
2671                 /* mac layer + network layer */
2672                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2673 
2674                 /* + transport layer */
2675                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2676                         hdr_len += tcp_hdrlen(skb);
2677                 else
2678                         hdr_len += sizeof(struct udphdr);
2679 
2680                 if (shinfo->gso_type & SKB_GSO_DODGY)
2681                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2682                                                 shinfo->gso_size);
2683 
2684                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2685         }
2686 }
2687 
2688 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2689                                  struct net_device *dev,
2690                                  struct netdev_queue *txq)
2691 {
2692         spinlock_t *root_lock = qdisc_lock(q);
2693         bool contended;
2694         int rc;
2695 
2696         qdisc_pkt_len_init(skb);
2697         qdisc_calculate_pkt_len(skb, q);
2698         /*
2699          * Heuristic to force contended enqueues to serialize on a
2700          * separate lock before trying to get qdisc main lock.
2701          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2702          * and dequeue packets faster.
2703          */
2704         contended = qdisc_is_running(q);
2705         if (unlikely(contended))
2706                 spin_lock(&q->busylock);
2707 
2708         spin_lock(root_lock);
2709         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2710                 kfree_skb(skb);
2711                 rc = NET_XMIT_DROP;
2712         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2713                    qdisc_run_begin(q)) {
2714                 /*
2715                  * This is a work-conserving queue; there are no old skbs
2716                  * waiting to be sent out; and the qdisc is not running -
2717                  * xmit the skb directly.
2718                  */
2719                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2720                         skb_dst_force(skb);
2721 
2722                 qdisc_bstats_update(q, skb);
2723 
2724                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2725                         if (unlikely(contended)) {
2726                                 spin_unlock(&q->busylock);
2727                                 contended = false;
2728                         }
2729                         __qdisc_run(q);
2730                 } else
2731                         qdisc_run_end(q);
2732 
2733                 rc = NET_XMIT_SUCCESS;
2734         } else {
2735                 skb_dst_force(skb);
2736                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2737                 if (qdisc_run_begin(q)) {
2738                         if (unlikely(contended)) {
2739                                 spin_unlock(&q->busylock);
2740                                 contended = false;
2741                         }
2742                         __qdisc_run(q);
2743                 }
2744         }
2745         spin_unlock(root_lock);
2746         if (unlikely(contended))
2747                 spin_unlock(&q->busylock);
2748         return rc;
2749 }
2750 
2751 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2752 static void skb_update_prio(struct sk_buff *skb)
2753 {
2754         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2755 
2756         if (!skb->priority && skb->sk && map) {
2757                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2758 
2759                 if (prioidx < map->priomap_len)
2760                         skb->priority = map->priomap[prioidx];
2761         }
2762 }
2763 #else
2764 #define skb_update_prio(skb)
2765 #endif
2766 
2767 static DEFINE_PER_CPU(int, xmit_recursion);
2768 #define RECURSION_LIMIT 10
2769 
2770 /**
2771  *      dev_loopback_xmit - loop back @skb
2772  *      @skb: buffer to transmit
2773  */
2774 int dev_loopback_xmit(struct sk_buff *skb)
2775 {
2776         skb_reset_mac_header(skb);
2777         __skb_pull(skb, skb_network_offset(skb));
2778         skb->pkt_type = PACKET_LOOPBACK;
2779         skb->ip_summed = CHECKSUM_UNNECESSARY;
2780         WARN_ON(!skb_dst(skb));
2781         skb_dst_force(skb);
2782         netif_rx_ni(skb);
2783         return 0;
2784 }
2785 EXPORT_SYMBOL(dev_loopback_xmit);
2786 
2787 /**
2788  *      dev_queue_xmit - transmit a buffer
2789  *      @skb: buffer to transmit
2790  *
2791  *      Queue a buffer for transmission to a network device. The caller must
2792  *      have set the device and priority and built the buffer before calling
2793  *      this function. The function can be called from an interrupt.
2794  *
2795  *      A negative errno code is returned on a failure. A success does not
2796  *      guarantee the frame will be transmitted as it may be dropped due
2797  *      to congestion or traffic shaping.
2798  *
2799  * -----------------------------------------------------------------------------------
2800  *      I notice this method can also return errors from the queue disciplines,
2801  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2802  *      be positive.
2803  *
2804  *      Regardless of the return value, the skb is consumed, so it is currently
2805  *      difficult to retry a send to this method.  (You can bump the ref count
2806  *      before sending to hold a reference for retry if you are careful.)
2807  *
2808  *      When calling this method, interrupts MUST be enabled.  This is because
2809  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2810  *          --BLG
2811  */
2812 int dev_queue_xmit(struct sk_buff *skb)
2813 {
2814         struct net_device *dev = skb->dev;
2815         struct netdev_queue *txq;
2816         struct Qdisc *q;
2817         int rc = -ENOMEM;
2818 
2819         skb_reset_mac_header(skb);
2820 
2821         /* Disable soft irqs for various locks below. Also
2822          * stops preemption for RCU.
2823          */
2824         rcu_read_lock_bh();
2825 
2826         skb_update_prio(skb);
2827 
2828         txq = netdev_pick_tx(dev, skb);
2829         q = rcu_dereference_bh(txq->qdisc);
2830 
2831 #ifdef CONFIG_NET_CLS_ACT
2832         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2833 #endif
2834         trace_net_dev_queue(skb);
2835         if (q->enqueue) {
2836                 rc = __dev_xmit_skb(skb, q, dev, txq);
2837                 goto out;
2838         }
2839 
2840         /* The device has no queue. Common case for software devices:
2841            loopback, all the sorts of tunnels...
2842 
2843            Really, it is unlikely that netif_tx_lock protection is necessary
2844            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2845            counters.)
2846            However, it is possible, that they rely on protection
2847            made by us here.
2848 
2849            Check this and shot the lock. It is not prone from deadlocks.
2850            Either shot noqueue qdisc, it is even simpler 8)
2851          */
2852         if (dev->flags & IFF_UP) {
2853                 int cpu = smp_processor_id(); /* ok because BHs are off */
2854 
2855                 if (txq->xmit_lock_owner != cpu) {
2856 
2857                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2858                                 goto recursion_alert;
2859 
2860                         HARD_TX_LOCK(dev, txq, cpu);
2861 
2862                         if (!netif_xmit_stopped(txq)) {
2863                                 __this_cpu_inc(xmit_recursion);
2864                                 rc = dev_hard_start_xmit(skb, dev, txq);
2865                                 __this_cpu_dec(xmit_recursion);
2866                                 if (dev_xmit_complete(rc)) {
2867                                         HARD_TX_UNLOCK(dev, txq);
2868                                         goto out;
2869                                 }
2870                         }
2871                         HARD_TX_UNLOCK(dev, txq);
2872                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2873                                              dev->name);
2874                 } else {
2875                         /* Recursion is detected! It is possible,
2876                          * unfortunately
2877                          */
2878 recursion_alert:
2879                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2880                                              dev->name);
2881                 }
2882         }
2883 
2884         rc = -ENETDOWN;
2885         rcu_read_unlock_bh();
2886 
2887         kfree_skb(skb);
2888         return rc;
2889 out:
2890         rcu_read_unlock_bh();
2891         return rc;
2892 }
2893 EXPORT_SYMBOL(dev_queue_xmit);
2894 
2895 
2896 /*=======================================================================
2897                         Receiver routines
2898   =======================================================================*/
2899 
2900 int netdev_max_backlog __read_mostly = 1000;
2901 EXPORT_SYMBOL(netdev_max_backlog);
2902 
2903 int netdev_tstamp_prequeue __read_mostly = 1;
2904 int netdev_budget __read_mostly = 300;
2905 int weight_p __read_mostly = 64;            /* old backlog weight */
2906 
2907 /* Called with irq disabled */
2908 static inline void ____napi_schedule(struct softnet_data *sd,
2909                                      struct napi_struct *napi)
2910 {
2911         list_add_tail(&napi->poll_list, &sd->poll_list);
2912         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2913 }
2914 
2915 #ifdef CONFIG_RPS
2916 
2917 /* One global table that all flow-based protocols share. */
2918 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2919 EXPORT_SYMBOL(rps_sock_flow_table);
2920 
2921 struct static_key rps_needed __read_mostly;
2922 
2923 static struct rps_dev_flow *
2924 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2925             struct rps_dev_flow *rflow, u16 next_cpu)
2926 {
2927         if (next_cpu != RPS_NO_CPU) {
2928 #ifdef CONFIG_RFS_ACCEL
2929                 struct netdev_rx_queue *rxqueue;
2930                 struct rps_dev_flow_table *flow_table;
2931                 struct rps_dev_flow *old_rflow;
2932                 u32 flow_id;
2933                 u16 rxq_index;
2934                 int rc;
2935 
2936                 /* Should we steer this flow to a different hardware queue? */
2937                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2938                     !(dev->features & NETIF_F_NTUPLE))
2939                         goto out;
2940                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2941                 if (rxq_index == skb_get_rx_queue(skb))
2942                         goto out;
2943 
2944                 rxqueue = dev->_rx + rxq_index;
2945                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2946                 if (!flow_table)
2947                         goto out;
2948                 flow_id = skb->rxhash & flow_table->mask;
2949                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2950                                                         rxq_index, flow_id);
2951                 if (rc < 0)
2952                         goto out;
2953                 old_rflow = rflow;
2954                 rflow = &flow_table->flows[flow_id];
2955                 rflow->filter = rc;
2956                 if (old_rflow->filter == rflow->filter)
2957                         old_rflow->filter = RPS_NO_FILTER;
2958         out:
2959 #endif
2960                 rflow->last_qtail =
2961                         per_cpu(softnet_data, next_cpu).input_queue_head;
2962         }
2963 
2964         rflow->cpu = next_cpu;
2965         return rflow;
2966 }
2967 
2968 /*
2969  * get_rps_cpu is called from netif_receive_skb and returns the target
2970  * CPU from the RPS map of the receiving queue for a given skb.
2971  * rcu_read_lock must be held on entry.
2972  */
2973 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2974                        struct rps_dev_flow **rflowp)
2975 {
2976         struct netdev_rx_queue *rxqueue;
2977         struct rps_map *map;
2978         struct rps_dev_flow_table *flow_table;
2979         struct rps_sock_flow_table *sock_flow_table;
2980         int cpu = -1;
2981         u16 tcpu;
2982 
2983         if (skb_rx_queue_recorded(skb)) {
2984                 u16 index = skb_get_rx_queue(skb);
2985                 if (unlikely(index >= dev->real_num_rx_queues)) {
2986                         WARN_ONCE(dev->real_num_rx_queues > 1,
2987                                   "%s received packet on queue %u, but number "
2988                                   "of RX queues is %u\n",
2989                                   dev->name, index, dev->real_num_rx_queues);
2990                         goto done;
2991                 }
2992                 rxqueue = dev->_rx + index;
2993         } else
2994                 rxqueue = dev->_rx;
2995 
2996         map = rcu_dereference(rxqueue->rps_map);
2997         if (map) {
2998                 if (map->len == 1 &&
2999                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3000                         tcpu = map->cpus[0];
3001                         if (cpu_online(tcpu))
3002                                 cpu = tcpu;
3003                         goto done;
3004                 }
3005         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3006                 goto done;
3007         }
3008 
3009         skb_reset_network_header(skb);
3010         if (!skb_get_rxhash(skb))
3011                 goto done;
3012 
3013         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3014         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3015         if (flow_table && sock_flow_table) {
3016                 u16 next_cpu;
3017                 struct rps_dev_flow *rflow;
3018 
3019                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3020                 tcpu = rflow->cpu;
3021 
3022                 next_cpu = sock_flow_table->ents[skb->rxhash &
3023                     sock_flow_table->mask];
3024 
3025                 /*
3026                  * If the desired CPU (where last recvmsg was done) is
3027                  * different from current CPU (one in the rx-queue flow
3028                  * table entry), switch if one of the following holds:
3029                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3030                  *   - Current CPU is offline.
3031                  *   - The current CPU's queue tail has advanced beyond the
3032                  *     last packet that was enqueued using this table entry.
3033                  *     This guarantees that all previous packets for the flow
3034                  *     have been dequeued, thus preserving in order delivery.
3035                  */
3036                 if (unlikely(tcpu != next_cpu) &&
3037                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3038                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3039                       rflow->last_qtail)) >= 0)) {
3040                         tcpu = next_cpu;
3041                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3042                 }
3043 
3044                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3045                         *rflowp = rflow;
3046                         cpu = tcpu;
3047                         goto done;
3048                 }
3049         }
3050 
3051         if (map) {
3052                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3053 
3054                 if (cpu_online(tcpu)) {
3055                         cpu = tcpu;
3056                         goto done;
3057                 }
3058         }
3059 
3060 done:
3061         return cpu;
3062 }
3063 
3064 #ifdef CONFIG_RFS_ACCEL
3065 
3066 /**
3067  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3068  * @dev: Device on which the filter was set
3069  * @rxq_index: RX queue index
3070  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3071  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3072  *
3073  * Drivers that implement ndo_rx_flow_steer() should periodically call
3074  * this function for each installed filter and remove the filters for
3075  * which it returns %true.
3076  */
3077 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3078                          u32 flow_id, u16 filter_id)
3079 {
3080         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3081         struct rps_dev_flow_table *flow_table;
3082         struct rps_dev_flow *rflow;
3083         bool expire = true;
3084         int cpu;
3085 
3086         rcu_read_lock();
3087         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3088         if (flow_table && flow_id <= flow_table->mask) {
3089                 rflow = &flow_table->flows[flow_id];
3090                 cpu = ACCESS_ONCE(rflow->cpu);
3091                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3092                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3093                            rflow->last_qtail) <
3094                      (int)(10 * flow_table->mask)))
3095                         expire = false;
3096         }
3097         rcu_read_unlock();
3098         return expire;
3099 }
3100 EXPORT_SYMBOL(rps_may_expire_flow);
3101 
3102 #endif /* CONFIG_RFS_ACCEL */
3103 
3104 /* Called from hardirq (IPI) context */
3105 static void rps_trigger_softirq(void *data)
3106 {
3107         struct softnet_data *sd = data;
3108 
3109         ____napi_schedule(sd, &sd->backlog);
3110         sd->received_rps++;
3111 }
3112 
3113 #endif /* CONFIG_RPS */
3114 
3115 /*
3116  * Check if this softnet_data structure is another cpu one
3117  * If yes, queue it to our IPI list and return 1
3118  * If no, return 0
3119  */
3120 static int rps_ipi_queued(struct softnet_data *sd)
3121 {
3122 #ifdef CONFIG_RPS
3123         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3124 
3125         if (sd != mysd) {
3126                 sd->rps_ipi_next = mysd->rps_ipi_list;
3127                 mysd->rps_ipi_list = sd;
3128 
3129                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3130                 return 1;
3131         }
3132 #endif /* CONFIG_RPS */
3133         return 0;
3134 }
3135 
3136 /*
3137  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3138  * queue (may be a remote CPU queue).
3139  */
3140 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3141                               unsigned int *qtail)
3142 {
3143         struct softnet_data *sd;
3144         unsigned long flags;
3145 
3146         sd = &per_cpu(softnet_data, cpu);
3147 
3148         local_irq_save(flags);
3149 
3150         rps_lock(sd);
3151         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3152                 if (skb_queue_len(&sd->input_pkt_queue)) {
3153 enqueue:
3154                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3155                         input_queue_tail_incr_save(sd, qtail);
3156                         rps_unlock(sd);
3157                         local_irq_restore(flags);
3158                         return NET_RX_SUCCESS;
3159                 }
3160 
3161                 /* Schedule NAPI for backlog device
3162                  * We can use non atomic operation since we own the queue lock
3163                  */
3164                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3165                         if (!rps_ipi_queued(sd))
3166                                 ____napi_schedule(sd, &sd->backlog);
3167                 }
3168                 goto enqueue;
3169         }
3170 
3171         sd->dropped++;
3172         rps_unlock(sd);
3173 
3174         local_irq_restore(flags);
3175 
3176         atomic_long_inc(&skb->dev->rx_dropped);
3177         kfree_skb(skb);
3178         return NET_RX_DROP;
3179 }
3180 
3181 /**
3182  *      netif_rx        -       post buffer to the network code
3183  *      @skb: buffer to post
3184  *
3185  *      This function receives a packet from a device driver and queues it for
3186  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3187  *      may be dropped during processing for congestion control or by the
3188  *      protocol layers.
3189  *
3190  *      return values:
3191  *      NET_RX_SUCCESS  (no congestion)
3192  *      NET_RX_DROP     (packet was dropped)
3193  *
3194  */
3195 
3196 int netif_rx(struct sk_buff *skb)
3197 {
3198         int ret;
3199 
3200         /* if netpoll wants it, pretend we never saw it */
3201         if (netpoll_rx(skb))
3202                 return NET_RX_DROP;
3203 
3204         net_timestamp_check(netdev_tstamp_prequeue, skb);
3205 
3206         trace_netif_rx(skb);
3207 #ifdef CONFIG_RPS
3208         if (static_key_false(&rps_needed)) {
3209                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3210                 int cpu;
3211 
3212                 preempt_disable();
3213                 rcu_read_lock();
3214 
3215                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3216                 if (cpu < 0)
3217                         cpu = smp_processor_id();
3218 
3219                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3220 
3221                 rcu_read_unlock();
3222                 preempt_enable();
3223         } else
3224 #endif
3225         {
3226                 unsigned int qtail;
3227                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3228                 put_cpu();
3229         }
3230         return ret;
3231 }
3232 EXPORT_SYMBOL(netif_rx);
3233 
3234 int netif_rx_ni(struct sk_buff *skb)
3235 {
3236         int err;
3237 
3238         preempt_disable();
3239         err = netif_rx(skb);
3240         if (local_softirq_pending())
3241                 do_softirq();
3242         preempt_enable();
3243 
3244         return err;
3245 }
3246 EXPORT_SYMBOL(netif_rx_ni);
3247 
3248 static void net_tx_action(struct softirq_action *h)
3249 {
3250         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3251 
3252         if (sd->completion_queue) {
3253                 struct sk_buff *clist;
3254 
3255                 local_irq_disable();
3256                 clist = sd->completion_queue;
3257                 sd->completion_queue = NULL;
3258                 local_irq_enable();
3259 
3260                 while (clist) {
3261                         struct sk_buff *skb = clist;
3262                         clist = clist->next;
3263 
3264                         WARN_ON(atomic_read(&skb->users));
3265                         trace_kfree_skb(skb, net_tx_action);
3266                         __kfree_skb(skb);
3267                 }
3268         }
3269 
3270         if (sd->output_queue) {
3271                 struct Qdisc *head;
3272 
3273                 local_irq_disable();
3274                 head = sd->output_queue;
3275                 sd->output_queue = NULL;
3276                 sd->output_queue_tailp = &sd->output_queue;
3277                 local_irq_enable();
3278 
3279                 while (head) {
3280                         struct Qdisc *q = head;
3281                         spinlock_t *root_lock;
3282 
3283                         head = head->next_sched;
3284 
3285                         root_lock = qdisc_lock(q);
3286                         if (spin_trylock(root_lock)) {
3287                                 smp_mb__before_clear_bit();
3288                                 clear_bit(__QDISC_STATE_SCHED,
3289                                           &q->state);
3290                                 qdisc_run(q);
3291                                 spin_unlock(root_lock);
3292                         } else {
3293                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3294                                               &q->state)) {
3295                                         __netif_reschedule(q);
3296                                 } else {
3297                                         smp_mb__before_clear_bit();
3298                                         clear_bit(__QDISC_STATE_SCHED,
3299                                                   &q->state);
3300                                 }
3301                         }
3302                 }
3303         }
3304 }
3305 
3306 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3307     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3308 /* This hook is defined here for ATM LANE */
3309 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3310                              unsigned char *addr) __read_mostly;
3311 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3312 #endif
3313 
3314 #ifdef CONFIG_NET_CLS_ACT
3315 /* TODO: Maybe we should just force sch_ingress to be compiled in
3316  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3317  * a compare and 2 stores extra right now if we dont have it on
3318  * but have CONFIG_NET_CLS_ACT
3319  * NOTE: This doesn't stop any functionality; if you dont have
3320  * the ingress scheduler, you just can't add policies on ingress.
3321  *
3322  */
3323 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3324 {
3325         struct net_device *dev = skb->dev;
3326         u32 ttl = G_TC_RTTL(skb->tc_verd);
3327         int result = TC_ACT_OK;
3328         struct Qdisc *q;
3329 
3330         if (unlikely(MAX_RED_LOOP < ttl++)) {
3331                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3332                                      skb->skb_iif, dev->ifindex);
3333                 return TC_ACT_SHOT;
3334         }
3335 
3336         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3337         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3338 
3339         q = rxq->qdisc;
3340         if (q != &noop_qdisc) {
3341                 spin_lock(qdisc_lock(q));
3342                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3343                         result = qdisc_enqueue_root(skb, q);
3344                 spin_unlock(qdisc_lock(q));
3345         }
3346 
3347         return result;
3348 }
3349 
3350 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3351                                          struct packet_type **pt_prev,
3352                                          int *ret, struct net_device *orig_dev)
3353 {
3354         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3355 
3356         if (!rxq || rxq->qdisc == &noop_qdisc)
3357                 goto out;
3358 
3359         if (*pt_prev) {
3360                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3361                 *pt_prev = NULL;
3362         }
3363 
3364         switch (ing_filter(skb, rxq)) {
3365         case TC_ACT_SHOT:
3366         case TC_ACT_STOLEN:
3367                 kfree_skb(skb);
3368                 return NULL;
3369         }
3370 
3371 out:
3372         skb->tc_verd = 0;
3373         return skb;
3374 }
3375 #endif
3376 
3377 /**
3378  *      netdev_is_rx_handler_busy - check if receive handler is registered
3379  *      @dev: device to check
3380  *
3381  *      Check if a receive handler is already registered for a given device.
3382  *      Return true if there one.
3383  *
3384  *      The caller must hold the rtnl_mutex.
3385  */
3386 bool netdev_is_rx_handler_busy(struct net_device *dev)
3387 {
3388         ASSERT_RTNL();
3389         return dev && rtnl_dereference(dev->rx_handler);
3390 }
3391 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3392 
3393 /**
3394  *      netdev_rx_handler_register - register receive handler
3395  *      @dev: device to register a handler for
3396  *      @rx_handler: receive handler to register
3397  *      @rx_handler_data: data pointer that is used by rx handler
3398  *
3399  *      Register a receive hander for a device. This handler will then be
3400  *      called from __netif_receive_skb. A negative errno code is returned
3401  *      on a failure.
3402  *
3403  *      The caller must hold the rtnl_mutex.
3404  *
3405  *      For a general description of rx_handler, see enum rx_handler_result.
3406  */
3407 int netdev_rx_handler_register(struct net_device *dev,
3408                                rx_handler_func_t *rx_handler,
3409                                void *rx_handler_data)
3410 {
3411         ASSERT_RTNL();
3412 
3413         if (dev->rx_handler)
3414                 return -EBUSY;
3415 
3416         /* Note: rx_handler_data must be set before rx_handler */
3417         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3418         rcu_assign_pointer(dev->rx_handler, rx_handler);
3419 
3420         return 0;
3421 }
3422 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3423 
3424 /**
3425  *      netdev_rx_handler_unregister - unregister receive handler
3426  *      @dev: device to unregister a handler from
3427  *
3428  *      Unregister a receive handler from a device.
3429  *
3430  *      The caller must hold the rtnl_mutex.
3431  */
3432 void netdev_rx_handler_unregister(struct net_device *dev)
3433 {
3434 
3435         ASSERT_RTNL();
3436         RCU_INIT_POINTER(dev->rx_handler, NULL);
3437         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3438          * section has a guarantee to see a non NULL rx_handler_data
3439          * as well.
3440          */
3441         synchronize_net();
3442         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3443 }
3444 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3445 
3446 /*
3447  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3448  * the special handling of PFMEMALLOC skbs.
3449  */
3450 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3451 {
3452         switch (skb->protocol) {
3453         case __constant_htons(ETH_P_ARP):
3454         case __constant_htons(ETH_P_IP):
3455         case __constant_htons(ETH_P_IPV6):
3456         case __constant_htons(ETH_P_8021Q):
3457         case __constant_htons(ETH_P_8021AD):
3458                 return true;
3459         default:
3460                 return false;
3461         }
3462 }
3463 
3464 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3465 {
3466         struct packet_type *ptype, *pt_prev;
3467         rx_handler_func_t *rx_handler;
3468         struct net_device *orig_dev;
3469         struct net_device *null_or_dev;
3470         bool deliver_exact = false;
3471         int ret = NET_RX_DROP;
3472         __be16 type;
3473 
3474         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3475 
3476         trace_netif_receive_skb(skb);
3477 
3478         /* if we've gotten here through NAPI, check netpoll */
3479         if (netpoll_receive_skb(skb))
3480                 goto out;
3481 
3482         orig_dev = skb->dev;
3483 
3484         skb_reset_network_header(skb);
3485         if (!skb_transport_header_was_set(skb))
3486                 skb_reset_transport_header(skb);
3487         skb_reset_mac_len(skb);
3488 
3489         pt_prev = NULL;
3490 
3491 another_round:
3492         skb->skb_iif = skb->dev->ifindex;
3493 
3494         __this_cpu_inc(softnet_data.processed);
3495 
3496         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3497             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3498                 skb = vlan_untag(skb);
3499                 if (unlikely(!skb))
3500                         goto out;
3501         }
3502 
3503 #ifdef CONFIG_NET_CLS_ACT
3504         if (skb->tc_verd & TC_NCLS) {
3505                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3506                 goto ncls;
3507         }
3508 #endif
3509 
3510         if (pfmemalloc)
3511                 goto skip_taps;
3512 
3513         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3514                 if (!ptype->dev || ptype->dev == skb->dev) {
3515                         if (pt_prev)
3516                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3517                         pt_prev = ptype;
3518                 }
3519         }
3520 
3521 skip_taps:
3522 #ifdef CONFIG_NET_CLS_ACT
3523         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3524         if (!skb)
3525                 goto out;
3526 ncls:
3527 #endif
3528 
3529         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3530                 goto drop;
3531 
3532         if (vlan_tx_tag_present(skb)) {
3533                 if (pt_prev) {
3534                         ret = deliver_skb(skb, pt_prev, orig_dev);
3535                         pt_prev = NULL;
3536                 }
3537                 if (vlan_do_receive(&skb))
3538                         goto another_round;
3539                 else if (unlikely(!skb))
3540                         goto out;
3541         }
3542 
3543         rx_handler = rcu_dereference(skb->dev->rx_handler);
3544         if (rx_handler) {
3545                 if (pt_prev) {
3546                         ret = deliver_skb(skb, pt_prev, orig_dev);
3547                         pt_prev = NULL;
3548                 }
3549                 switch (rx_handler(&skb)) {
3550                 case RX_HANDLER_CONSUMED:
3551                         ret = NET_RX_SUCCESS;
3552                         goto out;
3553                 case RX_HANDLER_ANOTHER:
3554                         goto another_round;
3555                 case RX_HANDLER_EXACT:
3556                         deliver_exact = true;
3557                 case RX_HANDLER_PASS:
3558                         break;
3559                 default:
3560                         BUG();
3561                 }
3562         }
3563 
3564         if (unlikely(vlan_tx_tag_present(skb))) {
3565                 if (vlan_tx_tag_get_id(skb))
3566                         skb->pkt_type = PACKET_OTHERHOST;
3567                 /* Note: we might in the future use prio bits
3568                  * and set skb->priority like in vlan_do_receive()
3569                  * For the time being, just ignore Priority Code Point
3570                  */
3571                 skb->vlan_tci = 0;
3572         }
3573 
3574         /* deliver only exact match when indicated */
3575         null_or_dev = deliver_exact ? skb->dev : NULL;
3576 
3577         type = skb->protocol;
3578         list_for_each_entry_rcu(ptype,
3579                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3580                 if (ptype->type == type &&
3581                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3582                      ptype->dev == orig_dev)) {
3583                         if (pt_prev)
3584                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3585                         pt_prev = ptype;
3586                 }
3587         }
3588 
3589         if (pt_prev) {
3590                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3591                         goto drop;
3592                 else
3593                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3594         } else {
3595 drop:
3596                 atomic_long_inc(&skb->dev->rx_dropped);
3597                 kfree_skb(skb);
3598                 /* Jamal, now you will not able to escape explaining
3599                  * me how you were going to use this. :-)
3600                  */
3601                 ret = NET_RX_DROP;
3602         }
3603 
3604 out:
3605         return ret;
3606 }
3607 
3608 static int __netif_receive_skb(struct sk_buff *skb)
3609 {
3610         int ret;
3611 
3612         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3613                 unsigned long pflags = current->flags;
3614 
3615                 /*
3616                  * PFMEMALLOC skbs are special, they should
3617                  * - be delivered to SOCK_MEMALLOC sockets only
3618                  * - stay away from userspace
3619                  * - have bounded memory usage
3620                  *
3621                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3622                  * context down to all allocation sites.
3623                  */
3624                 current->flags |= PF_MEMALLOC;
3625                 ret = __netif_receive_skb_core(skb, true);
3626                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3627         } else
3628                 ret = __netif_receive_skb_core(skb, false);
3629 
3630         return ret;
3631 }
3632 
3633 /**
3634  *      netif_receive_skb - process receive buffer from network
3635  *      @skb: buffer to process
3636  *
3637  *      netif_receive_skb() is the main receive data processing function.
3638  *      It always succeeds. The buffer may be dropped during processing
3639  *      for congestion control or by the protocol layers.
3640  *
3641  *      This function may only be called from softirq context and interrupts
3642  *      should be enabled.
3643  *
3644  *      Return values (usually ignored):
3645  *      NET_RX_SUCCESS: no congestion
3646  *      NET_RX_DROP: packet was dropped
3647  */
3648 int netif_receive_skb(struct sk_buff *skb)
3649 {
3650         int ret;
3651 
3652         net_timestamp_check(netdev_tstamp_prequeue, skb);
3653 
3654         if (skb_defer_rx_timestamp(skb))
3655                 return NET_RX_SUCCESS;
3656 
3657         rcu_read_lock();
3658 
3659 #ifdef CONFIG_RPS
3660         if (static_key_false(&rps_needed)) {
3661                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3662                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
3663 
3664                 if (cpu >= 0) {
3665                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3666                         rcu_read_unlock();
3667                         return ret;
3668                 }
3669         }
3670 #endif
3671         ret = __netif_receive_skb(skb);
3672         rcu_read_unlock();
3673         return ret;
3674 }
3675 EXPORT_SYMBOL(netif_receive_skb);
3676 
3677 /* Network device is going away, flush any packets still pending
3678  * Called with irqs disabled.
3679  */
3680 static void flush_backlog(void *arg)
3681 {
3682         struct net_device *dev = arg;
3683         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3684         struct sk_buff *skb, *tmp;
3685 
3686         rps_lock(sd);
3687         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3688                 if (skb->dev == dev) {
3689                         __skb_unlink(skb, &sd->input_pkt_queue);
3690                         kfree_skb(skb);
3691                         input_queue_head_incr(sd);
3692                 }
3693         }
3694         rps_unlock(sd);
3695 
3696         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3697                 if (skb->dev == dev) {
3698                         __skb_unlink(skb, &sd->process_queue);
3699                         kfree_skb(skb);
3700                         input_queue_head_incr(sd);
3701                 }
3702         }
3703 }
3704 
3705 static int napi_gro_complete(struct sk_buff *skb)
3706 {
3707         struct packet_offload *ptype;
3708         __be16 type = skb->protocol;
3709         struct list_head *head = &offload_base;
3710         int err = -ENOENT;
3711 
3712         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3713 
3714         if (NAPI_GRO_CB(skb)->count == 1) {
3715                 skb_shinfo(skb)->gso_size = 0;
3716                 goto out;
3717         }
3718 
3719         rcu_read_lock();
3720         list_for_each_entry_rcu(ptype, head, list) {
3721                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3722                         continue;
3723 
3724                 err = ptype->callbacks.gro_complete(skb);
3725                 break;
3726         }
3727         rcu_read_unlock();
3728 
3729         if (err) {
3730                 WARN_ON(&ptype->list == head);
3731                 kfree_skb(skb);
3732                 return NET_RX_SUCCESS;
3733         }
3734 
3735 out:
3736         return netif_receive_skb(skb);
3737 }
3738 
3739 /* napi->gro_list contains packets ordered by age.
3740  * youngest packets at the head of it.
3741  * Complete skbs in reverse order to reduce latencies.
3742  */
3743 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3744 {
3745         struct sk_buff *skb, *prev = NULL;
3746 
3747         /* scan list and build reverse chain */
3748         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3749                 skb->prev = prev;
3750                 prev = skb;
3751         }
3752 
3753         for (skb = prev; skb; skb = prev) {
3754                 skb->next = NULL;
3755 
3756                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3757                         return;
3758 
3759                 prev = skb->prev;
3760                 napi_gro_complete(skb);
3761                 napi->gro_count--;
3762         }
3763 
3764         napi->gro_list = NULL;
3765 }
3766 EXPORT_SYMBOL(napi_gro_flush);
3767 
3768 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3769 {
3770         struct sk_buff *p;
3771         unsigned int maclen = skb->dev->hard_header_len;
3772 
3773         for (p = napi->gro_list; p; p = p->next) {
3774                 unsigned long diffs;
3775 
3776                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3777                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3778                 if (maclen == ETH_HLEN)
3779                         diffs |= compare_ether_header(skb_mac_header(p),
3780                                                       skb_gro_mac_header(skb));
3781                 else if (!diffs)
3782                         diffs = memcmp(skb_mac_header(p),
3783                                        skb_gro_mac_header(skb),
3784                                        maclen);
3785                 NAPI_GRO_CB(p)->same_flow = !diffs;
3786                 NAPI_GRO_CB(p)->flush = 0;
3787         }
3788 }
3789 
3790 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3791 {
3792         struct sk_buff **pp = NULL;
3793         struct packet_offload *ptype;
3794         __be16 type = skb->protocol;
3795         struct list_head *head = &offload_base;
3796         int same_flow;
3797         enum gro_result ret;
3798 
3799         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3800                 goto normal;
3801 
3802         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3803                 goto normal;
3804 
3805         gro_list_prepare(napi, skb);
3806 
3807         rcu_read_lock();
3808         list_for_each_entry_rcu(ptype, head, list) {
3809                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3810                         continue;
3811 
3812                 skb_set_network_header(skb, skb_gro_offset(skb));
3813                 skb_reset_mac_len(skb);
3814                 NAPI_GRO_CB(skb)->same_flow = 0;
3815                 NAPI_GRO_CB(skb)->flush = 0;
3816                 NAPI_GRO_CB(skb)->free = 0;
3817 
3818                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3819                 break;
3820         }
3821         rcu_read_unlock();
3822 
3823         if (&ptype->list == head)
3824                 goto normal;
3825 
3826         same_flow = NAPI_GRO_CB(skb)->same_flow;
3827         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3828 
3829         if (pp) {
3830                 struct sk_buff *nskb = *pp;
3831 
3832                 *pp = nskb->next;
3833                 nskb->next = NULL;
3834                 napi_gro_complete(nskb);
3835                 napi->gro_count--;
3836         }
3837 
3838         if (same_flow)
3839                 goto ok;
3840 
3841         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3842                 goto normal;
3843 
3844         napi->gro_count++;
3845         NAPI_GRO_CB(skb)->count = 1;
3846         NAPI_GRO_CB(skb)->age = jiffies;
3847         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3848         skb->next = napi->gro_list;
3849         napi->gro_list = skb;
3850         ret = GRO_HELD;
3851 
3852 pull:
3853         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3854                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3855 
3856                 BUG_ON(skb->end - skb->tail < grow);
3857 
3858                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3859 
3860                 skb->tail += grow;
3861                 skb->data_len -= grow;
3862 
3863                 skb_shinfo(skb)->frags[0].page_offset += grow;
3864                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3865 
3866                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3867                         skb_frag_unref(skb, 0);
3868                         memmove(skb_shinfo(skb)->frags,
3869                                 skb_shinfo(skb)->frags + 1,
3870                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3871                 }
3872         }
3873 
3874 ok:
3875         return ret;
3876 
3877 normal:
3878         ret = GRO_NORMAL;
3879         goto pull;
3880 }
3881 
3882 
3883 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3884 {
3885         switch (ret) {
3886         case GRO_NORMAL:
3887                 if (netif_receive_skb(skb))
3888                         ret = GRO_DROP;
3889                 break;
3890 
3891         case GRO_DROP:
3892                 kfree_skb(skb);
3893                 break;
3894 
3895         case GRO_MERGED_FREE:
3896                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3897                         kmem_cache_free(skbuff_head_cache, skb);
3898                 else
3899                         __kfree_skb(skb);
3900                 break;
3901 
3902         case GRO_HELD:
3903         case GRO_MERGED:
3904                 break;
3905         }
3906 
3907         return ret;
3908 }
3909 
3910 static void skb_gro_reset_offset(struct sk_buff *skb)
3911 {
3912         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3913         const skb_frag_t *frag0 = &pinfo->frags[0];
3914 
3915         NAPI_GRO_CB(skb)->data_offset = 0;
3916         NAPI_GRO_CB(skb)->frag0 = NULL;
3917         NAPI_GRO_CB(skb)->frag0_len = 0;
3918 
3919         if (skb->mac_header == skb->tail &&
3920             pinfo->nr_frags &&
3921             !PageHighMem(skb_frag_page(frag0))) {
3922                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3923                 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
3924                                                     skb_frag_size(frag0),
3925                                                     skb->end - skb->tail);
3926         }
3927 }
3928 
3929 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3930 {
3931         skb_gro_reset_offset(skb);
3932 
3933         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3934 }
3935 EXPORT_SYMBOL(napi_gro_receive);
3936 
3937 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3938 {
3939         __skb_pull(skb, skb_headlen(skb));
3940         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3941         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3942         skb->vlan_tci = 0;
3943         skb->dev = napi->dev;
3944         skb->skb_iif = 0;
3945         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
3946 
3947         napi->skb = skb;
3948 }
3949 
3950 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3951 {
3952         struct sk_buff *skb = napi->skb;
3953 
3954         if (!skb) {
3955                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3956                 if (skb)
3957                         napi->skb = skb;
3958         }
3959         return skb;
3960 }
3961 EXPORT_SYMBOL(napi_get_frags);
3962 
3963 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3964                                gro_result_t ret)
3965 {
3966         switch (ret) {
3967         case GRO_NORMAL:
3968         case GRO_HELD:
3969                 skb->protocol = eth_type_trans(skb, skb->dev);
3970 
3971                 if (ret == GRO_HELD)
3972                         skb_gro_pull(skb, -ETH_HLEN);
3973                 else if (netif_receive_skb(skb))
3974                         ret = GRO_DROP;
3975                 break;
3976 
3977         case GRO_DROP:
3978         case GRO_MERGED_FREE:
3979                 napi_reuse_skb(napi, skb);
3980                 break;
3981 
3982         case GRO_MERGED:
3983                 break;
3984         }
3985 
3986         return ret;
3987 }
3988 
3989 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3990 {
3991         struct sk_buff *skb = napi->skb;
3992         struct ethhdr *eth;
3993         unsigned int hlen;
3994         unsigned int off;
3995 
3996         napi->skb = NULL;
3997 
3998         skb_reset_mac_header(skb);
3999         skb_gro_reset_offset(skb);
4000 
4001         off = skb_gro_offset(skb);
4002         hlen = off + sizeof(*eth);
4003         eth = skb_gro_header_fast(skb, off);
4004         if (skb_gro_header_hard(skb, hlen)) {
4005                 eth = skb_gro_header_slow(skb, hlen, off);
4006                 if (unlikely(!eth)) {
4007                         napi_reuse_skb(napi, skb);
4008                         skb = NULL;
4009                         goto out;
4010                 }
4011         }
4012 
4013         skb_gro_pull(skb, sizeof(*eth));
4014 
4015         /*
4016          * This works because the only protocols we care about don't require
4017          * special handling.  We'll fix it up properly at the end.
4018          */
4019         skb->protocol = eth->h_proto;
4020 
4021 out:
4022         return skb;
4023 }
4024 
4025 gro_result_t napi_gro_frags(struct napi_struct *napi)
4026 {
4027         struct sk_buff *skb = napi_frags_skb(napi);
4028 
4029         if (!skb)
4030                 return GRO_DROP;
4031 
4032         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4033 }
4034 EXPORT_SYMBOL(napi_gro_frags);
4035 
4036 /*
4037  * net_rps_action sends any pending IPI's for rps.
4038  * Note: called with local irq disabled, but exits with local irq enabled.
4039  */
4040 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4041 {
4042 #ifdef CONFIG_RPS
4043         struct softnet_data *remsd = sd->rps_ipi_list;
4044 
4045         if (remsd) {
4046                 sd->rps_ipi_list = NULL;
4047 
4048                 local_irq_enable();
4049 
4050                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4051                 while (remsd) {
4052                         struct softnet_data *next = remsd->rps_ipi_next;
4053 
4054                         if (cpu_online(remsd->cpu))
4055                                 __smp_call_function_single(remsd->cpu,
4056                                                            &remsd->csd, 0);
4057                         remsd = next;
4058                 }
4059         } else
4060 #endif
4061                 local_irq_enable();
4062 }
4063 
4064 static int process_backlog(struct napi_struct *napi, int quota)
4065 {
4066         int work = 0;
4067         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4068 
4069 #ifdef CONFIG_RPS
4070         /* Check if we have pending ipi, its better to send them now,
4071          * not waiting net_rx_action() end.
4072          */
4073         if (sd->rps_ipi_list) {
4074                 local_irq_disable();
4075                 net_rps_action_and_irq_enable(sd);
4076         }
4077 #endif
4078         napi->weight = weight_p;
4079         local_irq_disable();
4080         while (work < quota) {
4081                 struct sk_buff *skb;
4082                 unsigned int qlen;
4083 
4084                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4085                         rcu_read_lock();
4086                         local_irq_enable();
4087                         __netif_receive_skb(skb);
4088                         rcu_read_unlock();
4089                         local_irq_disable();
4090                         input_queue_head_incr(sd);
4091                         if (++work >= quota) {
4092                                 local_irq_enable();
4093                                 return work;
4094                         }
4095                 }
4096 
4097                 rps_lock(sd);
4098                 qlen = skb_queue_len(&sd->input_pkt_queue);
4099                 if (qlen)
4100                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4101                                                    &sd->process_queue);
4102 
4103                 if (qlen < quota - work) {
4104                         /*
4105                          * Inline a custom version of __napi_complete().
4106                          * only current cpu owns and manipulates this napi,
4107                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4108                          * we can use a plain write instead of clear_bit(),
4109                          * and we dont need an smp_mb() memory barrier.
4110                          */
4111                         list_del(&napi->poll_list);
4112                         napi->state = 0;
4113 
4114                         quota = work + qlen;
4115                 }
4116                 rps_unlock(sd);
4117         }
4118         local_irq_enable();
4119 
4120         return work;
4121 }
4122 
4123 /**
4124  * __napi_schedule - schedule for receive
4125  * @n: entry to schedule
4126  *
4127  * The entry's receive function will be scheduled to run
4128  */
4129 void __napi_schedule(struct napi_struct *n)
4130 {
4131         unsigned long flags;
4132 
4133         local_irq_save(flags);
4134         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4135         local_irq_restore(flags);
4136 }
4137 EXPORT_SYMBOL(__napi_schedule);
4138 
4139 void __napi_complete(struct napi_struct *n)
4140 {
4141         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4142         BUG_ON(n->gro_list);
4143 
4144         list_del(&n->poll_list);
4145         smp_mb__before_clear_bit();
4146         clear_bit(NAPI_STATE_SCHED, &n->state);
4147 }
4148 EXPORT_SYMBOL(__napi_complete);
4149 
4150 void napi_complete(struct napi_struct *n)
4151 {
4152         unsigned long flags;
4153 
4154         /*
4155          * don't let napi dequeue from the cpu poll list
4156          * just in case its running on a different cpu
4157          */
4158         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4159                 return;
4160 
4161         napi_gro_flush(n, false);
4162         local_irq_save(flags);
4163         __napi_complete(n);
4164         local_irq_restore(flags);
4165 }
4166 EXPORT_SYMBOL(napi_complete);
4167 
4168 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4169                     int (*poll)(struct napi_struct *, int), int weight)
4170 {
4171         INIT_LIST_HEAD(&napi->poll_list);
4172         napi->gro_count = 0;
4173         napi->gro_list = NULL;
4174         napi->skb = NULL;
4175         napi->poll = poll;
4176         if (weight > NAPI_POLL_WEIGHT)
4177                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4178                             weight, dev->name);
4179         napi->weight = weight;
4180         list_add(&napi->dev_list, &dev->napi_list);
4181         napi->dev = dev;
4182 #ifdef CONFIG_NETPOLL
4183         spin_lock_init(&napi->poll_lock);
4184         napi->poll_owner = -1;
4185 #endif
4186         set_bit(NAPI_STATE_SCHED, &napi->state);
4187 }
4188 EXPORT_SYMBOL(netif_napi_add);
4189 
4190 void netif_napi_del(struct napi_struct *napi)
4191 {
4192         struct sk_buff *skb, *next;
4193 
4194         list_del_init(&napi->dev_list);
4195         napi_free_frags(napi);
4196 
4197         for (skb = napi->gro_list; skb; skb = next) {
4198                 next = skb->next;
4199                 skb->next = NULL;
4200                 kfree_skb(skb);
4201         }
4202 
4203         napi->gro_list = NULL;
4204         napi->gro_count = 0;
4205 }
4206 EXPORT_SYMBOL(netif_napi_del);
4207 
4208 static void net_rx_action(struct softirq_action *h)
4209 {
4210         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4211         unsigned long time_limit = jiffies + 2;
4212         int budget = netdev_budget;
4213         void *have;
4214 
4215         local_irq_disable();
4216 
4217         while (!list_empty(&sd->poll_list)) {
4218                 struct napi_struct *n;
4219                 int work, weight;
4220 
4221                 /* If softirq window is exhuasted then punt.
4222                  * Allow this to run for 2 jiffies since which will allow
4223                  * an average latency of 1.5/HZ.
4224                  */
4225                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4226                         goto softnet_break;
4227 
4228                 local_irq_enable();
4229 
4230                 /* Even though interrupts have been re-enabled, this
4231                  * access is safe because interrupts can only add new
4232                  * entries to the tail of this list, and only ->poll()
4233                  * calls can remove this head entry from the list.
4234                  */
4235                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4236 
4237                 have = netpoll_poll_lock(n);
4238 
4239                 weight = n->weight;
4240 
4241                 /* This NAPI_STATE_SCHED test is for avoiding a race
4242                  * with netpoll's poll_napi().  Only the entity which
4243                  * obtains the lock and sees NAPI_STATE_SCHED set will
4244                  * actually make the ->poll() call.  Therefore we avoid
4245                  * accidentally calling ->poll() when NAPI is not scheduled.
4246                  */
4247                 work = 0;
4248                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4249                         work = n->poll(n, weight);
4250                         trace_napi_poll(n);
4251                 }
4252 
4253                 WARN_ON_ONCE(work > weight);
4254 
4255                 budget -= work;
4256 
4257                 local_irq_disable();
4258 
4259                 /* Drivers must not modify the NAPI state if they
4260                  * consume the entire weight.  In such cases this code
4261                  * still "owns" the NAPI instance and therefore can
4262                  * move the instance around on the list at-will.
4263                  */
4264                 if (unlikely(work == weight)) {
4265                         if (unlikely(napi_disable_pending(n))) {
4266                                 local_irq_enable();
4267                                 napi_complete(n);
4268                                 local_irq_disable();
4269                         } else {
4270                                 if (n->gro_list) {
4271                                         /* flush too old packets
4272                                          * If HZ < 1000, flush all packets.
4273                                          */
4274                                         local_irq_enable();
4275                                         napi_gro_flush(n, HZ >= 1000);
4276                                         local_irq_disable();
4277                                 }
4278                                 list_move_tail(&n->poll_list, &sd->poll_list);
4279                         }
4280                 }
4281 
4282                 netpoll_poll_unlock(have);
4283         }
4284 out:
4285         net_rps_action_and_irq_enable(sd);
4286 
4287 #ifdef CONFIG_NET_DMA
4288         /*
4289          * There may not be any more sk_buffs coming right now, so push
4290          * any pending DMA copies to hardware
4291          */
4292         dma_issue_pending_all();
4293 #endif
4294 
4295         return;
4296 
4297 softnet_break:
4298         sd->time_squeeze++;
4299         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4300         goto out;
4301 }
4302 
4303 struct netdev_upper {
4304         struct net_device *dev;
4305         bool master;
4306         struct list_head list;
4307         struct rcu_head rcu;
4308         struct list_head search_list;
4309 };
4310 
4311 static void __append_search_uppers(struct list_head *search_list,
4312                                    struct net_device *dev)
4313 {
4314         struct netdev_upper *upper;
4315 
4316         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4317                 /* check if this upper is not already in search list */
4318                 if (list_empty(&upper->search_list))
4319                         list_add_tail(&upper->search_list, search_list);
4320         }
4321 }
4322 
4323 static bool __netdev_search_upper_dev(struct net_device *dev,
4324                                       struct net_device *upper_dev)
4325 {
4326         LIST_HEAD(search_list);
4327         struct netdev_upper *upper;
4328         struct netdev_upper *tmp;
4329         bool ret = false;
4330 
4331         __append_search_uppers(&search_list, dev);
4332         list_for_each_entry(upper, &search_list, search_list) {
4333                 if (upper->dev == upper_dev) {
4334                         ret = true;
4335                         break;
4336                 }
4337                 __append_search_uppers(&search_list, upper->dev);
4338         }
4339         list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4340                 INIT_LIST_HEAD(&upper->search_list);
4341         return ret;
4342 }
4343 
4344 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4345                                                 struct net_device *upper_dev)
4346 {
4347         struct netdev_upper *upper;
4348 
4349         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4350                 if (upper->dev == upper_dev)
4351                         return upper;
4352         }
4353         return NULL;
4354 }
4355 
4356 /**
4357  * netdev_has_upper_dev - Check if device is linked to an upper device
4358  * @dev: device
4359  * @upper_dev: upper device to check
4360  *
4361  * Find out if a device is linked to specified upper device and return true
4362  * in case it is. Note that this checks only immediate upper device,
4363  * not through a complete stack of devices. The caller must hold the RTNL lock.
4364  */
4365 bool netdev_has_upper_dev(struct net_device *dev,
4366                           struct net_device *upper_dev)
4367 {
4368         ASSERT_RTNL();
4369 
4370         return __netdev_find_upper(dev, upper_dev);
4371 }
4372 EXPORT_SYMBOL(netdev_has_upper_dev);
4373 
4374 /**
4375  * netdev_has_any_upper_dev - Check if device is linked to some device
4376  * @dev: device
4377  *
4378  * Find out if a device is linked to an upper device and return true in case
4379  * it is. The caller must hold the RTNL lock.
4380  */
4381 bool netdev_has_any_upper_dev(struct net_device *dev)
4382 {
4383         ASSERT_RTNL();
4384 
4385         return !list_empty(&dev->upper_dev_list);
4386 }
4387 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4388 
4389 /**
4390  * netdev_master_upper_dev_get - Get master upper device
4391  * @dev: device
4392  *
4393  * Find a master upper device and return pointer to it or NULL in case
4394  * it's not there. The caller must hold the RTNL lock.
4395  */
4396 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4397 {
4398         struct netdev_upper *upper;
4399 
4400         ASSERT_RTNL();
4401 
4402         if (list_empty(&dev->upper_dev_list))
4403                 return NULL;
4404 
4405         upper = list_first_entry(&dev->upper_dev_list,
4406                                  struct netdev_upper, list);
4407         if (likely(upper->master))
4408                 return upper->dev;
4409         return NULL;
4410 }
4411 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4412 
4413 /**
4414  * netdev_master_upper_dev_get_rcu - Get master upper device
4415  * @dev: device
4416  *
4417  * Find a master upper device and return pointer to it or NULL in case
4418  * it's not there. The caller must hold the RCU read lock.
4419  */
4420 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4421 {
4422         struct netdev_upper *upper;
4423 
4424         upper = list_first_or_null_rcu(&dev->upper_dev_list,
4425                                        struct netdev_upper, list);
4426         if (upper && likely(upper->master))
4427                 return upper->dev;
4428         return NULL;
4429 }
4430 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4431 
4432 static int __netdev_upper_dev_link(struct net_device *dev,
4433                                    struct net_device *upper_dev, bool master)
4434 {
4435         struct netdev_upper *upper;
4436 
4437         ASSERT_RTNL();
4438 
4439         if (dev == upper_dev)
4440                 return -EBUSY;
4441 
4442         /* To prevent loops, check if dev is not upper device to upper_dev. */
4443         if (__netdev_search_upper_dev(upper_dev, dev))
4444                 return -EBUSY;
4445 
4446         if (__netdev_find_upper(dev, upper_dev))
4447                 return -EEXIST;
4448 
4449         if (master && netdev_master_upper_dev_get(dev))
4450                 return -EBUSY;
4451 
4452         upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4453         if (!upper)
4454                 return -ENOMEM;
4455 
4456         upper->dev = upper_dev;
4457         upper->master = master;
4458         INIT_LIST_HEAD(&upper->search_list);
4459 
4460         /* Ensure that master upper link is always the first item in list. */
4461         if (master)
4462                 list_add_rcu(&upper->list, &dev->upper_dev_list);
4463         else
4464                 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4465         dev_hold(upper_dev);
4466 
4467         return 0;
4468 }
4469 
4470 /**
4471  * netdev_upper_dev_link - Add a link to the upper device
4472  * @dev: device
4473  * @upper_dev: new upper device
4474  *
4475  * Adds a link to device which is upper to this one. The caller must hold
4476  * the RTNL lock. On a failure a negative errno code is returned.
4477  * On success the reference counts are adjusted and the function
4478  * returns zero.
4479  */
4480 int netdev_upper_dev_link(struct net_device *dev,
4481                           struct net_device *upper_dev)
4482 {
4483         return __netdev_upper_dev_link(dev, upper_dev, false);
4484 }
4485 EXPORT_SYMBOL(netdev_upper_dev_link);
4486 
4487 /**
4488  * netdev_master_upper_dev_link - Add a master link to the upper device
4489  * @dev: device
4490  * @upper_dev: new upper device
4491  *
4492  * Adds a link to device which is upper to this one. In this case, only
4493  * one master upper device can be linked, although other non-master devices
4494  * might be linked as well. The caller must hold the RTNL lock.
4495  * On a failure a negative errno code is returned. On success the reference
4496  * counts are adjusted and the function returns zero.
4497  */
4498 int netdev_master_upper_dev_link(struct net_device *dev,
4499                                  struct net_device *upper_dev)
4500 {
4501         return __netdev_upper_dev_link(dev, upper_dev, true);
4502 }
4503 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4504 
4505 /**
4506  * netdev_upper_dev_unlink - Removes a link to upper device
4507  * @dev: device
4508  * @upper_dev: new upper device
4509  *
4510  * Removes a link to device which is upper to this one. The caller must hold
4511  * the RTNL lock.
4512  */
4513 void netdev_upper_dev_unlink(struct net_device *dev,
4514                              struct net_device *upper_dev)
4515 {
4516         struct netdev_upper *upper;
4517 
4518         ASSERT_RTNL();
4519 
4520         upper = __netdev_find_upper(dev, upper_dev);
4521         if (!upper)
4522                 return;
4523         list_del_rcu(&upper->list);
4524         dev_put(upper_dev);
4525         kfree_rcu(upper, rcu);
4526 }
4527 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4528 
4529 static void dev_change_rx_flags(struct net_device *dev, int flags)
4530 {
4531         const struct net_device_ops *ops = dev->netdev_ops;
4532 
4533         if (ops->ndo_change_rx_flags)
4534                 ops->ndo_change_rx_flags(dev, flags);
4535 }
4536 
4537 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4538 {
4539         unsigned int old_flags = dev->flags;
4540         kuid_t uid;
4541         kgid_t gid;
4542 
4543         ASSERT_RTNL();
4544 
4545         dev->flags |= IFF_PROMISC;
4546         dev->promiscuity += inc;
4547         if (dev->promiscuity == 0) {
4548                 /*
4549                  * Avoid overflow.
4550                  * If inc causes overflow, untouch promisc and return error.
4551                  */
4552                 if (inc < 0)
4553                         dev->flags &= ~IFF_PROMISC;
4554                 else {
4555                         dev->promiscuity -= inc;
4556                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4557                                 dev->name);
4558                         return -EOVERFLOW;
4559                 }
4560         }
4561         if (dev->flags != old_flags) {
4562                 pr_info("device %s %s promiscuous mode\n",
4563                         dev->name,
4564                         dev->flags & IFF_PROMISC ? "entered" : "left");
4565                 if (audit_enabled) {
4566                         current_uid_gid(&uid, &gid);
4567                         audit_log(current->audit_context, GFP_ATOMIC,
4568                                 AUDIT_ANOM_PROMISCUOUS,
4569                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4570                                 dev->name, (dev->flags & IFF_PROMISC),
4571                                 (old_flags & IFF_PROMISC),
4572                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4573                                 from_kuid(&init_user_ns, uid),
4574                                 from_kgid(&init_user_ns, gid),
4575                                 audit_get_sessionid(current));
4576                 }
4577 
4578                 dev_change_rx_flags(dev, IFF_PROMISC);
4579         }
4580         return 0;
4581 }
4582 
4583 /**
4584  *      dev_set_promiscuity     - update promiscuity count on a device
4585  *      @dev: device
4586  *      @inc: modifier
4587  *
4588  *      Add or remove promiscuity from a device. While the count in the device
4589  *      remains above zero the interface remains promiscuous. Once it hits zero
4590  *      the device reverts back to normal filtering operation. A negative inc
4591  *      value is used to drop promiscuity on the device.
4592  *      Return 0 if successful or a negative errno code on error.
4593  */
4594 int dev_set_promiscuity(struct net_device *dev, int inc)
4595 {
4596         unsigned int old_flags = dev->flags;
4597         int err;
4598 
4599         err = __dev_set_promiscuity(dev, inc);
4600         if (err < 0)
4601                 return err;
4602         if (dev->flags != old_flags)
4603                 dev_set_rx_mode(dev);
4604         return err;
4605 }
4606 EXPORT_SYMBOL(dev_set_promiscuity);
4607 
4608 /**
4609  *      dev_set_allmulti        - update allmulti count on a device
4610  *      @dev: device
4611  *      @inc: modifier
4612  *
4613  *      Add or remove reception of all multicast frames to a device. While the
4614  *      count in the device remains above zero the interface remains listening
4615  *      to all interfaces. Once it hits zero the device reverts back to normal
4616  *      filtering operation. A negative @inc value is used to drop the counter
4617  *      when releasing a resource needing all multicasts.
4618  *      Return 0 if successful or a negative errno code on error.
4619  */
4620 
4621 int dev_set_allmulti(struct net_device *dev, int inc)
4622 {
4623         unsigned int old_flags = dev->flags;
4624 
4625         ASSERT_RTNL();
4626 
4627         dev->flags |= IFF_ALLMULTI;
4628         dev->allmulti += inc;
4629         if (dev->allmulti == 0) {
4630                 /*
4631                  * Avoid overflow.
4632                  * If inc causes overflow, untouch allmulti and return error.
4633                  */
4634                 if (inc < 0)
4635                         dev->flags &= ~IFF_ALLMULTI;
4636                 else {
4637                         dev->allmulti -= inc;
4638                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4639                                 dev->name);
4640                         return -EOVERFLOW;
4641                 }
4642         }
4643         if (dev->flags ^ old_flags) {
4644                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4645                 dev_set_rx_mode(dev);
4646         }
4647         return 0;
4648 }
4649 EXPORT_SYMBOL(dev_set_allmulti);
4650 
4651 /*
4652  *      Upload unicast and multicast address lists to device and
4653  *      configure RX filtering. When the device doesn't support unicast
4654  *      filtering it is put in promiscuous mode while unicast addresses
4655  *      are present.
4656  */
4657 void __dev_set_rx_mode(struct net_device *dev)
4658 {
4659         const struct net_device_ops *ops = dev->netdev_ops;
4660 
4661         /* dev_open will call this function so the list will stay sane. */
4662         if (!(dev->flags&IFF_UP))
4663                 return;
4664 
4665         if (!netif_device_present(dev))
4666                 return;
4667 
4668         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4669                 /* Unicast addresses changes may only happen under the rtnl,
4670                  * therefore calling __dev_set_promiscuity here is safe.
4671                  */
4672                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4673                         __dev_set_promiscuity(dev, 1);
4674                         dev->uc_promisc = true;
4675                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4676                         __dev_set_promiscuity(dev, -1);
4677                         dev->uc_promisc = false;
4678                 }
4679         }
4680 
4681         if (ops->ndo_set_rx_mode)
4682                 ops->ndo_set_rx_mode(dev);
4683 }
4684 EXPORT_SYMBOL(__dev_set_rx_mode);
4685 
4686 void dev_set_rx_mode(struct net_device *dev)
4687 {
4688         netif_addr_lock_bh(dev);
4689         __dev_set_rx_mode(dev);
4690         netif_addr_unlock_bh(dev);
4691 }
4692 
4693 /**
4694  *      dev_get_flags - get flags reported to userspace
4695  *      @dev: device
4696  *
4697  *      Get the combination of flag bits exported through APIs to userspace.
4698  */
4699 unsigned int dev_get_flags(const struct net_device *dev)
4700 {
4701         unsigned int flags;
4702 
4703         flags = (dev->flags & ~(IFF_PROMISC |
4704                                 IFF_ALLMULTI |
4705                                 IFF_RUNNING |
4706                                 IFF_LOWER_UP |
4707                                 IFF_DORMANT)) |
4708                 (dev->gflags & (IFF_PROMISC |
4709                                 IFF_ALLMULTI));
4710 
4711         if (netif_running(dev)) {
4712                 if (netif_oper_up(dev))
4713                         flags |= IFF_RUNNING;
4714                 if (netif_carrier_ok(dev))
4715                         flags |= IFF_LOWER_UP;
4716                 if (netif_dormant(dev))
4717                         flags |= IFF_DORMANT;
4718         }
4719 
4720         return flags;
4721 }
4722 EXPORT_SYMBOL(dev_get_flags);
4723 
4724 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4725 {
4726         unsigned int old_flags = dev->flags;
4727         int ret;
4728 
4729         ASSERT_RTNL();
4730 
4731         /*
4732          *      Set the flags on our device.
4733          */
4734 
4735         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4736                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4737                                IFF_AUTOMEDIA)) |
4738                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4739                                     IFF_ALLMULTI));
4740 
4741         /*
4742          *      Load in the correct multicast list now the flags have changed.
4743          */
4744 
4745         if ((old_flags ^ flags) & IFF_MULTICAST)
4746                 dev_change_rx_flags(dev, IFF_MULTICAST);
4747 
4748         dev_set_rx_mode(dev);
4749 
4750         /*
4751          *      Have we downed the interface. We handle IFF_UP ourselves
4752          *      according to user attempts to set it, rather than blindly
4753          *      setting it.
4754          */
4755 
4756         ret = 0;
4757         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4758                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4759 
4760                 if (!ret)
4761                         dev_set_rx_mode(dev);
4762         }
4763 
4764         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4765                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4766 
4767                 dev->gflags ^= IFF_PROMISC;
4768                 dev_set_promiscuity(dev, inc);
4769         }
4770 
4771         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4772            is important. Some (broken) drivers set IFF_PROMISC, when
4773            IFF_ALLMULTI is requested not asking us and not reporting.
4774          */
4775         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4776                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4777 
4778                 dev->gflags ^= IFF_ALLMULTI;
4779                 dev_set_allmulti(dev, inc);
4780         }
4781 
4782         return ret;
4783 }
4784 
4785 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4786 {
4787         unsigned int changes = dev->flags ^ old_flags;
4788 
4789         if (changes & IFF_UP) {
4790                 if (dev->flags & IFF_UP)
4791                         call_netdevice_notifiers(NETDEV_UP, dev);
4792                 else
4793                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4794         }
4795 
4796         if (dev->flags & IFF_UP &&
4797             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4798                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4799 }
4800 
4801 /**
4802  *      dev_change_flags - change device settings
4803  *      @dev: device
4804  *      @flags: device state flags
4805  *
4806  *      Change settings on device based state flags. The flags are
4807  *      in the userspace exported format.
4808  */
4809 int dev_change_flags(struct net_device *dev, unsigned int flags)
4810 {
4811         int ret;
4812         unsigned int changes, old_flags = dev->flags;
4813 
4814         ret = __dev_change_flags(dev, flags);
4815         if (ret < 0)
4816                 return ret;
4817 
4818         changes = old_flags ^ dev->flags;
4819         if (changes)
4820                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4821 
4822         __dev_notify_flags(dev, old_flags);
4823         return ret;
4824 }
4825 EXPORT_SYMBOL(dev_change_flags);
4826 
4827 /**
4828  *      dev_set_mtu - Change maximum transfer unit
4829  *      @dev: device
4830  *      @new_mtu: new transfer unit
4831  *
4832  *      Change the maximum transfer size of the network device.
4833  */
4834 int dev_set_mtu(struct net_device *dev, int new_mtu)
4835 {
4836         const struct net_device_ops *ops = dev->netdev_ops;
4837         int err;
4838 
4839         if (new_mtu == dev->mtu)
4840                 return 0;
4841 
4842         /*      MTU must be positive.    */
4843         if (new_mtu < 0)
4844                 return -EINVAL;
4845 
4846         if (!netif_device_present(dev))
4847                 return -ENODEV;
4848 
4849         err = 0;
4850         if (ops->ndo_change_mtu)
4851                 err = ops->ndo_change_mtu(dev, new_mtu);
4852         else
4853                 dev->mtu = new_mtu;
4854 
4855         if (!err)
4856                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4857         return err;
4858 }
4859 EXPORT_SYMBOL(dev_set_mtu);
4860 
4861 /**
4862  *      dev_set_group - Change group this device belongs to
4863  *      @dev: device
4864  *      @new_group: group this device should belong to
4865  */
4866 void dev_set_group(struct net_device *dev, int new_group)
4867 {
4868         dev->group = new_group;
4869 }
4870 EXPORT_SYMBOL(dev_set_group);
4871 
4872 /**
4873  *      dev_set_mac_address - Change Media Access Control Address
4874  *      @dev: device
4875  *      @sa: new address
4876  *
4877  *      Change the hardware (MAC) address of the device
4878  */
4879 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4880 {
4881         const struct net_device_ops *ops = dev->netdev_ops;
4882         int err;
4883 
4884         if (!ops->ndo_set_mac_address)
4885                 return -EOPNOTSUPP;
4886         if (sa->sa_family != dev->type)
4887                 return -EINVAL;
4888         if (!netif_device_present(dev))
4889                 return -ENODEV;
4890         err = ops->ndo_set_mac_address(dev, sa);
4891         if (err)
4892                 return err;
4893         dev->addr_assign_type = NET_ADDR_SET;
4894         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4895         add_device_randomness(dev->dev_addr, dev->addr_len);
4896         return 0;
4897 }
4898 EXPORT_SYMBOL(dev_set_mac_address);
4899 
4900 /**
4901  *      dev_change_carrier - Change device carrier
4902  *      @dev: device
4903  *      @new_carrier: new value
4904  *
4905  *      Change device carrier
4906  */
4907 int dev_change_carrier(struct net_device *dev, bool new_carrier)
4908 {
4909         const struct net_device_ops *ops = dev->netdev_ops;
4910 
4911         if (!ops->ndo_change_carrier)
4912                 return -EOPNOTSUPP;
4913         if (!netif_device_present(dev))
4914                 return -ENODEV;
4915         return ops->ndo_change_carrier(dev, new_carrier);
4916 }
4917 EXPORT_SYMBOL(dev_change_carrier);
4918 
4919 /**
4920  *      dev_new_index   -       allocate an ifindex
4921  *      @net: the applicable net namespace
4922  *
4923  *      Returns a suitable unique value for a new device interface
4924  *      number.  The caller must hold the rtnl semaphore or the
4925  *      dev_base_lock to be sure it remains unique.
4926  */
4927 static int dev_new_index(struct net *net)
4928 {
4929         int ifindex = net->ifindex;
4930         for (;;) {
4931                 if (++ifindex <= 0)
4932                         ifindex = 1;
4933                 if (!__dev_get_by_index(net, ifindex))
4934                         return net->ifindex = ifindex;
4935         }
4936 }
4937 
4938 /* Delayed registration/unregisteration */
4939 static LIST_HEAD(net_todo_list);
4940 
4941 static void net_set_todo(struct net_device *dev)
4942 {
4943         list_add_tail(&dev->todo_list, &net_todo_list);
4944 }
4945 
4946 static void rollback_registered_many(struct list_head *head)
4947 {
4948         struct net_device *dev, *tmp;
4949 
4950         BUG_ON(dev_boot_phase);
4951         ASSERT_RTNL();
4952 
4953         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4954                 /* Some devices call without registering
4955                  * for initialization unwind. Remove those
4956                  * devices and proceed with the remaining.
4957                  */
4958                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4959                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4960                                  dev->name, dev);
4961 
4962                         WARN_ON(1);
4963                         list_del(&dev->unreg_list);
4964                         continue;
4965                 }
4966                 dev->dismantle = true;
4967                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4968         }
4969 
4970         /* If device is running, close it first. */
4971         dev_close_many(head);
4972 
4973         list_for_each_entry(dev, head, unreg_list) {
4974                 /* And unlink it from device chain. */
4975                 unlist_netdevice(dev);
4976 
4977                 dev->reg_state = NETREG_UNREGISTERING;
4978         }
4979 
4980         synchronize_net();
4981 
4982         list_for_each_entry(dev, head, unreg_list) {
4983                 /* Shutdown queueing discipline. */
4984                 dev_shutdown(dev);
4985 
4986 
4987                 /* Notify protocols, that we are about to destroy
4988                    this device. They should clean all the things.
4989                 */
4990                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4991 
4992                 if (!dev->rtnl_link_ops ||
4993                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4994                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4995 
4996                 /*
4997                  *      Flush the unicast and multicast chains
4998                  */
4999                 dev_uc_flush(dev);
5000                 dev_mc_flush(dev);
5001 
5002                 if (dev->netdev_ops->ndo_uninit)
5003                         dev->netdev_ops->ndo_uninit(dev);
5004 
5005                 /* Notifier chain MUST detach us all upper devices. */
5006                 WARN_ON(netdev_has_any_upper_dev(dev));
5007 
5008                 /* Remove entries from kobject tree */
5009                 netdev_unregister_kobject(dev);
5010 #ifdef CONFIG_XPS
5011                 /* Remove XPS queueing entries */
5012                 netif_reset_xps_queues_gt(dev, 0);
5013 #endif
5014         }
5015 
5016         synchronize_net();
5017 
5018         list_for_each_entry(dev, head, unreg_list)
5019                 dev_put(dev);
5020 }
5021 
5022 static void rollback_registered(struct net_device *dev)
5023 {
5024         LIST_HEAD(single);
5025 
5026         list_add(&dev->unreg_list, &single);
5027         rollback_registered_many(&single);
5028         list_del(&single);
5029 }
5030 
5031 static netdev_features_t netdev_fix_features(struct net_device *dev,
5032         netdev_features_t features)
5033 {
5034         /* Fix illegal checksum combinations */
5035         if ((features & NETIF_F_HW_CSUM) &&
5036             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5037                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5038                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5039         }
5040 
5041         /* TSO requires that SG is present as well. */
5042         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5043                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5044                 features &= ~NETIF_F_ALL_TSO;
5045         }
5046 
5047         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5048                                         !(features & NETIF_F_IP_CSUM)) {
5049                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5050                 features &= ~NETIF_F_TSO;
5051                 features &= ~NETIF_F_TSO_ECN;
5052         }
5053 
5054         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5055                                          !(features & NETIF_F_IPV6_CSUM)) {
5056                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5057                 features &= ~NETIF_F_TSO6;
5058         }
5059 
5060         /* TSO ECN requires that TSO is present as well. */
5061         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5062                 features &= ~NETIF_F_TSO_ECN;
5063 
5064         /* Software GSO depends on SG. */
5065         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5066                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5067                 features &= ~NETIF_F_GSO;
5068         }
5069 
5070         /* UFO needs SG and checksumming */
5071         if (features & NETIF_F_UFO) {
5072                 /* maybe split UFO into V4 and V6? */
5073                 if (!((features & NETIF_F_GEN_CSUM) ||
5074                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5075                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5076                         netdev_dbg(dev,
5077                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5078                         features &= ~NETIF_F_UFO;
5079                 }
5080 
5081                 if (!(features & NETIF_F_SG)) {
5082                         netdev_dbg(dev,
5083                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5084                         features &= ~NETIF_F_UFO;
5085                 }
5086         }
5087 
5088         return features;
5089 }
5090 
5091 int __netdev_update_features(struct net_device *dev)
5092 {
5093         netdev_features_t features;
5094         int err = 0;
5095 
5096         ASSERT_RTNL();
5097 
5098         features = netdev_get_wanted_features(dev);
5099 
5100         if (dev->netdev_ops->ndo_fix_features)
5101                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5102 
5103         /* driver might be less strict about feature dependencies */
5104         features = netdev_fix_features(dev, features);
5105 
5106         if (dev->features == features)
5107                 return 0;
5108 
5109         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5110                 &dev->features, &features);
5111 
5112         if (dev->netdev_ops->ndo_set_features)
5113                 err = dev->netdev_ops->ndo_set_features(dev, features);
5114 
5115         if (unlikely(err < 0)) {
5116                 netdev_err(dev,
5117                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5118                         err, &features, &dev->features);
5119                 return -1;
5120         }
5121 
5122         if (!err)
5123                 dev->features = features;
5124 
5125         return 1;
5126 }
5127 
5128 /**
5129  *      netdev_update_features - recalculate device features
5130  *      @dev: the device to check
5131  *
5132  *      Recalculate dev->features set and send notifications if it
5133  *      has changed. Should be called after driver or hardware dependent
5134  *      conditions might have changed that influence the features.
5135  */
5136 void netdev_update_features(struct net_device *dev)
5137 {
5138         if (__netdev_update_features(dev))
5139                 netdev_features_change(dev);
5140 }
5141 EXPORT_SYMBOL(netdev_update_features);
5142 
5143 /**
5144  *      netdev_change_features - recalculate device features
5145  *      @dev: the device to check
5146  *
5147  *      Recalculate dev->features set and send notifications even
5148  *      if they have not changed. Should be called instead of
5149  *      netdev_update_features() if also dev->vlan_features might
5150  *      have changed to allow the changes to be propagated to stacked
5151  *      VLAN devices.
5152  */
5153 void netdev_change_features(struct net_device *dev)
5154 {
5155         __netdev_update_features(dev);
5156         netdev_features_change(dev);
5157 }
5158 EXPORT_SYMBOL(netdev_change_features);
5159 
5160 /**
5161  *      netif_stacked_transfer_operstate -      transfer operstate
5162  *      @rootdev: the root or lower level device to transfer state from
5163  *      @dev: the device to transfer operstate to
5164  *
5165  *      Transfer operational state from root to device. This is normally
5166  *      called when a stacking relationship exists between the root
5167  *      device and the device(a leaf device).
5168  */
5169 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5170                                         struct net_device *dev)
5171 {
5172         if (rootdev->operstate == IF_OPER_DORMANT)
5173                 netif_dormant_on(dev);
5174         else
5175                 netif_dormant_off(dev);
5176 
5177         if (netif_carrier_ok(rootdev)) {
5178                 if (!netif_carrier_ok(dev))
5179                         netif_carrier_on(dev);
5180         } else {
5181                 if (netif_carrier_ok(dev))
5182                         netif_carrier_off(dev);
5183         }
5184 }
5185 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5186 
5187 #ifdef CONFIG_RPS
5188 static int netif_alloc_rx_queues(struct net_device *dev)
5189 {
5190         unsigned int i, count = dev->num_rx_queues;
5191         struct netdev_rx_queue *rx;
5192 
5193         BUG_ON(count < 1);
5194 
5195         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5196         if (!rx)
5197                 return -ENOMEM;
5198 
5199         dev->_rx = rx;
5200 
5201         for (i = 0; i < count; i++)
5202                 rx[i].dev = dev;
5203         return 0;
5204 }
5205 #endif
5206 
5207 static void netdev_init_one_queue(struct net_device *dev,
5208                                   struct netdev_queue *queue, void *_unused)
5209 {
5210         /* Initialize queue lock */
5211         spin_lock_init(&queue->_xmit_lock);
5212         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5213         queue->xmit_lock_owner = -1;
5214         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5215         queue->dev = dev;
5216 #ifdef CONFIG_BQL
5217         dql_init(&queue->dql, HZ);
5218 #endif
5219 }
5220 
5221 static int netif_alloc_netdev_queues(struct net_device *dev)
5222 {
5223         unsigned int count = dev->num_tx_queues;
5224         struct netdev_queue *tx;
5225 
5226         BUG_ON(count < 1);
5227 
5228         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5229         if (!tx)
5230                 return -ENOMEM;
5231 
5232         dev->_tx = tx;
5233 
5234         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5235         spin_lock_init(&dev->tx_global_lock);
5236 
5237         return 0;
5238 }
5239 
5240 /**
5241  *      register_netdevice      - register a network device
5242  *      @dev: device to register
5243  *
5244  *      Take a completed network device structure and add it to the kernel
5245  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5246  *      chain. 0 is returned on success. A negative errno code is returned
5247  *      on a failure to set up the device, or if the name is a duplicate.
5248  *
5249  *      Callers must hold the rtnl semaphore. You may want
5250  *      register_netdev() instead of this.
5251  *
5252  *      BUGS:
5253  *      The locking appears insufficient to guarantee two parallel registers
5254  *      will not get the same name.
5255  */
5256 
5257 int register_netdevice(struct net_device *dev)
5258 {
5259         int ret;
5260         struct net *net = dev_net(dev);
5261 
5262         BUG_ON(dev_boot_phase);
5263         ASSERT_RTNL();
5264 
5265         might_sleep();
5266 
5267         /* When net_device's are persistent, this will be fatal. */
5268         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5269         BUG_ON(!net);
5270 
5271         spin_lock_init(&dev->addr_list_lock);
5272         netdev_set_addr_lockdep_class(dev);
5273 
5274         dev->iflink = -1;
5275 
5276         ret = dev_get_valid_name(net, dev, dev->name);
5277         if (ret < 0)
5278                 goto out;
5279 
5280         /* Init, if this function is available */
5281         if (dev->netdev_ops->ndo_init) {
5282                 ret = dev->netdev_ops->ndo_init(dev);
5283                 if (ret) {
5284                         if (ret > 0)
5285                                 ret = -EIO;
5286                         goto out;
5287                 }
5288         }
5289 
5290         if (((dev->hw_features | dev->features) &
5291              NETIF_F_HW_VLAN_CTAG_FILTER) &&
5292             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5293              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5294                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5295                 ret = -EINVAL;
5296                 goto err_uninit;
5297         }
5298 
5299         ret = -EBUSY;
5300         if (!dev->ifindex)
5301                 dev->ifindex = dev_new_index(net);
5302         else if (__dev_get_by_index(net, dev->ifindex))
5303                 goto err_uninit;
5304 
5305         if (dev->iflink == -1)
5306                 dev->iflink = dev->ifindex;
5307 
5308         /* Transfer changeable features to wanted_features and enable
5309          * software offloads (GSO and GRO).
5310          */
5311         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5312         dev->features |= NETIF_F_SOFT_FEATURES;
5313         dev->wanted_features = dev->features & dev->hw_features;
5314 
5315         /* Turn on no cache copy if HW is doing checksum */
5316         if (!(dev->flags & IFF_LOOPBACK)) {
5317                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5318                 if (dev->features & NETIF_F_ALL_CSUM) {
5319                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5320                         dev->features |= NETIF_F_NOCACHE_COPY;
5321                 }
5322         }
5323 
5324         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5325          */
5326         dev->vlan_features |= NETIF_F_HIGHDMA;
5327 
5328         /* Make NETIF_F_SG inheritable to tunnel devices.
5329          */
5330         dev->hw_enc_features |= NETIF_F_SG;
5331 
5332         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5333         ret = notifier_to_errno(ret);
5334         if (ret)
5335                 goto err_uninit;
5336 
5337         ret = netdev_register_kobject(dev);
5338         if (ret)
5339                 goto err_uninit;
5340         dev->reg_state = NETREG_REGISTERED;
5341 
5342         __netdev_update_features(dev);
5343 
5344         /*
5345          *      Default initial state at registry is that the
5346          *      device is present.
5347          */
5348 
5349         set_bit(__LINK_STATE_PRESENT, &dev->state);
5350 
5351         linkwatch_init_dev(dev);
5352 
5353         dev_init_scheduler(dev);
5354         dev_hold(dev);
5355         list_netdevice(dev);
5356         add_device_randomness(dev->dev_addr, dev->addr_len);
5357 
5358         /* If the device has permanent device address, driver should
5359          * set dev_addr and also addr_assign_type should be set to
5360          * NET_ADDR_PERM (default value).
5361          */
5362         if (dev->addr_assign_type == NET_ADDR_PERM)
5363                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5364 
5365         /* Notify protocols, that a new device appeared. */
5366         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5367         ret = notifier_to_errno(ret);
5368         if (ret) {
5369                 rollback_registered(dev);
5370                 dev->reg_state = NETREG_UNREGISTERED;
5371         }
5372         /*
5373          *      Prevent userspace races by waiting until the network
5374          *      device is fully setup before sending notifications.
5375          */
5376         if (!dev->rtnl_link_ops ||
5377             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5378                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5379 
5380 out:
5381         return ret;
5382 
5383 err_uninit:
5384         if (dev->netdev_ops->ndo_uninit)
5385                 dev->netdev_ops->ndo_uninit(dev);
5386         goto out;
5387 }
5388 EXPORT_SYMBOL(register_netdevice);
5389 
5390 /**
5391  *      init_dummy_netdev       - init a dummy network device for NAPI
5392  *      @dev: device to init
5393  *
5394  *      This takes a network device structure and initialize the minimum
5395  *      amount of fields so it can be used to schedule NAPI polls without
5396  *      registering a full blown interface. This is to be used by drivers
5397  *      that need to tie several hardware interfaces to a single NAPI
5398  *      poll scheduler due to HW limitations.
5399  */
5400 int init_dummy_netdev(struct net_device *dev)
5401 {
5402         /* Clear everything. Note we don't initialize spinlocks
5403          * are they aren't supposed to be taken by any of the
5404          * NAPI code and this dummy netdev is supposed to be
5405          * only ever used for NAPI polls
5406          */
5407         memset(dev, 0, sizeof(struct net_device));
5408 
5409         /* make sure we BUG if trying to hit standard
5410          * register/unregister code path
5411          */
5412         dev->reg_state = NETREG_DUMMY;
5413 
5414         /* NAPI wants this */
5415         INIT_LIST_HEAD(&dev->napi_list);
5416 
5417         /* a dummy interface is started by default */
5418         set_bit(__LINK_STATE_PRESENT, &dev->state);
5419         set_bit(__LINK_STATE_START, &dev->state);
5420 
5421         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5422          * because users of this 'device' dont need to change
5423          * its refcount.
5424          */
5425 
5426         return 0;
5427 }
5428 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5429 
5430 
5431 /**
5432  *      register_netdev - register a network device
5433  *      @dev: device to register
5434  *
5435  *      Take a completed network device structure and add it to the kernel
5436  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5437  *      chain. 0 is returned on success. A negative errno code is returned
5438  *      on a failure to set up the device, or if the name is a duplicate.
5439  *
5440  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5441  *      and expands the device name if you passed a format string to
5442  *      alloc_netdev.
5443  */
5444 int register_netdev(struct net_device *dev)
5445 {
5446         int err;
5447 
5448         rtnl_lock();
5449         err = register_netdevice(dev);
5450         rtnl_unlock();
5451         return err;
5452 }
5453 EXPORT_SYMBOL(register_netdev);
5454 
5455 int netdev_refcnt_read(const struct net_device *dev)
5456 {
5457         int i, refcnt = 0;
5458 
5459         for_each_possible_cpu(i)
5460                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5461         return refcnt;
5462 }
5463 EXPORT_SYMBOL(netdev_refcnt_read);
5464 
5465 /**
5466  * netdev_wait_allrefs - wait until all references are gone.
5467  * @dev: target net_device
5468  *
5469  * This is called when unregistering network devices.
5470  *
5471  * Any protocol or device that holds a reference should register
5472  * for netdevice notification, and cleanup and put back the
5473  * reference if they receive an UNREGISTER event.
5474  * We can get stuck here if buggy protocols don't correctly
5475  * call dev_put.
5476  */
5477 static void netdev_wait_allrefs(struct net_device *dev)
5478 {
5479         unsigned long rebroadcast_time, warning_time;
5480         int refcnt;
5481 
5482         linkwatch_forget_dev(dev);
5483 
5484         rebroadcast_time = warning_time = jiffies;
5485         refcnt = netdev_refcnt_read(dev);
5486 
5487         while (refcnt != 0) {
5488                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5489                         rtnl_lock();
5490 
5491                         /* Rebroadcast unregister notification */
5492                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5493 
5494                         __rtnl_unlock();
5495                         rcu_barrier();
5496                         rtnl_lock();
5497 
5498                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5499                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5500                                      &dev->state)) {
5501                                 /* We must not have linkwatch events
5502                                  * pending on unregister. If this
5503                                  * happens, we simply run the queue
5504                                  * unscheduled, resulting in a noop
5505                                  * for this device.
5506                                  */
5507                                 linkwatch_run_queue();
5508                         }
5509 
5510                         __rtnl_unlock();
5511 
5512                         rebroadcast_time = jiffies;
5513                 }
5514 
5515                 msleep(250);
5516 
5517                 refcnt = netdev_refcnt_read(dev);
5518 
5519                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5520                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5521                                  dev->name, refcnt);
5522                         warning_time = jiffies;
5523                 }
5524         }
5525 }
5526 
5527 /* The sequence is:
5528  *
5529  *      rtnl_lock();
5530  *      ...
5531  *      register_netdevice(x1);
5532  *      register_netdevice(x2);
5533  *      ...
5534  *      unregister_netdevice(y1);
5535  *      unregister_netdevice(y2);
5536  *      ...
5537  *      rtnl_unlock();
5538  *      free_netdev(y1);
5539  *      free_netdev(y2);
5540  *
5541  * We are invoked by rtnl_unlock().
5542  * This allows us to deal with problems:
5543  * 1) We can delete sysfs objects which invoke hotplug
5544  *    without deadlocking with linkwatch via keventd.
5545  * 2) Since we run with the RTNL semaphore not held, we can sleep
5546  *    safely in order to wait for the netdev refcnt to drop to zero.
5547  *
5548  * We must not return until all unregister events added during
5549  * the interval the lock was held have been completed.
5550  */
5551 void netdev_run_todo(void)
5552 {
5553         struct list_head list;
5554 
5555         /* Snapshot list, allow later requests */
5556         list_replace_init(&net_todo_list, &list);
5557 
5558         __rtnl_unlock();
5559 
5560 
5561         /* Wait for rcu callbacks to finish before next phase */
5562         if (!list_empty(&list))
5563                 rcu_barrier();
5564 
5565         while (!list_empty(&list)) {
5566                 struct net_device *dev
5567                         = list_first_entry(&list, struct net_device, todo_list);
5568                 list_del(&dev->todo_list);
5569 
5570                 rtnl_lock();
5571                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5572                 __rtnl_unlock();
5573 
5574                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5575                         pr_err("network todo '%s' but state %d\n",
5576                                dev->name, dev->reg_state);
5577                         dump_stack();
5578                         continue;
5579                 }
5580 
5581                 dev->reg_state = NETREG_UNREGISTERED;
5582 
5583                 on_each_cpu(flush_backlog, dev, 1);
5584 
5585                 netdev_wait_allrefs(dev);
5586 
5587                 /* paranoia */
5588                 BUG_ON(netdev_refcnt_read(dev));
5589                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5590                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5591                 WARN_ON(dev->dn_ptr);
5592 
5593                 if (dev->destructor)
5594                         dev->destructor(dev);
5595 
5596                 /* Free network device */
5597                 kobject_put(&dev->dev.kobj);
5598         }
5599 }
5600 
5601 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5602  * fields in the same order, with only the type differing.
5603  */
5604 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5605                              const struct net_device_stats *netdev_stats)
5606 {
5607 #if BITS_PER_LONG == 64
5608         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5609         memcpy(stats64, netdev_stats, sizeof(*stats64));
5610 #else
5611         size_t i, n = sizeof(*stats64) / sizeof(u64);
5612         const unsigned long *src = (const unsigned long *)netdev_stats;
5613         u64 *dst = (u64 *)stats64;
5614 
5615         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5616                      sizeof(*stats64) / sizeof(u64));
5617         for (i = 0; i < n; i++)
5618                 dst[i] = src[i];
5619 #endif
5620 }
5621 EXPORT_SYMBOL(netdev_stats_to_stats64);
5622 
5623 /**
5624  *      dev_get_stats   - get network device statistics
5625  *      @dev: device to get statistics from
5626  *      @storage: place to store stats
5627  *
5628  *      Get network statistics from device. Return @storage.
5629  *      The device driver may provide its own method by setting
5630  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5631  *      otherwise the internal statistics structure is used.
5632  */
5633 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5634                                         struct rtnl_link_stats64 *storage)
5635 {
5636         const struct net_device_ops *ops = dev->netdev_ops;
5637 
5638         if (ops->ndo_get_stats64) {
5639                 memset(storage, 0, sizeof(*storage));
5640                 ops->ndo_get_stats64(dev, storage);
5641         } else if (ops->ndo_get_stats) {
5642                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5643         } else {
5644                 netdev_stats_to_stats64(storage, &dev->stats);
5645         }
5646         storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
5647         return storage;
5648 }
5649 EXPORT_SYMBOL(dev_get_stats);
5650 
5651 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5652 {
5653         struct netdev_queue *queue = dev_ingress_queue(dev);
5654 
5655 #ifdef CONFIG_NET_CLS_ACT
5656         if (queue)
5657                 return queue;
5658         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5659         if (!queue)
5660                 return NULL;
5661         netdev_init_one_queue(dev, queue, NULL);
5662         queue->qdisc = &noop_qdisc;
5663         queue->qdisc_sleeping = &noop_qdisc;
5664         rcu_assign_pointer(dev->ingress_queue, queue);
5665 #endif
5666         return queue;
5667 }
5668 
5669 static const struct ethtool_ops default_ethtool_ops;
5670 
5671 void netdev_set_default_ethtool_ops(struct net_device *dev,
5672                                     const struct ethtool_ops *ops)
5673 {
5674         if (dev->ethtool_ops == &default_ethtool_ops)
5675                 dev->ethtool_ops = ops;
5676 }
5677 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5678 
5679 /**
5680  *      alloc_netdev_mqs - allocate network device
5681  *      @sizeof_priv:   size of private data to allocate space for
5682  *      @name:          device name format string
5683  *      @setup:         callback to initialize device
5684  *      @txqs:          the number of TX subqueues to allocate
5685  *      @rxqs:          the number of RX subqueues to allocate
5686  *
5687  *      Allocates a struct net_device with private data area for driver use
5688  *      and performs basic initialization.  Also allocates subquue structs
5689  *      for each queue on the device.
5690  */
5691 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5692                 void (*setup)(struct net_device *),
5693                 unsigned int txqs, unsigned int rxqs)
5694 {
5695         struct net_device *dev;
5696         size_t alloc_size;
5697         struct net_device *p;
5698 
5699         BUG_ON(strlen(name) >= sizeof(dev->name));
5700 
5701         if (txqs < 1) {
5702                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5703                 return NULL;
5704         }
5705 
5706 #ifdef CONFIG_RPS
5707         if (rxqs < 1) {
5708                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5709                 return NULL;
5710         }
5711 #endif
5712 
5713         alloc_size = sizeof(struct net_device);
5714         if (sizeof_priv) {
5715                 /* ensure 32-byte alignment of private area */
5716                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5717                 alloc_size += sizeof_priv;
5718         }
5719         /* ensure 32-byte alignment of whole construct */
5720         alloc_size += NETDEV_ALIGN - 1;
5721 
5722         p = kzalloc(alloc_size, GFP_KERNEL);
5723         if (!p)
5724                 return NULL;
5725 
5726         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5727         dev->padded = (char *)dev - (char *)p;
5728 
5729         dev->pcpu_refcnt = alloc_percpu(int);
5730         if (!dev->pcpu_refcnt)
5731                 goto free_p;
5732 
5733         if (dev_addr_init(dev))
5734                 goto free_pcpu;
5735 
5736         dev_mc_init(dev);
5737         dev_uc_init(dev);
5738 
5739         dev_net_set(dev, &init_net);
5740 
5741         dev->gso_max_size = GSO_MAX_SIZE;
5742         dev->gso_max_segs = GSO_MAX_SEGS;
5743 
5744         INIT_LIST_HEAD(&dev->napi_list);
5745         INIT_LIST_HEAD(&dev->unreg_list);
5746         INIT_LIST_HEAD(&dev->link_watch_list);
5747         INIT_LIST_HEAD(&dev->upper_dev_list);
5748         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5749         setup(dev);
5750 
5751         dev->num_tx_queues = txqs;
5752         dev->real_num_tx_queues = txqs;
5753         if (netif_alloc_netdev_queues(dev))
5754                 goto free_all;
5755 
5756 #ifdef CONFIG_RPS
5757         dev->num_rx_queues = rxqs;
5758         dev->real_num_rx_queues = rxqs;
5759         if (netif_alloc_rx_queues(dev))
5760                 goto free_all;
5761 #endif
5762 
5763         strcpy(dev->name, name);
5764         dev->group = INIT_NETDEV_GROUP;
5765         if (!dev->ethtool_ops)
5766                 dev->ethtool_ops = &default_ethtool_ops;
5767         return dev;
5768 
5769 free_all:
5770         free_netdev(dev);
5771         return NULL;
5772 
5773 free_pcpu:
5774         free_percpu(dev->pcpu_refcnt);
5775         kfree(dev->_tx);
5776 #ifdef CONFIG_RPS
5777         kfree(dev->_rx);
5778 #endif
5779 
5780 free_p:
5781         kfree(p);
5782         return NULL;
5783 }
5784 EXPORT_SYMBOL(alloc_netdev_mqs);
5785 
5786 /**
5787  *      free_netdev - free network device
5788  *      @dev: device
5789  *
5790  *      This function does the last stage of destroying an allocated device
5791  *      interface. The reference to the device object is released.
5792  *      If this is the last reference then it will be freed.
5793  */
5794 void free_netdev(struct net_device *dev)
5795 {
5796         struct napi_struct *p, *n;
5797 
5798         release_net(dev_net(dev));
5799 
5800         kfree(dev->_tx);
5801 #ifdef CONFIG_RPS
5802         kfree(dev->_rx);
5803 #endif
5804 
5805         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5806 
5807         /* Flush device addresses */
5808         dev_addr_flush(dev);
5809 
5810         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5811                 netif_napi_del(p);
5812 
5813         free_percpu(dev->pcpu_refcnt);
5814         dev->pcpu_refcnt = NULL;
5815 
5816         /*  Compatibility with error handling in drivers */
5817         if (dev->reg_state == NETREG_UNINITIALIZED) {
5818                 kfree((char *)dev - dev->padded);
5819                 return;
5820         }
5821 
5822         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5823         dev->reg_state = NETREG_RELEASED;
5824 
5825         /* will free via device release */
5826         put_device(&dev->dev);
5827 }
5828 EXPORT_SYMBOL(free_netdev);
5829 
5830 /**
5831  *      synchronize_net -  Synchronize with packet receive processing
5832  *
5833  *      Wait for packets currently being received to be done.
5834  *      Does not block later packets from starting.
5835  */
5836 void synchronize_net(void)
5837 {
5838         might_sleep();
5839         if (rtnl_is_locked())
5840                 synchronize_rcu_expedited();
5841         else
5842                 synchronize_rcu();
5843 }
5844 EXPORT_SYMBOL(synchronize_net);
5845 
5846 /**
5847  *      unregister_netdevice_queue - remove device from the kernel
5848  *      @dev: device
5849  *      @head: list
5850  *
5851  *      This function shuts down a device interface and removes it
5852  *      from the kernel tables.
5853  *      If head not NULL, device is queued to be unregistered later.
5854  *
5855  *      Callers must hold the rtnl semaphore.  You may want
5856  *      unregister_netdev() instead of this.
5857  */
5858 
5859 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5860 {
5861         ASSERT_RTNL();
5862 
5863         if (head) {
5864                 list_move_tail(&dev->unreg_list, head);
5865         } else {
5866                 rollback_registered(dev);
5867                 /* Finish processing unregister after unlock */
5868                 net_set_todo(dev);
5869         }
5870 }
5871 EXPORT_SYMBOL(unregister_netdevice_queue);
5872 
5873 /**
5874  *      unregister_netdevice_many - unregister many devices
5875  *      @head: list of devices
5876  *
5877  *  Note: As most callers use a stack allocated list_head,
5878  *  we force a list_del() to make sure stack wont be corrupted later.
5879  */
5880 void unregister_netdevice_many(struct list_head *head)
5881 {
5882         struct net_device *dev;
5883 
5884         if (!list_empty(head)) {
5885                 rollback_registered_many(head);
5886                 list_for_each_entry(dev, head, unreg_list)
5887                         net_set_todo(dev);
5888                 list_del(head);
5889         }
5890 }
5891 EXPORT_SYMBOL(unregister_netdevice_many);
5892 
5893 /**
5894  *      unregister_netdev - remove device from the kernel
5895  *      @dev: device
5896  *
5897  *      This function shuts down a device interface and removes it
5898  *      from the kernel tables.
5899  *
5900  *      This is just a wrapper for unregister_netdevice that takes
5901  *      the rtnl semaphore.  In general you want to use this and not
5902  *      unregister_netdevice.
5903  */
5904 void unregister_netdev(struct net_device *dev)
5905 {
5906         rtnl_lock();
5907         unregister_netdevice(dev);
5908         rtnl_unlock();
5909 }
5910 EXPORT_SYMBOL(unregister_netdev);
5911 
5912 /**
5913  *      dev_change_net_namespace - move device to different nethost namespace
5914  *      @dev: device
5915  *      @net: network namespace
5916  *      @pat: If not NULL name pattern to try if the current device name
5917  *            is already taken in the destination network namespace.
5918  *
5919  *      This function shuts down a device interface and moves it
5920  *      to a new network namespace. On success 0 is returned, on
5921  *      a failure a netagive errno code is returned.
5922  *
5923  *      Callers must hold the rtnl semaphore.
5924  */
5925 
5926 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5927 {
5928         int err;
5929 
5930         ASSERT_RTNL();
5931 
5932         /* Don't allow namespace local devices to be moved. */
5933         err = -EINVAL;
5934         if (dev->features & NETIF_F_NETNS_LOCAL)
5935                 goto out;
5936 
5937         /* Ensure the device has been registrered */
5938         if (dev->reg_state != NETREG_REGISTERED)
5939                 goto