~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/core/dev.c

Version: ~ [ linux-5.8-rc5 ] ~ [ linux-5.7.8 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.51 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.132 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.188 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.230 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.230 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  *      NET3    Protocol independent device support routines.
  3  *
  4  *              This program is free software; you can redistribute it and/or
  5  *              modify it under the terms of the GNU General Public License
  6  *              as published by the Free Software Foundation; either version
  7  *              2 of the License, or (at your option) any later version.
  8  *
  9  *      Derived from the non IP parts of dev.c 1.0.19
 10  *              Authors:        Ross Biro
 11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
 13  *
 14  *      Additional Authors:
 15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
 16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
 17  *              David Hinds <dahinds@users.sourceforge.net>
 18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 19  *              Adam Sulmicki <adam@cfar.umd.edu>
 20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
 21  *
 22  *      Changes:
 23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
 24  *                                      to 2 if register_netdev gets called
 25  *                                      before net_dev_init & also removed a
 26  *                                      few lines of code in the process.
 27  *              Alan Cox        :       device private ioctl copies fields back.
 28  *              Alan Cox        :       Transmit queue code does relevant
 29  *                                      stunts to keep the queue safe.
 30  *              Alan Cox        :       Fixed double lock.
 31  *              Alan Cox        :       Fixed promisc NULL pointer trap
 32  *              ????????        :       Support the full private ioctl range
 33  *              Alan Cox        :       Moved ioctl permission check into
 34  *                                      drivers
 35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
 36  *              Alan Cox        :       100 backlog just doesn't cut it when
 37  *                                      you start doing multicast video 8)
 38  *              Alan Cox        :       Rewrote net_bh and list manager.
 39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
 40  *              Alan Cox        :       Took out transmit every packet pass
 41  *                                      Saved a few bytes in the ioctl handler
 42  *              Alan Cox        :       Network driver sets packet type before
 43  *                                      calling netif_rx. Saves a function
 44  *                                      call a packet.
 45  *              Alan Cox        :       Hashed net_bh()
 46  *              Richard Kooijman:       Timestamp fixes.
 47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
 48  *              Alan Cox        :       Device lock protection.
 49  *              Alan Cox        :       Fixed nasty side effect of device close
 50  *                                      changes.
 51  *              Rudi Cilibrasi  :       Pass the right thing to
 52  *                                      set_mac_address()
 53  *              Dave Miller     :       32bit quantity for the device lock to
 54  *                                      make it work out on a Sparc.
 55  *              Bjorn Ekwall    :       Added KERNELD hack.
 56  *              Alan Cox        :       Cleaned up the backlog initialise.
 57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
 58  *                                      1 device.
 59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
 60  *                                      is no device open function.
 61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
 62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
 63  *              Cyrus Durgin    :       Cleaned for KMOD
 64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
 65  *                                      A network device unload needs to purge
 66  *                                      the backlog queue.
 67  *      Paul Rusty Russell      :       SIOCSIFNAME
 68  *              Pekka Riikonen  :       Netdev boot-time settings code
 69  *              Andrew Morton   :       Make unregister_netdevice wait
 70  *                                      indefinitely on dev->refcnt
 71  *              J Hadi Salim    :       - Backlog queue sampling
 72  *                                      - netif_rx() feedback
 73  */
 74 
 75 #include <asm/uaccess.h>
 76 #include <linux/bitops.h>
 77 #include <linux/capability.h>
 78 #include <linux/cpu.h>
 79 #include <linux/types.h>
 80 #include <linux/kernel.h>
 81 #include <linux/hash.h>
 82 #include <linux/slab.h>
 83 #include <linux/sched.h>
 84 #include <linux/mutex.h>
 85 #include <linux/string.h>
 86 #include <linux/mm.h>
 87 #include <linux/socket.h>
 88 #include <linux/sockios.h>
 89 #include <linux/errno.h>
 90 #include <linux/interrupt.h>
 91 #include <linux/if_ether.h>
 92 #include <linux/netdevice.h>
 93 #include <linux/etherdevice.h>
 94 #include <linux/ethtool.h>
 95 #include <linux/notifier.h>
 96 #include <linux/skbuff.h>
 97 #include <net/net_namespace.h>
 98 #include <net/sock.h>
 99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <net/mpls.h>
122 #include <linux/ipv6.h>
123 #include <linux/in.h>
124 #include <linux/jhash.h>
125 #include <linux/random.h>
126 #include <trace/events/napi.h>
127 #include <trace/events/net.h>
128 #include <trace/events/skb.h>
129 #include <linux/pci.h>
130 #include <linux/inetdevice.h>
131 #include <linux/cpu_rmap.h>
132 #include <linux/static_key.h>
133 #include <linux/hashtable.h>
134 #include <linux/vmalloc.h>
135 #include <linux/if_macvlan.h>
136 #include <linux/errqueue.h>
137 #include <linux/hrtimer.h>
138 
139 #include "net-sysfs.h"
140 
141 /* Instead of increasing this, you should create a hash table. */
142 #define MAX_GRO_SKBS 8
143 
144 /* This should be increased if a protocol with a bigger head is added. */
145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
146 
147 static DEFINE_SPINLOCK(ptype_lock);
148 static DEFINE_SPINLOCK(offload_lock);
149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
150 struct list_head ptype_all __read_mostly;       /* Taps */
151 static struct list_head offload_base __read_mostly;
152 
153 static int netif_rx_internal(struct sk_buff *skb);
154 static int call_netdevice_notifiers_info(unsigned long val,
155                                          struct net_device *dev,
156                                          struct netdev_notifier_info *info);
157 
158 /*
159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
160  * semaphore.
161  *
162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
163  *
164  * Writers must hold the rtnl semaphore while they loop through the
165  * dev_base_head list, and hold dev_base_lock for writing when they do the
166  * actual updates.  This allows pure readers to access the list even
167  * while a writer is preparing to update it.
168  *
169  * To put it another way, dev_base_lock is held for writing only to
170  * protect against pure readers; the rtnl semaphore provides the
171  * protection against other writers.
172  *
173  * See, for example usages, register_netdevice() and
174  * unregister_netdevice(), which must be called with the rtnl
175  * semaphore held.
176  */
177 DEFINE_RWLOCK(dev_base_lock);
178 EXPORT_SYMBOL(dev_base_lock);
179 
180 /* protects napi_hash addition/deletion and napi_gen_id */
181 static DEFINE_SPINLOCK(napi_hash_lock);
182 
183 static unsigned int napi_gen_id;
184 static DEFINE_HASHTABLE(napi_hash, 8);
185 
186 static seqcount_t devnet_rename_seq;
187 
188 static inline void dev_base_seq_inc(struct net *net)
189 {
190         while (++net->dev_base_seq == 0);
191 }
192 
193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
194 {
195         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
196 
197         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
198 }
199 
200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
201 {
202         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
203 }
204 
205 static inline void rps_lock(struct softnet_data *sd)
206 {
207 #ifdef CONFIG_RPS
208         spin_lock(&sd->input_pkt_queue.lock);
209 #endif
210 }
211 
212 static inline void rps_unlock(struct softnet_data *sd)
213 {
214 #ifdef CONFIG_RPS
215         spin_unlock(&sd->input_pkt_queue.lock);
216 #endif
217 }
218 
219 /* Device list insertion */
220 static void list_netdevice(struct net_device *dev)
221 {
222         struct net *net = dev_net(dev);
223 
224         ASSERT_RTNL();
225 
226         write_lock_bh(&dev_base_lock);
227         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
228         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
229         hlist_add_head_rcu(&dev->index_hlist,
230                            dev_index_hash(net, dev->ifindex));
231         write_unlock_bh(&dev_base_lock);
232 
233         dev_base_seq_inc(net);
234 }
235 
236 /* Device list removal
237  * caller must respect a RCU grace period before freeing/reusing dev
238  */
239 static void unlist_netdevice(struct net_device *dev)
240 {
241         ASSERT_RTNL();
242 
243         /* Unlink dev from the device chain */
244         write_lock_bh(&dev_base_lock);
245         list_del_rcu(&dev->dev_list);
246         hlist_del_rcu(&dev->name_hlist);
247         hlist_del_rcu(&dev->index_hlist);
248         write_unlock_bh(&dev_base_lock);
249 
250         dev_base_seq_inc(dev_net(dev));
251 }
252 
253 /*
254  *      Our notifier list
255  */
256 
257 static RAW_NOTIFIER_HEAD(netdev_chain);
258 
259 /*
260  *      Device drivers call our routines to queue packets here. We empty the
261  *      queue in the local softnet handler.
262  */
263 
264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
265 EXPORT_PER_CPU_SYMBOL(softnet_data);
266 
267 #ifdef CONFIG_LOCKDEP
268 /*
269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
270  * according to dev->type
271  */
272 static const unsigned short netdev_lock_type[] =
273         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
274          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
275          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
276          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
277          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
278          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
279          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
280          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
281          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
282          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
283          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
284          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
285          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
286          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
287          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
288 
289 static const char *const netdev_lock_name[] =
290         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
291          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
292          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
293          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
294          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
295          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
296          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
297          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
298          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
299          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
300          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
301          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
302          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
303          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
304          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
305 
306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
308 
309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
310 {
311         int i;
312 
313         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
314                 if (netdev_lock_type[i] == dev_type)
315                         return i;
316         /* the last key is used by default */
317         return ARRAY_SIZE(netdev_lock_type) - 1;
318 }
319 
320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
321                                                  unsigned short dev_type)
322 {
323         int i;
324 
325         i = netdev_lock_pos(dev_type);
326         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
327                                    netdev_lock_name[i]);
328 }
329 
330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
331 {
332         int i;
333 
334         i = netdev_lock_pos(dev->type);
335         lockdep_set_class_and_name(&dev->addr_list_lock,
336                                    &netdev_addr_lock_key[i],
337                                    netdev_lock_name[i]);
338 }
339 #else
340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
341                                                  unsigned short dev_type)
342 {
343 }
344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
345 {
346 }
347 #endif
348 
349 /*******************************************************************************
350 
351                 Protocol management and registration routines
352 
353 *******************************************************************************/
354 
355 /*
356  *      Add a protocol ID to the list. Now that the input handler is
357  *      smarter we can dispense with all the messy stuff that used to be
358  *      here.
359  *
360  *      BEWARE!!! Protocol handlers, mangling input packets,
361  *      MUST BE last in hash buckets and checking protocol handlers
362  *      MUST start from promiscuous ptype_all chain in net_bh.
363  *      It is true now, do not change it.
364  *      Explanation follows: if protocol handler, mangling packet, will
365  *      be the first on list, it is not able to sense, that packet
366  *      is cloned and should be copied-on-write, so that it will
367  *      change it and subsequent readers will get broken packet.
368  *                                                      --ANK (980803)
369  */
370 
371 static inline struct list_head *ptype_head(const struct packet_type *pt)
372 {
373         if (pt->type == htons(ETH_P_ALL))
374                 return &ptype_all;
375         else
376                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
377 }
378 
379 /**
380  *      dev_add_pack - add packet handler
381  *      @pt: packet type declaration
382  *
383  *      Add a protocol handler to the networking stack. The passed &packet_type
384  *      is linked into kernel lists and may not be freed until it has been
385  *      removed from the kernel lists.
386  *
387  *      This call does not sleep therefore it can not
388  *      guarantee all CPU's that are in middle of receiving packets
389  *      will see the new packet type (until the next received packet).
390  */
391 
392 void dev_add_pack(struct packet_type *pt)
393 {
394         struct list_head *head = ptype_head(pt);
395 
396         spin_lock(&ptype_lock);
397         list_add_rcu(&pt->list, head);
398         spin_unlock(&ptype_lock);
399 }
400 EXPORT_SYMBOL(dev_add_pack);
401 
402 /**
403  *      __dev_remove_pack        - remove packet handler
404  *      @pt: packet type declaration
405  *
406  *      Remove a protocol handler that was previously added to the kernel
407  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
408  *      from the kernel lists and can be freed or reused once this function
409  *      returns.
410  *
411  *      The packet type might still be in use by receivers
412  *      and must not be freed until after all the CPU's have gone
413  *      through a quiescent state.
414  */
415 void __dev_remove_pack(struct packet_type *pt)
416 {
417         struct list_head *head = ptype_head(pt);
418         struct packet_type *pt1;
419 
420         spin_lock(&ptype_lock);
421 
422         list_for_each_entry(pt1, head, list) {
423                 if (pt == pt1) {
424                         list_del_rcu(&pt->list);
425                         goto out;
426                 }
427         }
428 
429         pr_warn("dev_remove_pack: %p not found\n", pt);
430 out:
431         spin_unlock(&ptype_lock);
432 }
433 EXPORT_SYMBOL(__dev_remove_pack);
434 
435 /**
436  *      dev_remove_pack  - remove packet handler
437  *      @pt: packet type declaration
438  *
439  *      Remove a protocol handler that was previously added to the kernel
440  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
441  *      from the kernel lists and can be freed or reused once this function
442  *      returns.
443  *
444  *      This call sleeps to guarantee that no CPU is looking at the packet
445  *      type after return.
446  */
447 void dev_remove_pack(struct packet_type *pt)
448 {
449         __dev_remove_pack(pt);
450 
451         synchronize_net();
452 }
453 EXPORT_SYMBOL(dev_remove_pack);
454 
455 
456 /**
457  *      dev_add_offload - register offload handlers
458  *      @po: protocol offload declaration
459  *
460  *      Add protocol offload handlers to the networking stack. The passed
461  *      &proto_offload is linked into kernel lists and may not be freed until
462  *      it has been removed from the kernel lists.
463  *
464  *      This call does not sleep therefore it can not
465  *      guarantee all CPU's that are in middle of receiving packets
466  *      will see the new offload handlers (until the next received packet).
467  */
468 void dev_add_offload(struct packet_offload *po)
469 {
470         struct list_head *head = &offload_base;
471 
472         spin_lock(&offload_lock);
473         list_add_rcu(&po->list, head);
474         spin_unlock(&offload_lock);
475 }
476 EXPORT_SYMBOL(dev_add_offload);
477 
478 /**
479  *      __dev_remove_offload     - remove offload handler
480  *      @po: packet offload declaration
481  *
482  *      Remove a protocol offload handler that was previously added to the
483  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
484  *      is removed from the kernel lists and can be freed or reused once this
485  *      function returns.
486  *
487  *      The packet type might still be in use by receivers
488  *      and must not be freed until after all the CPU's have gone
489  *      through a quiescent state.
490  */
491 static void __dev_remove_offload(struct packet_offload *po)
492 {
493         struct list_head *head = &offload_base;
494         struct packet_offload *po1;
495 
496         spin_lock(&offload_lock);
497 
498         list_for_each_entry(po1, head, list) {
499                 if (po == po1) {
500                         list_del_rcu(&po->list);
501                         goto out;
502                 }
503         }
504 
505         pr_warn("dev_remove_offload: %p not found\n", po);
506 out:
507         spin_unlock(&offload_lock);
508 }
509 
510 /**
511  *      dev_remove_offload       - remove packet offload handler
512  *      @po: packet offload declaration
513  *
514  *      Remove a packet offload handler that was previously added to the kernel
515  *      offload handlers by dev_add_offload(). The passed &offload_type is
516  *      removed from the kernel lists and can be freed or reused once this
517  *      function returns.
518  *
519  *      This call sleeps to guarantee that no CPU is looking at the packet
520  *      type after return.
521  */
522 void dev_remove_offload(struct packet_offload *po)
523 {
524         __dev_remove_offload(po);
525 
526         synchronize_net();
527 }
528 EXPORT_SYMBOL(dev_remove_offload);
529 
530 /******************************************************************************
531 
532                       Device Boot-time Settings Routines
533 
534 *******************************************************************************/
535 
536 /* Boot time configuration table */
537 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
538 
539 /**
540  *      netdev_boot_setup_add   - add new setup entry
541  *      @name: name of the device
542  *      @map: configured settings for the device
543  *
544  *      Adds new setup entry to the dev_boot_setup list.  The function
545  *      returns 0 on error and 1 on success.  This is a generic routine to
546  *      all netdevices.
547  */
548 static int netdev_boot_setup_add(char *name, struct ifmap *map)
549 {
550         struct netdev_boot_setup *s;
551         int i;
552 
553         s = dev_boot_setup;
554         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
555                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
556                         memset(s[i].name, 0, sizeof(s[i].name));
557                         strlcpy(s[i].name, name, IFNAMSIZ);
558                         memcpy(&s[i].map, map, sizeof(s[i].map));
559                         break;
560                 }
561         }
562 
563         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
564 }
565 
566 /**
567  *      netdev_boot_setup_check - check boot time settings
568  *      @dev: the netdevice
569  *
570  *      Check boot time settings for the device.
571  *      The found settings are set for the device to be used
572  *      later in the device probing.
573  *      Returns 0 if no settings found, 1 if they are.
574  */
575 int netdev_boot_setup_check(struct net_device *dev)
576 {
577         struct netdev_boot_setup *s = dev_boot_setup;
578         int i;
579 
580         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
581                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
582                     !strcmp(dev->name, s[i].name)) {
583                         dev->irq        = s[i].map.irq;
584                         dev->base_addr  = s[i].map.base_addr;
585                         dev->mem_start  = s[i].map.mem_start;
586                         dev->mem_end    = s[i].map.mem_end;
587                         return 1;
588                 }
589         }
590         return 0;
591 }
592 EXPORT_SYMBOL(netdev_boot_setup_check);
593 
594 
595 /**
596  *      netdev_boot_base        - get address from boot time settings
597  *      @prefix: prefix for network device
598  *      @unit: id for network device
599  *
600  *      Check boot time settings for the base address of device.
601  *      The found settings are set for the device to be used
602  *      later in the device probing.
603  *      Returns 0 if no settings found.
604  */
605 unsigned long netdev_boot_base(const char *prefix, int unit)
606 {
607         const struct netdev_boot_setup *s = dev_boot_setup;
608         char name[IFNAMSIZ];
609         int i;
610 
611         sprintf(name, "%s%d", prefix, unit);
612 
613         /*
614          * If device already registered then return base of 1
615          * to indicate not to probe for this interface
616          */
617         if (__dev_get_by_name(&init_net, name))
618                 return 1;
619 
620         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
621                 if (!strcmp(name, s[i].name))
622                         return s[i].map.base_addr;
623         return 0;
624 }
625 
626 /*
627  * Saves at boot time configured settings for any netdevice.
628  */
629 int __init netdev_boot_setup(char *str)
630 {
631         int ints[5];
632         struct ifmap map;
633 
634         str = get_options(str, ARRAY_SIZE(ints), ints);
635         if (!str || !*str)
636                 return 0;
637 
638         /* Save settings */
639         memset(&map, 0, sizeof(map));
640         if (ints[0] > 0)
641                 map.irq = ints[1];
642         if (ints[0] > 1)
643                 map.base_addr = ints[2];
644         if (ints[0] > 2)
645                 map.mem_start = ints[3];
646         if (ints[0] > 3)
647                 map.mem_end = ints[4];
648 
649         /* Add new entry to the list */
650         return netdev_boot_setup_add(str, &map);
651 }
652 
653 __setup("netdev=", netdev_boot_setup);
654 
655 /*******************************************************************************
656 
657                             Device Interface Subroutines
658 
659 *******************************************************************************/
660 
661 /**
662  *      __dev_get_by_name       - find a device by its name
663  *      @net: the applicable net namespace
664  *      @name: name to find
665  *
666  *      Find an interface by name. Must be called under RTNL semaphore
667  *      or @dev_base_lock. If the name is found a pointer to the device
668  *      is returned. If the name is not found then %NULL is returned. The
669  *      reference counters are not incremented so the caller must be
670  *      careful with locks.
671  */
672 
673 struct net_device *__dev_get_by_name(struct net *net, const char *name)
674 {
675         struct net_device *dev;
676         struct hlist_head *head = dev_name_hash(net, name);
677 
678         hlist_for_each_entry(dev, head, name_hlist)
679                 if (!strncmp(dev->name, name, IFNAMSIZ))
680                         return dev;
681 
682         return NULL;
683 }
684 EXPORT_SYMBOL(__dev_get_by_name);
685 
686 /**
687  *      dev_get_by_name_rcu     - find a device by its name
688  *      @net: the applicable net namespace
689  *      @name: name to find
690  *
691  *      Find an interface by name.
692  *      If the name is found a pointer to the device is returned.
693  *      If the name is not found then %NULL is returned.
694  *      The reference counters are not incremented so the caller must be
695  *      careful with locks. The caller must hold RCU lock.
696  */
697 
698 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
699 {
700         struct net_device *dev;
701         struct hlist_head *head = dev_name_hash(net, name);
702 
703         hlist_for_each_entry_rcu(dev, head, name_hlist)
704                 if (!strncmp(dev->name, name, IFNAMSIZ))
705                         return dev;
706 
707         return NULL;
708 }
709 EXPORT_SYMBOL(dev_get_by_name_rcu);
710 
711 /**
712  *      dev_get_by_name         - find a device by its name
713  *      @net: the applicable net namespace
714  *      @name: name to find
715  *
716  *      Find an interface by name. This can be called from any
717  *      context and does its own locking. The returned handle has
718  *      the usage count incremented and the caller must use dev_put() to
719  *      release it when it is no longer needed. %NULL is returned if no
720  *      matching device is found.
721  */
722 
723 struct net_device *dev_get_by_name(struct net *net, const char *name)
724 {
725         struct net_device *dev;
726 
727         rcu_read_lock();
728         dev = dev_get_by_name_rcu(net, name);
729         if (dev)
730                 dev_hold(dev);
731         rcu_read_unlock();
732         return dev;
733 }
734 EXPORT_SYMBOL(dev_get_by_name);
735 
736 /**
737  *      __dev_get_by_index - find a device by its ifindex
738  *      @net: the applicable net namespace
739  *      @ifindex: index of device
740  *
741  *      Search for an interface by index. Returns %NULL if the device
742  *      is not found or a pointer to the device. The device has not
743  *      had its reference counter increased so the caller must be careful
744  *      about locking. The caller must hold either the RTNL semaphore
745  *      or @dev_base_lock.
746  */
747 
748 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
749 {
750         struct net_device *dev;
751         struct hlist_head *head = dev_index_hash(net, ifindex);
752 
753         hlist_for_each_entry(dev, head, index_hlist)
754                 if (dev->ifindex == ifindex)
755                         return dev;
756 
757         return NULL;
758 }
759 EXPORT_SYMBOL(__dev_get_by_index);
760 
761 /**
762  *      dev_get_by_index_rcu - find a device by its ifindex
763  *      @net: the applicable net namespace
764  *      @ifindex: index of device
765  *
766  *      Search for an interface by index. Returns %NULL if the device
767  *      is not found or a pointer to the device. The device has not
768  *      had its reference counter increased so the caller must be careful
769  *      about locking. The caller must hold RCU lock.
770  */
771 
772 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
773 {
774         struct net_device *dev;
775         struct hlist_head *head = dev_index_hash(net, ifindex);
776 
777         hlist_for_each_entry_rcu(dev, head, index_hlist)
778                 if (dev->ifindex == ifindex)
779                         return dev;
780 
781         return NULL;
782 }
783 EXPORT_SYMBOL(dev_get_by_index_rcu);
784 
785 
786 /**
787  *      dev_get_by_index - find a device by its ifindex
788  *      @net: the applicable net namespace
789  *      @ifindex: index of device
790  *
791  *      Search for an interface by index. Returns NULL if the device
792  *      is not found or a pointer to the device. The device returned has
793  *      had a reference added and the pointer is safe until the user calls
794  *      dev_put to indicate they have finished with it.
795  */
796 
797 struct net_device *dev_get_by_index(struct net *net, int ifindex)
798 {
799         struct net_device *dev;
800 
801         rcu_read_lock();
802         dev = dev_get_by_index_rcu(net, ifindex);
803         if (dev)
804                 dev_hold(dev);
805         rcu_read_unlock();
806         return dev;
807 }
808 EXPORT_SYMBOL(dev_get_by_index);
809 
810 /**
811  *      netdev_get_name - get a netdevice name, knowing its ifindex.
812  *      @net: network namespace
813  *      @name: a pointer to the buffer where the name will be stored.
814  *      @ifindex: the ifindex of the interface to get the name from.
815  *
816  *      The use of raw_seqcount_begin() and cond_resched() before
817  *      retrying is required as we want to give the writers a chance
818  *      to complete when CONFIG_PREEMPT is not set.
819  */
820 int netdev_get_name(struct net *net, char *name, int ifindex)
821 {
822         struct net_device *dev;
823         unsigned int seq;
824 
825 retry:
826         seq = raw_seqcount_begin(&devnet_rename_seq);
827         rcu_read_lock();
828         dev = dev_get_by_index_rcu(net, ifindex);
829         if (!dev) {
830                 rcu_read_unlock();
831                 return -ENODEV;
832         }
833 
834         strcpy(name, dev->name);
835         rcu_read_unlock();
836         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
837                 cond_resched();
838                 goto retry;
839         }
840 
841         return 0;
842 }
843 
844 /**
845  *      dev_getbyhwaddr_rcu - find a device by its hardware address
846  *      @net: the applicable net namespace
847  *      @type: media type of device
848  *      @ha: hardware address
849  *
850  *      Search for an interface by MAC address. Returns NULL if the device
851  *      is not found or a pointer to the device.
852  *      The caller must hold RCU or RTNL.
853  *      The returned device has not had its ref count increased
854  *      and the caller must therefore be careful about locking
855  *
856  */
857 
858 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
859                                        const char *ha)
860 {
861         struct net_device *dev;
862 
863         for_each_netdev_rcu(net, dev)
864                 if (dev->type == type &&
865                     !memcmp(dev->dev_addr, ha, dev->addr_len))
866                         return dev;
867 
868         return NULL;
869 }
870 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
871 
872 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
873 {
874         struct net_device *dev;
875 
876         ASSERT_RTNL();
877         for_each_netdev(net, dev)
878                 if (dev->type == type)
879                         return dev;
880 
881         return NULL;
882 }
883 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
884 
885 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
886 {
887         struct net_device *dev, *ret = NULL;
888 
889         rcu_read_lock();
890         for_each_netdev_rcu(net, dev)
891                 if (dev->type == type) {
892                         dev_hold(dev);
893                         ret = dev;
894                         break;
895                 }
896         rcu_read_unlock();
897         return ret;
898 }
899 EXPORT_SYMBOL(dev_getfirstbyhwtype);
900 
901 /**
902  *      __dev_get_by_flags - find any device with given flags
903  *      @net: the applicable net namespace
904  *      @if_flags: IFF_* values
905  *      @mask: bitmask of bits in if_flags to check
906  *
907  *      Search for any interface with the given flags. Returns NULL if a device
908  *      is not found or a pointer to the device. Must be called inside
909  *      rtnl_lock(), and result refcount is unchanged.
910  */
911 
912 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
913                                       unsigned short mask)
914 {
915         struct net_device *dev, *ret;
916 
917         ASSERT_RTNL();
918 
919         ret = NULL;
920         for_each_netdev(net, dev) {
921                 if (((dev->flags ^ if_flags) & mask) == 0) {
922                         ret = dev;
923                         break;
924                 }
925         }
926         return ret;
927 }
928 EXPORT_SYMBOL(__dev_get_by_flags);
929 
930 /**
931  *      dev_valid_name - check if name is okay for network device
932  *      @name: name string
933  *
934  *      Network device names need to be valid file names to
935  *      to allow sysfs to work.  We also disallow any kind of
936  *      whitespace.
937  */
938 bool dev_valid_name(const char *name)
939 {
940         if (*name == '\0')
941                 return false;
942         if (strlen(name) >= IFNAMSIZ)
943                 return false;
944         if (!strcmp(name, ".") || !strcmp(name, ".."))
945                 return false;
946 
947         while (*name) {
948                 if (*name == '/' || *name == ':' || isspace(*name))
949                         return false;
950                 name++;
951         }
952         return true;
953 }
954 EXPORT_SYMBOL(dev_valid_name);
955 
956 /**
957  *      __dev_alloc_name - allocate a name for a device
958  *      @net: network namespace to allocate the device name in
959  *      @name: name format string
960  *      @buf:  scratch buffer and result name string
961  *
962  *      Passed a format string - eg "lt%d" it will try and find a suitable
963  *      id. It scans list of devices to build up a free map, then chooses
964  *      the first empty slot. The caller must hold the dev_base or rtnl lock
965  *      while allocating the name and adding the device in order to avoid
966  *      duplicates.
967  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
968  *      Returns the number of the unit assigned or a negative errno code.
969  */
970 
971 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
972 {
973         int i = 0;
974         const char *p;
975         const int max_netdevices = 8*PAGE_SIZE;
976         unsigned long *inuse;
977         struct net_device *d;
978 
979         p = strnchr(name, IFNAMSIZ-1, '%');
980         if (p) {
981                 /*
982                  * Verify the string as this thing may have come from
983                  * the user.  There must be either one "%d" and no other "%"
984                  * characters.
985                  */
986                 if (p[1] != 'd' || strchr(p + 2, '%'))
987                         return -EINVAL;
988 
989                 /* Use one page as a bit array of possible slots */
990                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
991                 if (!inuse)
992                         return -ENOMEM;
993 
994                 for_each_netdev(net, d) {
995                         if (!sscanf(d->name, name, &i))
996                                 continue;
997                         if (i < 0 || i >= max_netdevices)
998                                 continue;
999 
1000                         /*  avoid cases where sscanf is not exact inverse of printf */
1001                         snprintf(buf, IFNAMSIZ, name, i);
1002                         if (!strncmp(buf, d->name, IFNAMSIZ))
1003                                 set_bit(i, inuse);
1004                 }
1005 
1006                 i = find_first_zero_bit(inuse, max_netdevices);
1007                 free_page((unsigned long) inuse);
1008         }
1009 
1010         if (buf != name)
1011                 snprintf(buf, IFNAMSIZ, name, i);
1012         if (!__dev_get_by_name(net, buf))
1013                 return i;
1014 
1015         /* It is possible to run out of possible slots
1016          * when the name is long and there isn't enough space left
1017          * for the digits, or if all bits are used.
1018          */
1019         return -ENFILE;
1020 }
1021 
1022 /**
1023  *      dev_alloc_name - allocate a name for a device
1024  *      @dev: device
1025  *      @name: name format string
1026  *
1027  *      Passed a format string - eg "lt%d" it will try and find a suitable
1028  *      id. It scans list of devices to build up a free map, then chooses
1029  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1030  *      while allocating the name and adding the device in order to avoid
1031  *      duplicates.
1032  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1033  *      Returns the number of the unit assigned or a negative errno code.
1034  */
1035 
1036 int dev_alloc_name(struct net_device *dev, const char *name)
1037 {
1038         char buf[IFNAMSIZ];
1039         struct net *net;
1040         int ret;
1041 
1042         BUG_ON(!dev_net(dev));
1043         net = dev_net(dev);
1044         ret = __dev_alloc_name(net, name, buf);
1045         if (ret >= 0)
1046                 strlcpy(dev->name, buf, IFNAMSIZ);
1047         return ret;
1048 }
1049 EXPORT_SYMBOL(dev_alloc_name);
1050 
1051 static int dev_alloc_name_ns(struct net *net,
1052                              struct net_device *dev,
1053                              const char *name)
1054 {
1055         char buf[IFNAMSIZ];
1056         int ret;
1057 
1058         ret = __dev_alloc_name(net, name, buf);
1059         if (ret >= 0)
1060                 strlcpy(dev->name, buf, IFNAMSIZ);
1061         return ret;
1062 }
1063 
1064 static int dev_get_valid_name(struct net *net,
1065                               struct net_device *dev,
1066                               const char *name)
1067 {
1068         BUG_ON(!net);
1069 
1070         if (!dev_valid_name(name))
1071                 return -EINVAL;
1072 
1073         if (strchr(name, '%'))
1074                 return dev_alloc_name_ns(net, dev, name);
1075         else if (__dev_get_by_name(net, name))
1076                 return -EEXIST;
1077         else if (dev->name != name)
1078                 strlcpy(dev->name, name, IFNAMSIZ);
1079 
1080         return 0;
1081 }
1082 
1083 /**
1084  *      dev_change_name - change name of a device
1085  *      @dev: device
1086  *      @newname: name (or format string) must be at least IFNAMSIZ
1087  *
1088  *      Change name of a device, can pass format strings "eth%d".
1089  *      for wildcarding.
1090  */
1091 int dev_change_name(struct net_device *dev, const char *newname)
1092 {
1093         unsigned char old_assign_type;
1094         char oldname[IFNAMSIZ];
1095         int err = 0;
1096         int ret;
1097         struct net *net;
1098 
1099         ASSERT_RTNL();
1100         BUG_ON(!dev_net(dev));
1101 
1102         net = dev_net(dev);
1103         if (dev->flags & IFF_UP)
1104                 return -EBUSY;
1105 
1106         write_seqcount_begin(&devnet_rename_seq);
1107 
1108         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1109                 write_seqcount_end(&devnet_rename_seq);
1110                 return 0;
1111         }
1112 
1113         memcpy(oldname, dev->name, IFNAMSIZ);
1114 
1115         err = dev_get_valid_name(net, dev, newname);
1116         if (err < 0) {
1117                 write_seqcount_end(&devnet_rename_seq);
1118                 return err;
1119         }
1120 
1121         if (oldname[0] && !strchr(oldname, '%'))
1122                 netdev_info(dev, "renamed from %s\n", oldname);
1123 
1124         old_assign_type = dev->name_assign_type;
1125         dev->name_assign_type = NET_NAME_RENAMED;
1126 
1127 rollback:
1128         ret = device_rename(&dev->dev, dev->name);
1129         if (ret) {
1130                 memcpy(dev->name, oldname, IFNAMSIZ);
1131                 dev->name_assign_type = old_assign_type;
1132                 write_seqcount_end(&devnet_rename_seq);
1133                 return ret;
1134         }
1135 
1136         write_seqcount_end(&devnet_rename_seq);
1137 
1138         netdev_adjacent_rename_links(dev, oldname);
1139 
1140         write_lock_bh(&dev_base_lock);
1141         hlist_del_rcu(&dev->name_hlist);
1142         write_unlock_bh(&dev_base_lock);
1143 
1144         synchronize_rcu();
1145 
1146         write_lock_bh(&dev_base_lock);
1147         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1148         write_unlock_bh(&dev_base_lock);
1149 
1150         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1151         ret = notifier_to_errno(ret);
1152 
1153         if (ret) {
1154                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1155                 if (err >= 0) {
1156                         err = ret;
1157                         write_seqcount_begin(&devnet_rename_seq);
1158                         memcpy(dev->name, oldname, IFNAMSIZ);
1159                         memcpy(oldname, newname, IFNAMSIZ);
1160                         dev->name_assign_type = old_assign_type;
1161                         old_assign_type = NET_NAME_RENAMED;
1162                         goto rollback;
1163                 } else {
1164                         pr_err("%s: name change rollback failed: %d\n",
1165                                dev->name, ret);
1166                 }
1167         }
1168 
1169         return err;
1170 }
1171 
1172 /**
1173  *      dev_set_alias - change ifalias of a device
1174  *      @dev: device
1175  *      @alias: name up to IFALIASZ
1176  *      @len: limit of bytes to copy from info
1177  *
1178  *      Set ifalias for a device,
1179  */
1180 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1181 {
1182         char *new_ifalias;
1183 
1184         ASSERT_RTNL();
1185 
1186         if (len >= IFALIASZ)
1187                 return -EINVAL;
1188 
1189         if (!len) {
1190                 kfree(dev->ifalias);
1191                 dev->ifalias = NULL;
1192                 return 0;
1193         }
1194 
1195         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1196         if (!new_ifalias)
1197                 return -ENOMEM;
1198         dev->ifalias = new_ifalias;
1199 
1200         strlcpy(dev->ifalias, alias, len+1);
1201         return len;
1202 }
1203 
1204 
1205 /**
1206  *      netdev_features_change - device changes features
1207  *      @dev: device to cause notification
1208  *
1209  *      Called to indicate a device has changed features.
1210  */
1211 void netdev_features_change(struct net_device *dev)
1212 {
1213         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1214 }
1215 EXPORT_SYMBOL(netdev_features_change);
1216 
1217 /**
1218  *      netdev_state_change - device changes state
1219  *      @dev: device to cause notification
1220  *
1221  *      Called to indicate a device has changed state. This function calls
1222  *      the notifier chains for netdev_chain and sends a NEWLINK message
1223  *      to the routing socket.
1224  */
1225 void netdev_state_change(struct net_device *dev)
1226 {
1227         if (dev->flags & IFF_UP) {
1228                 struct netdev_notifier_change_info change_info;
1229 
1230                 change_info.flags_changed = 0;
1231                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1232                                               &change_info.info);
1233                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1234         }
1235 }
1236 EXPORT_SYMBOL(netdev_state_change);
1237 
1238 /**
1239  *      netdev_notify_peers - notify network peers about existence of @dev
1240  *      @dev: network device
1241  *
1242  * Generate traffic such that interested network peers are aware of
1243  * @dev, such as by generating a gratuitous ARP. This may be used when
1244  * a device wants to inform the rest of the network about some sort of
1245  * reconfiguration such as a failover event or virtual machine
1246  * migration.
1247  */
1248 void netdev_notify_peers(struct net_device *dev)
1249 {
1250         rtnl_lock();
1251         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1252         rtnl_unlock();
1253 }
1254 EXPORT_SYMBOL(netdev_notify_peers);
1255 
1256 static int __dev_open(struct net_device *dev)
1257 {
1258         const struct net_device_ops *ops = dev->netdev_ops;
1259         int ret;
1260 
1261         ASSERT_RTNL();
1262 
1263         if (!netif_device_present(dev))
1264                 return -ENODEV;
1265 
1266         /* Block netpoll from trying to do any rx path servicing.
1267          * If we don't do this there is a chance ndo_poll_controller
1268          * or ndo_poll may be running while we open the device
1269          */
1270         netpoll_poll_disable(dev);
1271 
1272         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1273         ret = notifier_to_errno(ret);
1274         if (ret)
1275                 return ret;
1276 
1277         set_bit(__LINK_STATE_START, &dev->state);
1278 
1279         if (ops->ndo_validate_addr)
1280                 ret = ops->ndo_validate_addr(dev);
1281 
1282         if (!ret && ops->ndo_open)
1283                 ret = ops->ndo_open(dev);
1284 
1285         netpoll_poll_enable(dev);
1286 
1287         if (ret)
1288                 clear_bit(__LINK_STATE_START, &dev->state);
1289         else {
1290                 dev->flags |= IFF_UP;
1291                 dev_set_rx_mode(dev);
1292                 dev_activate(dev);
1293                 add_device_randomness(dev->dev_addr, dev->addr_len);
1294         }
1295 
1296         return ret;
1297 }
1298 
1299 /**
1300  *      dev_open        - prepare an interface for use.
1301  *      @dev:   device to open
1302  *
1303  *      Takes a device from down to up state. The device's private open
1304  *      function is invoked and then the multicast lists are loaded. Finally
1305  *      the device is moved into the up state and a %NETDEV_UP message is
1306  *      sent to the netdev notifier chain.
1307  *
1308  *      Calling this function on an active interface is a nop. On a failure
1309  *      a negative errno code is returned.
1310  */
1311 int dev_open(struct net_device *dev)
1312 {
1313         int ret;
1314 
1315         if (dev->flags & IFF_UP)
1316                 return 0;
1317 
1318         ret = __dev_open(dev);
1319         if (ret < 0)
1320                 return ret;
1321 
1322         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1323         call_netdevice_notifiers(NETDEV_UP, dev);
1324 
1325         return ret;
1326 }
1327 EXPORT_SYMBOL(dev_open);
1328 
1329 static int __dev_close_many(struct list_head *head)
1330 {
1331         struct net_device *dev;
1332 
1333         ASSERT_RTNL();
1334         might_sleep();
1335 
1336         list_for_each_entry(dev, head, close_list) {
1337                 /* Temporarily disable netpoll until the interface is down */
1338                 netpoll_poll_disable(dev);
1339 
1340                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1341 
1342                 clear_bit(__LINK_STATE_START, &dev->state);
1343 
1344                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1345                  * can be even on different cpu. So just clear netif_running().
1346                  *
1347                  * dev->stop() will invoke napi_disable() on all of it's
1348                  * napi_struct instances on this device.
1349                  */
1350                 smp_mb__after_atomic(); /* Commit netif_running(). */
1351         }
1352 
1353         dev_deactivate_many(head);
1354 
1355         list_for_each_entry(dev, head, close_list) {
1356                 const struct net_device_ops *ops = dev->netdev_ops;
1357 
1358                 /*
1359                  *      Call the device specific close. This cannot fail.
1360                  *      Only if device is UP
1361                  *
1362                  *      We allow it to be called even after a DETACH hot-plug
1363                  *      event.
1364                  */
1365                 if (ops->ndo_stop)
1366                         ops->ndo_stop(dev);
1367 
1368                 dev->flags &= ~IFF_UP;
1369                 netpoll_poll_enable(dev);
1370         }
1371 
1372         return 0;
1373 }
1374 
1375 static int __dev_close(struct net_device *dev)
1376 {
1377         int retval;
1378         LIST_HEAD(single);
1379 
1380         list_add(&dev->close_list, &single);
1381         retval = __dev_close_many(&single);
1382         list_del(&single);
1383 
1384         return retval;
1385 }
1386 
1387 static int dev_close_many(struct list_head *head)
1388 {
1389         struct net_device *dev, *tmp;
1390 
1391         /* Remove the devices that don't need to be closed */
1392         list_for_each_entry_safe(dev, tmp, head, close_list)
1393                 if (!(dev->flags & IFF_UP))
1394                         list_del_init(&dev->close_list);
1395 
1396         __dev_close_many(head);
1397 
1398         list_for_each_entry_safe(dev, tmp, head, close_list) {
1399                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1400                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1401                 list_del_init(&dev->close_list);
1402         }
1403 
1404         return 0;
1405 }
1406 
1407 /**
1408  *      dev_close - shutdown an interface.
1409  *      @dev: device to shutdown
1410  *
1411  *      This function moves an active device into down state. A
1412  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1413  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1414  *      chain.
1415  */
1416 int dev_close(struct net_device *dev)
1417 {
1418         if (dev->flags & IFF_UP) {
1419                 LIST_HEAD(single);
1420 
1421                 list_add(&dev->close_list, &single);
1422                 dev_close_many(&single);
1423                 list_del(&single);
1424         }
1425         return 0;
1426 }
1427 EXPORT_SYMBOL(dev_close);
1428 
1429 
1430 /**
1431  *      dev_disable_lro - disable Large Receive Offload on a device
1432  *      @dev: device
1433  *
1434  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1435  *      called under RTNL.  This is needed if received packets may be
1436  *      forwarded to another interface.
1437  */
1438 void dev_disable_lro(struct net_device *dev)
1439 {
1440         struct net_device *lower_dev;
1441         struct list_head *iter;
1442 
1443         dev->wanted_features &= ~NETIF_F_LRO;
1444         netdev_update_features(dev);
1445 
1446         if (unlikely(dev->features & NETIF_F_LRO))
1447                 netdev_WARN(dev, "failed to disable LRO!\n");
1448 
1449         netdev_for_each_lower_dev(dev, lower_dev, iter)
1450                 dev_disable_lro(lower_dev);
1451 }
1452 EXPORT_SYMBOL(dev_disable_lro);
1453 
1454 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1455                                    struct net_device *dev)
1456 {
1457         struct netdev_notifier_info info;
1458 
1459         netdev_notifier_info_init(&info, dev);
1460         return nb->notifier_call(nb, val, &info);
1461 }
1462 
1463 static int dev_boot_phase = 1;
1464 
1465 /**
1466  *      register_netdevice_notifier - register a network notifier block
1467  *      @nb: notifier
1468  *
1469  *      Register a notifier to be called when network device events occur.
1470  *      The notifier passed is linked into the kernel structures and must
1471  *      not be reused until it has been unregistered. A negative errno code
1472  *      is returned on a failure.
1473  *
1474  *      When registered all registration and up events are replayed
1475  *      to the new notifier to allow device to have a race free
1476  *      view of the network device list.
1477  */
1478 
1479 int register_netdevice_notifier(struct notifier_block *nb)
1480 {
1481         struct net_device *dev;
1482         struct net_device *last;
1483         struct net *net;
1484         int err;
1485 
1486         rtnl_lock();
1487         err = raw_notifier_chain_register(&netdev_chain, nb);
1488         if (err)
1489                 goto unlock;
1490         if (dev_boot_phase)
1491                 goto unlock;
1492         for_each_net(net) {
1493                 for_each_netdev(net, dev) {
1494                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1495                         err = notifier_to_errno(err);
1496                         if (err)
1497                                 goto rollback;
1498 
1499                         if (!(dev->flags & IFF_UP))
1500                                 continue;
1501 
1502                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1503                 }
1504         }
1505 
1506 unlock:
1507         rtnl_unlock();
1508         return err;
1509 
1510 rollback:
1511         last = dev;
1512         for_each_net(net) {
1513                 for_each_netdev(net, dev) {
1514                         if (dev == last)
1515                                 goto outroll;
1516 
1517                         if (dev->flags & IFF_UP) {
1518                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1519                                                         dev);
1520                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1521                         }
1522                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1523                 }
1524         }
1525 
1526 outroll:
1527         raw_notifier_chain_unregister(&netdev_chain, nb);
1528         goto unlock;
1529 }
1530 EXPORT_SYMBOL(register_netdevice_notifier);
1531 
1532 /**
1533  *      unregister_netdevice_notifier - unregister a network notifier block
1534  *      @nb: notifier
1535  *
1536  *      Unregister a notifier previously registered by
1537  *      register_netdevice_notifier(). The notifier is unlinked into the
1538  *      kernel structures and may then be reused. A negative errno code
1539  *      is returned on a failure.
1540  *
1541  *      After unregistering unregister and down device events are synthesized
1542  *      for all devices on the device list to the removed notifier to remove
1543  *      the need for special case cleanup code.
1544  */
1545 
1546 int unregister_netdevice_notifier(struct notifier_block *nb)
1547 {
1548         struct net_device *dev;
1549         struct net *net;
1550         int err;
1551 
1552         rtnl_lock();
1553         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1554         if (err)
1555                 goto unlock;
1556 
1557         for_each_net(net) {
1558                 for_each_netdev(net, dev) {
1559                         if (dev->flags & IFF_UP) {
1560                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1561                                                         dev);
1562                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1563                         }
1564                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1565                 }
1566         }
1567 unlock:
1568         rtnl_unlock();
1569         return err;
1570 }
1571 EXPORT_SYMBOL(unregister_netdevice_notifier);
1572 
1573 /**
1574  *      call_netdevice_notifiers_info - call all network notifier blocks
1575  *      @val: value passed unmodified to notifier function
1576  *      @dev: net_device pointer passed unmodified to notifier function
1577  *      @info: notifier information data
1578  *
1579  *      Call all network notifier blocks.  Parameters and return value
1580  *      are as for raw_notifier_call_chain().
1581  */
1582 
1583 static int call_netdevice_notifiers_info(unsigned long val,
1584                                          struct net_device *dev,
1585                                          struct netdev_notifier_info *info)
1586 {
1587         ASSERT_RTNL();
1588         netdev_notifier_info_init(info, dev);
1589         return raw_notifier_call_chain(&netdev_chain, val, info);
1590 }
1591 
1592 /**
1593  *      call_netdevice_notifiers - call all network notifier blocks
1594  *      @val: value passed unmodified to notifier function
1595  *      @dev: net_device pointer passed unmodified to notifier function
1596  *
1597  *      Call all network notifier blocks.  Parameters and return value
1598  *      are as for raw_notifier_call_chain().
1599  */
1600 
1601 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1602 {
1603         struct netdev_notifier_info info;
1604 
1605         return call_netdevice_notifiers_info(val, dev, &info);
1606 }
1607 EXPORT_SYMBOL(call_netdevice_notifiers);
1608 
1609 static struct static_key netstamp_needed __read_mostly;
1610 #ifdef HAVE_JUMP_LABEL
1611 /* We are not allowed to call static_key_slow_dec() from irq context
1612  * If net_disable_timestamp() is called from irq context, defer the
1613  * static_key_slow_dec() calls.
1614  */
1615 static atomic_t netstamp_needed_deferred;
1616 #endif
1617 
1618 void net_enable_timestamp(void)
1619 {
1620 #ifdef HAVE_JUMP_LABEL
1621         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1622 
1623         if (deferred) {
1624                 while (--deferred)
1625                         static_key_slow_dec(&netstamp_needed);
1626                 return;
1627         }
1628 #endif
1629         static_key_slow_inc(&netstamp_needed);
1630 }
1631 EXPORT_SYMBOL(net_enable_timestamp);
1632 
1633 void net_disable_timestamp(void)
1634 {
1635 #ifdef HAVE_JUMP_LABEL
1636         if (in_interrupt()) {
1637                 atomic_inc(&netstamp_needed_deferred);
1638                 return;
1639         }
1640 #endif
1641         static_key_slow_dec(&netstamp_needed);
1642 }
1643 EXPORT_SYMBOL(net_disable_timestamp);
1644 
1645 static inline void net_timestamp_set(struct sk_buff *skb)
1646 {
1647         skb->tstamp.tv64 = 0;
1648         if (static_key_false(&netstamp_needed))
1649                 __net_timestamp(skb);
1650 }
1651 
1652 #define net_timestamp_check(COND, SKB)                  \
1653         if (static_key_false(&netstamp_needed)) {               \
1654                 if ((COND) && !(SKB)->tstamp.tv64)      \
1655                         __net_timestamp(SKB);           \
1656         }                                               \
1657 
1658 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1659 {
1660         unsigned int len;
1661 
1662         if (!(dev->flags & IFF_UP))
1663                 return false;
1664 
1665         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1666         if (skb->len <= len)
1667                 return true;
1668 
1669         /* if TSO is enabled, we don't care about the length as the packet
1670          * could be forwarded without being segmented before
1671          */
1672         if (skb_is_gso(skb))
1673                 return true;
1674 
1675         return false;
1676 }
1677 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1678 
1679 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1680 {
1681         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1682                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1683                         atomic_long_inc(&dev->rx_dropped);
1684                         kfree_skb(skb);
1685                         return NET_RX_DROP;
1686                 }
1687         }
1688 
1689         if (unlikely(!is_skb_forwardable(dev, skb))) {
1690                 atomic_long_inc(&dev->rx_dropped);
1691                 kfree_skb(skb);
1692                 return NET_RX_DROP;
1693         }
1694 
1695         skb_scrub_packet(skb, true);
1696         skb->protocol = eth_type_trans(skb, dev);
1697         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1698 
1699         return 0;
1700 }
1701 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1702 
1703 /**
1704  * dev_forward_skb - loopback an skb to another netif
1705  *
1706  * @dev: destination network device
1707  * @skb: buffer to forward
1708  *
1709  * return values:
1710  *      NET_RX_SUCCESS  (no congestion)
1711  *      NET_RX_DROP     (packet was dropped, but freed)
1712  *
1713  * dev_forward_skb can be used for injecting an skb from the
1714  * start_xmit function of one device into the receive queue
1715  * of another device.
1716  *
1717  * The receiving device may be in another namespace, so
1718  * we have to clear all information in the skb that could
1719  * impact namespace isolation.
1720  */
1721 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1722 {
1723         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1724 }
1725 EXPORT_SYMBOL_GPL(dev_forward_skb);
1726 
1727 static inline int deliver_skb(struct sk_buff *skb,
1728                               struct packet_type *pt_prev,
1729                               struct net_device *orig_dev)
1730 {
1731         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1732                 return -ENOMEM;
1733         atomic_inc(&skb->users);
1734         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1735 }
1736 
1737 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1738 {
1739         if (!ptype->af_packet_priv || !skb->sk)
1740                 return false;
1741 
1742         if (ptype->id_match)
1743                 return ptype->id_match(ptype, skb->sk);
1744         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1745                 return true;
1746 
1747         return false;
1748 }
1749 
1750 /*
1751  *      Support routine. Sends outgoing frames to any network
1752  *      taps currently in use.
1753  */
1754 
1755 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1756 {
1757         struct packet_type *ptype;
1758         struct sk_buff *skb2 = NULL;
1759         struct packet_type *pt_prev = NULL;
1760 
1761         rcu_read_lock();
1762         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1763                 /* Never send packets back to the socket
1764                  * they originated from - MvS (miquels@drinkel.ow.org)
1765                  */
1766                 if ((ptype->dev == dev || !ptype->dev) &&
1767                     (!skb_loop_sk(ptype, skb))) {
1768                         if (pt_prev) {
1769                                 deliver_skb(skb2, pt_prev, skb->dev);
1770                                 pt_prev = ptype;
1771                                 continue;
1772                         }
1773 
1774                         skb2 = skb_clone(skb, GFP_ATOMIC);
1775                         if (!skb2)
1776                                 break;
1777 
1778                         net_timestamp_set(skb2);
1779 
1780                         /* skb->nh should be correctly
1781                            set by sender, so that the second statement is
1782                            just protection against buggy protocols.
1783                          */
1784                         skb_reset_mac_header(skb2);
1785 
1786                         if (skb_network_header(skb2) < skb2->data ||
1787                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1788                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1789                                                      ntohs(skb2->protocol),
1790                                                      dev->name);
1791                                 skb_reset_network_header(skb2);
1792                         }
1793 
1794                         skb2->transport_header = skb2->network_header;
1795                         skb2->pkt_type = PACKET_OUTGOING;
1796                         pt_prev = ptype;
1797                 }
1798         }
1799         if (pt_prev)
1800                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1801         rcu_read_unlock();
1802 }
1803 
1804 /**
1805  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1806  * @dev: Network device
1807  * @txq: number of queues available
1808  *
1809  * If real_num_tx_queues is changed the tc mappings may no longer be
1810  * valid. To resolve this verify the tc mapping remains valid and if
1811  * not NULL the mapping. With no priorities mapping to this
1812  * offset/count pair it will no longer be used. In the worst case TC0
1813  * is invalid nothing can be done so disable priority mappings. If is
1814  * expected that drivers will fix this mapping if they can before
1815  * calling netif_set_real_num_tx_queues.
1816  */
1817 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1818 {
1819         int i;
1820         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1821 
1822         /* If TC0 is invalidated disable TC mapping */
1823         if (tc->offset + tc->count > txq) {
1824                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1825                 dev->num_tc = 0;
1826                 return;
1827         }
1828 
1829         /* Invalidated prio to tc mappings set to TC0 */
1830         for (i = 1; i < TC_BITMASK + 1; i++) {
1831                 int q = netdev_get_prio_tc_map(dev, i);
1832 
1833                 tc = &dev->tc_to_txq[q];
1834                 if (tc->offset + tc->count > txq) {
1835                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1836                                 i, q);
1837                         netdev_set_prio_tc_map(dev, i, 0);
1838                 }
1839         }
1840 }
1841 
1842 #ifdef CONFIG_XPS
1843 static DEFINE_MUTEX(xps_map_mutex);
1844 #define xmap_dereference(P)             \
1845         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1846 
1847 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1848                                         int cpu, u16 index)
1849 {
1850         struct xps_map *map = NULL;
1851         int pos;
1852 
1853         if (dev_maps)
1854                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1855 
1856         for (pos = 0; map && pos < map->len; pos++) {
1857                 if (map->queues[pos] == index) {
1858                         if (map->len > 1) {
1859                                 map->queues[pos] = map->queues[--map->len];
1860                         } else {
1861                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1862                                 kfree_rcu(map, rcu);
1863                                 map = NULL;
1864                         }
1865                         break;
1866                 }
1867         }
1868 
1869         return map;
1870 }
1871 
1872 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1873 {
1874         struct xps_dev_maps *dev_maps;
1875         int cpu, i;
1876         bool active = false;
1877 
1878         mutex_lock(&xps_map_mutex);
1879         dev_maps = xmap_dereference(dev->xps_maps);
1880 
1881         if (!dev_maps)
1882                 goto out_no_maps;
1883 
1884         for_each_possible_cpu(cpu) {
1885                 for (i = index; i < dev->num_tx_queues; i++) {
1886                         if (!remove_xps_queue(dev_maps, cpu, i))
1887                                 break;
1888                 }
1889                 if (i == dev->num_tx_queues)
1890                         active = true;
1891         }
1892 
1893         if (!active) {
1894                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1895                 kfree_rcu(dev_maps, rcu);
1896         }
1897 
1898         for (i = index; i < dev->num_tx_queues; i++)
1899                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1900                                              NUMA_NO_NODE);
1901 
1902 out_no_maps:
1903         mutex_unlock(&xps_map_mutex);
1904 }
1905 
1906 static struct xps_map *expand_xps_map(struct xps_map *map,
1907                                       int cpu, u16 index)
1908 {
1909         struct xps_map *new_map;
1910         int alloc_len = XPS_MIN_MAP_ALLOC;
1911         int i, pos;
1912 
1913         for (pos = 0; map && pos < map->len; pos++) {
1914                 if (map->queues[pos] != index)
1915                         continue;
1916                 return map;
1917         }
1918 
1919         /* Need to add queue to this CPU's existing map */
1920         if (map) {
1921                 if (pos < map->alloc_len)
1922                         return map;
1923 
1924                 alloc_len = map->alloc_len * 2;
1925         }
1926 
1927         /* Need to allocate new map to store queue on this CPU's map */
1928         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1929                                cpu_to_node(cpu));
1930         if (!new_map)
1931                 return NULL;
1932 
1933         for (i = 0; i < pos; i++)
1934                 new_map->queues[i] = map->queues[i];
1935         new_map->alloc_len = alloc_len;
1936         new_map->len = pos;
1937 
1938         return new_map;
1939 }
1940 
1941 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1942                         u16 index)
1943 {
1944         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1945         struct xps_map *map, *new_map;
1946         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1947         int cpu, numa_node_id = -2;
1948         bool active = false;
1949 
1950         mutex_lock(&xps_map_mutex);
1951 
1952         dev_maps = xmap_dereference(dev->xps_maps);
1953 
1954         /* allocate memory for queue storage */
1955         for_each_online_cpu(cpu) {
1956                 if (!cpumask_test_cpu(cpu, mask))
1957                         continue;
1958 
1959                 if (!new_dev_maps)
1960                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1961                 if (!new_dev_maps) {
1962                         mutex_unlock(&xps_map_mutex);
1963                         return -ENOMEM;
1964                 }
1965 
1966                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1967                                  NULL;
1968 
1969                 map = expand_xps_map(map, cpu, index);
1970                 if (!map)
1971                         goto error;
1972 
1973                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1974         }
1975 
1976         if (!new_dev_maps)
1977                 goto out_no_new_maps;
1978 
1979         for_each_possible_cpu(cpu) {
1980                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1981                         /* add queue to CPU maps */
1982                         int pos = 0;
1983 
1984                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1985                         while ((pos < map->len) && (map->queues[pos] != index))
1986                                 pos++;
1987 
1988                         if (pos == map->len)
1989                                 map->queues[map->len++] = index;
1990 #ifdef CONFIG_NUMA
1991                         if (numa_node_id == -2)
1992                                 numa_node_id = cpu_to_node(cpu);
1993                         else if (numa_node_id != cpu_to_node(cpu))
1994                                 numa_node_id = -1;
1995 #endif
1996                 } else if (dev_maps) {
1997                         /* fill in the new device map from the old device map */
1998                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1999                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2000                 }
2001 
2002         }
2003 
2004         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2005 
2006         /* Cleanup old maps */
2007         if (dev_maps) {
2008                 for_each_possible_cpu(cpu) {
2009                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2010                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2011                         if (map && map != new_map)
2012                                 kfree_rcu(map, rcu);
2013                 }
2014 
2015                 kfree_rcu(dev_maps, rcu);
2016         }
2017 
2018         dev_maps = new_dev_maps;
2019         active = true;
2020 
2021 out_no_new_maps:
2022         /* update Tx queue numa node */
2023         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2024                                      (numa_node_id >= 0) ? numa_node_id :
2025                                      NUMA_NO_NODE);
2026 
2027         if (!dev_maps)
2028                 goto out_no_maps;
2029 
2030         /* removes queue from unused CPUs */
2031         for_each_possible_cpu(cpu) {
2032                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2033                         continue;
2034 
2035                 if (remove_xps_queue(dev_maps, cpu, index))
2036                         active = true;
2037         }
2038 
2039         /* free map if not active */
2040         if (!active) {
2041                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2042                 kfree_rcu(dev_maps, rcu);
2043         }
2044 
2045 out_no_maps:
2046         mutex_unlock(&xps_map_mutex);
2047 
2048         return 0;
2049 error:
2050         /* remove any maps that we added */
2051         for_each_possible_cpu(cpu) {
2052                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2053                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2054                                  NULL;
2055                 if (new_map && new_map != map)
2056                         kfree(new_map);
2057         }
2058 
2059         mutex_unlock(&xps_map_mutex);
2060 
2061         kfree(new_dev_maps);
2062         return -ENOMEM;
2063 }
2064 EXPORT_SYMBOL(netif_set_xps_queue);
2065 
2066 #endif
2067 /*
2068  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2069  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2070  */
2071 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2072 {
2073         int rc;
2074 
2075         if (txq < 1 || txq > dev->num_tx_queues)
2076                 return -EINVAL;
2077 
2078         if (dev->reg_state == NETREG_REGISTERED ||
2079             dev->reg_state == NETREG_UNREGISTERING) {
2080                 ASSERT_RTNL();
2081 
2082                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2083                                                   txq);
2084                 if (rc)
2085                         return rc;
2086 
2087                 if (dev->num_tc)
2088                         netif_setup_tc(dev, txq);
2089 
2090                 if (txq < dev->real_num_tx_queues) {
2091                         qdisc_reset_all_tx_gt(dev, txq);
2092 #ifdef CONFIG_XPS
2093                         netif_reset_xps_queues_gt(dev, txq);
2094 #endif
2095                 }
2096         }
2097 
2098         dev->real_num_tx_queues = txq;
2099         return 0;
2100 }
2101 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2102 
2103 #ifdef CONFIG_SYSFS
2104 /**
2105  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2106  *      @dev: Network device
2107  *      @rxq: Actual number of RX queues
2108  *
2109  *      This must be called either with the rtnl_lock held or before
2110  *      registration of the net device.  Returns 0 on success, or a
2111  *      negative error code.  If called before registration, it always
2112  *      succeeds.
2113  */
2114 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2115 {
2116         int rc;
2117 
2118         if (rxq < 1 || rxq > dev->num_rx_queues)
2119                 return -EINVAL;
2120 
2121         if (dev->reg_state == NETREG_REGISTERED) {
2122                 ASSERT_RTNL();
2123 
2124                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2125                                                   rxq);
2126                 if (rc)
2127                         return rc;
2128         }
2129 
2130         dev->real_num_rx_queues = rxq;
2131         return 0;
2132 }
2133 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2134 #endif
2135 
2136 /**
2137  * netif_get_num_default_rss_queues - default number of RSS queues
2138  *
2139  * This routine should set an upper limit on the number of RSS queues
2140  * used by default by multiqueue devices.
2141  */
2142 int netif_get_num_default_rss_queues(void)
2143 {
2144         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2145 }
2146 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2147 
2148 static inline void __netif_reschedule(struct Qdisc *q)
2149 {
2150         struct softnet_data *sd;
2151         unsigned long flags;
2152 
2153         local_irq_save(flags);
2154         sd = this_cpu_ptr(&softnet_data);
2155         q->next_sched = NULL;
2156         *sd->output_queue_tailp = q;
2157         sd->output_queue_tailp = &q->next_sched;
2158         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2159         local_irq_restore(flags);
2160 }
2161 
2162 void __netif_schedule(struct Qdisc *q)
2163 {
2164         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2165                 __netif_reschedule(q);
2166 }
2167 EXPORT_SYMBOL(__netif_schedule);
2168 
2169 struct dev_kfree_skb_cb {
2170         enum skb_free_reason reason;
2171 };
2172 
2173 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2174 {
2175         return (struct dev_kfree_skb_cb *)skb->cb;
2176 }
2177 
2178 void netif_schedule_queue(struct netdev_queue *txq)
2179 {
2180         rcu_read_lock();
2181         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2182                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2183 
2184                 __netif_schedule(q);
2185         }
2186         rcu_read_unlock();
2187 }
2188 EXPORT_SYMBOL(netif_schedule_queue);
2189 
2190 /**
2191  *      netif_wake_subqueue - allow sending packets on subqueue
2192  *      @dev: network device
2193  *      @queue_index: sub queue index
2194  *
2195  * Resume individual transmit queue of a device with multiple transmit queues.
2196  */
2197 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2198 {
2199         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2200 
2201         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2202                 struct Qdisc *q;
2203 
2204                 rcu_read_lock();
2205                 q = rcu_dereference(txq->qdisc);
2206                 __netif_schedule(q);
2207                 rcu_read_unlock();
2208         }
2209 }
2210 EXPORT_SYMBOL(netif_wake_subqueue);
2211 
2212 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2213 {
2214         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2215                 struct Qdisc *q;
2216 
2217                 rcu_read_lock();
2218                 q = rcu_dereference(dev_queue->qdisc);
2219                 __netif_schedule(q);
2220                 rcu_read_unlock();
2221         }
2222 }
2223 EXPORT_SYMBOL(netif_tx_wake_queue);
2224 
2225 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2226 {
2227         unsigned long flags;
2228 
2229         if (likely(atomic_read(&skb->users) == 1)) {
2230                 smp_rmb();
2231                 atomic_set(&skb->users, 0);
2232         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2233                 return;
2234         }
2235         get_kfree_skb_cb(skb)->reason = reason;
2236         local_irq_save(flags);
2237         skb->next = __this_cpu_read(softnet_data.completion_queue);
2238         __this_cpu_write(softnet_data.completion_queue, skb);
2239         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2240         local_irq_restore(flags);
2241 }
2242 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2243 
2244 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2245 {
2246         if (in_irq() || irqs_disabled())
2247                 __dev_kfree_skb_irq(skb, reason);
2248         else
2249                 dev_kfree_skb(skb);
2250 }
2251 EXPORT_SYMBOL(__dev_kfree_skb_any);
2252 
2253 
2254 /**
2255  * netif_device_detach - mark device as removed
2256  * @dev: network device
2257  *
2258  * Mark device as removed from system and therefore no longer available.
2259  */
2260 void netif_device_detach(struct net_device *dev)
2261 {
2262         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2263             netif_running(dev)) {
2264                 netif_tx_stop_all_queues(dev);
2265         }
2266 }
2267 EXPORT_SYMBOL(netif_device_detach);
2268 
2269 /**
2270  * netif_device_attach - mark device as attached
2271  * @dev: network device
2272  *
2273  * Mark device as attached from system and restart if needed.
2274  */
2275 void netif_device_attach(struct net_device *dev)
2276 {
2277         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2278             netif_running(dev)) {
2279                 netif_tx_wake_all_queues(dev);
2280                 __netdev_watchdog_up(dev);
2281         }
2282 }
2283 EXPORT_SYMBOL(netif_device_attach);
2284 
2285 static void skb_warn_bad_offload(const struct sk_buff *skb)
2286 {
2287         static const netdev_features_t null_features = 0;
2288         struct net_device *dev = skb->dev;
2289         const char *driver = "";
2290 
2291         if (!net_ratelimit())
2292                 return;
2293 
2294         if (dev && dev->dev.parent)
2295                 driver = dev_driver_string(dev->dev.parent);
2296 
2297         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2298              "gso_type=%d ip_summed=%d\n",
2299              driver, dev ? &dev->features : &null_features,
2300              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2301              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2302              skb_shinfo(skb)->gso_type, skb->ip_summed);
2303 }
2304 
2305 /*
2306  * Invalidate hardware checksum when packet is to be mangled, and
2307  * complete checksum manually on outgoing path.
2308  */
2309 int skb_checksum_help(struct sk_buff *skb)
2310 {
2311         __wsum csum;
2312         int ret = 0, offset;
2313 
2314         if (skb->ip_summed == CHECKSUM_COMPLETE)
2315                 goto out_set_summed;
2316 
2317         if (unlikely(skb_shinfo(skb)->gso_size)) {
2318                 skb_warn_bad_offload(skb);
2319                 return -EINVAL;
2320         }
2321 
2322         /* Before computing a checksum, we should make sure no frag could
2323          * be modified by an external entity : checksum could be wrong.
2324          */
2325         if (skb_has_shared_frag(skb)) {
2326                 ret = __skb_linearize(skb);
2327                 if (ret)
2328                         goto out;
2329         }
2330 
2331         offset = skb_checksum_start_offset(skb);
2332         BUG_ON(offset >= skb_headlen(skb));
2333         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2334 
2335         offset += skb->csum_offset;
2336         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2337 
2338         if (skb_cloned(skb) &&
2339             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2340                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2341                 if (ret)
2342                         goto out;
2343         }
2344 
2345         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2346 out_set_summed:
2347         skb->ip_summed = CHECKSUM_NONE;
2348 out:
2349         return ret;
2350 }
2351 EXPORT_SYMBOL(skb_checksum_help);
2352 
2353 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2354 {
2355         __be16 type = skb->protocol;
2356 
2357         /* Tunnel gso handlers can set protocol to ethernet. */
2358         if (type == htons(ETH_P_TEB)) {
2359                 struct ethhdr *eth;
2360 
2361                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2362                         return 0;
2363 
2364                 eth = (struct ethhdr *)skb_mac_header(skb);
2365                 type = eth->h_proto;
2366         }
2367 
2368         return __vlan_get_protocol(skb, type, depth);
2369 }
2370 
2371 /**
2372  *      skb_mac_gso_segment - mac layer segmentation handler.
2373  *      @skb: buffer to segment
2374  *      @features: features for the output path (see dev->features)
2375  */
2376 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2377                                     netdev_features_t features)
2378 {
2379         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2380         struct packet_offload *ptype;
2381         int vlan_depth = skb->mac_len;
2382         __be16 type = skb_network_protocol(skb, &vlan_depth);
2383 
2384         if (unlikely(!type))
2385                 return ERR_PTR(-EINVAL);
2386 
2387         __skb_pull(skb, vlan_depth);
2388 
2389         rcu_read_lock();
2390         list_for_each_entry_rcu(ptype, &offload_base, list) {
2391                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2392                         segs = ptype->callbacks.gso_segment(skb, features);
2393                         break;
2394                 }
2395         }
2396         rcu_read_unlock();
2397 
2398         __skb_push(skb, skb->data - skb_mac_header(skb));
2399 
2400         return segs;
2401 }
2402 EXPORT_SYMBOL(skb_mac_gso_segment);
2403 
2404 
2405 /* openvswitch calls this on rx path, so we need a different check.
2406  */
2407 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2408 {
2409         if (tx_path)
2410                 return skb->ip_summed != CHECKSUM_PARTIAL;
2411         else
2412                 return skb->ip_summed == CHECKSUM_NONE;
2413 }
2414 
2415 /**
2416  *      __skb_gso_segment - Perform segmentation on skb.
2417  *      @skb: buffer to segment
2418  *      @features: features for the output path (see dev->features)
2419  *      @tx_path: whether it is called in TX path
2420  *
2421  *      This function segments the given skb and returns a list of segments.
2422  *
2423  *      It may return NULL if the skb requires no segmentation.  This is
2424  *      only possible when GSO is used for verifying header integrity.
2425  */
2426 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2427                                   netdev_features_t features, bool tx_path)
2428 {
2429         if (unlikely(skb_needs_check(skb, tx_path))) {
2430                 int err;
2431 
2432                 skb_warn_bad_offload(skb);
2433 
2434                 err = skb_cow_head(skb, 0);
2435                 if (err < 0)
2436                         return ERR_PTR(err);
2437         }
2438 
2439         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2440         SKB_GSO_CB(skb)->encap_level = 0;
2441 
2442         skb_reset_mac_header(skb);
2443         skb_reset_mac_len(skb);
2444 
2445         return skb_mac_gso_segment(skb, features);
2446 }
2447 EXPORT_SYMBOL(__skb_gso_segment);
2448 
2449 /* Take action when hardware reception checksum errors are detected. */
2450 #ifdef CONFIG_BUG
2451 void netdev_rx_csum_fault(struct net_device *dev)
2452 {
2453         if (net_ratelimit()) {
2454                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2455                 dump_stack();
2456         }
2457 }
2458 EXPORT_SYMBOL(netdev_rx_csum_fault);
2459 #endif
2460 
2461 /* Actually, we should eliminate this check as soon as we know, that:
2462  * 1. IOMMU is present and allows to map all the memory.
2463  * 2. No high memory really exists on this machine.
2464  */
2465 
2466 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2467 {
2468 #ifdef CONFIG_HIGHMEM
2469         int i;
2470         if (!(dev->features & NETIF_F_HIGHDMA)) {
2471                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2472                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2473                         if (PageHighMem(skb_frag_page(frag)))
2474                                 return 1;
2475                 }
2476         }
2477 
2478         if (PCI_DMA_BUS_IS_PHYS) {
2479                 struct device *pdev = dev->dev.parent;
2480 
2481                 if (!pdev)
2482                         return 0;
2483                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2484                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2485                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2486                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2487                                 return 1;
2488                 }
2489         }
2490 #endif
2491         return 0;
2492 }
2493 
2494 /* If MPLS offload request, verify we are testing hardware MPLS features
2495  * instead of standard features for the netdev.
2496  */
2497 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2498 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2499                                            netdev_features_t features,
2500                                            __be16 type)
2501 {
2502         if (eth_p_mpls(type))
2503                 features &= skb->dev->mpls_features;
2504 
2505         return features;
2506 }
2507 #else
2508 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2509                                            netdev_features_t features,
2510                                            __be16 type)
2511 {
2512         return features;
2513 }
2514 #endif
2515 
2516 static netdev_features_t harmonize_features(struct sk_buff *skb,
2517         netdev_features_t features)
2518 {
2519         int tmp;
2520         __be16 type;
2521 
2522         type = skb_network_protocol(skb, &tmp);
2523         features = net_mpls_features(skb, features, type);
2524 
2525         if (skb->ip_summed != CHECKSUM_NONE &&
2526             !can_checksum_protocol(features, type)) {
2527                 features &= ~NETIF_F_ALL_CSUM;
2528         } else if (illegal_highdma(skb->dev, skb)) {
2529                 features &= ~NETIF_F_SG;
2530         }
2531 
2532         return features;
2533 }
2534 
2535 netdev_features_t netif_skb_features(struct sk_buff *skb)
2536 {
2537         struct net_device *dev = skb->dev;
2538         netdev_features_t features = dev->features;
2539         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2540         __be16 protocol = skb->protocol;
2541 
2542         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2543                 features &= ~NETIF_F_GSO_MASK;
2544 
2545         /* If encapsulation offload request, verify we are testing
2546          * hardware encapsulation features instead of standard
2547          * features for the netdev
2548          */
2549         if (skb->encapsulation)
2550                 features &= dev->hw_enc_features;
2551 
2552         if (!vlan_tx_tag_present(skb)) {
2553                 if (unlikely(protocol == htons(ETH_P_8021Q) ||
2554                              protocol == htons(ETH_P_8021AD))) {
2555                         struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2556                         protocol = veh->h_vlan_encapsulated_proto;
2557                 } else {
2558                         goto finalize;
2559                 }
2560         }
2561 
2562         features = netdev_intersect_features(features,
2563                                              dev->vlan_features |
2564                                              NETIF_F_HW_VLAN_CTAG_TX |
2565                                              NETIF_F_HW_VLAN_STAG_TX);
2566 
2567         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2568                 features = netdev_intersect_features(features,
2569                                                      NETIF_F_SG |
2570                                                      NETIF_F_HIGHDMA |
2571                                                      NETIF_F_FRAGLIST |
2572                                                      NETIF_F_GEN_CSUM |
2573                                                      NETIF_F_HW_VLAN_CTAG_TX |
2574                                                      NETIF_F_HW_VLAN_STAG_TX);
2575 
2576 finalize:
2577         if (dev->netdev_ops->ndo_features_check)
2578                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2579                                                                 features);
2580 
2581         return harmonize_features(skb, features);
2582 }
2583 EXPORT_SYMBOL(netif_skb_features);
2584 
2585 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2586                     struct netdev_queue *txq, bool more)
2587 {
2588         unsigned int len;
2589         int rc;
2590 
2591         if (!list_empty(&ptype_all))
2592                 dev_queue_xmit_nit(skb, dev);
2593 
2594         len = skb->len;
2595         trace_net_dev_start_xmit(skb, dev);
2596         rc = netdev_start_xmit(skb, dev, txq, more);
2597         trace_net_dev_xmit(skb, rc, dev, len);
2598 
2599         return rc;
2600 }
2601 
2602 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2603                                     struct netdev_queue *txq, int *ret)
2604 {
2605         struct sk_buff *skb = first;
2606         int rc = NETDEV_TX_OK;
2607 
2608         while (skb) {
2609                 struct sk_buff *next = skb->next;
2610 
2611                 skb->next = NULL;
2612                 rc = xmit_one(skb, dev, txq, next != NULL);
2613                 if (unlikely(!dev_xmit_complete(rc))) {
2614                         skb->next = next;
2615                         goto out;
2616                 }
2617 
2618                 skb = next;
2619                 if (netif_xmit_stopped(txq) && skb) {
2620                         rc = NETDEV_TX_BUSY;
2621                         break;
2622                 }
2623         }
2624 
2625 out:
2626         *ret = rc;
2627         return skb;
2628 }
2629 
2630 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2631                                           netdev_features_t features)
2632 {
2633         if (vlan_tx_tag_present(skb) &&
2634             !vlan_hw_offload_capable(features, skb->vlan_proto))
2635                 skb = __vlan_hwaccel_push_inside(skb);
2636         return skb;
2637 }
2638 
2639 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2640 {
2641         netdev_features_t features;
2642 
2643         if (skb->next)
2644                 return skb;
2645 
2646         features = netif_skb_features(skb);
2647         skb = validate_xmit_vlan(skb, features);
2648         if (unlikely(!skb))
2649                 goto out_null;
2650 
2651         if (netif_needs_gso(dev, skb, features)) {
2652                 struct sk_buff *segs;
2653 
2654                 segs = skb_gso_segment(skb, features);
2655                 if (IS_ERR(segs)) {
2656                         goto out_kfree_skb;
2657                 } else if (segs) {
2658                         consume_skb(skb);
2659                         skb = segs;
2660                 }
2661         } else {
2662                 if (skb_needs_linearize(skb, features) &&
2663                     __skb_linearize(skb))
2664                         goto out_kfree_skb;
2665 
2666                 /* If packet is not checksummed and device does not
2667                  * support checksumming for this protocol, complete
2668                  * checksumming here.
2669                  */
2670                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2671                         if (skb->encapsulation)
2672                                 skb_set_inner_transport_header(skb,
2673                                                                skb_checksum_start_offset(skb));
2674                         else
2675                                 skb_set_transport_header(skb,
2676                                                          skb_checksum_start_offset(skb));
2677                         if (!(features & NETIF_F_ALL_CSUM) &&
2678                             skb_checksum_help(skb))
2679                                 goto out_kfree_skb;
2680                 }
2681         }
2682 
2683         return skb;
2684 
2685 out_kfree_skb:
2686         kfree_skb(skb);
2687 out_null:
2688         return NULL;
2689 }
2690 
2691 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2692 {
2693         struct sk_buff *next, *head = NULL, *tail;
2694 
2695         for (; skb != NULL; skb = next) {
2696                 next = skb->next;
2697                 skb->next = NULL;
2698 
2699                 /* in case skb wont be segmented, point to itself */
2700                 skb->prev = skb;
2701 
2702                 skb = validate_xmit_skb(skb, dev);
2703                 if (!skb)
2704                         continue;
2705 
2706                 if (!head)
2707                         head = skb;
2708                 else
2709                         tail->next = skb;
2710                 /* If skb was segmented, skb->prev points to
2711                  * the last segment. If not, it still contains skb.
2712                  */
2713                 tail = skb->prev;
2714         }
2715         return head;
2716 }
2717 
2718 static void qdisc_pkt_len_init(struct sk_buff *skb)
2719 {
2720         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2721 
2722         qdisc_skb_cb(skb)->pkt_len = skb->len;
2723 
2724         /* To get more precise estimation of bytes sent on wire,
2725          * we add to pkt_len the headers size of all segments
2726          */
2727         if (shinfo->gso_size)  {
2728                 unsigned int hdr_len;
2729                 u16 gso_segs = shinfo->gso_segs;
2730 
2731                 /* mac layer + network layer */
2732                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2733 
2734                 /* + transport layer */
2735                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2736                         hdr_len += tcp_hdrlen(skb);
2737                 else
2738                         hdr_len += sizeof(struct udphdr);
2739 
2740                 if (shinfo->gso_type & SKB_GSO_DODGY)
2741                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2742                                                 shinfo->gso_size);
2743 
2744                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2745         }
2746 }
2747 
2748 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2749                                  struct net_device *dev,
2750                                  struct netdev_queue *txq)
2751 {
2752         spinlock_t *root_lock = qdisc_lock(q);
2753         bool contended;
2754         int rc;
2755 
2756         qdisc_pkt_len_init(skb);
2757         qdisc_calculate_pkt_len(skb, q);
2758         /*
2759          * Heuristic to force contended enqueues to serialize on a
2760          * separate lock before trying to get qdisc main lock.
2761          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2762          * often and dequeue packets faster.
2763          */
2764         contended = qdisc_is_running(q);
2765         if (unlikely(contended))
2766                 spin_lock(&q->busylock);
2767 
2768         spin_lock(root_lock);
2769         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2770                 kfree_skb(skb);
2771                 rc = NET_XMIT_DROP;
2772         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2773                    qdisc_run_begin(q)) {
2774                 /*
2775                  * This is a work-conserving queue; there are no old skbs
2776                  * waiting to be sent out; and the qdisc is not running -
2777                  * xmit the skb directly.
2778                  */
2779 
2780                 qdisc_bstats_update(q, skb);
2781 
2782                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2783                         if (unlikely(contended)) {
2784                                 spin_unlock(&q->busylock);
2785                                 contended = false;
2786                         }
2787                         __qdisc_run(q);
2788                 } else
2789                         qdisc_run_end(q);
2790 
2791                 rc = NET_XMIT_SUCCESS;
2792         } else {
2793                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2794                 if (qdisc_run_begin(q)) {
2795                         if (unlikely(contended)) {
2796                                 spin_unlock(&q->busylock);
2797                                 contended = false;
2798                         }
2799                         __qdisc_run(q);
2800                 }
2801         }
2802         spin_unlock(root_lock);
2803         if (unlikely(contended))
2804                 spin_unlock(&q->busylock);
2805         return rc;
2806 }
2807 
2808 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2809 static void skb_update_prio(struct sk_buff *skb)
2810 {
2811         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2812 
2813         if (!skb->priority && skb->sk && map) {
2814                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2815 
2816                 if (prioidx < map->priomap_len)
2817                         skb->priority = map->priomap[prioidx];
2818         }
2819 }
2820 #else
2821 #define skb_update_prio(skb)
2822 #endif
2823 
2824 DEFINE_PER_CPU(int, xmit_recursion);
2825 EXPORT_SYMBOL(xmit_recursion);
2826 
2827 #define RECURSION_LIMIT 10
2828 
2829 /**
2830  *      dev_loopback_xmit - loop back @skb
2831  *      @skb: buffer to transmit
2832  */
2833 int dev_loopback_xmit(struct sk_buff *skb)
2834 {
2835         skb_reset_mac_header(skb);
2836         __skb_pull(skb, skb_network_offset(skb));
2837         skb->pkt_type = PACKET_LOOPBACK;
2838         skb->ip_summed = CHECKSUM_UNNECESSARY;
2839         WARN_ON(!skb_dst(skb));
2840         skb_dst_force(skb);
2841         netif_rx_ni(skb);
2842         return 0;
2843 }
2844 EXPORT_SYMBOL(dev_loopback_xmit);
2845 
2846 /**
2847  *      __dev_queue_xmit - transmit a buffer
2848  *      @skb: buffer to transmit
2849  *      @accel_priv: private data used for L2 forwarding offload
2850  *
2851  *      Queue a buffer for transmission to a network device. The caller must
2852  *      have set the device and priority and built the buffer before calling
2853  *      this function. The function can be called from an interrupt.
2854  *
2855  *      A negative errno code is returned on a failure. A success does not
2856  *      guarantee the frame will be transmitted as it may be dropped due
2857  *      to congestion or traffic shaping.
2858  *
2859  * -----------------------------------------------------------------------------------
2860  *      I notice this method can also return errors from the queue disciplines,
2861  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2862  *      be positive.
2863  *
2864  *      Regardless of the return value, the skb is consumed, so it is currently
2865  *      difficult to retry a send to this method.  (You can bump the ref count
2866  *      before sending to hold a reference for retry if you are careful.)
2867  *
2868  *      When calling this method, interrupts MUST be enabled.  This is because
2869  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2870  *          --BLG
2871  */
2872 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2873 {
2874         struct net_device *dev = skb->dev;
2875         struct netdev_queue *txq;
2876         struct Qdisc *q;
2877         int rc = -ENOMEM;
2878 
2879         skb_reset_mac_header(skb);
2880 
2881         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2882                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2883 
2884         /* Disable soft irqs for various locks below. Also
2885          * stops preemption for RCU.
2886          */
2887         rcu_read_lock_bh();
2888 
2889         skb_update_prio(skb);
2890 
2891         /* If device/qdisc don't need skb->dst, release it right now while
2892          * its hot in this cpu cache.
2893          */
2894         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2895                 skb_dst_drop(skb);
2896         else
2897                 skb_dst_force(skb);
2898 
2899         txq = netdev_pick_tx(dev, skb, accel_priv);
2900         q = rcu_dereference_bh(txq->qdisc);
2901 
2902 #ifdef CONFIG_NET_CLS_ACT
2903         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2904 #endif
2905         trace_net_dev_queue(skb);
2906         if (q->enqueue) {
2907                 rc = __dev_xmit_skb(skb, q, dev, txq);
2908                 goto out;
2909         }
2910 
2911         /* The device has no queue. Common case for software devices:
2912            loopback, all the sorts of tunnels...
2913 
2914            Really, it is unlikely that netif_tx_lock protection is necessary
2915            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2916            counters.)
2917            However, it is possible, that they rely on protection
2918            made by us here.
2919 
2920            Check this and shot the lock. It is not prone from deadlocks.
2921            Either shot noqueue qdisc, it is even simpler 8)
2922          */
2923         if (dev->flags & IFF_UP) {
2924                 int cpu = smp_processor_id(); /* ok because BHs are off */
2925 
2926                 if (txq->xmit_lock_owner != cpu) {
2927 
2928                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2929                                 goto recursion_alert;
2930 
2931                         skb = validate_xmit_skb(skb, dev);
2932                         if (!skb)
2933                                 goto drop;
2934 
2935                         HARD_TX_LOCK(dev, txq, cpu);
2936 
2937                         if (!netif_xmit_stopped(txq)) {
2938                                 __this_cpu_inc(xmit_recursion);
2939                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2940                                 __this_cpu_dec(xmit_recursion);
2941                                 if (dev_xmit_complete(rc)) {
2942                                         HARD_TX_UNLOCK(dev, txq);
2943                                         goto out;
2944                                 }
2945                         }
2946                         HARD_TX_UNLOCK(dev, txq);
2947                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2948                                              dev->name);
2949                 } else {
2950                         /* Recursion is detected! It is possible,
2951                          * unfortunately
2952                          */
2953 recursion_alert:
2954                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2955                                              dev->name);
2956                 }
2957         }
2958 
2959         rc = -ENETDOWN;
2960 drop:
2961         rcu_read_unlock_bh();
2962 
2963         atomic_long_inc(&dev->tx_dropped);
2964         kfree_skb_list(skb);
2965         return rc;
2966 out:
2967         rcu_read_unlock_bh();
2968         return rc;
2969 }
2970 
2971 int dev_queue_xmit(struct sk_buff *skb)
2972 {
2973         return __dev_queue_xmit(skb, NULL);
2974 }
2975 EXPORT_SYMBOL(dev_queue_xmit);
2976 
2977 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2978 {
2979         return __dev_queue_xmit(skb, accel_priv);
2980 }
2981 EXPORT_SYMBOL(dev_queue_xmit_accel);
2982 
2983 
2984 /*=======================================================================
2985                         Receiver routines
2986   =======================================================================*/
2987 
2988 int netdev_max_backlog __read_mostly = 1000;
2989 EXPORT_SYMBOL(netdev_max_backlog);
2990 
2991 int netdev_tstamp_prequeue __read_mostly = 1;
2992 int netdev_budget __read_mostly = 300;
2993 int weight_p __read_mostly = 64;            /* old backlog weight */
2994 
2995 /* Called with irq disabled */
2996 static inline void ____napi_schedule(struct softnet_data *sd,
2997                                      struct napi_struct *napi)
2998 {
2999         list_add_tail(&napi->poll_list, &sd->poll_list);
3000         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3001 }
3002 
3003 #ifdef CONFIG_RPS
3004 
3005 /* One global table that all flow-based protocols share. */
3006 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3007 EXPORT_SYMBOL(rps_sock_flow_table);
3008 
3009 struct static_key rps_needed __read_mostly;
3010 
3011 static struct rps_dev_flow *
3012 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3013             struct rps_dev_flow *rflow, u16 next_cpu)
3014 {
3015         if (next_cpu != RPS_NO_CPU) {
3016 #ifdef CONFIG_RFS_ACCEL
3017                 struct netdev_rx_queue *rxqueue;
3018                 struct rps_dev_flow_table *flow_table;
3019                 struct rps_dev_flow *old_rflow;
3020                 u32 flow_id;
3021                 u16 rxq_index;
3022                 int rc;
3023 
3024                 /* Should we steer this flow to a different hardware queue? */
3025                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3026                     !(dev->features & NETIF_F_NTUPLE))
3027                         goto out;
3028                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3029                 if (rxq_index == skb_get_rx_queue(skb))
3030                         goto out;
3031 
3032                 rxqueue = dev->_rx + rxq_index;
3033                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3034                 if (!flow_table)
3035                         goto out;
3036                 flow_id = skb_get_hash(skb) & flow_table->mask;
3037                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3038                                                         rxq_index, flow_id);
3039                 if (rc < 0)
3040                         goto out;
3041                 old_rflow = rflow;
3042                 rflow = &flow_table->flows[flow_id];
3043                 rflow->filter = rc;
3044                 if (old_rflow->filter == rflow->filter)
3045                         old_rflow->filter = RPS_NO_FILTER;
3046         out:
3047 #endif
3048                 rflow->last_qtail =
3049                         per_cpu(softnet_data, next_cpu).input_queue_head;
3050         }
3051 
3052         rflow->cpu = next_cpu;
3053         return rflow;
3054 }
3055 
3056 /*
3057  * get_rps_cpu is called from netif_receive_skb and returns the target
3058  * CPU from the RPS map of the receiving queue for a given skb.
3059  * rcu_read_lock must be held on entry.
3060  */
3061 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3062                        struct rps_dev_flow **rflowp)
3063 {
3064         struct netdev_rx_queue *rxqueue;
3065         struct rps_map *map;
3066         struct rps_dev_flow_table *flow_table;
3067         struct rps_sock_flow_table *sock_flow_table;
3068         int cpu = -1;
3069         u16 tcpu;
3070         u32 hash;
3071 
3072         if (skb_rx_queue_recorded(skb)) {
3073                 u16 index = skb_get_rx_queue(skb);
3074                 if (unlikely(index >= dev->real_num_rx_queues)) {
3075                         WARN_ONCE(dev->real_num_rx_queues > 1,
3076                                   "%s received packet on queue %u, but number "
3077                                   "of RX queues is %u\n",
3078                                   dev->name, index, dev->real_num_rx_queues);
3079                         goto done;
3080                 }
3081                 rxqueue = dev->_rx + index;
3082         } else
3083                 rxqueue = dev->_rx;
3084 
3085         map = rcu_dereference(rxqueue->rps_map);
3086         if (map) {
3087                 if (map->len == 1 &&
3088                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3089                         tcpu = map->cpus[0];
3090                         if (cpu_online(tcpu))
3091                                 cpu = tcpu;
3092                         goto done;
3093                 }
3094         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3095                 goto done;
3096         }
3097 
3098         skb_reset_network_header(skb);
3099         hash = skb_get_hash(skb);
3100         if (!hash)
3101                 goto done;
3102 
3103         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3104         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3105         if (flow_table && sock_flow_table) {
3106                 u16 next_cpu;
3107                 struct rps_dev_flow *rflow;
3108 
3109                 rflow = &flow_table->flows[hash & flow_table->mask];
3110                 tcpu = rflow->cpu;
3111 
3112                 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3113 
3114                 /*
3115                  * If the desired CPU (where last recvmsg was done) is
3116                  * different from current CPU (one in the rx-queue flow
3117                  * table entry), switch if one of the following holds:
3118                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3119                  *   - Current CPU is offline.
3120                  *   - The current CPU's queue tail has advanced beyond the
3121                  *     last packet that was enqueued using this table entry.
3122                  *     This guarantees that all previous packets for the flow
3123                  *     have been dequeued, thus preserving in order delivery.
3124                  */
3125                 if (unlikely(tcpu != next_cpu) &&
3126                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3127                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3128                       rflow->last_qtail)) >= 0)) {
3129                         tcpu = next_cpu;
3130                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3131                 }
3132 
3133                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3134                         *rflowp = rflow;
3135                         cpu = tcpu;
3136                         goto done;
3137                 }
3138         }
3139 
3140         if (map) {
3141                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3142                 if (cpu_online(tcpu)) {
3143                         cpu = tcpu;
3144                         goto done;
3145                 }
3146         }
3147 
3148 done:
3149         return cpu;
3150 }
3151 
3152 #ifdef CONFIG_RFS_ACCEL
3153 
3154 /**
3155  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3156  * @dev: Device on which the filter was set
3157  * @rxq_index: RX queue index
3158  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3159  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3160  *
3161  * Drivers that implement ndo_rx_flow_steer() should periodically call
3162  * this function for each installed filter and remove the filters for
3163  * which it returns %true.
3164  */
3165 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3166                          u32 flow_id, u16 filter_id)
3167 {
3168         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3169         struct rps_dev_flow_table *flow_table;
3170         struct rps_dev_flow *rflow;
3171         bool expire = true;
3172         int cpu;
3173 
3174         rcu_read_lock();
3175         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3176         if (flow_table && flow_id <= flow_table->mask) {
3177                 rflow = &flow_table->flows[flow_id];
3178                 cpu = ACCESS_ONCE(rflow->cpu);
3179                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3180                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3181                            rflow->last_qtail) <
3182                      (int)(10 * flow_table->mask)))
3183                         expire = false;
3184         }
3185         rcu_read_unlock();
3186         return expire;
3187 }
3188 EXPORT_SYMBOL(rps_may_expire_flow);
3189 
3190 #endif /* CONFIG_RFS_ACCEL */
3191 
3192 /* Called from hardirq (IPI) context */
3193 static void rps_trigger_softirq(void *data)
3194 {
3195         struct softnet_data *sd = data;
3196 
3197         ____napi_schedule(sd, &sd->backlog);
3198         sd->received_rps++;
3199 }
3200 
3201 #endif /* CONFIG_RPS */
3202 
3203 /*
3204  * Check if this softnet_data structure is another cpu one
3205  * If yes, queue it to our IPI list and return 1
3206  * If no, return 0
3207  */
3208 static int rps_ipi_queued(struct softnet_data *sd)
3209 {
3210 #ifdef CONFIG_RPS
3211         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3212 
3213         if (sd != mysd) {
3214                 sd->rps_ipi_next = mysd->rps_ipi_list;
3215                 mysd->rps_ipi_list = sd;
3216 
3217                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3218                 return 1;
3219         }
3220 #endif /* CONFIG_RPS */
3221         return 0;
3222 }
3223 
3224 #ifdef CONFIG_NET_FLOW_LIMIT
3225 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3226 #endif
3227 
3228 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3229 {
3230 #ifdef CONFIG_NET_FLOW_LIMIT
3231         struct sd_flow_limit *fl;
3232         struct softnet_data *sd;
3233         unsigned int old_flow, new_flow;
3234 
3235         if (qlen < (netdev_max_backlog >> 1))
3236                 return false;
3237 
3238         sd = this_cpu_ptr(&softnet_data);
3239 
3240         rcu_read_lock();
3241         fl = rcu_dereference(sd->flow_limit);
3242         if (fl) {
3243                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3244                 old_flow = fl->history[fl->history_head];
3245                 fl->history[fl->history_head] = new_flow;
3246 
3247                 fl->history_head++;
3248                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3249 
3250                 if (likely(fl->buckets[old_flow]))
3251                         fl->buckets[old_flow]--;
3252 
3253                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3254                         fl->count++;
3255                         rcu_read_unlock();
3256                         return true;
3257                 }
3258         }
3259         rcu_read_unlock();
3260 #endif
3261         return false;
3262 }
3263 
3264 /*
3265  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3266  * queue (may be a remote CPU queue).
3267  */
3268 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3269                               unsigned int *qtail)
3270 {
3271         struct softnet_data *sd;
3272         unsigned long flags;
3273         unsigned int qlen;
3274 
3275         sd = &per_cpu(softnet_data, cpu);
3276 
3277         local_irq_save(flags);
3278 
3279         rps_lock(sd);
3280         qlen = skb_queue_len(&sd->input_pkt_queue);
3281         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3282                 if (qlen) {
3283 enqueue:
3284                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3285                         input_queue_tail_incr_save(sd, qtail);
3286                         rps_unlock(sd);
3287                         local_irq_restore(flags);
3288                         return NET_RX_SUCCESS;
3289                 }
3290 
3291                 /* Schedule NAPI for backlog device
3292                  * We can use non atomic operation since we own the queue lock
3293                  */
3294                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3295                         if (!rps_ipi_queued(sd))
3296                                 ____napi_schedule(sd, &sd->backlog);
3297                 }
3298                 goto enqueue;
3299         }
3300 
3301         sd->dropped++;
3302         rps_unlock(sd);
3303 
3304         local_irq_restore(flags);
3305 
3306         atomic_long_inc(&skb->dev->rx_dropped);
3307         kfree_skb(skb);
3308         return NET_RX_DROP;
3309 }
3310 
3311 static int netif_rx_internal(struct sk_buff *skb)
3312 {
3313         int ret;
3314 
3315         net_timestamp_check(netdev_tstamp_prequeue, skb);
3316 
3317         trace_netif_rx(skb);
3318 #ifdef CONFIG_RPS
3319         if (static_key_false(&rps_needed)) {
3320                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3321                 int cpu;
3322 
3323                 preempt_disable();
3324                 rcu_read_lock();
3325 
3326                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3327                 if (cpu < 0)
3328                         cpu = smp_processor_id();
3329 
3330                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3331 
3332                 rcu_read_unlock();
3333                 preempt_enable();
3334         } else
3335 #endif
3336         {
3337                 unsigned int qtail;
3338                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3339                 put_cpu();
3340         }
3341         return ret;
3342 }
3343 
3344 /**
3345  *      netif_rx        -       post buffer to the network code
3346  *      @skb: buffer to post
3347  *
3348  *      This function receives a packet from a device driver and queues it for
3349  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3350  *      may be dropped during processing for congestion control or by the
3351  *      protocol layers.
3352  *
3353  *      return values:
3354  *      NET_RX_SUCCESS  (no congestion)
3355  *      NET_RX_DROP     (packet was dropped)
3356  *
3357  */
3358 
3359 int netif_rx(struct sk_buff *skb)
3360 {
3361         trace_netif_rx_entry(skb);
3362 
3363         return netif_rx_internal(skb);
3364 }
3365 EXPORT_SYMBOL(netif_rx);
3366 
3367 int netif_rx_ni(struct sk_buff *skb)
3368 {
3369         int err;
3370 
3371         trace_netif_rx_ni_entry(skb);
3372 
3373         preempt_disable();
3374         err = netif_rx_internal(skb);
3375         if (local_softirq_pending())
3376                 do_softirq();
3377         preempt_enable();
3378 
3379         return err;
3380 }
3381 EXPORT_SYMBOL(netif_rx_ni);
3382 
3383 static void net_tx_action(struct softirq_action *h)
3384 {
3385         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3386 
3387         if (sd->completion_queue) {
3388                 struct sk_buff *clist;
3389 
3390                 local_irq_disable();
3391                 clist = sd->completion_queue;
3392                 sd->completion_queue = NULL;
3393                 local_irq_enable();
3394 
3395                 while (clist) {
3396                         struct sk_buff *skb = clist;
3397                         clist = clist->next;
3398 
3399                         WARN_ON(atomic_read(&skb->users));
3400                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3401                                 trace_consume_skb(skb);
3402                         else
3403                                 trace_kfree_skb(skb, net_tx_action);
3404                         __kfree_skb(skb);
3405                 }
3406         }
3407 
3408         if (sd->output_queue) {
3409                 struct Qdisc *head;
3410 
3411                 local_irq_disable();
3412                 head = sd->output_queue;
3413                 sd->output_queue = NULL;
3414                 sd->output_queue_tailp = &sd->output_queue;
3415                 local_irq_enable();
3416 
3417                 while (head) {
3418                         struct Qdisc *q = head;
3419                         spinlock_t *root_lock;
3420 
3421                         head = head->next_sched;
3422 
3423                         root_lock = qdisc_lock(q);
3424                         if (spin_trylock(root_lock)) {
3425                                 smp_mb__before_atomic();
3426                                 clear_bit(__QDISC_STATE_SCHED,
3427                                           &q->state);
3428                                 qdisc_run(q);
3429                                 spin_unlock(root_lock);
3430                         } else {
3431                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3432                                               &q->state)) {
3433                                         __netif_reschedule(q);
3434                                 } else {
3435                                         smp_mb__before_atomic();
3436                                         clear_bit(__QDISC_STATE_SCHED,
3437                                                   &q->state);
3438                                 }
3439                         }
3440                 }
3441         }
3442 }
3443 
3444 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3445     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3446 /* This hook is defined here for ATM LANE */
3447 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3448                              unsigned char *addr) __read_mostly;
3449 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3450 #endif
3451 
3452 #ifdef CONFIG_NET_CLS_ACT
3453 /* TODO: Maybe we should just force sch_ingress to be compiled in
3454  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3455  * a compare and 2 stores extra right now if we dont have it on
3456  * but have CONFIG_NET_CLS_ACT
3457  * NOTE: This doesn't stop any functionality; if you dont have
3458  * the ingress scheduler, you just can't add policies on ingress.
3459  *
3460  */
3461 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3462 {
3463         struct net_device *dev = skb->dev;
3464         u32 ttl = G_TC_RTTL(skb->tc_verd);
3465         int result = TC_ACT_OK;
3466         struct Qdisc *q;
3467 
3468         if (unlikely(MAX_RED_LOOP < ttl++)) {
3469                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3470                                      skb->skb_iif, dev->ifindex);
3471                 return TC_ACT_SHOT;
3472         }
3473 
3474         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3475         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3476 
3477         q = rcu_dereference(rxq->qdisc);
3478         if (q != &noop_qdisc) {
3479                 spin_lock(qdisc_lock(q));
3480                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3481                         result = qdisc_enqueue_root(skb, q);
3482                 spin_unlock(qdisc_lock(q));
3483         }
3484 
3485         return result;
3486 }
3487 
3488 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3489                                          struct packet_type **pt_prev,
3490                                          int *ret, struct net_device *orig_dev)
3491 {
3492         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3493 
3494         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3495                 goto out;
3496 
3497         if (*pt_prev) {
3498                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3499                 *pt_prev = NULL;
3500         }
3501 
3502         switch (ing_filter(skb, rxq)) {
3503         case TC_ACT_SHOT:
3504         case TC_ACT_STOLEN:
3505                 kfree_skb(skb);
3506                 return NULL;
3507         }
3508 
3509 out:
3510         skb->tc_verd = 0;
3511         return skb;
3512 }
3513 #endif
3514 
3515 /**
3516  *      netdev_rx_handler_register - register receive handler
3517  *      @dev: device to register a handler for
3518  *      @rx_handler: receive handler to register
3519  *      @rx_handler_data: data pointer that is used by rx handler
3520  *
3521  *      Register a receive handler for a device. This handler will then be
3522  *      called from __netif_receive_skb. A negative errno code is returned
3523  *      on a failure.
3524  *
3525  *      The caller must hold the rtnl_mutex.
3526  *
3527  *      For a general description of rx_handler, see enum rx_handler_result.
3528  */
3529 int netdev_rx_handler_register(struct net_device *dev,
3530                                rx_handler_func_t *rx_handler,
3531                                void *rx_handler_data)
3532 {
3533         ASSERT_RTNL();
3534 
3535         if (dev->rx_handler)
3536                 return -EBUSY;
3537 
3538         /* Note: rx_handler_data must be set before rx_handler */
3539         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3540         rcu_assign_pointer(dev->rx_handler, rx_handler);
3541 
3542         return 0;
3543 }
3544 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3545 
3546 /**
3547  *      netdev_rx_handler_unregister - unregister receive handler
3548  *      @dev: device to unregister a handler from
3549  *
3550  *      Unregister a receive handler from a device.
3551  *
3552  *      The caller must hold the rtnl_mutex.
3553  */
3554 void netdev_rx_handler_unregister(struct net_device *dev)
3555 {
3556 
3557         ASSERT_RTNL();
3558         RCU_INIT_POINTER(dev->rx_handler, NULL);
3559         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3560          * section has a guarantee to see a non NULL rx_handler_data
3561          * as well.
3562          */
3563         synchronize_net();
3564         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3565 }
3566 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3567 
3568 /*
3569  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3570  * the special handling of PFMEMALLOC skbs.
3571  */
3572 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3573 {
3574         switch (skb->protocol) {
3575         case htons(ETH_P_ARP):
3576         case htons(ETH_P_IP):
3577         case htons(ETH_P_IPV6):
3578         case htons(ETH_P_8021Q):
3579         case htons(ETH_P_8021AD):
3580                 return true;
3581         default:
3582                 return false;
3583         }
3584 }
3585 
3586 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3587 {
3588         struct packet_type *ptype, *pt_prev;
3589         rx_handler_func_t *rx_handler;
3590         struct net_device *orig_dev;
3591         struct net_device *null_or_dev;
3592         bool deliver_exact = false;
3593         int ret = NET_RX_DROP;
3594         __be16 type;
3595 
3596         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3597 
3598         trace_netif_receive_skb(skb);
3599 
3600         orig_dev = skb->dev;
3601 
3602         skb_reset_network_header(skb);
3603         if (!skb_transport_header_was_set(skb))
3604                 skb_reset_transport_header(skb);
3605         skb_reset_mac_len(skb);
3606 
3607         pt_prev = NULL;
3608 
3609         rcu_read_lock();
3610 
3611 another_round:
3612         skb->skb_iif = skb->dev->ifindex;
3613 
3614         __this_cpu_inc(softnet_data.processed);
3615 
3616         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3617             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3618                 skb = skb_vlan_untag(skb);
3619                 if (unlikely(!skb))
3620                         goto unlock;
3621         }
3622 
3623 #ifdef CONFIG_NET_CLS_ACT
3624         if (skb->tc_verd & TC_NCLS) {
3625                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3626                 goto ncls;
3627         }
3628 #endif
3629 
3630         if (pfmemalloc)
3631                 goto skip_taps;
3632 
3633         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3634                 if (!ptype->dev || ptype->dev == skb->dev) {
3635                         if (pt_prev)
3636                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3637                         pt_prev = ptype;
3638                 }
3639         }
3640 
3641 skip_taps:
3642 #ifdef CONFIG_NET_CLS_ACT
3643         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3644         if (!skb)
3645                 goto unlock;
3646 ncls:
3647 #endif
3648 
3649         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3650                 goto drop;
3651 
3652         if (vlan_tx_tag_present(skb)) {
3653                 if (pt_prev) {
3654                         ret = deliver_skb(skb, pt_prev, orig_dev);
3655                         pt_prev = NULL;
3656                 }
3657                 if (vlan_do_receive(&skb))
3658                         goto another_round;
3659                 else if (unlikely(!skb))
3660                         goto unlock;
3661         }
3662 
3663         rx_handler = rcu_dereference(skb->dev->rx_handler);
3664         if (rx_handler) {
3665                 if (pt_prev) {
3666                         ret = deliver_skb(skb, pt_prev, orig_dev);
3667                         pt_prev = NULL;
3668                 }
3669                 switch (rx_handler(&skb)) {
3670                 case RX_HANDLER_CONSUMED:
3671                         ret = NET_RX_SUCCESS;
3672                         goto unlock;
3673                 case RX_HANDLER_ANOTHER:
3674                         goto another_round;
3675                 case RX_HANDLER_EXACT:
3676                         deliver_exact = true;
3677                 case RX_HANDLER_PASS:
3678                         break;
3679                 default:
3680                         BUG();
3681                 }
3682         }
3683 
3684         if (unlikely(vlan_tx_tag_present(skb))) {
3685                 if (vlan_tx_tag_get_id(skb))
3686                         skb->pkt_type = PACKET_OTHERHOST;
3687                 /* Note: we might in the future use prio bits
3688                  * and set skb->priority like in vlan_do_receive()
3689                  * For the time being, just ignore Priority Code Point
3690                  */
3691                 skb->vlan_tci = 0;
3692         }
3693 
3694         /* deliver only exact match when indicated */
3695         null_or_dev = deliver_exact ? skb->dev : NULL;
3696 
3697         type = skb->protocol;
3698         list_for_each_entry_rcu(ptype,
3699                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3700                 if (ptype->type == type &&
3701                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3702                      ptype->dev == orig_dev)) {
3703                         if (pt_prev)
3704                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3705                         pt_prev = ptype;
3706                 }
3707         }
3708 
3709         if (pt_prev) {
3710                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3711                         goto drop;
3712                 else
3713                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3714         } else {
3715 drop:
3716                 atomic_long_inc(&skb->dev->rx_dropped);
3717                 kfree_skb(skb);
3718                 /* Jamal, now you will not able to escape explaining
3719                  * me how you were going to use this. :-)
3720                  */
3721                 ret = NET_RX_DROP;
3722         }
3723 
3724 unlock:
3725         rcu_read_unlock();
3726         return ret;
3727 }
3728 
3729 static int __netif_receive_skb(struct sk_buff *skb)
3730 {
3731         int ret;
3732 
3733         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3734                 unsigned long pflags = current->flags;
3735 
3736                 /*
3737                  * PFMEMALLOC skbs are special, they should
3738                  * - be delivered to SOCK_MEMALLOC sockets only
3739                  * - stay away from userspace
3740                  * - have bounded memory usage
3741                  *
3742                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3743                  * context down to all allocation sites.
3744                  */
3745                 current->flags |= PF_MEMALLOC;
3746                 ret = __netif_receive_skb_core(skb, true);
3747                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3748         } else
3749                 ret = __netif_receive_skb_core(skb, false);
3750 
3751         return ret;
3752 }
3753 
3754 static int netif_receive_skb_internal(struct sk_buff *skb)
3755 {
3756         net_timestamp_check(netdev_tstamp_prequeue, skb);
3757 
3758         if (skb_defer_rx_timestamp(skb))
3759                 return NET_RX_SUCCESS;
3760 
3761 #ifdef CONFIG_RPS
3762         if (static_key_false(&rps_needed)) {
3763                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3764                 int cpu, ret;
3765 
3766                 rcu_read_lock();
3767 
3768                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3769 
3770                 if (cpu >= 0) {
3771                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3772                         rcu_read_unlock();
3773                         return ret;
3774                 }
3775                 rcu_read_unlock();
3776         }
3777 #endif
3778         return __netif_receive_skb(skb);
3779 }
3780 
3781 /**
3782  *      netif_receive_skb - process receive buffer from network
3783  *      @skb: buffer to process
3784  *
3785  *      netif_receive_skb() is the main receive data processing function.
3786  *      It always succeeds. The buffer may be dropped during processing
3787  *      for congestion control or by the protocol layers.
3788  *
3789  *      This function may only be called from softirq context and interrupts
3790  *      should be enabled.
3791  *
3792  *      Return values (usually ignored):
3793  *      NET_RX_SUCCESS: no congestion
3794  *      NET_RX_DROP: packet was dropped
3795  */
3796 int netif_receive_skb(struct sk_buff *skb)
3797 {
3798         trace_netif_receive_skb_entry(skb);
3799 
3800         return netif_receive_skb_internal(skb);
3801 }
3802 EXPORT_SYMBOL(netif_receive_skb);
3803 
3804 /* Network device is going away, flush any packets still pending
3805  * Called with irqs disabled.
3806  */
3807 static void flush_backlog(void *arg)
3808 {
3809         struct net_device *dev = arg;
3810         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3811         struct sk_buff *skb, *tmp;
3812 
3813         rps_lock(sd);
3814         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3815                 if (skb->dev == dev) {
3816                         __skb_unlink(skb, &sd->input_pkt_queue);
3817                         kfree_skb(skb);
3818                         input_queue_head_incr(sd);
3819                 }
3820         }
3821         rps_unlock(sd);
3822 
3823         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3824                 if (skb->dev == dev) {
3825                         __skb_unlink(skb, &sd->process_queue);
3826                         kfree_skb(skb);
3827                         input_queue_head_incr(sd);
3828                 }
3829         }
3830 }
3831 
3832 static int napi_gro_complete(struct sk_buff *skb)
3833 {
3834         struct packet_offload *ptype;
3835         __be16 type = skb->protocol;
3836         struct list_head *head = &offload_base;
3837         int err = -ENOENT;
3838 
3839         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3840 
3841         if (NAPI_GRO_CB(skb)->count == 1) {
3842                 skb_shinfo(skb)->gso_size = 0;
3843                 goto out;
3844         }
3845 
3846         rcu_read_lock();
3847         list_for_each_entry_rcu(ptype, head, list) {
3848                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3849                         continue;
3850 
3851                 err = ptype->callbacks.gro_complete(skb, 0);
3852                 break;
3853         }
3854         rcu_read_unlock();
3855 
3856         if (err) {
3857                 WARN_ON(&ptype->list == head);
3858                 kfree_skb(skb);
3859                 return NET_RX_SUCCESS;
3860         }
3861 
3862 out:
3863         return netif_receive_skb_internal(skb);
3864 }
3865 
3866 /* napi->gro_list contains packets ordered by age.
3867  * youngest packets at the head of it.
3868  * Complete skbs in reverse order to reduce latencies.
3869  */
3870 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3871 {
3872         struct sk_buff *skb, *prev = NULL;
3873 
3874         /* scan list and build reverse chain */
3875         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3876                 skb->prev = prev;
3877                 prev = skb;
3878         }
3879 
3880         for (skb = prev; skb; skb = prev) {
3881                 skb->next = NULL;
3882 
3883                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3884                         return;
3885 
3886                 prev = skb->prev;
3887                 napi_gro_complete(skb);
3888                 napi->gro_count--;
3889         }
3890 
3891         napi->gro_list = NULL;
3892 }
3893 EXPORT_SYMBOL(napi_gro_flush);
3894 
3895 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3896 {
3897         struct sk_buff *p;
3898         unsigned int maclen = skb->dev->hard_header_len;
3899         u32 hash = skb_get_hash_raw(skb);
3900 
3901         for (p = napi->gro_list; p; p = p->next) {
3902                 unsigned long diffs;
3903 
3904                 NAPI_GRO_CB(p)->flush = 0;
3905 
3906                 if (hash != skb_get_hash_raw(p)) {
3907                         NAPI_GRO_CB(p)->same_flow = 0;
3908                         continue;
3909                 }
3910 
3911                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3912                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3913                 if (maclen == ETH_HLEN)
3914                         diffs |= compare_ether_header(skb_mac_header(p),
3915                                                       skb_mac_header(skb));
3916                 else if (!diffs)
3917                         diffs = memcmp(skb_mac_header(p),
3918                                        skb_mac_header(skb),
3919                                        maclen);
3920                 NAPI_GRO_CB(p)->same_flow = !diffs;
3921         }
3922 }
3923 
3924 static void skb_gro_reset_offset(struct sk_buff *skb)
3925 {
3926         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3927         const skb_frag_t *frag0 = &pinfo->frags[0];
3928 
3929         NAPI_GRO_CB(skb)->data_offset = 0;
3930         NAPI_GRO_CB(skb)->frag0 = NULL;
3931         NAPI_GRO_CB(skb)->frag0_len = 0;
3932 
3933         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3934             pinfo->nr_frags &&
3935             !PageHighMem(skb_frag_page(frag0))) {
3936                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3937                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3938         }
3939 }
3940 
3941 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3942 {
3943         struct skb_shared_info *pinfo = skb_shinfo(skb);
3944 
3945         BUG_ON(skb->end - skb->tail < grow);
3946 
3947         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3948 
3949         skb->data_len -= grow;
3950         skb->tail += grow;
3951 
3952         pinfo->frags[0].page_offset += grow;
3953         skb_frag_size_sub(&pinfo->frags[0], grow);
3954 
3955         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3956                 skb_frag_unref(skb, 0);
3957                 memmove(pinfo->frags, pinfo->frags + 1,
3958                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3959         }
3960 }
3961 
3962 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3963 {
3964         struct sk_buff **pp = NULL;
3965         struct packet_offload *ptype;
3966         __be16 type = skb->protocol;
3967         struct list_head *head = &offload_base;
3968         int same_flow;
3969         enum gro_result ret;
3970         int grow;
3971 
3972         if (!(skb->dev->features & NETIF_F_GRO))
3973                 goto normal;
3974 
3975         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
3976                 goto normal;
3977 
3978         gro_list_prepare(napi, skb);
3979 
3980         rcu_read_lock();
3981         list_for_each_entry_rcu(ptype, head, list) {
3982                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3983                         continue;
3984 
3985                 skb_set_network_header(skb, skb_gro_offset(skb));
3986                 skb_reset_mac_len(skb);
3987                 NAPI_GRO_CB(skb)->same_flow = 0;
3988                 NAPI_GRO_CB(skb)->flush = 0;
3989                 NAPI_GRO_CB(skb)->free = 0;
3990                 NAPI_GRO_CB(skb)->udp_mark = 0;
3991 
3992                 /* Setup for GRO checksum validation */
3993                 switch (skb->ip_summed) {
3994                 case CHECKSUM_COMPLETE:
3995                         NAPI_GRO_CB(skb)->csum = skb->csum;
3996                         NAPI_GRO_CB(skb)->csum_valid = 1;
3997                         NAPI_GRO_CB(skb)->csum_cnt = 0;
3998                         break;
3999                 case CHECKSUM_UNNECESSARY:
4000                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4001                         NAPI_GRO_CB(skb)->csum_valid = 0;
4002                         break;
4003                 default:
4004                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4005                         NAPI_GRO_CB(skb)->csum_valid = 0;
4006                 }
4007 
4008                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4009                 break;
4010         }
4011         rcu_read_unlock();
4012 
4013         if (&ptype->list == head)
4014                 goto normal;
4015 
4016         same_flow = NAPI_GRO_CB(skb)->same_flow;
4017         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4018 
4019         if (pp) {
4020                 struct sk_buff *nskb = *pp;
4021 
4022                 *pp = nskb->next;
4023                 nskb->next = NULL;
4024                 napi_gro_complete(nskb);
4025                 napi->gro_count--;
4026         }
4027 
4028         if (same_flow)
4029                 goto ok;
4030 
4031         if (NAPI_GRO_CB(skb)->flush)
4032                 goto normal;
4033 
4034         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4035                 struct sk_buff *nskb = napi->gro_list;
4036 
4037                 /* locate the end of the list to select the 'oldest' flow */
4038                 while (nskb->next) {
4039                         pp = &nskb->next;
4040                         nskb = *pp;
4041                 }
4042                 *pp = NULL;
4043                 nskb->next = NULL;
4044                 napi_gro_complete(nskb);
4045         } else {
4046                 napi->gro_count++;
4047         }
4048         NAPI_GRO_CB(skb)->count = 1;
4049         NAPI_GRO_CB(skb)->age = jiffies;
4050         NAPI_GRO_CB(skb)->last = skb;
4051         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4052         skb->next = napi->gro_list;
4053         napi->gro_list = skb;
4054         ret = GRO_HELD;
4055 
4056 pull:
4057         grow = skb_gro_offset(skb) - skb_headlen(skb);
4058         if (grow > 0)
4059                 gro_pull_from_frag0(skb, grow);
4060 ok:
4061         return ret;
4062 
4063 normal:
4064         ret = GRO_NORMAL;
4065         goto pull;
4066 }
4067 
4068 struct packet_offload *gro_find_receive_by_type(__be16 type)
4069 {
4070         struct list_head *offload_head = &offload_base;
4071         struct packet_offload *ptype;
4072 
4073         list_for_each_entry_rcu(ptype, offload_head, list) {
4074                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4075                         continue;
4076                 return ptype;
4077         }
4078         return NULL;
4079 }
4080 EXPORT_SYMBOL(gro_find_receive_by_type);
4081 
4082 struct packet_offload *gro_find_complete_by_type(__be16 type)
4083 {
4084         struct list_head *offload_head = &offload_base;
4085         struct packet_offload *ptype;
4086 
4087         list_for_each_entry_rcu(ptype, offload_head, list) {
4088                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4089                         continue;
4090                 return ptype;
4091         }
4092         return NULL;
4093 }
4094 EXPORT_SYMBOL(gro_find_complete_by_type);
4095 
4096 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4097 {
4098         switch (ret) {
4099         case GRO_NORMAL:
4100                 if (netif_receive_skb_internal(skb))
4101                         ret = GRO_DROP;
4102                 break;
4103 
4104         case GRO_DROP:
4105                 kfree_skb(skb);
4106                 break;
4107 
4108         case GRO_MERGED_FREE:
4109                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4110                         kmem_cache_free(skbuff_head_cache, skb);
4111                 else
4112                         __kfree_skb(skb);
4113                 break;
4114 
4115         case GRO_HELD:
4116         case GRO_MERGED:
4117                 break;
4118         }
4119 
4120         return ret;
4121 }
4122 
4123 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4124 {
4125         trace_napi_gro_receive_entry(skb);
4126 
4127         skb_gro_reset_offset(skb);
4128 
4129         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4130 }
4131 EXPORT_SYMBOL(napi_gro_receive);
4132 
4133 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4134 {
4135         if (unlikely(skb->pfmemalloc)) {
4136                 consume_skb(skb);
4137                 return;
4138         }
4139         __skb_pull(skb, skb_headlen(skb));
4140         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4141         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4142         skb->vlan_tci = 0;
4143         skb->dev = napi->dev;
4144         skb->skb_iif = 0;
4145         skb->encapsulation = 0;
4146         skb_shinfo(skb)->gso_type = 0;
4147         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4148 
4149         napi->skb = skb;
4150 }
4151 
4152 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4153 {
4154         struct sk_buff *skb = napi->skb;
4155 
4156         if (!skb) {
4157                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4158                 napi->skb = skb;
4159         }
4160         return skb;
4161 }
4162 EXPORT_SYMBOL(napi_get_frags);
4163 
4164 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4165                                       struct sk_buff *skb,
4166                                       gro_result_t ret)
4167 {
4168         switch (ret) {
4169         case GRO_NORMAL:
4170         case GRO_HELD:
4171                 __skb_push(skb, ETH_HLEN);
4172                 skb->protocol = eth_type_trans(skb, skb->dev);
4173                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4174                         ret = GRO_DROP;
4175                 break;
4176 
4177         case GRO_DROP:
4178         case GRO_MERGED_FREE:
4179                 napi_reuse_skb(napi, skb);
4180                 break;
4181 
4182         case GRO_MERGED:
4183                 break;
4184         }
4185 
4186         return ret;
4187 }
4188 
4189 /* Upper GRO stack assumes network header starts at gro_offset=0
4190  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4191  * We copy ethernet header into skb->data to have a common layout.
4192  */
4193 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4194 {
4195         struct sk_buff *skb = napi->skb;
4196         const struct ethhdr *eth;
4197         unsigned int hlen = sizeof(*eth);
4198 
4199         napi->skb = NULL;
4200 
4201         skb_reset_mac_header(skb);
4202         skb_gro_reset_offset(skb);
4203 
4204         eth = skb_gro_header_fast(skb, 0);
4205         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4206                 eth = skb_gro_header_slow(skb, hlen, 0);
4207                 if (unlikely(!eth)) {
4208                         napi_reuse_skb(napi, skb);
4209                         return NULL;
4210                 }
4211         } else {
4212                 gro_pull_from_frag0(skb, hlen);
4213                 NAPI_GRO_CB(skb)->frag0 += hlen;
4214                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4215         }
4216         __skb_pull(skb, hlen);
4217 
4218         /*
4219          * This works because the only protocols we care about don't require
4220          * special handling.
4221          * We'll fix it up properly in napi_frags_finish()
4222          */
4223         skb->protocol = eth->h_proto;
4224 
4225         return skb;
4226 }
4227 
4228 gro_result_t napi_gro_frags(struct napi_struct *napi)
4229 {
4230         struct sk_buff *skb = napi_frags_skb(napi);
4231 
4232         if (!skb)
4233                 return GRO_DROP;
4234 
4235         trace_napi_gro_frags_entry(skb);
4236 
4237         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4238 }
4239 EXPORT_SYMBOL(napi_gro_frags);
4240 
4241 /* Compute the checksum from gro_offset and return the folded value
4242  * after adding in any pseudo checksum.
4243  */
4244 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4245 {
4246         __wsum wsum;
4247         __sum16 sum;
4248 
4249         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4250 
4251         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4252         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4253         if (likely(!sum)) {
4254                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4255                     !skb->csum_complete_sw)
4256                         netdev_rx_csum_fault(skb->dev);
4257         }
4258 
4259         NAPI_GRO_CB(skb)->csum = wsum;
4260         NAPI_GRO_CB(skb)->csum_valid = 1;
4261 
4262         return sum;
4263 }
4264 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4265 
4266 /*
4267  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4268  * Note: called with local irq disabled, but exits with local irq enabled.
4269  */
4270 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4271 {
4272 #ifdef CONFIG_RPS
4273         struct softnet_data *remsd = sd->rps_ipi_list;
4274 
4275         if (remsd) {
4276                 sd->rps_ipi_list = NULL;
4277 
4278                 local_irq_enable();
4279 
4280                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4281                 while (remsd) {
4282                         struct softnet_data *next = remsd->rps_ipi_next;
4283 
4284                         if (cpu_online(remsd->cpu))
4285                                 smp_call_function_single_async(remsd->cpu,
4286                                                            &remsd->csd);
4287                         remsd = next;
4288                 }
4289         } else
4290 #endif
4291                 local_irq_enable();
4292 }
4293 
4294 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4295 {
4296 #ifdef CONFIG_RPS
4297         return sd->rps_ipi_list != NULL;
4298 #else
4299         return false;
4300 #endif
4301 }
4302 
4303 static int process_backlog(struct napi_struct *napi, int quota)
4304 {
4305         int work = 0;
4306         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4307 
4308         /* Check if we have pending ipi, its better to send them now,
4309          * not waiting net_rx_action() end.
4310          */
4311         if (sd_has_rps_ipi_waiting(sd)) {
4312                 local_irq_disable();
4313                 net_rps_action_and_irq_enable(sd);
4314         }
4315 
4316         napi->weight = weight_p;
4317         local_irq_disable();
4318         while (1) {
4319                 struct sk_buff *skb;
4320 
4321                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4322                         local_irq_enable();
4323                         __netif_receive_skb(skb);
4324                         local_irq_disable();
4325                         input_queue_head_incr(sd);
4326                         if (++work >= quota) {
4327                                 local_irq_enable();
4328                                 return work;
4329                         }
4330                 }
4331 
4332                 rps_lock(sd);
4333                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4334                         /*
4335                          * Inline a custom version of __napi_complete().
4336                          * only current cpu owns and manipulates this napi,
4337                          * and NAPI_STATE_SCHED is the only possible flag set
4338                          * on backlog.
4339                          * We can use a plain write instead of clear_bit(),
4340                          * and we dont need an smp_mb() memory barrier.
4341                          */
4342                         napi->state = 0;
4343                         rps_unlock(sd);
4344 
4345                         break;
4346                 }
4347 
4348                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4349                                            &sd->process_queue);
4350                 rps_unlock(sd);
4351         }
4352         local_irq_enable();
4353 
4354         return work;
4355 }
4356 
4357 /**
4358  * __napi_schedule - schedule for receive
4359  * @n: entry to schedule
4360  *
4361  * The entry's receive function will be scheduled to run.
4362  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4363  */
4364 void __napi_schedule(struct napi_struct *n)
4365 {
4366         unsigned long flags;
4367 
4368         local_irq_save(flags);
4369         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4370         local_irq_restore(flags);
4371 }
4372 EXPORT_SYMBOL(__napi_schedule);
4373 
4374 /**
4375  * __napi_schedule_irqoff - schedule for receive
4376  * @n: entry to schedule
4377  *
4378  * Variant of __napi_schedule() assuming hard irqs are masked
4379  */
4380 void __napi_schedule_irqoff(struct napi_struct *n)
4381 {
4382         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4383 }
4384 EXPORT_SYMBOL(__napi_schedule_irqoff);
4385 
4386 void __napi_complete(struct napi_struct *n)
4387 {
4388         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4389 
4390         list_del_init(&n->poll_list);
4391         smp_mb__before_atomic();
4392         clear_bit(NAPI_STATE_SCHED, &n->state);
4393 }
4394 EXPORT_SYMBOL(__napi_complete);
4395 
4396 void napi_complete_done(struct napi_struct *n, int work_done)
4397 {
4398         unsigned long flags;
4399 
4400         /*
4401          * don't let napi dequeue from the cpu poll list
4402          * just in case its running on a different cpu
4403          */
4404         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4405                 return;
4406 
4407         if (n->gro_list) {
4408                 unsigned long timeout = 0;
4409 
4410                 if (work_done)
4411                         timeout = n->dev->gro_flush_timeout;
4412 
4413                 if (timeout)
4414                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4415                                       HRTIMER_MODE_REL_PINNED);
4416                 else
4417                         napi_gro_flush(n, false);
4418         }
4419         if (likely(list_empty(&n->poll_list))) {
4420                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4421         } else {
4422                 /* If n->poll_list is not empty, we need to mask irqs */
4423                 local_irq_save(flags);
4424                 __napi_complete(n);
4425                 local_irq_restore(flags);
4426         }
4427 }
4428 EXPORT_SYMBOL(napi_complete_done);
4429 
4430 /* must be called under rcu_read_lock(), as we dont take a reference */
4431 struct napi_struct *napi_by_id(unsigned int napi_id)
4432 {
4433         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4434         struct napi_struct *napi;
4435 
4436         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4437                 if (napi->napi_id == napi_id)
4438                         return napi;
4439 
4440         return NULL;
4441 }
4442 EXPORT_SYMBOL_GPL(napi_by_id);
4443 
4444 void napi_hash_add(struct napi_struct *napi)
4445 {
4446         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4447 
4448                 spin_lock(&napi_hash_lock);
4449 
4450                 /* 0 is not a valid id, we also skip an id that is taken
4451                  * we expect both events to be extremely rare
4452                  */
4453                 napi->napi_id = 0;
4454                 while (!napi->napi_id) {
4455                         napi->napi_id = ++napi_gen_id;
4456                         if (napi_by_id(napi->napi_id))
4457                                 napi->napi_id = 0;
4458                 }
4459 
4460                 hlist_add_head_rcu(&napi->napi_hash_node,
4461                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4462 
4463                 spin_unlock(&napi_hash_lock);
4464         }
4465 }
4466 EXPORT_SYMBOL_GPL(napi_hash_add);
4467 
4468 /* Warning : caller is responsible to make sure rcu grace period
4469  * is respected before freeing memory containing @napi
4470  */
4471 void napi_hash_del(struct napi_struct *napi)
4472 {
4473         spin_lock(&napi_hash_lock);
4474 
4475         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4476                 hlist_del_rcu(&napi->napi_hash_node);
4477 
4478         spin_unlock(&napi_hash_lock);
4479 }
4480 EXPORT_SYMBOL_GPL(napi_hash_del);
4481 
4482 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4483 {
4484         struct napi_struct *napi;
4485 
4486         napi = container_of(timer, struct napi_struct, timer);
4487         if (napi->gro_list)
4488                 napi_schedule(napi);
4489 
4490         return HRTIMER_NORESTART;
4491 }
4492 
4493 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4494                     int (*poll)(struct napi_struct *, int), int weight)
4495 {
4496         INIT_LIST_HEAD(&napi->poll_list);
4497         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4498         napi->timer.function = napi_watchdog;
4499         napi->gro_count = 0;
4500         napi->gro_list = NULL;
4501         napi->skb = NULL;
4502         napi->poll = poll;
4503         if (weight > NAPI_POLL_WEIGHT)
4504                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4505                             weight, dev->name);
4506         napi->weight = weight;
4507         list_add(&napi->dev_list, &dev->napi_list);
4508         napi->dev = dev;
4509 #ifdef CONFIG_NETPOLL
4510         spin_lock_init(&napi->poll_lock);
4511         napi->poll_owner = -1;
4512 #endif
4513         set_bit(NAPI_STATE_SCHED, &napi->state);
4514 }
4515 EXPORT_SYMBOL(netif_napi_add);
4516 
4517 void napi_disable(struct napi_struct *n)
4518 {
4519         might_sleep();
4520         set_bit(NAPI_STATE_DISABLE, &n->state);
4521 
4522         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4523                 msleep(1);
4524 
4525         hrtimer_cancel(&n->timer);
4526 
4527         clear_bit(NAPI_STATE_DISABLE, &n->state);
4528 }
4529 EXPORT_SYMBOL(napi_disable);
4530 
4531 void netif_napi_del(struct napi_struct *napi)
4532 {
4533         list_del_init(&napi->dev_list);
4534         napi_free_frags(napi);
4535 
4536         kfree_skb_list(napi->gro_list);
4537         napi->gro_list = NULL;
4538         napi->gro_count = 0;
4539 }
4540 EXPORT_SYMBOL(netif_napi_del);
4541 
4542 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4543 {
4544         void *have;
4545         int work, weight;
4546 
4547         list_del_init(&n->poll_list);
4548 
4549         have = netpoll_poll_lock(n);
4550 
4551         weight = n->weight;
4552 
4553         /* This NAPI_STATE_SCHED test is for avoiding a race
4554          * with netpoll's poll_napi().  Only the entity which
4555          * obtains the lock and sees NAPI_STATE_SCHED set will
4556          * actually make the ->poll() call.  Therefore we avoid
4557          * accidentally calling ->poll() when NAPI is not scheduled.
4558          */
4559         work = 0;
4560         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4561                 work = n->poll(n, weight);
4562                 trace_napi_poll(n);
4563         }
4564 
4565         WARN_ON_ONCE(work > weight);
4566 
4567         if (likely(work < weight))
4568                 goto out_unlock;
4569 
4570         /* Drivers must not modify the NAPI state if they
4571          * consume the entire weight.  In such cases this code
4572          * still "owns" the NAPI instance and therefore can
4573          * move the instance around on the list at-will.
4574          */
4575         if (unlikely(napi_disable_pending(n))) {
4576                 napi_complete(n);
4577                 goto out_unlock;
4578         }
4579 
4580         if (n->gro_list) {
4581                 /* flush too old packets
4582                  * If HZ < 1000, flush all packets.
4583                  */
4584                 napi_gro_flush(n, HZ >= 1000);
4585         }
4586 
4587         /* Some drivers may have called napi_schedule
4588          * prior to exhausting their budget.
4589          */
4590         if (unlikely(!list_empty(&n->poll_list))) {
4591                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4592                              n->dev ? n->dev->name : "backlog");
4593                 goto out_unlock;
4594         }
4595 
4596         list_add_tail(&n->poll_list, repoll);
4597 
4598 out_unlock:
4599         netpoll_poll_unlock(have);
4600 
4601         return work;
4602 }
4603 
4604 static void net_rx_action(struct softirq_action *h)
4605 {
4606         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4607         unsigned long time_limit = jiffies + 2;
4608         int budget = netdev_budget;
4609         LIST_HEAD(list);
4610         LIST_HEAD(repoll);
4611 
4612         local_irq_disable();
4613         list_splice_init(&sd->poll_list, &list);
4614         local_irq_enable();
4615 
4616         for (;;) {
4617                 struct napi_struct *n;
4618 
4619                 if (list_empty(&list)) {
4620                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4621                                 return;
4622                         break;
4623                 }
4624 
4625                 n = list_first_entry(&list, struct napi_struct, poll_list);
4626                 budget -= napi_poll(n, &repoll);
4627 
4628                 /* If softirq window is exhausted then punt.
4629                  * Allow this to run for 2 jiffies since which will allow
4630                  * an average latency of 1.5/HZ.
4631                  */
4632                 if (unlikely(budget <= 0 ||
4633                              time_after_eq(jiffies, time_limit))) {
4634                         sd->time_squeeze++;
4635                         break;
4636                 }
4637         }
4638 
4639         local_irq_disable();
4640 
4641         list_splice_tail_init(&sd->poll_list, &list);
4642         list_splice_tail(&repoll, &list);
4643         list_splice(&list, &sd->poll_list);
4644         if (!list_empty(&sd->poll_list))
4645                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4646 
4647         net_rps_action_and_irq_enable(sd);
4648 }
4649 
4650 struct netdev_adjacent {
4651         struct net_device *dev;
4652 
4653         /* upper master flag, there can only be one master device per list */
4654         bool master;
4655 
4656         /* counter for the number of times this device was added to us */
4657         u16 ref_nr;
4658 
4659         /* private field for the users */
4660         void *private;
4661 
4662         struct list_head list;
4663         struct rcu_head rcu;
4664 };
4665 
4666 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4667                                                  struct net_device *adj_dev,
4668                                                  struct list_head *adj_list)
4669 {
4670         struct netdev_adjacent *adj;
4671 
4672         list_for_each_entry(adj, adj_list, list) {
4673                 if (adj->dev == adj_dev)
4674                         return adj;
4675         }
4676         return NULL;
4677 }
4678 
4679 /**
4680  * netdev_has_upper_dev - Check if device is linked to an upper device
4681  * @dev: device
4682  * @upper_dev: upper device to check
4683  *
4684  * Find out if a device is linked to specified upper device and return true
4685  * in case it is. Note that this checks only immediate upper device,
4686  * not through a complete stack of devices. The caller must hold the RTNL lock.
4687  */
4688 bool netdev_has_upper_dev(struct net_device *dev,
4689                           struct net_device *upper_dev)
4690 {
4691         ASSERT_RTNL();
4692 
4693         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4694 }
4695 EXPORT_SYMBOL(netdev_has_upper_dev);
4696 
4697 /**
4698  * netdev_has_any_upper_dev - Check if device is linked to some device
4699  * @dev: device
4700  *
4701  * Find out if a device is linked to an upper device and return true in case
4702  * it is. The caller must hold the RTNL lock.
4703  */
4704 static bool netdev_has_any_upper_dev(struct net_device *dev)
4705 {
4706         ASSERT_RTNL();
4707 
4708         return !list_empty(&dev->all_adj_list.upper);
4709 }
4710 
4711 /**
4712  * netdev_master_upper_dev_get - Get master upper device
4713  * @dev: device
4714  *
4715  * Find a master upper device and return pointer to it or NULL in case
4716  * it's not there. The caller must hold the RTNL lock.
4717  */
4718 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4719 {
4720         struct netdev_adjacent *upper;
4721 
4722         ASSERT_RTNL();
4723 
4724         if (list_empty(&dev->adj_list.upper))
4725                 return NULL;
4726 
4727         upper = list_first_entry(&dev->adj_list.upper,
4728                                  struct netdev_adjacent, list);
4729         if (likely(upper->master))
4730                 return upper->dev;
4731         return NULL;
4732 }
4733 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4734 
4735 void *netdev_adjacent_get_private(struct list_head *adj_list)
4736 {
4737         struct netdev_adjacent *adj;
4738 
4739         adj = list_entry(adj_list, struct netdev_adjacent, list);
4740 
4741         return adj->private;
4742 }
4743 EXPORT_SYMBOL(netdev_adjacent_get_private);
4744 
4745 /**
4746  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4747  * @dev: device
4748  * @iter: list_head ** of the current position
4749  *
4750  * Gets the next device from the dev's upper list, starting from iter
4751  * position. The caller must hold RCU read lock.
4752  */
4753 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4754                                                  struct list_head **iter)
4755 {
4756         struct netdev_adjacent *upper;
4757 
4758         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4759 
4760         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4761 
4762         if (&upper->list == &dev->adj_list.upper)
4763                 return NULL;
4764 
4765         *iter = &upper->list;
4766 
4767         return upper->dev;
4768 }
4769 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4770 
4771 /**
4772  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4773  * @dev: device
4774  * @iter: list_head ** of the current position
4775  *
4776  * Gets the next device from the dev's upper list, starting from iter
4777  * position. The caller must hold RCU read lock.
4778  */
4779 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4780                                                      struct list_head **iter)
4781 {
4782         struct netdev_adjacent *upper;
4783 
4784         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4785 
4786         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4787 
4788         if (&upper->list == &dev->all_adj_list.upper)
4789                 return NULL;
4790 
4791         *iter = &upper->list;
4792 
4793         return upper->dev;
4794 }
4795 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4796 
4797 /**
4798  * netdev_lower_get_next_private - Get the next ->private from the
4799  *                                 lower neighbour list
4800  * @dev: device
4801  * @iter: list_head ** of the current position
4802  *
4803  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4804  * list, starting from iter position. The caller must hold either hold the
4805  * RTNL lock or its own locking that guarantees that the neighbour lower
4806  * list will remain unchainged.
4807  */
4808 void *netdev_lower_get_next_private(struct net_device *dev,
4809                                     struct list_head **iter)
4810 {
4811         struct netdev_adjacent *lower;
4812 
4813         lower = list_entry(*iter, struct netdev_adjacent, list);
4814 
4815         if (&lower->list == &dev->adj_list.lower)
4816                 return NULL;
4817 
4818         *iter = lower->list.next;
4819 
4820         return lower->private;
4821 }
4822 EXPORT_SYMBOL(netdev_lower_get_next_private);
4823 
4824 /**
4825  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4826  *                                     lower neighbour list, RCU
4827  *                                     variant
4828  * @dev: device
4829  * @iter: list_head ** of the current position
4830  *
4831  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4832  * list, starting from iter position. The caller must hold RCU read lock.
4833  */
4834 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4835                                         struct list_head **iter)
4836 {
4837         struct netdev_adjacent *lower;
4838 
4839         WARN_ON_ONCE(!rcu_read_lock_held());
4840 
4841         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4842 
4843         if (&lower->list == &dev->adj_list.lower)
4844                 return NULL;
4845 
4846         *iter = &lower->list;
4847 
4848         return lower->private;
4849 }
4850 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4851 
4852 /**
4853  * netdev_lower_get_next - Get the next device from the lower neighbour
4854  *                         list
4855  * @dev: device
4856  * @iter: list_head ** of the current position
4857  *
4858  * Gets the next netdev_adjacent from the dev's lower neighbour
4859  * list, starting from iter position. The caller must hold RTNL lock or
4860  * its own locking that guarantees that the neighbour lower
4861  * list will remain unchainged.
4862  */
4863 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4864 {
4865         struct netdev_adjacent *lower;
4866 
4867         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4868 
4869         if (&lower->list == &dev->adj_list.lower)
4870                 return NULL;
4871 
4872         *iter = &lower->list;
4873 
4874         return lower->dev;
4875 }
4876 EXPORT_SYMBOL(netdev_lower_get_next);
4877 
4878 /**
4879  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4880  *                                     lower neighbour list, RCU
4881  *                                     variant
4882  * @dev: device
4883  *
4884  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4885  * list. The caller must hold RCU read lock.
4886  */
4887 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4888 {
4889         struct netdev_adjacent *lower;
4890 
4891         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4892                         struct netdev_adjacent, list);
4893         if (lower)
4894                 return lower->private;
4895         return NULL;
4896 }
4897 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4898 
4899 /**
4900  * netdev_master_upper_dev_get_rcu - Get master upper device
4901  * @dev: device
4902  *
4903  * Find a master upper device and return pointer to it or NULL in case
4904  * it's not there. The caller must hold the RCU read lock.
4905  */
4906 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4907 {
4908         struct netdev_adjacent *upper;
4909 
4910         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4911                                        struct netdev_adjacent, list);
4912         if (upper && likely(upper->master))
4913                 return upper->dev;
4914         return NULL;
4915 }
4916 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4917 
4918 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4919                               struct net_device *adj_dev,
4920                               struct list_head *dev_list)
4921 {
4922         char linkname[IFNAMSIZ+7];
4923         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4924                 "upper_%s" : "lower_%s", adj_dev->name);
4925         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4926                                  linkname);
4927 }
4928 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4929                                char *name,
4930                                struct list_head *dev_list)
4931 {
4932         char linkname[IFNAMSIZ+7];
4933         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4934                 "upper_%s" : "lower_%s", name);
4935         sysfs_remove_link(&(dev->dev.kobj), linkname);
4936 }
4937 
4938 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4939                                                  struct net_device *adj_dev,
4940                                                  struct list_head *dev_list)
4941 {
4942         return (dev_list == &dev->adj_list.upper ||
4943                 dev_list == &dev->adj_list.lower) &&
4944                 net_eq(dev_net(dev), dev_net(adj_dev));
4945 }
4946 
4947 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4948                                         struct net_device *adj_dev,
4949                                         struct list_head *dev_list,
4950                                         void *private, bool master)
4951 {
4952         struct netdev_adjacent *adj;
4953         int ret;
4954 
4955         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4956 
4957         if (adj) {
4958                 adj->ref_nr++;
4959                 return 0;
4960         }
4961 
4962         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4963         if (!adj)
4964                 return -ENOMEM;
4965 
4966         adj->dev = adj_dev;
4967         adj->master = master;
4968         adj->ref_nr = 1;
4969         adj->private = private;
4970         dev_hold(adj_dev);
4971 
4972         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4973                  adj_dev->name, dev->name, adj_dev->name);
4974 
4975         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
4976                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4977                 if (ret)
4978                         goto free_adj;
4979         }
4980 
4981         /* Ensure that master link is always the first item in list. */
4982         if (master) {
4983                 ret = sysfs_create_link(&(dev->dev.kobj),
4984                                         &(adj_dev->dev.kobj), "master");
4985                 if (ret)
4986                         goto remove_symlinks;
4987 
4988                 list_add_rcu(&adj->list, dev_list);
4989         } else {
4990                 list_add_tail_rcu(&adj->list, dev_list);
4991         }
4992 
4993         return 0;
4994 
4995 remove_symlinks:
4996         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
4997                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4998 free_adj:
4999         kfree(adj);
5000         dev_put(adj_dev);
5001 
5002         return ret;
5003 }
5004 
5005 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5006                                          struct net_device *adj_dev,
5007                                          struct list_head *dev_list)
5008 {
5009         struct netdev_adjacent *adj;
5010 
5011         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5012 
5013         if (!adj) {
5014                 pr_err("tried to remove device %s from %s\n",
5015                        dev->name, adj_dev->name);
5016                 BUG();
5017         }
5018 
5019         if (adj->ref_nr > 1) {
5020                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5021                          adj->ref_nr-1);
5022                 adj->ref_nr--;
5023                 return;
5024         }
5025 
5026         if (adj->master)
5027                 sysfs_remove_link(&(dev->dev.kobj), "master");
5028 
5029         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5030                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5031 
5032         list_del_rcu(&adj->list);
5033         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5034                  adj_dev->name, dev->name, adj_dev->name);
5035         dev_put(adj_dev);
5036         kfree_rcu(adj, rcu);
5037 }
5038 
5039 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5040                                             struct net_device *upper_dev,
5041                                             struct list_head *up_list,
5042                                             struct list_head *down_list,
5043                                             void *private, bool master)
5044 {
5045         int ret;
5046 
5047         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5048                                            master);
5049         if (ret)
5050                 return ret;
5051 
5052         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5053                                            false);
5054         if (ret) {
5055                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5056                 return ret;
5057         }
5058 
5059         return 0;
5060 }
5061 
5062 static int __netdev_adjacent_dev_link(struct net_device *dev,
5063                                       struct net_device *upper_dev)
5064 {
5065         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5066                                                 &dev->all_adj_list.upper,
5067                                                 &upper_dev->all_adj_list.lower,
5068                                                 NULL, false);
5069 }
5070 
5071 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5072                                                struct net_device *upper_dev,
5073                                                struct list_head *up_list,
5074                                                struct list_head *down_list)
5075 {
5076         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5077         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5078 }
5079 
5080 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5081                                          struct net_device *upper_dev)
5082 {
5083         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5084                                            &dev->all_adj_list.upper,
5085                                            &upper_dev->all_adj_list.lower);
5086 }
5087 
5088 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5089                                                 struct net_device *upper_dev,
5090                                                 void *private, bool master)
5091 {
5092         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5093 
5094         if (ret)
5095                 return ret;
5096 
5097         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5098                                                &dev->adj_list.upper,
5099                                                &upper_dev->adj_list.lower,
5100                                                private, master);
5101         if (ret) {
5102                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5103                 return ret;
5104         }
5105 
5106         return 0;
5107 }
5108 
5109 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5110                                                    struct net_device *upper_dev)
5111 {
5112         __netdev_adjacent_dev_unlink(dev, upper_dev);
5113         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5114                                            &dev->adj_list.upper,
5115                                            &upper_dev->adj_list.lower);
5116 }
5117 
5118 static int __netdev_upper_dev_link(struct net_device *dev,
5119                                    struct net_device *upper_dev, bool master,
5120                                    void *private)
5121 {
5122         struct netdev_adjacent *i, *j, *to_i, *to_j;
5123         int ret = 0;
5124 
5125         ASSERT_RTNL();
5126 
5127         if (dev == upper_dev)
5128                 return -EBUSY;
5129 
5130         /* To prevent loops, check if dev is not upper device to upper_dev. */
5131         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5132                 return -EBUSY;
5133 
5134         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5135                 return -EEXIST;
5136 
5137         if (master && netdev_master_upper_dev_get(dev))
5138                 return -EBUSY;
5139 
5140         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5141                                                    master);
5142         if (ret)
5143                 return ret;
5144 
5145         /* Now that we linked these devs, make all the upper_dev's
5146          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5147          * versa, and don't forget the devices itself. All of these
5148          * links are non-neighbours.
5149          */
5150         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5151                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5152                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5153                                  i->dev->name, j->dev->name);
5154                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5155                         if (ret)
5156                                 goto rollback_mesh;
5157                 }
5158         }
5159 
5160         /* add dev to every upper_dev's upper device */
5161         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5162                 pr_debug("linking %s's upper device %s with %s\n",
5163                          upper_dev->name, i->dev->name, dev->name);
5164                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5165                 if (ret)
5166                         goto rollback_upper_mesh;
5167         }
5168 
5169         /* add upper_dev to every dev's lower device */
5170         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5171                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5172                          i->dev->name, upper_dev->name);
5173                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5174                 if (ret)
5175                         goto rollback_lower_mesh;
5176         }
5177 
5178         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5179         return 0;
5180 
5181 rollback_lower_mesh:
5182         to_i = i;
5183         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5184                 if (i == to_i)
5185                         break;
5186                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5187         }
5188 
5189         i = NULL;
5190 
5191 rollback_upper_mesh:
5192         to_i = i;
5193         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5194                 if (i == to_i)
5195                         break;
5196                 __netdev_adjacent_dev_unlink(dev, i->dev);
5197         }
5198 
5199         i = j = NULL;
5200 
5201 rollback_mesh:
5202         to_i = i;
5203         to_j = j;
5204         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5205                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5206                         if (i == to_i && j == to_j)
5207                                 break;
5208                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5209                 }
5210                 if (i == to_i)
5211                         break;
5212         }
5213 
5214         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5215 
5216         return ret;
5217 }
5218 
5219 /**
5220  * netdev_upper_dev_link - Add a link to the upper device
5221  * @dev: device
5222  * @upper_dev: new upper device
5223  *
5224  * Adds a link to device which is upper to this one. The caller must hold
5225  * the RTNL lock. On a failure a negative errno code is returned.
5226  * On success the reference counts are adjusted and the function
5227  * returns zero.
5228  */
5229 int netdev_upper_dev_link(struct net_device *dev,
5230                           struct net_device *upper_dev)
5231 {
5232         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5233 }
5234 EXPORT_SYMBOL(netdev_upper_dev_link);
5235 
5236 /**
5237  * netdev_master_upper_dev_link - Add a master link to the upper device
5238  * @dev: device
5239  * @upper_dev: new upper device
5240  *
5241  * Adds a link to device which is upper to this one. In this case, only
5242  * one master upper device can be linked, although other non-master devices
5243  * might be linked as well. The caller must hold the RTNL lock.
5244  * On a failure a negative errno code is returned. On success the reference
5245  * counts are adjusted and the function returns zero.
5246  */
5247 int netdev_master_upper_dev_link(struct net_device *dev,
5248                                  struct net_device *upper_dev)
5249 {
5250         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5251 }
5252 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5253 
5254 int netdev_master_upper_dev_link_private(struct net_device *dev,
5255                                          struct net_device *upper_dev,
5256                                          void *private)
5257 {
5258         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5259 }
5260 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5261 
5262 /**
5263  * netdev_upper_dev_unlink - Removes a link to upper device
5264  * @dev: device
5265  * @upper_dev: new upper device
5266  *
5267  * Removes a link to device which is upper to this one. The caller must hold
5268  * the RTNL lock.
5269  */
5270 void netdev_upper_dev_unlink(struct net_device *dev,
5271                              struct net_device *upper_dev)
5272 {
5273         struct netdev_adjacent *i, *j;
5274         ASSERT_RTNL();
5275 
5276         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5277 
5278         /* Here is the tricky part. We must remove all dev's lower
5279          * devices from all upper_dev's upper devices and vice
5280          * versa, to maintain the graph relationship.
5281          */
5282         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5283                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5284                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5285 
5286         /* remove also the devices itself from lower/upper device
5287          * list
5288          */
5289         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5290                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5291 
5292         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5293                 __netdev_adjacent_dev_unlink(dev, i->dev);
5294 
5295         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5296 }
5297 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5298 
5299 static void netdev_adjacent_add_links(struct net_device *dev)
5300 {
5301         struct netdev_adjacent *iter;
5302 
5303         struct net *net = dev_net(dev);
5304 
5305         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5306                 if (!net_eq(net,dev_net(iter->dev)))
5307                         continue;
5308                 netdev_adjacent_sysfs_add(iter->dev, dev,
5309                                           &iter->dev->adj_list.lower);
5310                 netdev_adjacent_sysfs_add(dev, iter->dev,
5311                                           &dev->adj_list.upper);
5312         }
5313 
5314         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5315                 if (!net_eq(net,dev_net(iter->dev)))
5316                         continue;
5317                 netdev_adjacent_sysfs_add(iter->dev, dev,
5318                                           &iter->dev->adj_list.upper);
5319                 netdev_adjacent_sysfs_add(dev, iter->dev,
5320                                           &dev->adj_list.lower);
5321         }
5322 }
5323 
5324 static void netdev_adjacent_del_links(struct net_device *dev)
5325 {
5326         struct netdev_adjacent *iter;
5327 
5328         struct net *net = dev_net(dev);
5329 
5330         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5331                 if (!net_eq(net,dev_net(iter->dev)))
5332                         continue;
5333                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5334                                           &iter->dev->adj_list.lower);
5335                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5336                                           &dev->adj_list.upper);
5337         }
5338 
5339         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5340                 if (!net_eq(net,dev_net(iter->dev)))
5341                         continue;
5342                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5343                                           &iter->dev->adj_list.upper);
5344                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5345                                           &dev->adj_list.lower);
5346         }
5347 }
5348 
5349 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5350 {
5351         struct netdev_adjacent *iter;
5352 
5353         struct net *net = dev_net(dev);
5354 
5355         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5356                 if (!net_eq(net,dev_net(iter->dev)))
5357                         continue;
5358                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5359                                           &iter->dev->adj_list.lower);
5360                 netdev_adjacent_sysfs_add(iter->dev, dev,
5361                                           &iter->dev->adj_list.lower);
5362         }
5363 
5364         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5365                 if (!net_eq(net,dev_net(iter->dev)))
5366                         continue;
5367                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5368                                           &iter->dev->adj_list.upper);
5369                 netdev_adjacent_sysfs_add(iter->dev, dev,
5370                                           &iter->dev->adj_list.upper);
5371         }
5372 }
5373 
5374 void *netdev_lower_dev_get_private(struct net_device *dev,
5375                                    struct net_device *lower_dev)
5376 {
5377         struct netdev_adjacent *lower;
5378 
5379         if (!lower_dev)
5380                 return NULL;
5381         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5382         if (!lower)
5383                 return NULL;
5384 
5385         return lower->private;
5386 }
5387 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5388 
5389 
5390 int dev_get_nest_level(struct net_device *dev,
5391                        bool (*type_check)(struct net_device *dev))
5392 {
5393         struct net_device *lower = NULL;
5394         struct list_head *iter;
5395         int max_nest = -1;
5396         int nest;
5397 
5398         ASSERT_RTNL();
5399 
5400         netdev_for_each_lower_dev(dev, lower, iter) {
5401                 nest = dev_get_nest_level(lower, type_check);
5402                 if (max_nest < nest)
5403                         max_nest = nest;
5404         }
5405 
5406         if (type_check(dev))
5407                 max_nest++;
5408 
5409         return max_nest;
5410 }
5411 EXPORT_SYMBOL(dev_get_nest_level);
5412 
5413 static void dev_change_rx_flags(struct net_device *dev, int flags)
5414 {
5415         const struct net_device_ops *ops = dev->netdev_ops;
5416 
5417         if (ops->ndo_change_rx_flags)
5418                 ops->ndo_change_rx_flags(dev, flags);
5419 }
5420 
5421 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5422 {
5423         unsigned int old_flags = dev->flags;
5424         kuid_t uid;
5425         kgid_t gid;
5426 
5427         ASSERT_RTNL();
5428 
5429         dev->flags |= IFF_PROMISC;
5430         dev->promiscuity += inc;
5431         if (dev->promiscuity == 0) {
5432                 /*
5433                  * Avoid overflow.
5434                  * If inc causes overflow, untouch promisc and return error.
5435                  */
5436                 if (inc < 0)
5437                         dev->flags &= ~IFF_PROMISC;
5438                 else {
5439                         dev->promiscuity -= inc;
5440                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5441                                 dev->name);
5442                         return -EOVERFLOW;
5443                 }
5444         }
5445         if (dev->flags != old_flags) {
5446                 pr_info("device %s %s promiscuous mode\n",
5447                         dev->name,
5448                         dev->flags & IFF_PROMISC ? "entered" : "left");
5449                 if (audit_enabled) {
5450                         current_uid_gid(&uid, &gid);
5451                         audit_log(current->audit_context, GFP_ATOMIC,
5452                                 AUDIT_ANOM_PROMISCUOUS,
5453                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5454                                 dev->name, (dev->flags & IFF_PROMISC),
5455                                 (old_flags & IFF_PROMISC),
5456                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5457                                 from_kuid(&init_user_ns, uid),
5458                                 from_kgid(&init_user_ns, gid),
5459                                 audit_get_sessionid(current));
5460                 }
5461 
5462                 dev_change_rx_flags(dev, IFF_PROMISC);
5463         }
5464         if (notify)
5465                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5466         return 0;
5467 }
5468 
5469 /**
5470  *      dev_set_promiscuity     - update promiscuity count on a device
5471  *      @dev: device
5472  *      @inc: modifier
5473  *
5474  *      Add or remove promiscuity from a device. While the count in the device
5475  *      remains above zero the interface remains promiscuous. Once it hits zero
5476  *      the device reverts back to normal filtering operation. A negative inc
5477  *      value is used to drop promiscuity on the device.
5478  *      Return 0 if successful or a negative errno code on error.
5479  */
5480 int dev_set_promiscuity(struct net_device *dev, int inc)
5481 {
5482         unsigned int old_flags = dev->flags;
5483         int err;
5484 
5485         err = __dev_set_promiscuity(dev, inc, true);
5486         if (err < 0)
5487                 return err;
5488         if (dev->flags != old_flags)
5489                 dev_set_rx_mode(dev);
5490         return err;
5491 }
5492 EXPORT_SYMBOL(dev_set_promiscuity);
5493 
5494 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5495 {
5496         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5497 
5498         ASSERT_RTNL();
5499 
5500         dev->flags |= IFF_ALLMULTI;
5501         dev->allmulti += inc;
5502         if (dev->allmulti == 0) {
5503                 /*
5504                  * Avoid overflow.
5505                  * If inc causes overflow, untouch allmulti and return error.
5506                  */
5507                 if (inc < 0)
5508                         dev->flags &= ~IFF_ALLMULTI;
5509                 else {
5510                         dev->allmulti -= inc;
5511                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5512                                 dev->name);
5513                         return -EOVERFLOW;
5514                 }
5515         }
5516         if (dev->flags ^ old_flags) {
5517                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5518                 dev_set_rx_mode(dev);
5519                 if (notify)
5520                         __dev_notify_flags(dev, old_flags,
5521                                            dev->gflags ^ old_gflags);
5522         }
5523         return 0;
5524 }
5525 
5526 /**
5527  *      dev_set_allmulti        - update allmulti count on a device
5528  *      @dev: device
5529  *      @inc: modifier
5530  *
5531  *      Add or remove reception of all multicast frames to a device. While the
5532  *      count in the device remains above zero the interface remains listening
5533  *      to all interfaces. Once it hits zero the device reverts back to normal
5534  *      filtering operation. A negative @inc value is used to drop the counter
5535  *      when releasing a resource needing all multicasts.
5536  *      Return 0 if successful or a negative errno code on error.
5537  */
5538 
5539 int dev_set_allmulti(struct net_device *dev, int inc)
5540 {
5541         return __dev_set_allmulti(dev, inc, true);
5542 }
5543 EXPORT_SYMBOL(dev_set_allmulti);
5544 
5545 /*
5546  *      Upload unicast and multicast address lists to device and
5547  *      configure RX filtering. When the device doesn't support unicast
5548  *      filtering it is put in promiscuous mode while unicast addresses
5549  *      are present.
5550  */
5551 void __dev_set_rx_mode(struct net_device *dev)
5552 {
5553         const struct net_device_ops *ops = dev->netdev_ops;
5554 
5555         /* dev_open will call this function so the list will stay sane. */
5556         if (!(dev->flags&IFF_UP))
5557                 return;
5558 
5559         if (!netif_device_present(dev))
5560                 return;
5561 
5562         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5563                 /* Unicast addresses changes may only happen under the rtnl,
5564                  * therefore calling __dev_set_promiscuity here is safe.
5565                  */
5566                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5567                         __dev_set_promiscuity(dev, 1, false);
5568                         dev->uc_promisc = true;
5569                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5570                         __dev_set_promiscuity(dev, -1, false);
5571                         dev->uc_promisc = false;
5572                 }
5573         }
5574 
5575         if (ops->ndo_set_rx_mode)
5576                 ops->ndo_set_rx_mode(dev);
5577 }
5578 
5579 void dev_set_rx_mode(struct net_device *dev)
5580 {
5581         netif_addr_lock_bh(dev);
5582         __dev_set_rx_mode(dev);
5583         netif_addr_unlock_bh(dev);
5584 }
5585 
5586 /**
5587  *      dev_get_flags - get flags reported to userspace
5588  *      @dev: device
5589  *
5590  *      Get the combination of flag bits exported through APIs to userspace.
5591  */
5592 unsigned int dev_get_flags(const struct net_device *dev)
5593 {
5594         unsigned int flags;
5595 
5596         flags = (dev->flags & ~(IFF_PROMISC |
5597                                 IFF_ALLMULTI |
5598                                 IFF_RUNNING |
5599                                 IFF_LOWER_UP |
5600                                 IFF_DORMANT)) |
5601                 (dev->gflags & (IFF_PROMISC |
5602                                 IFF_ALLMULTI));
5603 
5604         if (netif_running(dev)) {
5605                 if (netif_oper_up(dev))
5606                         flags |= IFF_RUNNING;
5607                 if (netif_carrier_ok(dev))
5608                         flags |= IFF_LOWER_UP;
5609                 if (netif_dormant(dev))
5610                         flags |= IFF_DORMANT;
5611         }
5612 
5613         return flags;
5614 }
5615 EXPORT_SYMBOL(dev_get_flags);
5616 
5617 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5618 {
5619         unsigned int old_flags = dev->flags;
5620         int ret;
5621 
5622         ASSERT_RTNL();
5623 
5624         /*
5625          *      Set the flags on our device.
5626          */
5627 
5628         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5629                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5630                                IFF_AUTOMEDIA)) |
5631                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5632                                     IFF_ALLMULTI));
5633 
5634         /*
5635          *      Load in the correct multicast list now the flags have changed.
5636          */
5637 
5638         if ((old_flags ^ flags) & IFF_MULTICAST)
5639                 dev_change_rx_flags(dev, IFF_MULTICAST);
5640 
5641         dev_set_rx_mode(dev);
5642 
5643         /*
5644          *      Have we downed the interface. We handle IFF_UP ourselves
5645          *      according to user attempts to set it, rather than blindly
5646          *      setting it.
5647          */
5648 
5649         ret = 0;
5650         if ((old_flags ^ flags) & IFF_UP)
5651                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5652 
5653         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5654                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5655                 unsigned int old_flags = dev->flags;
5656 
5657                 dev->gflags ^= IFF_PROMISC;
5658 
5659                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5660                         if (dev->flags != old_flags)
5661                                 dev_set_rx_mode(dev);
5662         }
5663 
5664         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5665            is important. Some (broken) drivers set IFF_PROMISC, when
5666            IFF_ALLMULTI is requested not asking us and not reporting.
5667          */
5668         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5669                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5670 
5671                 dev->gflags ^= IFF_ALLMULTI;
5672                 __dev_set_allmulti(dev, inc, false);
5673         }
5674 
5675         return ret;
5676 }
5677 
5678 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5679                         unsigned int gchanges)
5680 {
5681         unsigned int changes = dev->flags ^ old_flags;
5682 
5683         if (gchanges)
5684                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5685 
5686         if (changes & IFF_UP) {
5687                 if (dev->flags & IFF_UP)
5688                         call_netdevice_notifiers(NETDEV_UP, dev);
5689                 else
5690                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5691         }
5692 
5693         if (dev->flags & IFF_UP &&
5694             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5695                 struct netdev_notifier_change_info change_info;
5696 
5697                 change_info.flags_changed = changes;
5698                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5699                                               &change_info.info);
5700         }
5701 }
5702 
5703 /**
5704  *      dev_change_flags - change device settings
5705  *      @dev: device
5706  *      @flags: device state flags
5707  *
5708  *      Change settings on device based state flags. The flags are
5709  *      in the userspace exported format.
5710  */
5711 int dev_change_flags(struct net_device *dev, unsigned int flags)
5712 {
5713         int ret;
5714         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5715 
5716         ret = __dev_change_flags(dev, flags);
5717         if (ret < 0)
5718                 return ret;
5719 
5720         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5721         __dev_notify_flags(dev, old_flags, changes);
5722         return ret;
5723 }
5724 EXPORT_SYMBOL(dev_change_flags);
5725 
5726 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5727 {
5728         const struct net_device_ops *ops = dev->netdev_ops;
5729 
5730         if (ops->ndo_change_mtu)
5731                 return ops->ndo_change_mtu(dev, new_mtu);
5732 
5733         dev->mtu = new_mtu;
5734         return 0;
5735 }
5736 
5737 /**
5738  *      dev_set_mtu - Change maximum transfer unit
5739  *      @dev: device
5740  *      @new_mtu: new transfer unit
5741  *
5742  *      Change the maximum transfer size of the network device.
5743  */
5744 int dev_set_mtu(struct net_device *dev, int new_mtu)
5745 {
5746         int err, orig_mtu;
5747 
5748         if (new_mtu == dev->mtu)
5749                 return 0;
5750 
5751         /*      MTU must be positive.    */
5752         if (new_mtu < 0)
5753                 return -EINVAL;
5754 
5755         if (!netif_device_present(dev))
5756                 return -ENODEV;
5757 
5758         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5759         err = notifier_to_errno(err);
5760         if (err)
5761                 return err;
5762 
5763         orig_mtu = dev->mtu;
5764         err = __dev_set_mtu(dev, new_mtu);
5765 
5766         if (!err) {
5767                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5768                 err = notifier_to_errno(err);
5769                 if (err) {
5770                         /* setting mtu back and notifying everyone again,
5771                          * so that they have a chance to revert changes.
5772                          */
5773                         __dev_set_mtu(dev, orig_mtu);
5774                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5775                 }
5776         }
5777         return err;
5778 }
5779 EXPORT_SYMBOL(dev_set_mtu);
5780 
5781 /**
5782  *      dev_set_group - Change group this device belongs to
5783  *      @dev: device
5784  *      @new_group: group this device should belong to
5785  */
5786 void dev_set_group(struct net_device *dev, int new_group)
5787 {
5788         dev->group = new_group;
5789 }
5790 EXPORT_SYMBOL(dev_set_group);
5791 
5792 /**
5793  *      dev_set_mac_address - Change Media Access Control Address
5794  *      @dev: device
5795  *      @sa: new address
5796  *
5797  *      Change the hardware (MAC) address of the device
5798  */
5799 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5800 {
5801         const struct net_device_ops *ops = dev->netdev_ops;
5802         int err;
5803 
5804         if (!ops->ndo_set_mac_address)
5805                 return -EOPNOTSUPP;
5806         if (sa->sa_family != dev->type)
5807                 return -EINVAL;
5808         if (!netif_device_present(dev))
5809                 return -ENODEV;
5810         err = ops->ndo_set_mac_address(dev, sa);
5811         if (err)
5812                 return err;
5813         dev->addr_assign_type = NET_ADDR_SET;
5814         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5815         add_device_randomness(dev->dev_addr, dev->addr_len);
5816         return 0;
5817 }
5818 EXPORT_SYMBOL(dev_set_mac_address);
5819 
5820 /**
5821  *      dev_change_carrier - Change device carrier
5822  *      @dev: device
5823  *      @new_carrier: new value
5824  *
5825  *      Change device carrier
5826  */
5827 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5828 {
5829         const struct net_device_ops *ops = dev->netdev_ops;
5830 
5831         if (!ops->ndo_change_carrier)
5832                 return -EOPNOTSUPP;
5833         if (!netif_device_present(dev))
5834                 return -ENODEV;
5835         return ops->ndo_change_carrier(dev, new_carrier);
5836 }
5837 EXPORT_SYMBOL(dev_change_carrier);
5838 
5839 /**
5840  *      dev_get_phys_port_id - Get device physical port ID
5841  *      @dev: device
5842  *      @ppid: port ID
5843  *
5844  *      Get device physical port ID
5845  */
5846 int dev_get_phys_port_id(struct net_device *dev,
5847                          struct netdev_phys_item_id *ppid)
5848 {
5849         const struct net_device_ops *ops = dev->netdev_ops;
5850 
5851         if (!ops->ndo_get_phys_port_id)
5852                 return -EOPNOTSUPP;
5853         return ops->ndo_get_phys_port_id(dev, ppid);
5854 }
5855 EXPORT_SYMBOL(dev_get_phys_port_id);
5856 
5857 /**
5858  *      dev_new_index   -       allocate an ifindex
5859  *      @net: the applicable net namespace
5860  *
5861  *      Returns a suitable unique value for a new device interface
5862  *      number.  The caller must hold the rtnl semaphore or the
5863  *      dev_base_lock to be sure it remains unique.
5864  */
5865 static int dev_new_index(struct net *net)
5866 {
5867         int ifindex = net->ifindex;
5868         for (;;) {
5869                 if (++ifindex <= 0)
5870                         ifindex = 1;
5871                 if (!__dev_get_by_index(net, ifindex))
5872                         return net->ifindex = ifindex;
5873         }
5874 }
5875 
5876 /* Delayed registration/unregisteration */
5877 static LIST_HEAD(net_todo_list);
5878 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5879 
5880 static void net_set_todo(struct net_device *dev)
5881 {
5882         list_add_tail(&dev->todo_list, &net_todo_list);
5883         dev_net(dev)->dev_unreg_count++;
5884 }
5885 
5886 static void rollback_registered_many(struct list_head *head)
5887 {
5888         struct net_device *dev, *tmp;
5889         LIST_HEAD(close_head);
5890 
5891         BUG_ON(dev_boot_phase);
5892         ASSERT_RTNL();
5893 
5894         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5895                 /* Some devices call without registering
5896                  * for initialization unwind. Remove those
5897                  * devices and proceed with the remaining.
5898                  */
5899                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5900                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5901                                  dev->name, dev);
5902 
5903                         WARN_ON(1);
5904                         list_del(&dev->unreg_list);
5905                         continue;
5906                 }
5907                 dev->dismantle = true;
5908                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5909         }
5910 
5911         /* If device is running, close it first. */
5912         list_for_each_entry(dev, head, unreg_list)
5913                 list_add_tail(&dev->close_list, &close_head);
5914         dev_close_many(&close_head);
5915 
5916         list_for_each_entry(dev, head, unreg_list) {
5917                 /* And unlink it from device chain. */
5918                 unlist_netdevice(dev);
5919 
5920                 dev->reg_state = NETREG_UNREGISTERING;
5921         }
5922 
5923         synchronize_net();
5924 
5925         list_for_each_entry(dev, head, unreg_list) {
5926                 struct sk_buff *skb = NULL;
5927 
5928                 /* Shutdown queueing discipline. */
5929                 dev_shutdown(dev);
5930 
5931 
5932                 /* Notify protocols, that we are about to destroy
5933                    this device. They should clean all the things.
5934                 */
5935                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5936 
5937                 if (!dev->rtnl_link_ops ||
5938                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5939                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
5940                                                      GFP_KERNEL);
5941 
5942                 /*
5943                  *      Flush the unicast and multicast chains
5944                  */
5945                 dev_uc_flush(dev);
5946                 dev_mc_flush(dev);
5947 
5948                 if (dev->netdev_ops->ndo_uninit)
5949                         dev->netdev_ops->ndo_uninit(dev);
5950 
5951                 if (skb)
5952                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
5953 
5954                 /* Notifier chain MUST detach us all upper devices. */
5955                 WARN_ON(netdev_has_any_upper_dev(dev));
5956 
5957                 /* Remove entries from kobject tree */
5958                 netdev_unregister_kobject(dev);
5959 #ifdef CONFIG_XPS
5960                 /* Remove XPS queueing entries */
5961                 netif_reset_xps_queues_gt(dev, 0);
5962 #endif
5963         }
5964 
5965         synchronize_net();
5966 
5967         list_for_each_entry(dev, head, unreg_list)
5968                 dev_put(dev);
5969 }
5970 
5971 static void rollback_registered(struct net_device *dev)
5972 {
5973         LIST_HEAD(single);
5974 
5975         list_add(&dev->unreg_list, &single);
5976         rollback_registered_many(&single);
5977         list_del(&single);
5978 }
5979 
5980 static netdev_features_t netdev_fix_features(struct net_device *dev,
5981         netdev_features_t features)
5982 {
5983         /* Fix illegal checksum combinations */
5984         if ((features & NETIF_F_HW_CSUM) &&
5985             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5986                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5987                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5988         }
5989 
5990         /* TSO requires that SG is present as well. */
5991         if ((features & NETIF_F_ALL_TSO) && !(