~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/core/dev.c

Version: ~ [ linux-5.15-rc5 ] ~ [ linux-5.14.11 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.72 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.152 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.210 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.250 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.286 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.288 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  *      NET3    Protocol independent device support routines.
  4  *
  5  *      Derived from the non IP parts of dev.c 1.0.19
  6  *              Authors:        Ross Biro
  7  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  8  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  9  *
 10  *      Additional Authors:
 11  *              Florian la Roche <rzsfl@rz.uni-sb.de>
 12  *              Alan Cox <gw4pts@gw4pts.ampr.org>
 13  *              David Hinds <dahinds@users.sourceforge.net>
 14  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 15  *              Adam Sulmicki <adam@cfar.umd.edu>
 16  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
 17  *
 18  *      Changes:
 19  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
 20  *                                      to 2 if register_netdev gets called
 21  *                                      before net_dev_init & also removed a
 22  *                                      few lines of code in the process.
 23  *              Alan Cox        :       device private ioctl copies fields back.
 24  *              Alan Cox        :       Transmit queue code does relevant
 25  *                                      stunts to keep the queue safe.
 26  *              Alan Cox        :       Fixed double lock.
 27  *              Alan Cox        :       Fixed promisc NULL pointer trap
 28  *              ????????        :       Support the full private ioctl range
 29  *              Alan Cox        :       Moved ioctl permission check into
 30  *                                      drivers
 31  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
 32  *              Alan Cox        :       100 backlog just doesn't cut it when
 33  *                                      you start doing multicast video 8)
 34  *              Alan Cox        :       Rewrote net_bh and list manager.
 35  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
 36  *              Alan Cox        :       Took out transmit every packet pass
 37  *                                      Saved a few bytes in the ioctl handler
 38  *              Alan Cox        :       Network driver sets packet type before
 39  *                                      calling netif_rx. Saves a function
 40  *                                      call a packet.
 41  *              Alan Cox        :       Hashed net_bh()
 42  *              Richard Kooijman:       Timestamp fixes.
 43  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
 44  *              Alan Cox        :       Device lock protection.
 45  *              Alan Cox        :       Fixed nasty side effect of device close
 46  *                                      changes.
 47  *              Rudi Cilibrasi  :       Pass the right thing to
 48  *                                      set_mac_address()
 49  *              Dave Miller     :       32bit quantity for the device lock to
 50  *                                      make it work out on a Sparc.
 51  *              Bjorn Ekwall    :       Added KERNELD hack.
 52  *              Alan Cox        :       Cleaned up the backlog initialise.
 53  *              Craig Metz      :       SIOCGIFCONF fix if space for under
 54  *                                      1 device.
 55  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
 56  *                                      is no device open function.
 57  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
 58  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
 59  *              Cyrus Durgin    :       Cleaned for KMOD
 60  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
 61  *                                      A network device unload needs to purge
 62  *                                      the backlog queue.
 63  *      Paul Rusty Russell      :       SIOCSIFNAME
 64  *              Pekka Riikonen  :       Netdev boot-time settings code
 65  *              Andrew Morton   :       Make unregister_netdevice wait
 66  *                                      indefinitely on dev->refcnt
 67  *              J Hadi Salim    :       - Backlog queue sampling
 68  *                                      - netif_rx() feedback
 69  */
 70 
 71 #include <linux/uaccess.h>
 72 #include <linux/bitops.h>
 73 #include <linux/capability.h>
 74 #include <linux/cpu.h>
 75 #include <linux/types.h>
 76 #include <linux/kernel.h>
 77 #include <linux/hash.h>
 78 #include <linux/slab.h>
 79 #include <linux/sched.h>
 80 #include <linux/sched/mm.h>
 81 #include <linux/mutex.h>
 82 #include <linux/string.h>
 83 #include <linux/mm.h>
 84 #include <linux/socket.h>
 85 #include <linux/sockios.h>
 86 #include <linux/errno.h>
 87 #include <linux/interrupt.h>
 88 #include <linux/if_ether.h>
 89 #include <linux/netdevice.h>
 90 #include <linux/etherdevice.h>
 91 #include <linux/ethtool.h>
 92 #include <linux/skbuff.h>
 93 #include <linux/bpf.h>
 94 #include <linux/bpf_trace.h>
 95 #include <net/net_namespace.h>
 96 #include <net/sock.h>
 97 #include <net/busy_poll.h>
 98 #include <linux/rtnetlink.h>
 99 #include <linux/stat.h>
100 #include <net/dst.h>
101 #include <net/dst_metadata.h>
102 #include <net/pkt_sched.h>
103 #include <net/pkt_cls.h>
104 #include <net/checksum.h>
105 #include <net/xfrm.h>
106 #include <linux/highmem.h>
107 #include <linux/init.h>
108 #include <linux/module.h>
109 #include <linux/netpoll.h>
110 #include <linux/rcupdate.h>
111 #include <linux/delay.h>
112 #include <net/iw_handler.h>
113 #include <asm/current.h>
114 #include <linux/audit.h>
115 #include <linux/dmaengine.h>
116 #include <linux/err.h>
117 #include <linux/ctype.h>
118 #include <linux/if_arp.h>
119 #include <linux/if_vlan.h>
120 #include <linux/ip.h>
121 #include <net/ip.h>
122 #include <net/mpls.h>
123 #include <linux/ipv6.h>
124 #include <linux/in.h>
125 #include <linux/jhash.h>
126 #include <linux/random.h>
127 #include <trace/events/napi.h>
128 #include <trace/events/net.h>
129 #include <trace/events/skb.h>
130 #include <linux/inetdevice.h>
131 #include <linux/cpu_rmap.h>
132 #include <linux/static_key.h>
133 #include <linux/hashtable.h>
134 #include <linux/vmalloc.h>
135 #include <linux/if_macvlan.h>
136 #include <linux/errqueue.h>
137 #include <linux/hrtimer.h>
138 #include <linux/netfilter_ingress.h>
139 #include <linux/crash_dump.h>
140 #include <linux/sctp.h>
141 #include <net/udp_tunnel.h>
142 #include <linux/net_namespace.h>
143 #include <linux/indirect_call_wrapper.h>
144 #include <net/devlink.h>
145 
146 #include "net-sysfs.h"
147 
148 #define MAX_GRO_SKBS 8
149 
150 /* This should be increased if a protocol with a bigger head is added. */
151 #define GRO_MAX_HEAD (MAX_HEADER + 128)
152 
153 static DEFINE_SPINLOCK(ptype_lock);
154 static DEFINE_SPINLOCK(offload_lock);
155 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
156 struct list_head ptype_all __read_mostly;       /* Taps */
157 static struct list_head offload_base __read_mostly;
158 
159 static int netif_rx_internal(struct sk_buff *skb);
160 static int call_netdevice_notifiers_info(unsigned long val,
161                                          struct netdev_notifier_info *info);
162 static int call_netdevice_notifiers_extack(unsigned long val,
163                                            struct net_device *dev,
164                                            struct netlink_ext_ack *extack);
165 static struct napi_struct *napi_by_id(unsigned int napi_id);
166 
167 /*
168  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
169  * semaphore.
170  *
171  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
172  *
173  * Writers must hold the rtnl semaphore while they loop through the
174  * dev_base_head list, and hold dev_base_lock for writing when they do the
175  * actual updates.  This allows pure readers to access the list even
176  * while a writer is preparing to update it.
177  *
178  * To put it another way, dev_base_lock is held for writing only to
179  * protect against pure readers; the rtnl semaphore provides the
180  * protection against other writers.
181  *
182  * See, for example usages, register_netdevice() and
183  * unregister_netdevice(), which must be called with the rtnl
184  * semaphore held.
185  */
186 DEFINE_RWLOCK(dev_base_lock);
187 EXPORT_SYMBOL(dev_base_lock);
188 
189 static DEFINE_MUTEX(ifalias_mutex);
190 
191 /* protects napi_hash addition/deletion and napi_gen_id */
192 static DEFINE_SPINLOCK(napi_hash_lock);
193 
194 static unsigned int napi_gen_id = NR_CPUS;
195 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
196 
197 static seqcount_t devnet_rename_seq;
198 
199 static inline void dev_base_seq_inc(struct net *net)
200 {
201         while (++net->dev_base_seq == 0)
202                 ;
203 }
204 
205 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
206 {
207         unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
208 
209         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
210 }
211 
212 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
213 {
214         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
215 }
216 
217 static inline void rps_lock(struct softnet_data *sd)
218 {
219 #ifdef CONFIG_RPS
220         spin_lock(&sd->input_pkt_queue.lock);
221 #endif
222 }
223 
224 static inline void rps_unlock(struct softnet_data *sd)
225 {
226 #ifdef CONFIG_RPS
227         spin_unlock(&sd->input_pkt_queue.lock);
228 #endif
229 }
230 
231 static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
232                                                        const char *name)
233 {
234         struct netdev_name_node *name_node;
235 
236         name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
237         if (!name_node)
238                 return NULL;
239         INIT_HLIST_NODE(&name_node->hlist);
240         name_node->dev = dev;
241         name_node->name = name;
242         return name_node;
243 }
244 
245 static struct netdev_name_node *
246 netdev_name_node_head_alloc(struct net_device *dev)
247 {
248         struct netdev_name_node *name_node;
249 
250         name_node = netdev_name_node_alloc(dev, dev->name);
251         if (!name_node)
252                 return NULL;
253         INIT_LIST_HEAD(&name_node->list);
254         return name_node;
255 }
256 
257 static void netdev_name_node_free(struct netdev_name_node *name_node)
258 {
259         kfree(name_node);
260 }
261 
262 static void netdev_name_node_add(struct net *net,
263                                  struct netdev_name_node *name_node)
264 {
265         hlist_add_head_rcu(&name_node->hlist,
266                            dev_name_hash(net, name_node->name));
267 }
268 
269 static void netdev_name_node_del(struct netdev_name_node *name_node)
270 {
271         hlist_del_rcu(&name_node->hlist);
272 }
273 
274 static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
275                                                         const char *name)
276 {
277         struct hlist_head *head = dev_name_hash(net, name);
278         struct netdev_name_node *name_node;
279 
280         hlist_for_each_entry(name_node, head, hlist)
281                 if (!strcmp(name_node->name, name))
282                         return name_node;
283         return NULL;
284 }
285 
286 static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
287                                                             const char *name)
288 {
289         struct hlist_head *head = dev_name_hash(net, name);
290         struct netdev_name_node *name_node;
291 
292         hlist_for_each_entry_rcu(name_node, head, hlist)
293                 if (!strcmp(name_node->name, name))
294                         return name_node;
295         return NULL;
296 }
297 
298 int netdev_name_node_alt_create(struct net_device *dev, const char *name)
299 {
300         struct netdev_name_node *name_node;
301         struct net *net = dev_net(dev);
302 
303         name_node = netdev_name_node_lookup(net, name);
304         if (name_node)
305                 return -EEXIST;
306         name_node = netdev_name_node_alloc(dev, name);
307         if (!name_node)
308                 return -ENOMEM;
309         netdev_name_node_add(net, name_node);
310         /* The node that holds dev->name acts as a head of per-device list. */
311         list_add_tail(&name_node->list, &dev->name_node->list);
312 
313         return 0;
314 }
315 EXPORT_SYMBOL(netdev_name_node_alt_create);
316 
317 static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
318 {
319         list_del(&name_node->list);
320         netdev_name_node_del(name_node);
321         kfree(name_node->name);
322         netdev_name_node_free(name_node);
323 }
324 
325 int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
326 {
327         struct netdev_name_node *name_node;
328         struct net *net = dev_net(dev);
329 
330         name_node = netdev_name_node_lookup(net, name);
331         if (!name_node)
332                 return -ENOENT;
333         /* lookup might have found our primary name or a name belonging
334          * to another device.
335          */
336         if (name_node == dev->name_node || name_node->dev != dev)
337                 return -EINVAL;
338 
339         __netdev_name_node_alt_destroy(name_node);
340 
341         return 0;
342 }
343 EXPORT_SYMBOL(netdev_name_node_alt_destroy);
344 
345 static void netdev_name_node_alt_flush(struct net_device *dev)
346 {
347         struct netdev_name_node *name_node, *tmp;
348 
349         list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
350                 __netdev_name_node_alt_destroy(name_node);
351 }
352 
353 /* Device list insertion */
354 static void list_netdevice(struct net_device *dev)
355 {
356         struct net *net = dev_net(dev);
357 
358         ASSERT_RTNL();
359 
360         write_lock_bh(&dev_base_lock);
361         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
362         netdev_name_node_add(net, dev->name_node);
363         hlist_add_head_rcu(&dev->index_hlist,
364                            dev_index_hash(net, dev->ifindex));
365         write_unlock_bh(&dev_base_lock);
366 
367         dev_base_seq_inc(net);
368 }
369 
370 /* Device list removal
371  * caller must respect a RCU grace period before freeing/reusing dev
372  */
373 static void unlist_netdevice(struct net_device *dev)
374 {
375         ASSERT_RTNL();
376 
377         /* Unlink dev from the device chain */
378         write_lock_bh(&dev_base_lock);
379         list_del_rcu(&dev->dev_list);
380         netdev_name_node_del(dev->name_node);
381         hlist_del_rcu(&dev->index_hlist);
382         write_unlock_bh(&dev_base_lock);
383 
384         dev_base_seq_inc(dev_net(dev));
385 }
386 
387 /*
388  *      Our notifier list
389  */
390 
391 static RAW_NOTIFIER_HEAD(netdev_chain);
392 
393 /*
394  *      Device drivers call our routines to queue packets here. We empty the
395  *      queue in the local softnet handler.
396  */
397 
398 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
399 EXPORT_PER_CPU_SYMBOL(softnet_data);
400 
401 /*******************************************************************************
402  *
403  *              Protocol management and registration routines
404  *
405  *******************************************************************************/
406 
407 
408 /*
409  *      Add a protocol ID to the list. Now that the input handler is
410  *      smarter we can dispense with all the messy stuff that used to be
411  *      here.
412  *
413  *      BEWARE!!! Protocol handlers, mangling input packets,
414  *      MUST BE last in hash buckets and checking protocol handlers
415  *      MUST start from promiscuous ptype_all chain in net_bh.
416  *      It is true now, do not change it.
417  *      Explanation follows: if protocol handler, mangling packet, will
418  *      be the first on list, it is not able to sense, that packet
419  *      is cloned and should be copied-on-write, so that it will
420  *      change it and subsequent readers will get broken packet.
421  *                                                      --ANK (980803)
422  */
423 
424 static inline struct list_head *ptype_head(const struct packet_type *pt)
425 {
426         if (pt->type == htons(ETH_P_ALL))
427                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
428         else
429                 return pt->dev ? &pt->dev->ptype_specific :
430                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
431 }
432 
433 /**
434  *      dev_add_pack - add packet handler
435  *      @pt: packet type declaration
436  *
437  *      Add a protocol handler to the networking stack. The passed &packet_type
438  *      is linked into kernel lists and may not be freed until it has been
439  *      removed from the kernel lists.
440  *
441  *      This call does not sleep therefore it can not
442  *      guarantee all CPU's that are in middle of receiving packets
443  *      will see the new packet type (until the next received packet).
444  */
445 
446 void dev_add_pack(struct packet_type *pt)
447 {
448         struct list_head *head = ptype_head(pt);
449 
450         spin_lock(&ptype_lock);
451         list_add_rcu(&pt->list, head);
452         spin_unlock(&ptype_lock);
453 }
454 EXPORT_SYMBOL(dev_add_pack);
455 
456 /**
457  *      __dev_remove_pack        - remove packet handler
458  *      @pt: packet type declaration
459  *
460  *      Remove a protocol handler that was previously added to the kernel
461  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
462  *      from the kernel lists and can be freed or reused once this function
463  *      returns.
464  *
465  *      The packet type might still be in use by receivers
466  *      and must not be freed until after all the CPU's have gone
467  *      through a quiescent state.
468  */
469 void __dev_remove_pack(struct packet_type *pt)
470 {
471         struct list_head *head = ptype_head(pt);
472         struct packet_type *pt1;
473 
474         spin_lock(&ptype_lock);
475 
476         list_for_each_entry(pt1, head, list) {
477                 if (pt == pt1) {
478                         list_del_rcu(&pt->list);
479                         goto out;
480                 }
481         }
482 
483         pr_warn("dev_remove_pack: %p not found\n", pt);
484 out:
485         spin_unlock(&ptype_lock);
486 }
487 EXPORT_SYMBOL(__dev_remove_pack);
488 
489 /**
490  *      dev_remove_pack  - remove packet handler
491  *      @pt: packet type declaration
492  *
493  *      Remove a protocol handler that was previously added to the kernel
494  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
495  *      from the kernel lists and can be freed or reused once this function
496  *      returns.
497  *
498  *      This call sleeps to guarantee that no CPU is looking at the packet
499  *      type after return.
500  */
501 void dev_remove_pack(struct packet_type *pt)
502 {
503         __dev_remove_pack(pt);
504 
505         synchronize_net();
506 }
507 EXPORT_SYMBOL(dev_remove_pack);
508 
509 
510 /**
511  *      dev_add_offload - register offload handlers
512  *      @po: protocol offload declaration
513  *
514  *      Add protocol offload handlers to the networking stack. The passed
515  *      &proto_offload is linked into kernel lists and may not be freed until
516  *      it has been removed from the kernel lists.
517  *
518  *      This call does not sleep therefore it can not
519  *      guarantee all CPU's that are in middle of receiving packets
520  *      will see the new offload handlers (until the next received packet).
521  */
522 void dev_add_offload(struct packet_offload *po)
523 {
524         struct packet_offload *elem;
525 
526         spin_lock(&offload_lock);
527         list_for_each_entry(elem, &offload_base, list) {
528                 if (po->priority < elem->priority)
529                         break;
530         }
531         list_add_rcu(&po->list, elem->list.prev);
532         spin_unlock(&offload_lock);
533 }
534 EXPORT_SYMBOL(dev_add_offload);
535 
536 /**
537  *      __dev_remove_offload     - remove offload handler
538  *      @po: packet offload declaration
539  *
540  *      Remove a protocol offload handler that was previously added to the
541  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
542  *      is removed from the kernel lists and can be freed or reused once this
543  *      function returns.
544  *
545  *      The packet type might still be in use by receivers
546  *      and must not be freed until after all the CPU's have gone
547  *      through a quiescent state.
548  */
549 static void __dev_remove_offload(struct packet_offload *po)
550 {
551         struct list_head *head = &offload_base;
552         struct packet_offload *po1;
553 
554         spin_lock(&offload_lock);
555 
556         list_for_each_entry(po1, head, list) {
557                 if (po == po1) {
558                         list_del_rcu(&po->list);
559                         goto out;
560                 }
561         }
562 
563         pr_warn("dev_remove_offload: %p not found\n", po);
564 out:
565         spin_unlock(&offload_lock);
566 }
567 
568 /**
569  *      dev_remove_offload       - remove packet offload handler
570  *      @po: packet offload declaration
571  *
572  *      Remove a packet offload handler that was previously added to the kernel
573  *      offload handlers by dev_add_offload(). The passed &offload_type is
574  *      removed from the kernel lists and can be freed or reused once this
575  *      function returns.
576  *
577  *      This call sleeps to guarantee that no CPU is looking at the packet
578  *      type after return.
579  */
580 void dev_remove_offload(struct packet_offload *po)
581 {
582         __dev_remove_offload(po);
583 
584         synchronize_net();
585 }
586 EXPORT_SYMBOL(dev_remove_offload);
587 
588 /******************************************************************************
589  *
590  *                    Device Boot-time Settings Routines
591  *
592  ******************************************************************************/
593 
594 /* Boot time configuration table */
595 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
596 
597 /**
598  *      netdev_boot_setup_add   - add new setup entry
599  *      @name: name of the device
600  *      @map: configured settings for the device
601  *
602  *      Adds new setup entry to the dev_boot_setup list.  The function
603  *      returns 0 on error and 1 on success.  This is a generic routine to
604  *      all netdevices.
605  */
606 static int netdev_boot_setup_add(char *name, struct ifmap *map)
607 {
608         struct netdev_boot_setup *s;
609         int i;
610 
611         s = dev_boot_setup;
612         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
613                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
614                         memset(s[i].name, 0, sizeof(s[i].name));
615                         strlcpy(s[i].name, name, IFNAMSIZ);
616                         memcpy(&s[i].map, map, sizeof(s[i].map));
617                         break;
618                 }
619         }
620 
621         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
622 }
623 
624 /**
625  * netdev_boot_setup_check      - check boot time settings
626  * @dev: the netdevice
627  *
628  * Check boot time settings for the device.
629  * The found settings are set for the device to be used
630  * later in the device probing.
631  * Returns 0 if no settings found, 1 if they are.
632  */
633 int netdev_boot_setup_check(struct net_device *dev)
634 {
635         struct netdev_boot_setup *s = dev_boot_setup;
636         int i;
637 
638         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
639                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
640                     !strcmp(dev->name, s[i].name)) {
641                         dev->irq = s[i].map.irq;
642                         dev->base_addr = s[i].map.base_addr;
643                         dev->mem_start = s[i].map.mem_start;
644                         dev->mem_end = s[i].map.mem_end;
645                         return 1;
646                 }
647         }
648         return 0;
649 }
650 EXPORT_SYMBOL(netdev_boot_setup_check);
651 
652 
653 /**
654  * netdev_boot_base     - get address from boot time settings
655  * @prefix: prefix for network device
656  * @unit: id for network device
657  *
658  * Check boot time settings for the base address of device.
659  * The found settings are set for the device to be used
660  * later in the device probing.
661  * Returns 0 if no settings found.
662  */
663 unsigned long netdev_boot_base(const char *prefix, int unit)
664 {
665         const struct netdev_boot_setup *s = dev_boot_setup;
666         char name[IFNAMSIZ];
667         int i;
668 
669         sprintf(name, "%s%d", prefix, unit);
670 
671         /*
672          * If device already registered then return base of 1
673          * to indicate not to probe for this interface
674          */
675         if (__dev_get_by_name(&init_net, name))
676                 return 1;
677 
678         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
679                 if (!strcmp(name, s[i].name))
680                         return s[i].map.base_addr;
681         return 0;
682 }
683 
684 /*
685  * Saves at boot time configured settings for any netdevice.
686  */
687 int __init netdev_boot_setup(char *str)
688 {
689         int ints[5];
690         struct ifmap map;
691 
692         str = get_options(str, ARRAY_SIZE(ints), ints);
693         if (!str || !*str)
694                 return 0;
695 
696         /* Save settings */
697         memset(&map, 0, sizeof(map));
698         if (ints[0] > 0)
699                 map.irq = ints[1];
700         if (ints[0] > 1)
701                 map.base_addr = ints[2];
702         if (ints[0] > 2)
703                 map.mem_start = ints[3];
704         if (ints[0] > 3)
705                 map.mem_end = ints[4];
706 
707         /* Add new entry to the list */
708         return netdev_boot_setup_add(str, &map);
709 }
710 
711 __setup("netdev=", netdev_boot_setup);
712 
713 /*******************************************************************************
714  *
715  *                          Device Interface Subroutines
716  *
717  *******************************************************************************/
718 
719 /**
720  *      dev_get_iflink  - get 'iflink' value of a interface
721  *      @dev: targeted interface
722  *
723  *      Indicates the ifindex the interface is linked to.
724  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
725  */
726 
727 int dev_get_iflink(const struct net_device *dev)
728 {
729         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
730                 return dev->netdev_ops->ndo_get_iflink(dev);
731 
732         return dev->ifindex;
733 }
734 EXPORT_SYMBOL(dev_get_iflink);
735 
736 /**
737  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
738  *      @dev: targeted interface
739  *      @skb: The packet.
740  *
741  *      For better visibility of tunnel traffic OVS needs to retrieve
742  *      egress tunnel information for a packet. Following API allows
743  *      user to get this info.
744  */
745 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
746 {
747         struct ip_tunnel_info *info;
748 
749         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
750                 return -EINVAL;
751 
752         info = skb_tunnel_info_unclone(skb);
753         if (!info)
754                 return -ENOMEM;
755         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
756                 return -EINVAL;
757 
758         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
759 }
760 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
761 
762 /**
763  *      __dev_get_by_name       - find a device by its name
764  *      @net: the applicable net namespace
765  *      @name: name to find
766  *
767  *      Find an interface by name. Must be called under RTNL semaphore
768  *      or @dev_base_lock. If the name is found a pointer to the device
769  *      is returned. If the name is not found then %NULL is returned. The
770  *      reference counters are not incremented so the caller must be
771  *      careful with locks.
772  */
773 
774 struct net_device *__dev_get_by_name(struct net *net, const char *name)
775 {
776         struct netdev_name_node *node_name;
777 
778         node_name = netdev_name_node_lookup(net, name);
779         return node_name ? node_name->dev : NULL;
780 }
781 EXPORT_SYMBOL(__dev_get_by_name);
782 
783 /**
784  * dev_get_by_name_rcu  - find a device by its name
785  * @net: the applicable net namespace
786  * @name: name to find
787  *
788  * Find an interface by name.
789  * If the name is found a pointer to the device is returned.
790  * If the name is not found then %NULL is returned.
791  * The reference counters are not incremented so the caller must be
792  * careful with locks. The caller must hold RCU lock.
793  */
794 
795 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
796 {
797         struct netdev_name_node *node_name;
798 
799         node_name = netdev_name_node_lookup_rcu(net, name);
800         return node_name ? node_name->dev : NULL;
801 }
802 EXPORT_SYMBOL(dev_get_by_name_rcu);
803 
804 /**
805  *      dev_get_by_name         - find a device by its name
806  *      @net: the applicable net namespace
807  *      @name: name to find
808  *
809  *      Find an interface by name. This can be called from any
810  *      context and does its own locking. The returned handle has
811  *      the usage count incremented and the caller must use dev_put() to
812  *      release it when it is no longer needed. %NULL is returned if no
813  *      matching device is found.
814  */
815 
816 struct net_device *dev_get_by_name(struct net *net, const char *name)
817 {
818         struct net_device *dev;
819 
820         rcu_read_lock();
821         dev = dev_get_by_name_rcu(net, name);
822         if (dev)
823                 dev_hold(dev);
824         rcu_read_unlock();
825         return dev;
826 }
827 EXPORT_SYMBOL(dev_get_by_name);
828 
829 /**
830  *      __dev_get_by_index - find a device by its ifindex
831  *      @net: the applicable net namespace
832  *      @ifindex: index of device
833  *
834  *      Search for an interface by index. Returns %NULL if the device
835  *      is not found or a pointer to the device. The device has not
836  *      had its reference counter increased so the caller must be careful
837  *      about locking. The caller must hold either the RTNL semaphore
838  *      or @dev_base_lock.
839  */
840 
841 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
842 {
843         struct net_device *dev;
844         struct hlist_head *head = dev_index_hash(net, ifindex);
845 
846         hlist_for_each_entry(dev, head, index_hlist)
847                 if (dev->ifindex == ifindex)
848                         return dev;
849 
850         return NULL;
851 }
852 EXPORT_SYMBOL(__dev_get_by_index);
853 
854 /**
855  *      dev_get_by_index_rcu - find a device by its ifindex
856  *      @net: the applicable net namespace
857  *      @ifindex: index of device
858  *
859  *      Search for an interface by index. Returns %NULL if the device
860  *      is not found or a pointer to the device. The device has not
861  *      had its reference counter increased so the caller must be careful
862  *      about locking. The caller must hold RCU lock.
863  */
864 
865 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
866 {
867         struct net_device *dev;
868         struct hlist_head *head = dev_index_hash(net, ifindex);
869 
870         hlist_for_each_entry_rcu(dev, head, index_hlist)
871                 if (dev->ifindex == ifindex)
872                         return dev;
873 
874         return NULL;
875 }
876 EXPORT_SYMBOL(dev_get_by_index_rcu);
877 
878 
879 /**
880  *      dev_get_by_index - find a device by its ifindex
881  *      @net: the applicable net namespace
882  *      @ifindex: index of device
883  *
884  *      Search for an interface by index. Returns NULL if the device
885  *      is not found or a pointer to the device. The device returned has
886  *      had a reference added and the pointer is safe until the user calls
887  *      dev_put to indicate they have finished with it.
888  */
889 
890 struct net_device *dev_get_by_index(struct net *net, int ifindex)
891 {
892         struct net_device *dev;
893 
894         rcu_read_lock();
895         dev = dev_get_by_index_rcu(net, ifindex);
896         if (dev)
897                 dev_hold(dev);
898         rcu_read_unlock();
899         return dev;
900 }
901 EXPORT_SYMBOL(dev_get_by_index);
902 
903 /**
904  *      dev_get_by_napi_id - find a device by napi_id
905  *      @napi_id: ID of the NAPI struct
906  *
907  *      Search for an interface by NAPI ID. Returns %NULL if the device
908  *      is not found or a pointer to the device. The device has not had
909  *      its reference counter increased so the caller must be careful
910  *      about locking. The caller must hold RCU lock.
911  */
912 
913 struct net_device *dev_get_by_napi_id(unsigned int napi_id)
914 {
915         struct napi_struct *napi;
916 
917         WARN_ON_ONCE(!rcu_read_lock_held());
918 
919         if (napi_id < MIN_NAPI_ID)
920                 return NULL;
921 
922         napi = napi_by_id(napi_id);
923 
924         return napi ? napi->dev : NULL;
925 }
926 EXPORT_SYMBOL(dev_get_by_napi_id);
927 
928 /**
929  *      netdev_get_name - get a netdevice name, knowing its ifindex.
930  *      @net: network namespace
931  *      @name: a pointer to the buffer where the name will be stored.
932  *      @ifindex: the ifindex of the interface to get the name from.
933  *
934  *      The use of raw_seqcount_begin() and cond_resched() before
935  *      retrying is required as we want to give the writers a chance
936  *      to complete when CONFIG_PREEMPT is not set.
937  */
938 int netdev_get_name(struct net *net, char *name, int ifindex)
939 {
940         struct net_device *dev;
941         unsigned int seq;
942 
943 retry:
944         seq = raw_seqcount_begin(&devnet_rename_seq);
945         rcu_read_lock();
946         dev = dev_get_by_index_rcu(net, ifindex);
947         if (!dev) {
948                 rcu_read_unlock();
949                 return -ENODEV;
950         }
951 
952         strcpy(name, dev->name);
953         rcu_read_unlock();
954         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
955                 cond_resched();
956                 goto retry;
957         }
958 
959         return 0;
960 }
961 
962 /**
963  *      dev_getbyhwaddr_rcu - find a device by its hardware address
964  *      @net: the applicable net namespace
965  *      @type: media type of device
966  *      @ha: hardware address
967  *
968  *      Search for an interface by MAC address. Returns NULL if the device
969  *      is not found or a pointer to the device.
970  *      The caller must hold RCU or RTNL.
971  *      The returned device has not had its ref count increased
972  *      and the caller must therefore be careful about locking
973  *
974  */
975 
976 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
977                                        const char *ha)
978 {
979         struct net_device *dev;
980 
981         for_each_netdev_rcu(net, dev)
982                 if (dev->type == type &&
983                     !memcmp(dev->dev_addr, ha, dev->addr_len))
984                         return dev;
985 
986         return NULL;
987 }
988 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
989 
990 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
991 {
992         struct net_device *dev;
993 
994         ASSERT_RTNL();
995         for_each_netdev(net, dev)
996                 if (dev->type == type)
997                         return dev;
998 
999         return NULL;
1000 }
1001 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
1002 
1003 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
1004 {
1005         struct net_device *dev, *ret = NULL;
1006 
1007         rcu_read_lock();
1008         for_each_netdev_rcu(net, dev)
1009                 if (dev->type == type) {
1010                         dev_hold(dev);
1011                         ret = dev;
1012                         break;
1013                 }
1014         rcu_read_unlock();
1015         return ret;
1016 }
1017 EXPORT_SYMBOL(dev_getfirstbyhwtype);
1018 
1019 /**
1020  *      __dev_get_by_flags - find any device with given flags
1021  *      @net: the applicable net namespace
1022  *      @if_flags: IFF_* values
1023  *      @mask: bitmask of bits in if_flags to check
1024  *
1025  *      Search for any interface with the given flags. Returns NULL if a device
1026  *      is not found or a pointer to the device. Must be called inside
1027  *      rtnl_lock(), and result refcount is unchanged.
1028  */
1029 
1030 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1031                                       unsigned short mask)
1032 {
1033         struct net_device *dev, *ret;
1034 
1035         ASSERT_RTNL();
1036 
1037         ret = NULL;
1038         for_each_netdev(net, dev) {
1039                 if (((dev->flags ^ if_flags) & mask) == 0) {
1040                         ret = dev;
1041                         break;
1042                 }
1043         }
1044         return ret;
1045 }
1046 EXPORT_SYMBOL(__dev_get_by_flags);
1047 
1048 /**
1049  *      dev_valid_name - check if name is okay for network device
1050  *      @name: name string
1051  *
1052  *      Network device names need to be valid file names to
1053  *      to allow sysfs to work.  We also disallow any kind of
1054  *      whitespace.
1055  */
1056 bool dev_valid_name(const char *name)
1057 {
1058         if (*name == '\0')
1059                 return false;
1060         if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1061                 return false;
1062         if (!strcmp(name, ".") || !strcmp(name, ".."))
1063                 return false;
1064 
1065         while (*name) {
1066                 if (*name == '/' || *name == ':' || isspace(*name))
1067                         return false;
1068                 name++;
1069         }
1070         return true;
1071 }
1072 EXPORT_SYMBOL(dev_valid_name);
1073 
1074 /**
1075  *      __dev_alloc_name - allocate a name for a device
1076  *      @net: network namespace to allocate the device name in
1077  *      @name: name format string
1078  *      @buf:  scratch buffer and result name string
1079  *
1080  *      Passed a format string - eg "lt%d" it will try and find a suitable
1081  *      id. It scans list of devices to build up a free map, then chooses
1082  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1083  *      while allocating the name and adding the device in order to avoid
1084  *      duplicates.
1085  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086  *      Returns the number of the unit assigned or a negative errno code.
1087  */
1088 
1089 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1090 {
1091         int i = 0;
1092         const char *p;
1093         const int max_netdevices = 8*PAGE_SIZE;
1094         unsigned long *inuse;
1095         struct net_device *d;
1096 
1097         if (!dev_valid_name(name))
1098                 return -EINVAL;
1099 
1100         p = strchr(name, '%');
1101         if (p) {
1102                 /*
1103                  * Verify the string as this thing may have come from
1104                  * the user.  There must be either one "%d" and no other "%"
1105                  * characters.
1106                  */
1107                 if (p[1] != 'd' || strchr(p + 2, '%'))
1108                         return -EINVAL;
1109 
1110                 /* Use one page as a bit array of possible slots */
1111                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1112                 if (!inuse)
1113                         return -ENOMEM;
1114 
1115                 for_each_netdev(net, d) {
1116                         if (!sscanf(d->name, name, &i))
1117                                 continue;
1118                         if (i < 0 || i >= max_netdevices)
1119                                 continue;
1120 
1121                         /*  avoid cases where sscanf is not exact inverse of printf */
1122                         snprintf(buf, IFNAMSIZ, name, i);
1123                         if (!strncmp(buf, d->name, IFNAMSIZ))
1124                                 set_bit(i, inuse);
1125                 }
1126 
1127                 i = find_first_zero_bit(inuse, max_netdevices);
1128                 free_page((unsigned long) inuse);
1129         }
1130 
1131         snprintf(buf, IFNAMSIZ, name, i);
1132         if (!__dev_get_by_name(net, buf))
1133                 return i;
1134 
1135         /* It is possible to run out of possible slots
1136          * when the name is long and there isn't enough space left
1137          * for the digits, or if all bits are used.
1138          */
1139         return -ENFILE;
1140 }
1141 
1142 static int dev_alloc_name_ns(struct net *net,
1143                              struct net_device *dev,
1144                              const char *name)
1145 {
1146         char buf[IFNAMSIZ];
1147         int ret;
1148 
1149         BUG_ON(!net);
1150         ret = __dev_alloc_name(net, name, buf);
1151         if (ret >= 0)
1152                 strlcpy(dev->name, buf, IFNAMSIZ);
1153         return ret;
1154 }
1155 
1156 /**
1157  *      dev_alloc_name - allocate a name for a device
1158  *      @dev: device
1159  *      @name: name format string
1160  *
1161  *      Passed a format string - eg "lt%d" it will try and find a suitable
1162  *      id. It scans list of devices to build up a free map, then chooses
1163  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1164  *      while allocating the name and adding the device in order to avoid
1165  *      duplicates.
1166  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1167  *      Returns the number of the unit assigned or a negative errno code.
1168  */
1169 
1170 int dev_alloc_name(struct net_device *dev, const char *name)
1171 {
1172         return dev_alloc_name_ns(dev_net(dev), dev, name);
1173 }
1174 EXPORT_SYMBOL(dev_alloc_name);
1175 
1176 static int dev_get_valid_name(struct net *net, struct net_device *dev,
1177                               const char *name)
1178 {
1179         BUG_ON(!net);
1180 
1181         if (!dev_valid_name(name))
1182                 return -EINVAL;
1183 
1184         if (strchr(name, '%'))
1185                 return dev_alloc_name_ns(net, dev, name);
1186         else if (__dev_get_by_name(net, name))
1187                 return -EEXIST;
1188         else if (dev->name != name)
1189                 strlcpy(dev->name, name, IFNAMSIZ);
1190 
1191         return 0;
1192 }
1193 
1194 /**
1195  *      dev_change_name - change name of a device
1196  *      @dev: device
1197  *      @newname: name (or format string) must be at least IFNAMSIZ
1198  *
1199  *      Change name of a device, can pass format strings "eth%d".
1200  *      for wildcarding.
1201  */
1202 int dev_change_name(struct net_device *dev, const char *newname)
1203 {
1204         unsigned char old_assign_type;
1205         char oldname[IFNAMSIZ];
1206         int err = 0;
1207         int ret;
1208         struct net *net;
1209 
1210         ASSERT_RTNL();
1211         BUG_ON(!dev_net(dev));
1212 
1213         net = dev_net(dev);
1214 
1215         /* Some auto-enslaved devices e.g. failover slaves are
1216          * special, as userspace might rename the device after
1217          * the interface had been brought up and running since
1218          * the point kernel initiated auto-enslavement. Allow
1219          * live name change even when these slave devices are
1220          * up and running.
1221          *
1222          * Typically, users of these auto-enslaving devices
1223          * don't actually care about slave name change, as
1224          * they are supposed to operate on master interface
1225          * directly.
1226          */
1227         if (dev->flags & IFF_UP &&
1228             likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
1229                 return -EBUSY;
1230 
1231         write_seqcount_begin(&devnet_rename_seq);
1232 
1233         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1234                 write_seqcount_end(&devnet_rename_seq);
1235                 return 0;
1236         }
1237 
1238         memcpy(oldname, dev->name, IFNAMSIZ);
1239 
1240         err = dev_get_valid_name(net, dev, newname);
1241         if (err < 0) {
1242                 write_seqcount_end(&devnet_rename_seq);
1243                 return err;
1244         }
1245 
1246         if (oldname[0] && !strchr(oldname, '%'))
1247                 netdev_info(dev, "renamed from %s\n", oldname);
1248 
1249         old_assign_type = dev->name_assign_type;
1250         dev->name_assign_type = NET_NAME_RENAMED;
1251 
1252 rollback:
1253         ret = device_rename(&dev->dev, dev->name);
1254         if (ret) {
1255                 memcpy(dev->name, oldname, IFNAMSIZ);
1256                 dev->name_assign_type = old_assign_type;
1257                 write_seqcount_end(&devnet_rename_seq);
1258                 return ret;
1259         }
1260 
1261         write_seqcount_end(&devnet_rename_seq);
1262 
1263         netdev_adjacent_rename_links(dev, oldname);
1264 
1265         write_lock_bh(&dev_base_lock);
1266         netdev_name_node_del(dev->name_node);
1267         write_unlock_bh(&dev_base_lock);
1268 
1269         synchronize_rcu();
1270 
1271         write_lock_bh(&dev_base_lock);
1272         netdev_name_node_add(net, dev->name_node);
1273         write_unlock_bh(&dev_base_lock);
1274 
1275         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1276         ret = notifier_to_errno(ret);
1277 
1278         if (ret) {
1279                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1280                 if (err >= 0) {
1281                         err = ret;
1282                         write_seqcount_begin(&devnet_rename_seq);
1283                         memcpy(dev->name, oldname, IFNAMSIZ);
1284                         memcpy(oldname, newname, IFNAMSIZ);
1285                         dev->name_assign_type = old_assign_type;
1286                         old_assign_type = NET_NAME_RENAMED;
1287                         goto rollback;
1288                 } else {
1289                         pr_err("%s: name change rollback failed: %d\n",
1290                                dev->name, ret);
1291                 }
1292         }
1293 
1294         return err;
1295 }
1296 
1297 /**
1298  *      dev_set_alias - change ifalias of a device
1299  *      @dev: device
1300  *      @alias: name up to IFALIASZ
1301  *      @len: limit of bytes to copy from info
1302  *
1303  *      Set ifalias for a device,
1304  */
1305 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1306 {
1307         struct dev_ifalias *new_alias = NULL;
1308 
1309         if (len >= IFALIASZ)
1310                 return -EINVAL;
1311 
1312         if (len) {
1313                 new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1314                 if (!new_alias)
1315                         return -ENOMEM;
1316 
1317                 memcpy(new_alias->ifalias, alias, len);
1318                 new_alias->ifalias[len] = 0;
1319         }
1320 
1321         mutex_lock(&ifalias_mutex);
1322         new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1323                                         mutex_is_locked(&ifalias_mutex));
1324         mutex_unlock(&ifalias_mutex);
1325 
1326         if (new_alias)
1327                 kfree_rcu(new_alias, rcuhead);
1328 
1329         return len;
1330 }
1331 EXPORT_SYMBOL(dev_set_alias);
1332 
1333 /**
1334  *      dev_get_alias - get ifalias of a device
1335  *      @dev: device
1336  *      @name: buffer to store name of ifalias
1337  *      @len: size of buffer
1338  *
1339  *      get ifalias for a device.  Caller must make sure dev cannot go
1340  *      away,  e.g. rcu read lock or own a reference count to device.
1341  */
1342 int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1343 {
1344         const struct dev_ifalias *alias;
1345         int ret = 0;
1346 
1347         rcu_read_lock();
1348         alias = rcu_dereference(dev->ifalias);
1349         if (alias)
1350                 ret = snprintf(name, len, "%s", alias->ifalias);
1351         rcu_read_unlock();
1352 
1353         return ret;
1354 }
1355 
1356 /**
1357  *      netdev_features_change - device changes features
1358  *      @dev: device to cause notification
1359  *
1360  *      Called to indicate a device has changed features.
1361  */
1362 void netdev_features_change(struct net_device *dev)
1363 {
1364         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1365 }
1366 EXPORT_SYMBOL(netdev_features_change);
1367 
1368 /**
1369  *      netdev_state_change - device changes state
1370  *      @dev: device to cause notification
1371  *
1372  *      Called to indicate a device has changed state. This function calls
1373  *      the notifier chains for netdev_chain and sends a NEWLINK message
1374  *      to the routing socket.
1375  */
1376 void netdev_state_change(struct net_device *dev)
1377 {
1378         if (dev->flags & IFF_UP) {
1379                 struct netdev_notifier_change_info change_info = {
1380                         .info.dev = dev,
1381                 };
1382 
1383                 call_netdevice_notifiers_info(NETDEV_CHANGE,
1384                                               &change_info.info);
1385                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1386         }
1387 }
1388 EXPORT_SYMBOL(netdev_state_change);
1389 
1390 /**
1391  * netdev_notify_peers - notify network peers about existence of @dev
1392  * @dev: network device
1393  *
1394  * Generate traffic such that interested network peers are aware of
1395  * @dev, such as by generating a gratuitous ARP. This may be used when
1396  * a device wants to inform the rest of the network about some sort of
1397  * reconfiguration such as a failover event or virtual machine
1398  * migration.
1399  */
1400 void netdev_notify_peers(struct net_device *dev)
1401 {
1402         rtnl_lock();
1403         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1404         call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1405         rtnl_unlock();
1406 }
1407 EXPORT_SYMBOL(netdev_notify_peers);
1408 
1409 static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1410 {
1411         const struct net_device_ops *ops = dev->netdev_ops;
1412         int ret;
1413 
1414         ASSERT_RTNL();
1415 
1416         if (!netif_device_present(dev))
1417                 return -ENODEV;
1418 
1419         /* Block netpoll from trying to do any rx path servicing.
1420          * If we don't do this there is a chance ndo_poll_controller
1421          * or ndo_poll may be running while we open the device
1422          */
1423         netpoll_poll_disable(dev);
1424 
1425         ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1426         ret = notifier_to_errno(ret);
1427         if (ret)
1428                 return ret;
1429 
1430         set_bit(__LINK_STATE_START, &dev->state);
1431 
1432         if (ops->ndo_validate_addr)
1433                 ret = ops->ndo_validate_addr(dev);
1434 
1435         if (!ret && ops->ndo_open)
1436                 ret = ops->ndo_open(dev);
1437 
1438         netpoll_poll_enable(dev);
1439 
1440         if (ret)
1441                 clear_bit(__LINK_STATE_START, &dev->state);
1442         else {
1443                 dev->flags |= IFF_UP;
1444                 dev_set_rx_mode(dev);
1445                 dev_activate(dev);
1446                 add_device_randomness(dev->dev_addr, dev->addr_len);
1447         }
1448 
1449         return ret;
1450 }
1451 
1452 /**
1453  *      dev_open        - prepare an interface for use.
1454  *      @dev: device to open
1455  *      @extack: netlink extended ack
1456  *
1457  *      Takes a device from down to up state. The device's private open
1458  *      function is invoked and then the multicast lists are loaded. Finally
1459  *      the device is moved into the up state and a %NETDEV_UP message is
1460  *      sent to the netdev notifier chain.
1461  *
1462  *      Calling this function on an active interface is a nop. On a failure
1463  *      a negative errno code is returned.
1464  */
1465 int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1466 {
1467         int ret;
1468 
1469         if (dev->flags & IFF_UP)
1470                 return 0;
1471 
1472         ret = __dev_open(dev, extack);
1473         if (ret < 0)
1474                 return ret;
1475 
1476         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1477         call_netdevice_notifiers(NETDEV_UP, dev);
1478 
1479         return ret;
1480 }
1481 EXPORT_SYMBOL(dev_open);
1482 
1483 static void __dev_close_many(struct list_head *head)
1484 {
1485         struct net_device *dev;
1486 
1487         ASSERT_RTNL();
1488         might_sleep();
1489 
1490         list_for_each_entry(dev, head, close_list) {
1491                 /* Temporarily disable netpoll until the interface is down */
1492                 netpoll_poll_disable(dev);
1493 
1494                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1495 
1496                 clear_bit(__LINK_STATE_START, &dev->state);
1497 
1498                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1499                  * can be even on different cpu. So just clear netif_running().
1500                  *
1501                  * dev->stop() will invoke napi_disable() on all of it's
1502                  * napi_struct instances on this device.
1503                  */
1504                 smp_mb__after_atomic(); /* Commit netif_running(). */
1505         }
1506 
1507         dev_deactivate_many(head);
1508 
1509         list_for_each_entry(dev, head, close_list) {
1510                 const struct net_device_ops *ops = dev->netdev_ops;
1511 
1512                 /*
1513                  *      Call the device specific close. This cannot fail.
1514                  *      Only if device is UP
1515                  *
1516                  *      We allow it to be called even after a DETACH hot-plug
1517                  *      event.
1518                  */
1519                 if (ops->ndo_stop)
1520                         ops->ndo_stop(dev);
1521 
1522                 dev->flags &= ~IFF_UP;
1523                 netpoll_poll_enable(dev);
1524         }
1525 }
1526 
1527 static void __dev_close(struct net_device *dev)
1528 {
1529         LIST_HEAD(single);
1530 
1531         list_add(&dev->close_list, &single);
1532         __dev_close_many(&single);
1533         list_del(&single);
1534 }
1535 
1536 void dev_close_many(struct list_head *head, bool unlink)
1537 {
1538         struct net_device *dev, *tmp;
1539 
1540         /* Remove the devices that don't need to be closed */
1541         list_for_each_entry_safe(dev, tmp, head, close_list)
1542                 if (!(dev->flags & IFF_UP))
1543                         list_del_init(&dev->close_list);
1544 
1545         __dev_close_many(head);
1546 
1547         list_for_each_entry_safe(dev, tmp, head, close_list) {
1548                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1549                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1550                 if (unlink)
1551                         list_del_init(&dev->close_list);
1552         }
1553 }
1554 EXPORT_SYMBOL(dev_close_many);
1555 
1556 /**
1557  *      dev_close - shutdown an interface.
1558  *      @dev: device to shutdown
1559  *
1560  *      This function moves an active device into down state. A
1561  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1562  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1563  *      chain.
1564  */
1565 void dev_close(struct net_device *dev)
1566 {
1567         if (dev->flags & IFF_UP) {
1568                 LIST_HEAD(single);
1569 
1570                 list_add(&dev->close_list, &single);
1571                 dev_close_many(&single, true);
1572                 list_del(&single);
1573         }
1574 }
1575 EXPORT_SYMBOL(dev_close);
1576 
1577 
1578 /**
1579  *      dev_disable_lro - disable Large Receive Offload on a device
1580  *      @dev: device
1581  *
1582  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1583  *      called under RTNL.  This is needed if received packets may be
1584  *      forwarded to another interface.
1585  */
1586 void dev_disable_lro(struct net_device *dev)
1587 {
1588         struct net_device *lower_dev;
1589         struct list_head *iter;
1590 
1591         dev->wanted_features &= ~NETIF_F_LRO;
1592         netdev_update_features(dev);
1593 
1594         if (unlikely(dev->features & NETIF_F_LRO))
1595                 netdev_WARN(dev, "failed to disable LRO!\n");
1596 
1597         netdev_for_each_lower_dev(dev, lower_dev, iter)
1598                 dev_disable_lro(lower_dev);
1599 }
1600 EXPORT_SYMBOL(dev_disable_lro);
1601 
1602 /**
1603  *      dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1604  *      @dev: device
1605  *
1606  *      Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1607  *      called under RTNL.  This is needed if Generic XDP is installed on
1608  *      the device.
1609  */
1610 static void dev_disable_gro_hw(struct net_device *dev)
1611 {
1612         dev->wanted_features &= ~NETIF_F_GRO_HW;
1613         netdev_update_features(dev);
1614 
1615         if (unlikely(dev->features & NETIF_F_GRO_HW))
1616                 netdev_WARN(dev, "failed to disable GRO_HW!\n");
1617 }
1618 
1619 const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1620 {
1621 #define N(val)                                          \
1622         case NETDEV_##val:                              \
1623                 return "NETDEV_" __stringify(val);
1624         switch (cmd) {
1625         N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1626         N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1627         N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1628         N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1629         N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1630         N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1631         N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1632         N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1633         N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1634         N(PRE_CHANGEADDR)
1635         }
1636 #undef N
1637         return "UNKNOWN_NETDEV_EVENT";
1638 }
1639 EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1640 
1641 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1642                                    struct net_device *dev)
1643 {
1644         struct netdev_notifier_info info = {
1645                 .dev = dev,
1646         };
1647 
1648         return nb->notifier_call(nb, val, &info);
1649 }
1650 
1651 static int call_netdevice_register_notifiers(struct notifier_block *nb,
1652                                              struct net_device *dev)
1653 {
1654         int err;
1655 
1656         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1657         err = notifier_to_errno(err);
1658         if (err)
1659                 return err;
1660 
1661         if (!(dev->flags & IFF_UP))
1662                 return 0;
1663 
1664         call_netdevice_notifier(nb, NETDEV_UP, dev);
1665         return 0;
1666 }
1667 
1668 static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1669                                                 struct net_device *dev)
1670 {
1671         if (dev->flags & IFF_UP) {
1672                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1673                                         dev);
1674                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1675         }
1676         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1677 }
1678 
1679 static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1680                                                  struct net *net)
1681 {
1682         struct net_device *dev;
1683         int err;
1684 
1685         for_each_netdev(net, dev) {
1686                 err = call_netdevice_register_notifiers(nb, dev);
1687                 if (err)
1688                         goto rollback;
1689         }
1690         return 0;
1691 
1692 rollback:
1693         for_each_netdev_continue_reverse(net, dev)
1694                 call_netdevice_unregister_notifiers(nb, dev);
1695         return err;
1696 }
1697 
1698 static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1699                                                     struct net *net)
1700 {
1701         struct net_device *dev;
1702 
1703         for_each_netdev(net, dev)
1704                 call_netdevice_unregister_notifiers(nb, dev);
1705 }
1706 
1707 static int dev_boot_phase = 1;
1708 
1709 /**
1710  * register_netdevice_notifier - register a network notifier block
1711  * @nb: notifier
1712  *
1713  * Register a notifier to be called when network device events occur.
1714  * The notifier passed is linked into the kernel structures and must
1715  * not be reused until it has been unregistered. A negative errno code
1716  * is returned on a failure.
1717  *
1718  * When registered all registration and up events are replayed
1719  * to the new notifier to allow device to have a race free
1720  * view of the network device list.
1721  */
1722 
1723 int register_netdevice_notifier(struct notifier_block *nb)
1724 {
1725         struct net *net;
1726         int err;
1727 
1728         /* Close race with setup_net() and cleanup_net() */
1729         down_write(&pernet_ops_rwsem);
1730         rtnl_lock();
1731         err = raw_notifier_chain_register(&netdev_chain, nb);
1732         if (err)
1733                 goto unlock;
1734         if (dev_boot_phase)
1735                 goto unlock;
1736         for_each_net(net) {
1737                 err = call_netdevice_register_net_notifiers(nb, net);
1738                 if (err)
1739                         goto rollback;
1740         }
1741 
1742 unlock:
1743         rtnl_unlock();
1744         up_write(&pernet_ops_rwsem);
1745         return err;
1746 
1747 rollback:
1748         for_each_net_continue_reverse(net)
1749                 call_netdevice_unregister_net_notifiers(nb, net);
1750 
1751         raw_notifier_chain_unregister(&netdev_chain, nb);
1752         goto unlock;
1753 }
1754 EXPORT_SYMBOL(register_netdevice_notifier);
1755 
1756 /**
1757  * unregister_netdevice_notifier - unregister a network notifier block
1758  * @nb: notifier
1759  *
1760  * Unregister a notifier previously registered by
1761  * register_netdevice_notifier(). The notifier is unlinked into the
1762  * kernel structures and may then be reused. A negative errno code
1763  * is returned on a failure.
1764  *
1765  * After unregistering unregister and down device events are synthesized
1766  * for all devices on the device list to the removed notifier to remove
1767  * the need for special case cleanup code.
1768  */
1769 
1770 int unregister_netdevice_notifier(struct notifier_block *nb)
1771 {
1772         struct net_device *dev;
1773         struct net *net;
1774         int err;
1775 
1776         /* Close race with setup_net() and cleanup_net() */
1777         down_write(&pernet_ops_rwsem);
1778         rtnl_lock();
1779         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1780         if (err)
1781                 goto unlock;
1782 
1783         for_each_net(net) {
1784                 for_each_netdev(net, dev) {
1785                         if (dev->flags & IFF_UP) {
1786                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1787                                                         dev);
1788                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1789                         }
1790                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1791                 }
1792         }
1793 unlock:
1794         rtnl_unlock();
1795         up_write(&pernet_ops_rwsem);
1796         return err;
1797 }
1798 EXPORT_SYMBOL(unregister_netdevice_notifier);
1799 
1800 /**
1801  * register_netdevice_notifier_net - register a per-netns network notifier block
1802  * @net: network namespace
1803  * @nb: notifier
1804  *
1805  * Register a notifier to be called when network device events occur.
1806  * The notifier passed is linked into the kernel structures and must
1807  * not be reused until it has been unregistered. A negative errno code
1808  * is returned on a failure.
1809  *
1810  * When registered all registration and up events are replayed
1811  * to the new notifier to allow device to have a race free
1812  * view of the network device list.
1813  */
1814 
1815 int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1816 {
1817         int err;
1818 
1819         rtnl_lock();
1820         err = raw_notifier_chain_register(&net->netdev_chain, nb);
1821         if (err)
1822                 goto unlock;
1823         if (dev_boot_phase)
1824                 goto unlock;
1825 
1826         err = call_netdevice_register_net_notifiers(nb, net);
1827         if (err)
1828                 goto chain_unregister;
1829 
1830 unlock:
1831         rtnl_unlock();
1832         return err;
1833 
1834 chain_unregister:
1835         raw_notifier_chain_unregister(&netdev_chain, nb);
1836         goto unlock;
1837 }
1838 EXPORT_SYMBOL(register_netdevice_notifier_net);
1839 
1840 /**
1841  * unregister_netdevice_notifier_net - unregister a per-netns
1842  *                                     network notifier block
1843  * @net: network namespace
1844  * @nb: notifier
1845  *
1846  * Unregister a notifier previously registered by
1847  * register_netdevice_notifier(). The notifier is unlinked into the
1848  * kernel structures and may then be reused. A negative errno code
1849  * is returned on a failure.
1850  *
1851  * After unregistering unregister and down device events are synthesized
1852  * for all devices on the device list to the removed notifier to remove
1853  * the need for special case cleanup code.
1854  */
1855 
1856 int unregister_netdevice_notifier_net(struct net *net,
1857                                       struct notifier_block *nb)
1858 {
1859         int err;
1860 
1861         rtnl_lock();
1862         err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1863         if (err)
1864                 goto unlock;
1865 
1866         call_netdevice_unregister_net_notifiers(nb, net);
1867 
1868 unlock:
1869         rtnl_unlock();
1870         return err;
1871 }
1872 EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1873 
1874 /**
1875  *      call_netdevice_notifiers_info - call all network notifier blocks
1876  *      @val: value passed unmodified to notifier function
1877  *      @info: notifier information data
1878  *
1879  *      Call all network notifier blocks.  Parameters and return value
1880  *      are as for raw_notifier_call_chain().
1881  */
1882 
1883 static int call_netdevice_notifiers_info(unsigned long val,
1884                                          struct netdev_notifier_info *info)
1885 {
1886         struct net *net = dev_net(info->dev);
1887         int ret;
1888 
1889         ASSERT_RTNL();
1890 
1891         /* Run per-netns notifier block chain first, then run the global one.
1892          * Hopefully, one day, the global one is going to be removed after
1893          * all notifier block registrators get converted to be per-netns.
1894          */
1895         ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
1896         if (ret & NOTIFY_STOP_MASK)
1897                 return ret;
1898         return raw_notifier_call_chain(&netdev_chain, val, info);
1899 }
1900 
1901 static int call_netdevice_notifiers_extack(unsigned long val,
1902                                            struct net_device *dev,
1903                                            struct netlink_ext_ack *extack)
1904 {
1905         struct netdev_notifier_info info = {
1906                 .dev = dev,
1907                 .extack = extack,
1908         };
1909 
1910         return call_netdevice_notifiers_info(val, &info);
1911 }
1912 
1913 /**
1914  *      call_netdevice_notifiers - call all network notifier blocks
1915  *      @val: value passed unmodified to notifier function
1916  *      @dev: net_device pointer passed unmodified to notifier function
1917  *
1918  *      Call all network notifier blocks.  Parameters and return value
1919  *      are as for raw_notifier_call_chain().
1920  */
1921 
1922 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1923 {
1924         return call_netdevice_notifiers_extack(val, dev, NULL);
1925 }
1926 EXPORT_SYMBOL(call_netdevice_notifiers);
1927 
1928 /**
1929  *      call_netdevice_notifiers_mtu - call all network notifier blocks
1930  *      @val: value passed unmodified to notifier function
1931  *      @dev: net_device pointer passed unmodified to notifier function
1932  *      @arg: additional u32 argument passed to the notifier function
1933  *
1934  *      Call all network notifier blocks.  Parameters and return value
1935  *      are as for raw_notifier_call_chain().
1936  */
1937 static int call_netdevice_notifiers_mtu(unsigned long val,
1938                                         struct net_device *dev, u32 arg)
1939 {
1940         struct netdev_notifier_info_ext info = {
1941                 .info.dev = dev,
1942                 .ext.mtu = arg,
1943         };
1944 
1945         BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
1946 
1947         return call_netdevice_notifiers_info(val, &info.info);
1948 }
1949 
1950 #ifdef CONFIG_NET_INGRESS
1951 static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
1952 
1953 void net_inc_ingress_queue(void)
1954 {
1955         static_branch_inc(&ingress_needed_key);
1956 }
1957 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1958 
1959 void net_dec_ingress_queue(void)
1960 {
1961         static_branch_dec(&ingress_needed_key);
1962 }
1963 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1964 #endif
1965 
1966 #ifdef CONFIG_NET_EGRESS
1967 static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
1968 
1969 void net_inc_egress_queue(void)
1970 {
1971         static_branch_inc(&egress_needed_key);
1972 }
1973 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1974 
1975 void net_dec_egress_queue(void)
1976 {
1977         static_branch_dec(&egress_needed_key);
1978 }
1979 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1980 #endif
1981 
1982 static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
1983 #ifdef CONFIG_JUMP_LABEL
1984 static atomic_t netstamp_needed_deferred;
1985 static atomic_t netstamp_wanted;
1986 static void netstamp_clear(struct work_struct *work)
1987 {
1988         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1989         int wanted;
1990 
1991         wanted = atomic_add_return(deferred, &netstamp_wanted);
1992         if (wanted > 0)
1993                 static_branch_enable(&netstamp_needed_key);
1994         else
1995                 static_branch_disable(&netstamp_needed_key);
1996 }
1997 static DECLARE_WORK(netstamp_work, netstamp_clear);
1998 #endif
1999 
2000 void net_enable_timestamp(void)
2001 {
2002 #ifdef CONFIG_JUMP_LABEL
2003         int wanted;
2004 
2005         while (1) {
2006                 wanted = atomic_read(&netstamp_wanted);
2007                 if (wanted <= 0)
2008                         break;
2009                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
2010                         return;
2011         }
2012         atomic_inc(&netstamp_needed_deferred);
2013         schedule_work(&netstamp_work);
2014 #else
2015         static_branch_inc(&netstamp_needed_key);
2016 #endif
2017 }
2018 EXPORT_SYMBOL(net_enable_timestamp);
2019 
2020 void net_disable_timestamp(void)
2021 {
2022 #ifdef CONFIG_JUMP_LABEL
2023         int wanted;
2024 
2025         while (1) {
2026                 wanted = atomic_read(&netstamp_wanted);
2027                 if (wanted <= 1)
2028                         break;
2029                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
2030                         return;
2031         }
2032         atomic_dec(&netstamp_needed_deferred);
2033         schedule_work(&netstamp_work);
2034 #else
2035         static_branch_dec(&netstamp_needed_key);
2036 #endif
2037 }
2038 EXPORT_SYMBOL(net_disable_timestamp);
2039 
2040 static inline void net_timestamp_set(struct sk_buff *skb)
2041 {
2042         skb->tstamp = 0;
2043         if (static_branch_unlikely(&netstamp_needed_key))
2044                 __net_timestamp(skb);
2045 }
2046 
2047 #define net_timestamp_check(COND, SKB)                          \
2048         if (static_branch_unlikely(&netstamp_needed_key)) {     \
2049                 if ((COND) && !(SKB)->tstamp)                   \
2050                         __net_timestamp(SKB);                   \
2051         }                                                       \
2052 
2053 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2054 {
2055         unsigned int len;
2056 
2057         if (!(dev->flags & IFF_UP))
2058                 return false;
2059 
2060         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
2061         if (skb->len <= len)
2062                 return true;
2063 
2064         /* if TSO is enabled, we don't care about the length as the packet
2065          * could be forwarded without being segmented before
2066          */
2067         if (skb_is_gso(skb))
2068                 return true;
2069 
2070         return false;
2071 }
2072 EXPORT_SYMBOL_GPL(is_skb_forwardable);
2073 
2074 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2075 {
2076         int ret = ____dev_forward_skb(dev, skb);
2077 
2078         if (likely(!ret)) {
2079                 skb->protocol = eth_type_trans(skb, dev);
2080                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2081         }
2082 
2083         return ret;
2084 }
2085 EXPORT_SYMBOL_GPL(__dev_forward_skb);
2086 
2087 /**
2088  * dev_forward_skb - loopback an skb to another netif
2089  *
2090  * @dev: destination network device
2091  * @skb: buffer to forward
2092  *
2093  * return values:
2094  *      NET_RX_SUCCESS  (no congestion)
2095  *      NET_RX_DROP     (packet was dropped, but freed)
2096  *
2097  * dev_forward_skb can be used for injecting an skb from the
2098  * start_xmit function of one device into the receive queue
2099  * of another device.
2100  *
2101  * The receiving device may be in another namespace, so
2102  * we have to clear all information in the skb that could
2103  * impact namespace isolation.
2104  */
2105 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2106 {
2107         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2108 }
2109 EXPORT_SYMBOL_GPL(dev_forward_skb);
2110 
2111 static inline int deliver_skb(struct sk_buff *skb,
2112                               struct packet_type *pt_prev,
2113                               struct net_device *orig_dev)
2114 {
2115         if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2116                 return -ENOMEM;
2117         refcount_inc(&skb->users);
2118         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2119 }
2120 
2121 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2122                                           struct packet_type **pt,
2123                                           struct net_device *orig_dev,
2124                                           __be16 type,
2125                                           struct list_head *ptype_list)
2126 {
2127         struct packet_type *ptype, *pt_prev = *pt;
2128 
2129         list_for_each_entry_rcu(ptype, ptype_list, list) {
2130                 if (ptype->type != type)
2131                         continue;
2132                 if (pt_prev)
2133                         deliver_skb(skb, pt_prev, orig_dev);
2134                 pt_prev = ptype;
2135         }
2136         *pt = pt_prev;
2137 }
2138 
2139 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2140 {
2141         if (!ptype->af_packet_priv || !skb->sk)
2142                 return false;
2143 
2144         if (ptype->id_match)
2145                 return ptype->id_match(ptype, skb->sk);
2146         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2147                 return true;
2148 
2149         return false;
2150 }
2151 
2152 /**
2153  * dev_nit_active - return true if any network interface taps are in use
2154  *
2155  * @dev: network device to check for the presence of taps
2156  */
2157 bool dev_nit_active(struct net_device *dev)
2158 {
2159         return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2160 }
2161 EXPORT_SYMBOL_GPL(dev_nit_active);
2162 
2163 /*
2164  *      Support routine. Sends outgoing frames to any network
2165  *      taps currently in use.
2166  */
2167 
2168 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2169 {
2170         struct packet_type *ptype;
2171         struct sk_buff *skb2 = NULL;
2172         struct packet_type *pt_prev = NULL;
2173         struct list_head *ptype_list = &ptype_all;
2174 
2175         rcu_read_lock();
2176 again:
2177         list_for_each_entry_rcu(ptype, ptype_list, list) {
2178                 if (ptype->ignore_outgoing)
2179                         continue;
2180 
2181                 /* Never send packets back to the socket
2182                  * they originated from - MvS (miquels@drinkel.ow.org)
2183                  */
2184                 if (skb_loop_sk(ptype, skb))
2185                         continue;
2186 
2187                 if (pt_prev) {
2188                         deliver_skb(skb2, pt_prev, skb->dev);
2189                         pt_prev = ptype;
2190                         continue;
2191                 }
2192 
2193                 /* need to clone skb, done only once */
2194                 skb2 = skb_clone(skb, GFP_ATOMIC);
2195                 if (!skb2)
2196                         goto out_unlock;
2197 
2198                 net_timestamp_set(skb2);
2199 
2200                 /* skb->nh should be correctly
2201                  * set by sender, so that the second statement is
2202                  * just protection against buggy protocols.
2203                  */
2204                 skb_reset_mac_header(skb2);
2205 
2206                 if (skb_network_header(skb2) < skb2->data ||
2207                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2208                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2209                                              ntohs(skb2->protocol),
2210                                              dev->name);
2211                         skb_reset_network_header(skb2);
2212                 }
2213 
2214                 skb2->transport_header = skb2->network_header;
2215                 skb2->pkt_type = PACKET_OUTGOING;
2216                 pt_prev = ptype;
2217         }
2218 
2219         if (ptype_list == &ptype_all) {
2220                 ptype_list = &dev->ptype_all;
2221                 goto again;
2222         }
2223 out_unlock:
2224         if (pt_prev) {
2225                 if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2226                         pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2227                 else
2228                         kfree_skb(skb2);
2229         }
2230         rcu_read_unlock();
2231 }
2232 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2233 
2234 /**
2235  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2236  * @dev: Network device
2237  * @txq: number of queues available
2238  *
2239  * If real_num_tx_queues is changed the tc mappings may no longer be
2240  * valid. To resolve this verify the tc mapping remains valid and if
2241  * not NULL the mapping. With no priorities mapping to this
2242  * offset/count pair it will no longer be used. In the worst case TC0
2243  * is invalid nothing can be done so disable priority mappings. If is
2244  * expected that drivers will fix this mapping if they can before
2245  * calling netif_set_real_num_tx_queues.
2246  */
2247 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2248 {
2249         int i;
2250         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2251 
2252         /* If TC0 is invalidated disable TC mapping */
2253         if (tc->offset + tc->count > txq) {
2254                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2255                 dev->num_tc = 0;
2256                 return;
2257         }
2258 
2259         /* Invalidated prio to tc mappings set to TC0 */
2260         for (i = 1; i < TC_BITMASK + 1; i++) {
2261                 int q = netdev_get_prio_tc_map(dev, i);
2262 
2263                 tc = &dev->tc_to_txq[q];
2264                 if (tc->offset + tc->count > txq) {
2265                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2266                                 i, q);
2267                         netdev_set_prio_tc_map(dev, i, 0);
2268                 }
2269         }
2270 }
2271 
2272 int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2273 {
2274         if (dev->num_tc) {
2275                 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2276                 int i;
2277 
2278                 /* walk through the TCs and see if it falls into any of them */
2279                 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2280                         if ((txq - tc->offset) < tc->count)
2281                                 return i;
2282                 }
2283 
2284                 /* didn't find it, just return -1 to indicate no match */
2285                 return -1;
2286         }
2287 
2288         return 0;
2289 }
2290 EXPORT_SYMBOL(netdev_txq_to_tc);
2291 
2292 #ifdef CONFIG_XPS
2293 struct static_key xps_needed __read_mostly;
2294 EXPORT_SYMBOL(xps_needed);
2295 struct static_key xps_rxqs_needed __read_mostly;
2296 EXPORT_SYMBOL(xps_rxqs_needed);
2297 static DEFINE_MUTEX(xps_map_mutex);
2298 #define xmap_dereference(P)             \
2299         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2300 
2301 static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2302                              int tci, u16 index)
2303 {
2304         struct xps_map *map = NULL;
2305         int pos;
2306 
2307         if (dev_maps)
2308                 map = xmap_dereference(dev_maps->attr_map[tci]);
2309         if (!map)
2310                 return false;
2311 
2312         for (pos = map->len; pos--;) {
2313                 if (map->queues[pos] != index)
2314                         continue;
2315 
2316                 if (map->len > 1) {
2317                         map->queues[pos] = map->queues[--map->len];
2318                         break;
2319                 }
2320 
2321                 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2322                 kfree_rcu(map, rcu);
2323                 return false;
2324         }
2325 
2326         return true;
2327 }
2328 
2329 static bool remove_xps_queue_cpu(struct net_device *dev,
2330                                  struct xps_dev_maps *dev_maps,
2331                                  int cpu, u16 offset, u16 count)
2332 {
2333         int num_tc = dev->num_tc ? : 1;
2334         bool active = false;
2335         int tci;
2336 
2337         for (tci = cpu * num_tc; num_tc--; tci++) {
2338                 int i, j;
2339 
2340                 for (i = count, j = offset; i--; j++) {
2341                         if (!remove_xps_queue(dev_maps, tci, j))
2342                                 break;
2343                 }
2344 
2345                 active |= i < 0;
2346         }
2347 
2348         return active;
2349 }
2350 
2351 static void reset_xps_maps(struct net_device *dev,
2352                            struct xps_dev_maps *dev_maps,
2353                            bool is_rxqs_map)
2354 {
2355         if (is_rxqs_map) {
2356                 static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2357                 RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2358         } else {
2359                 RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2360         }
2361         static_key_slow_dec_cpuslocked(&xps_needed);
2362         kfree_rcu(dev_maps, rcu);
2363 }
2364 
2365 static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
2366                            struct xps_dev_maps *dev_maps, unsigned int nr_ids,
2367                            u16 offset, u16 count, bool is_rxqs_map)
2368 {
2369         bool active = false;
2370         int i, j;
2371 
2372         for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
2373              j < nr_ids;)
2374                 active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
2375                                                count);
2376         if (!active)
2377                 reset_xps_maps(dev, dev_maps, is_rxqs_map);
2378 
2379         if (!is_rxqs_map) {
2380                 for (i = offset + (count - 1); count--; i--) {
2381                         netdev_queue_numa_node_write(
2382                                 netdev_get_tx_queue(dev, i),
2383                                 NUMA_NO_NODE);
2384                 }
2385         }
2386 }
2387 
2388 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2389                                    u16 count)
2390 {
2391         const unsigned long *possible_mask = NULL;
2392         struct xps_dev_maps *dev_maps;
2393         unsigned int nr_ids;
2394 
2395         if (!static_key_false(&xps_needed))
2396                 return;
2397 
2398         cpus_read_lock();
2399         mutex_lock(&xps_map_mutex);
2400 
2401         if (static_key_false(&xps_rxqs_needed)) {
2402                 dev_maps = xmap_dereference(dev->xps_rxqs_map);
2403                 if (dev_maps) {
2404                         nr_ids = dev->num_rx_queues;
2405                         clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
2406                                        offset, count, true);
2407                 }
2408         }
2409 
2410         dev_maps = xmap_dereference(dev->xps_cpus_map);
2411         if (!dev_maps)
2412                 goto out_no_maps;
2413 
2414         if (num_possible_cpus() > 1)
2415                 possible_mask = cpumask_bits(cpu_possible_mask);
2416         nr_ids = nr_cpu_ids;
2417         clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
2418                        false);
2419 
2420 out_no_maps:
2421         mutex_unlock(&xps_map_mutex);
2422         cpus_read_unlock();
2423 }
2424 
2425 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2426 {
2427         netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2428 }
2429 
2430 static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2431                                       u16 index, bool is_rxqs_map)
2432 {
2433         struct xps_map *new_map;
2434         int alloc_len = XPS_MIN_MAP_ALLOC;
2435         int i, pos;
2436 
2437         for (pos = 0; map && pos < map->len; pos++) {
2438                 if (map->queues[pos] != index)
2439                         continue;
2440                 return map;
2441         }
2442 
2443         /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2444         if (map) {
2445                 if (pos < map->alloc_len)
2446                         return map;
2447 
2448                 alloc_len = map->alloc_len * 2;
2449         }
2450 
2451         /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2452          *  map
2453          */
2454         if (is_rxqs_map)
2455                 new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2456         else
2457                 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2458                                        cpu_to_node(attr_index));
2459         if (!new_map)
2460                 return NULL;
2461 
2462         for (i = 0; i < pos; i++)
2463                 new_map->queues[i] = map->queues[i];
2464         new_map->alloc_len = alloc_len;
2465         new_map->len = pos;
2466 
2467         return new_map;
2468 }
2469 
2470 /* Must be called under cpus_read_lock */
2471 int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2472                           u16 index, bool is_rxqs_map)
2473 {
2474         const unsigned long *online_mask = NULL, *possible_mask = NULL;
2475         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2476         int i, j, tci, numa_node_id = -2;
2477         int maps_sz, num_tc = 1, tc = 0;
2478         struct xps_map *map, *new_map;
2479         bool active = false;
2480         unsigned int nr_ids;
2481 
2482         if (dev->num_tc) {
2483                 /* Do not allow XPS on subordinate device directly */
2484                 num_tc = dev->num_tc;
2485                 if (num_tc < 0)
2486                         return -EINVAL;
2487 
2488                 /* If queue belongs to subordinate dev use its map */
2489                 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2490 
2491                 tc = netdev_txq_to_tc(dev, index);
2492                 if (tc < 0)
2493                         return -EINVAL;
2494         }
2495 
2496         mutex_lock(&xps_map_mutex);
2497         if (is_rxqs_map) {
2498                 maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2499                 dev_maps = xmap_dereference(dev->xps_rxqs_map);
2500                 nr_ids = dev->num_rx_queues;
2501         } else {
2502                 maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2503                 if (num_possible_cpus() > 1) {
2504                         online_mask = cpumask_bits(cpu_online_mask);
2505                         possible_mask = cpumask_bits(cpu_possible_mask);
2506                 }
2507                 dev_maps = xmap_dereference(dev->xps_cpus_map);
2508                 nr_ids = nr_cpu_ids;
2509         }
2510 
2511         if (maps_sz < L1_CACHE_BYTES)
2512                 maps_sz = L1_CACHE_BYTES;
2513 
2514         /* allocate memory for queue storage */
2515         for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2516              j < nr_ids;) {
2517                 if (!new_dev_maps)
2518                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2519                 if (!new_dev_maps) {
2520                         mutex_unlock(&xps_map_mutex);
2521                         return -ENOMEM;
2522                 }
2523 
2524                 tci = j * num_tc + tc;
2525                 map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
2526                                  NULL;
2527 
2528                 map = expand_xps_map(map, j, index, is_rxqs_map);
2529                 if (!map)
2530                         goto error;
2531 
2532                 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2533         }
2534 
2535         if (!new_dev_maps)
2536                 goto out_no_new_maps;
2537 
2538         if (!dev_maps) {
2539                 /* Increment static keys at most once per type */
2540                 static_key_slow_inc_cpuslocked(&xps_needed);
2541                 if (is_rxqs_map)
2542                         static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2543         }
2544 
2545         for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2546              j < nr_ids;) {
2547                 /* copy maps belonging to foreign traffic classes */
2548                 for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
2549                         /* fill in the new device map from the old device map */
2550                         map = xmap_dereference(dev_maps->attr_map[tci]);
2551                         RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2552                 }
2553 
2554                 /* We need to explicitly update tci as prevous loop
2555                  * could break out early if dev_maps is NULL.
2556                  */
2557                 tci = j * num_tc + tc;
2558 
2559                 if (netif_attr_test_mask(j, mask, nr_ids) &&
2560                     netif_attr_test_online(j, online_mask, nr_ids)) {
2561                         /* add tx-queue to CPU/rx-queue maps */
2562                         int pos = 0;
2563 
2564                         map = xmap_dereference(new_dev_maps->attr_map[tci]);
2565                         while ((pos < map->len) && (map->queues[pos] != index))
2566                                 pos++;
2567 
2568                         if (pos == map->len)
2569                                 map->queues[map->len++] = index;
2570 #ifdef CONFIG_NUMA
2571                         if (!is_rxqs_map) {
2572                                 if (numa_node_id == -2)
2573                                         numa_node_id = cpu_to_node(j);
2574                                 else if (numa_node_id != cpu_to_node(j))
2575                                         numa_node_id = -1;
2576                         }
2577 #endif
2578                 } else if (dev_maps) {
2579                         /* fill in the new device map from the old device map */
2580                         map = xmap_dereference(dev_maps->attr_map[tci]);
2581                         RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2582                 }
2583 
2584                 /* copy maps belonging to foreign traffic classes */
2585                 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2586                         /* fill in the new device map from the old device map */
2587                         map = xmap_dereference(dev_maps->attr_map[tci]);
2588                         RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2589                 }
2590         }
2591 
2592         if (is_rxqs_map)
2593                 rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
2594         else
2595                 rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
2596 
2597         /* Cleanup old maps */
2598         if (!dev_maps)
2599                 goto out_no_old_maps;
2600 
2601         for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2602              j < nr_ids;) {
2603                 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2604                         new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2605                         map = xmap_dereference(dev_maps->attr_map[tci]);
2606                         if (map && map != new_map)
2607                                 kfree_rcu(map, rcu);
2608                 }
2609         }
2610 
2611         kfree_rcu(dev_maps, rcu);
2612 
2613 out_no_old_maps:
2614         dev_maps = new_dev_maps;
2615         active = true;
2616 
2617 out_no_new_maps:
2618         if (!is_rxqs_map) {
2619                 /* update Tx queue numa node */
2620                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2621                                              (numa_node_id >= 0) ?
2622                                              numa_node_id : NUMA_NO_NODE);
2623         }
2624 
2625         if (!dev_maps)
2626                 goto out_no_maps;
2627 
2628         /* removes tx-queue from unused CPUs/rx-queues */
2629         for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2630              j < nr_ids;) {
2631                 for (i = tc, tci = j * num_tc; i--; tci++)
2632                         active |= remove_xps_queue(dev_maps, tci, index);
2633                 if (!netif_attr_test_mask(j, mask, nr_ids) ||
2634                     !netif_attr_test_online(j, online_mask, nr_ids))
2635                         active |= remove_xps_queue(dev_maps, tci, index);
2636                 for (i = num_tc - tc, tci++; --i; tci++)
2637                         active |= remove_xps_queue(dev_maps, tci, index);
2638         }
2639 
2640         /* free map if not active */
2641         if (!active)
2642                 reset_xps_maps(dev, dev_maps, is_rxqs_map);
2643 
2644 out_no_maps:
2645         mutex_unlock(&xps_map_mutex);
2646 
2647         return 0;
2648 error:
2649         /* remove any maps that we added */
2650         for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2651              j < nr_ids;) {
2652                 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2653                         new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2654                         map = dev_maps ?
2655                               xmap_dereference(dev_maps->attr_map[tci]) :
2656                               NULL;
2657                         if (new_map && new_map != map)
2658                                 kfree(new_map);
2659                 }
2660         }
2661 
2662         mutex_unlock(&xps_map_mutex);
2663 
2664         kfree(new_dev_maps);
2665         return -ENOMEM;
2666 }
2667 EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2668 
2669 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2670                         u16 index)
2671 {
2672         int ret;
2673 
2674         cpus_read_lock();
2675         ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
2676         cpus_read_unlock();
2677 
2678         return ret;
2679 }
2680 EXPORT_SYMBOL(netif_set_xps_queue);
2681 
2682 #endif
2683 static void netdev_unbind_all_sb_channels(struct net_device *dev)
2684 {
2685         struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2686 
2687         /* Unbind any subordinate channels */
2688         while (txq-- != &dev->_tx[0]) {
2689                 if (txq->sb_dev)
2690                         netdev_unbind_sb_channel(dev, txq->sb_dev);
2691         }
2692 }
2693 
2694 void netdev_reset_tc(struct net_device *dev)
2695 {
2696 #ifdef CONFIG_XPS
2697         netif_reset_xps_queues_gt(dev, 0);
2698 #endif
2699         netdev_unbind_all_sb_channels(dev);
2700 
2701         /* Reset TC configuration of device */
2702         dev->num_tc = 0;
2703         memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2704         memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2705 }
2706 EXPORT_SYMBOL(netdev_reset_tc);
2707 
2708 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2709 {
2710         if (tc >= dev->num_tc)
2711                 return -EINVAL;
2712 
2713 #ifdef CONFIG_XPS
2714         netif_reset_xps_queues(dev, offset, count);
2715 #endif
2716         dev->tc_to_txq[tc].count = count;
2717         dev->tc_to_txq[tc].offset = offset;
2718         return 0;
2719 }
2720 EXPORT_SYMBOL(netdev_set_tc_queue);
2721 
2722 int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2723 {
2724         if (num_tc > TC_MAX_QUEUE)
2725                 return -EINVAL;
2726 
2727 #ifdef CONFIG_XPS
2728         netif_reset_xps_queues_gt(dev, 0);
2729 #endif
2730         netdev_unbind_all_sb_channels(dev);
2731 
2732         dev->num_tc = num_tc;
2733         return 0;
2734 }
2735 EXPORT_SYMBOL(netdev_set_num_tc);
2736 
2737 void netdev_unbind_sb_channel(struct net_device *dev,
2738                               struct net_device *sb_dev)
2739 {
2740         struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2741 
2742 #ifdef CONFIG_XPS
2743         netif_reset_xps_queues_gt(sb_dev, 0);
2744 #endif
2745         memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2746         memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2747 
2748         while (txq-- != &dev->_tx[0]) {
2749                 if (txq->sb_dev == sb_dev)
2750                         txq->sb_dev = NULL;
2751         }
2752 }
2753 EXPORT_SYMBOL(netdev_unbind_sb_channel);
2754 
2755 int netdev_bind_sb_channel_queue(struct net_device *dev,
2756                                  struct net_device *sb_dev,
2757                                  u8 tc, u16 count, u16 offset)
2758 {
2759         /* Make certain the sb_dev and dev are already configured */
2760         if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2761                 return -EINVAL;
2762 
2763         /* We cannot hand out queues we don't have */
2764         if ((offset + count) > dev->real_num_tx_queues)
2765                 return -EINVAL;
2766 
2767         /* Record the mapping */
2768         sb_dev->tc_to_txq[tc].count = count;
2769         sb_dev->tc_to_txq[tc].offset = offset;
2770 
2771         /* Provide a way for Tx queue to find the tc_to_txq map or
2772          * XPS map for itself.
2773          */
2774         while (count--)
2775                 netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2776 
2777         return 0;
2778 }
2779 EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2780 
2781 int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2782 {
2783         /* Do not use a multiqueue device to represent a subordinate channel */
2784         if (netif_is_multiqueue(dev))
2785                 return -ENODEV;
2786 
2787         /* We allow channels 1 - 32767 to be used for subordinate channels.
2788          * Channel 0 is meant to be "native" mode and used only to represent
2789          * the main root device. We allow writing 0 to reset the device back
2790          * to normal mode after being used as a subordinate channel.
2791          */
2792         if (channel > S16_MAX)
2793                 return -EINVAL;
2794 
2795         dev->num_tc = -channel;
2796 
2797         return 0;
2798 }
2799 EXPORT_SYMBOL(netdev_set_sb_channel);
2800 
2801 /*
2802  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2803  * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2804  */
2805 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2806 {
2807         bool disabling;
2808         int rc;
2809 
2810         disabling = txq < dev->real_num_tx_queues;
2811 
2812         if (txq < 1 || txq > dev->num_tx_queues)
2813                 return -EINVAL;
2814 
2815         if (dev->reg_state == NETREG_REGISTERED ||
2816             dev->reg_state == NETREG_UNREGISTERING) {
2817                 ASSERT_RTNL();
2818 
2819                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2820                                                   txq);
2821                 if (rc)
2822                         return rc;
2823 
2824                 if (dev->num_tc)
2825                         netif_setup_tc(dev, txq);
2826 
2827                 dev->real_num_tx_queues = txq;
2828 
2829                 if (disabling) {
2830                         synchronize_net();
2831                         qdisc_reset_all_tx_gt(dev, txq);
2832 #ifdef CONFIG_XPS
2833                         netif_reset_xps_queues_gt(dev, txq);
2834 #endif
2835                 }
2836         } else {
2837                 dev->real_num_tx_queues = txq;
2838         }
2839 
2840         return 0;
2841 }
2842 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2843 
2844 #ifdef CONFIG_SYSFS
2845 /**
2846  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2847  *      @dev: Network device
2848  *      @rxq: Actual number of RX queues
2849  *
2850  *      This must be called either with the rtnl_lock held or before
2851  *      registration of the net device.  Returns 0 on success, or a
2852  *      negative error code.  If called before registration, it always
2853  *      succeeds.
2854  */
2855 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2856 {
2857         int rc;
2858 
2859         if (rxq < 1 || rxq > dev->num_rx_queues)
2860                 return -EINVAL;
2861 
2862         if (dev->reg_state == NETREG_REGISTERED) {
2863                 ASSERT_RTNL();
2864 
2865                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2866                                                   rxq);
2867                 if (rc)
2868                         return rc;
2869         }
2870 
2871         dev->real_num_rx_queues = rxq;
2872         return 0;
2873 }
2874 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2875 #endif
2876 
2877 /**
2878  * netif_get_num_default_rss_queues - default number of RSS queues
2879  *
2880  * This routine should set an upper limit on the number of RSS queues
2881  * used by default by multiqueue devices.
2882  */
2883 int netif_get_num_default_rss_queues(void)
2884 {
2885         return is_kdump_kernel() ?
2886                 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2887 }
2888 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2889 
2890 static void __netif_reschedule(struct Qdisc *q)
2891 {
2892         struct softnet_data *sd;
2893         unsigned long flags;
2894 
2895         local_irq_save(flags);
2896         sd = this_cpu_ptr(&softnet_data);
2897         q->next_sched = NULL;
2898         *sd->output_queue_tailp = q;
2899         sd->output_queue_tailp = &q->next_sched;
2900         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2901         local_irq_restore(flags);
2902 }
2903 
2904 void __netif_schedule(struct Qdisc *q)
2905 {
2906         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2907                 __netif_reschedule(q);
2908 }
2909 EXPORT_SYMBOL(__netif_schedule);
2910 
2911 struct dev_kfree_skb_cb {
2912         enum skb_free_reason reason;
2913 };
2914 
2915 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2916 {
2917         return (struct dev_kfree_skb_cb *)skb->cb;
2918 }
2919 
2920 void netif_schedule_queue(struct netdev_queue *txq)
2921 {
2922         rcu_read_lock();
2923         if (!netif_xmit_stopped(txq)) {
2924                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2925 
2926                 __netif_schedule(q);
2927         }
2928         rcu_read_unlock();
2929 }
2930 EXPORT_SYMBOL(netif_schedule_queue);
2931 
2932 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2933 {
2934         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2935                 struct Qdisc *q;
2936 
2937                 rcu_read_lock();
2938                 q = rcu_dereference(dev_queue->qdisc);
2939                 __netif_schedule(q);
2940                 rcu_read_unlock();
2941         }
2942 }
2943 EXPORT_SYMBOL(netif_tx_wake_queue);
2944 
2945 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2946 {
2947         unsigned long flags;
2948 
2949         if (unlikely(!skb))
2950                 return;
2951 
2952         if (likely(refcount_read(&skb->users) == 1)) {
2953                 smp_rmb();
2954                 refcount_set(&skb->users, 0);
2955         } else if (likely(!refcount_dec_and_test(&skb->users))) {
2956                 return;
2957         }
2958         get_kfree_skb_cb(skb)->reason = reason;
2959         local_irq_save(flags);
2960         skb->next = __this_cpu_read(softnet_data.completion_queue);
2961         __this_cpu_write(softnet_data.completion_queue, skb);
2962         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2963         local_irq_restore(flags);
2964 }
2965 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2966 
2967 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2968 {
2969         if (in_irq() || irqs_disabled())
2970                 __dev_kfree_skb_irq(skb, reason);
2971         else
2972                 dev_kfree_skb(skb);
2973 }
2974 EXPORT_SYMBOL(__dev_kfree_skb_any);
2975 
2976 
2977 /**
2978  * netif_device_detach - mark device as removed
2979  * @dev: network device
2980  *
2981  * Mark device as removed from system and therefore no longer available.
2982  */
2983 void netif_device_detach(struct net_device *dev)
2984 {
2985         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2986             netif_running(dev)) {
2987                 netif_tx_stop_all_queues(dev);
2988         }
2989 }
2990 EXPORT_SYMBOL(netif_device_detach);
2991 
2992 /**
2993  * netif_device_attach - mark device as attached
2994  * @dev: network device
2995  *
2996  * Mark device as attached from system and restart if needed.
2997  */
2998 void netif_device_attach(struct net_device *dev)
2999 {
3000         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3001             netif_running(dev)) {
3002                 netif_tx_wake_all_queues(dev);
3003                 __netdev_watchdog_up(dev);
3004         }
3005 }
3006 EXPORT_SYMBOL(netif_device_attach);
3007 
3008 /*
3009  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3010  * to be used as a distribution range.
3011  */
3012 static u16 skb_tx_hash(const struct net_device *dev,
3013                        const struct net_device *sb_dev,
3014                        struct sk_buff *skb)
3015 {
3016         u32 hash;
3017         u16 qoffset = 0;
3018         u16 qcount = dev->real_num_tx_queues;
3019 
3020         if (dev->num_tc) {
3021                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3022 
3023                 qoffset = sb_dev->tc_to_txq[tc].offset;
3024                 qcount = sb_dev->tc_to_txq[tc].count;
3025         }
3026 
3027         if (skb_rx_queue_recorded(skb)) {
3028                 hash = skb_get_rx_queue(skb);
3029                 if (hash >= qoffset)
3030                         hash -= qoffset;
3031                 while (unlikely(hash >= qcount))
3032                         hash -= qcount;
3033                 return hash + qoffset;
3034         }
3035 
3036         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3037 }
3038 
3039 static void skb_warn_bad_offload(const struct sk_buff *skb)
3040 {
3041         static const netdev_features_t null_features;
3042         struct net_device *dev = skb->dev;
3043         const char *name = "";
3044 
3045         if (!net_ratelimit())
3046                 return;
3047 
3048         if (dev) {
3049                 if (dev->dev.parent)
3050                         name = dev_driver_string(dev->dev.parent);
3051                 else
3052                         name = netdev_name(dev);
3053         }
3054         skb_dump(KERN_WARNING, skb, false);
3055         WARN(1, "%s: caps=(%pNF, %pNF)\n",
3056              name, dev ? &dev->features : &null_features,
3057              skb->sk ? &skb->sk->sk_route_caps : &null_features);
3058 }
3059 
3060 /*
3061  * Invalidate hardware checksum when packet is to be mangled, and
3062  * complete checksum manually on outgoing path.
3063  */
3064 int skb_checksum_help(struct sk_buff *skb)
3065 {
3066         __wsum csum;
3067         int ret = 0, offset;
3068 
3069         if (skb->ip_summed == CHECKSUM_COMPLETE)
3070                 goto out_set_summed;
3071 
3072         if (unlikely(skb_shinfo(skb)->gso_size)) {
3073                 skb_warn_bad_offload(skb);
3074                 return -EINVAL;
3075         }
3076 
3077         /* Before computing a checksum, we should make sure no frag could
3078          * be modified by an external entity : checksum could be wrong.
3079          */
3080         if (skb_has_shared_frag(skb)) {
3081                 ret = __skb_linearize(skb);
3082                 if (ret)
3083                         goto out;
3084         }
3085 
3086         offset = skb_checksum_start_offset(skb);
3087         BUG_ON(offset >= skb_headlen(skb));
3088         csum = skb_checksum(skb, offset, skb->len - offset, 0);
3089 
3090         offset += skb->csum_offset;
3091         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
3092 
3093         ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3094         if (ret)
3095                 goto out;
3096 
3097         *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3098 out_set_summed:
3099         skb->ip_summed = CHECKSUM_NONE;
3100 out:
3101         return ret;
3102 }
3103 EXPORT_SYMBOL(skb_checksum_help);
3104 
3105 int skb_crc32c_csum_help(struct sk_buff *skb)
3106 {
3107         __le32 crc32c_csum;
3108         int ret = 0, offset, start;
3109 
3110         if (skb->ip_summed != CHECKSUM_PARTIAL)
3111                 goto out;
3112 
3113         if (unlikely(skb_is_gso(skb)))
3114                 goto out;
3115 
3116         /* Before computing a checksum, we should make sure no frag could
3117          * be modified by an external entity : checksum could be wrong.
3118          */
3119         if (unlikely(skb_has_shared_frag(skb))) {
3120                 ret = __skb_linearize(skb);
3121                 if (ret)
3122                         goto out;
3123         }
3124         start = skb_checksum_start_offset(skb);
3125         offset = start + offsetof(struct sctphdr, checksum);
3126         if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3127                 ret = -EINVAL;
3128                 goto out;
3129         }
3130 
3131         ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3132         if (ret)
3133                 goto out;
3134 
3135         crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3136                                                   skb->len - start, ~(__u32)0,
3137                                                   crc32c_csum_stub));
3138         *(__le32 *)(skb->data + offset) = crc32c_csum;
3139         skb->ip_summed = CHECKSUM_NONE;
3140         skb->csum_not_inet = 0;
3141 out:
3142         return ret;
3143 }
3144 
3145 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3146 {
3147         __be16 type = skb->protocol;
3148 
3149         /* Tunnel gso handlers can set protocol to ethernet. */
3150         if (type == htons(ETH_P_TEB)) {
3151                 struct ethhdr *eth;
3152 
3153                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3154                         return 0;
3155 
3156                 eth = (struct ethhdr *)skb->data;
3157                 type = eth->h_proto;
3158         }
3159 
3160         return __vlan_get_protocol(skb, type, depth);
3161 }
3162 
3163 /**
3164  *      skb_mac_gso_segment - mac layer segmentation handler.
3165  *      @skb: buffer to segment
3166  *      @features: features for the output path (see dev->features)
3167  */
3168 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
3169                                     netdev_features_t features)
3170 {
3171         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
3172         struct packet_offload *ptype;
3173         int vlan_depth = skb->mac_len;
3174         __be16 type = skb_network_protocol(skb, &vlan_depth);
3175 
3176         if (unlikely(!type))
3177                 return ERR_PTR(-EINVAL);
3178 
3179         __skb_pull(skb, vlan_depth);
3180 
3181         rcu_read_lock();
3182         list_for_each_entry_rcu(ptype, &offload_base, list) {
3183                 if (ptype->type == type && ptype->callbacks.gso_segment) {
3184                         segs = ptype->callbacks.gso_segment(skb, features);
3185                         break;
3186                 }
3187         }
3188         rcu_read_unlock();
3189 
3190         __skb_push(skb, skb->data - skb_mac_header(skb));
3191 
3192         return segs;
3193 }
3194 EXPORT_SYMBOL(skb_mac_gso_segment);
3195 
3196 
3197 /* openvswitch calls this on rx path, so we need a different check.
3198  */
3199 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
3200 {
3201         if (tx_path)
3202                 return skb->ip_summed != CHECKSUM_PARTIAL &&
3203                        skb->ip_summed != CHECKSUM_UNNECESSARY;
3204 
3205         return skb->ip_summed == CHECKSUM_NONE;
3206 }
3207 
3208 /**
3209  *      __skb_gso_segment - Perform segmentation on skb.
3210  *      @skb: buffer to segment
3211  *      @features: features for the output path (see dev->features)
3212  *      @tx_path: whether it is called in TX path
3213  *
3214  *      This function segments the given skb and returns a list of segments.
3215  *
3216  *      It may return NULL if the skb requires no segmentation.  This is
3217  *      only possible when GSO is used for verifying header integrity.
3218  *
3219  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
3220  */
3221 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
3222                                   netdev_features_t features, bool tx_path)
3223 {
3224         struct sk_buff *segs;
3225 
3226         if (unlikely(skb_needs_check(skb, tx_path))) {
3227                 int err;
3228 
3229                 /* We're going to init ->check field in TCP or UDP header */
3230                 err = skb_cow_head(skb, 0);
3231                 if (err < 0)
3232                         return ERR_PTR(err);
3233         }
3234 
3235         /* Only report GSO partial support if it will enable us to
3236          * support segmentation on this frame without needing additional
3237          * work.
3238          */
3239         if (features & NETIF_F_GSO_PARTIAL) {
3240                 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
3241                 struct net_device *dev = skb->dev;
3242 
3243                 partial_features |= dev->features & dev->gso_partial_features;
3244                 if (!skb_gso_ok(skb, features | partial_features))
3245                         features &= ~NETIF_F_GSO_PARTIAL;
3246         }
3247 
3248         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
3249                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
3250 
3251         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3252         SKB_GSO_CB(skb)->encap_level = 0;
3253 
3254         skb_reset_mac_header(skb);
3255         skb_reset_mac_len(skb);
3256 
3257         segs = skb_mac_gso_segment(skb, features);
3258 
3259         if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3260                 skb_warn_bad_offload(skb);
3261 
3262         return segs;
3263 }
3264 EXPORT_SYMBOL(__skb_gso_segment);
3265 
3266 /* Take action when hardware reception checksum errors are detected. */
3267 #ifdef CONFIG_BUG
3268 void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3269 {
3270         if (net_ratelimit()) {
3271                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
3272                 skb_dump(KERN_ERR, skb, true);
3273                 dump_stack();
3274         }
3275 }
3276 EXPORT_SYMBOL(netdev_rx_csum_fault);
3277 #endif
3278 
3279 /* XXX: check that highmem exists at all on the given machine. */
3280 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3281 {
3282 #ifdef CONFIG_HIGHMEM
3283         int i;
3284 
3285         if (!(dev->features & NETIF_F_HIGHDMA)) {
3286                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3287                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3288 
3289                         if (PageHighMem(skb_frag_page(frag)))
3290                                 return 1;
3291                 }
3292         }
3293 #endif
3294         return 0;
3295 }
3296 
3297 /* If MPLS offload request, verify we are testing hardware MPLS features
3298  * instead of standard features for the netdev.
3299  */
3300 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3301 static netdev_features_t net_mpls_features(struct sk_buff *skb,
3302                                            netdev_features_t features,
3303                                            __be16 type)
3304 {
3305         if (eth_p_mpls(type))
3306                 features &= skb->dev->mpls_features;
3307 
3308         return features;
3309 }
3310 #else
3311 static netdev_features_t net_mpls_features(struct sk_buff *skb,
3312                                            netdev_features_t features,
3313                                            __be16 type)
3314 {
3315         return features;
3316 }
3317 #endif
3318 
3319 static netdev_features_t harmonize_features(struct sk_buff *skb,
3320         netdev_features_t features)
3321 {
3322         int tmp;
3323         __be16 type;
3324 
3325         type = skb_network_protocol(skb, &tmp);
3326         features = net_mpls_features(skb, features, type);
3327 
3328         if (skb->ip_summed != CHECKSUM_NONE &&
3329             !can_checksum_protocol(features, type)) {
3330                 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3331         }
3332         if (illegal_highdma(skb->dev, skb))
3333                 features &= ~NETIF_F_SG;
3334 
3335         return features;
3336 }
3337 
3338 netdev_features_t passthru_features_check(struct sk_buff *skb,
3339                                           struct net_device *dev,
3340                                           netdev_features_t features)
3341 {
3342         return features;
3343 }
3344 EXPORT_SYMBOL(passthru_features_check);
3345 
3346 static netdev_features_t dflt_features_check(struct sk_buff *skb,
3347                                              struct net_device *dev,
3348                                              netdev_features_t features)
3349 {
3350         return vlan_features_check(skb, features);
3351 }
3352 
3353 static netdev_features_t gso_features_check(const struct sk_buff *skb,
3354                                             struct net_device *dev,
3355                                             netdev_features_t features)
3356 {
3357         u16 gso_segs = skb_shinfo(skb)->gso_segs;
3358 
3359         if (gso_segs > dev->gso_max_segs)
3360                 return features & ~NETIF_F_GSO_MASK;
3361 
3362         /* Support for GSO partial features requires software
3363          * intervention before we can actually process the packets
3364          * so we need to strip support for any partial features now
3365          * and we can pull them back in after we have partially
3366          * segmented the frame.
3367          */
3368         if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3369                 features &= ~dev->gso_partial_features;
3370 
3371         /* Make sure to clear the IPv4 ID mangling feature if the
3372          * IPv4 header has the potential to be fragmented.
3373          */
3374         if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3375                 struct iphdr *iph = skb->encapsulation ?
3376                                     inner_ip_hdr(skb) : ip_hdr(skb);
3377 
3378                 if (!(iph->frag_off & htons(IP_DF)))
3379                         features &= ~NETIF_F_TSO_MANGLEID;
3380         }
3381 
3382         return features;
3383 }
3384 
3385 netdev_features_t netif_skb_features(struct sk_buff *skb)
3386 {
3387         struct net_device *dev = skb->dev;
3388         netdev_features_t features = dev->features;
3389 
3390         if (skb_is_gso(skb))
3391                 features = gso_features_check(skb, dev, features);
3392 
3393         /* If encapsulation offload request, verify we are testing
3394          * hardware encapsulation features instead of standard
3395          * features for the netdev
3396          */
3397         if (skb->encapsulation)
3398                 features &= dev->hw_enc_features;
3399 
3400         if (skb_vlan_tagged(skb))
3401                 features = netdev_intersect_features(features,
3402                                                      dev->vlan_features |
3403                                                      NETIF_F_HW_VLAN_CTAG_TX |
3404                                                      NETIF_F_HW_VLAN_STAG_TX);
3405 
3406         if (dev->netdev_ops->ndo_features_check)
3407                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
3408                                                                 features);
3409         else
3410                 features &= dflt_features_check(skb, dev, features);
3411 
3412         return harmonize_features(skb, features);
3413 }
3414 EXPORT_SYMBOL(netif_skb_features);
3415 
3416 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3417                     struct netdev_queue *txq, bool more)
3418 {
3419         unsigned int len;
3420         int rc;
3421 
3422         if (dev_nit_active(dev))
3423                 dev_queue_xmit_nit(skb, dev);
3424 
3425         len = skb->len;
3426         trace_net_dev_start_xmit(skb, dev);
3427         rc = netdev_start_xmit(skb, dev, txq, more);
3428         trace_net_dev_xmit(skb, rc, dev, len);
3429 
3430         return rc;
3431 }
3432 
3433 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3434                                     struct netdev_queue *txq, int *ret)
3435 {
3436         struct sk_buff *skb = first;
3437         int rc = NETDEV_TX_OK;
3438 
3439         while (skb) {
3440                 struct sk_buff *next = skb->next;
3441 
3442                 skb_mark_not_on_list(skb);
3443                 rc = xmit_one(skb, dev, txq, next != NULL);
3444                 if (unlikely(!dev_xmit_complete(rc))) {
3445                         skb->next = next;
3446                         goto out;
3447                 }
3448 
3449                 skb = next;
3450                 if (netif_tx_queue_stopped(txq) && skb) {
3451                         rc = NETDEV_TX_BUSY;
3452                         break;
3453                 }
3454         }
3455 
3456 out:
3457         *ret = rc;
3458         return skb;
3459 }
3460 
3461 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3462                                           netdev_features_t features)
3463 {
3464         if (skb_vlan_tag_present(skb) &&
3465             !vlan_hw_offload_capable(features, skb->vlan_proto))
3466                 skb = __vlan_hwaccel_push_inside(skb);
3467         return skb;
3468 }
3469 
3470 int skb_csum_hwoffload_help(struct sk_buff *skb,
3471                             const netdev_features_t features)
3472 {
3473         if (unlikely(skb->csum_not_inet))
3474                 return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3475                         skb_crc32c_csum_help(skb);
3476 
3477         return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
3478 }
3479 EXPORT_SYMBOL(skb_csum_hwoffload_help);
3480 
3481 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3482 {
3483         netdev_features_t features;
3484 
3485         features = netif_skb_features(skb);
3486         skb = validate_xmit_vlan(skb, features);
3487         if (unlikely(!skb))
3488                 goto out_null;
3489 
3490         skb = sk_validate_xmit_skb(skb, dev);
3491         if (unlikely(!skb))
3492                 goto out_null;
3493 
3494         if (netif_needs_gso(skb, features)) {
3495                 struct sk_buff *segs;
3496 
3497                 segs = skb_gso_segment(skb, features);
3498                 if (IS_ERR(segs)) {
3499                         goto out_kfree_skb;
3500                 } else if (segs) {
3501                         consume_skb(skb);
3502                         skb = segs;
3503                 }
3504         } else {
3505                 if (skb_needs_linearize(skb, features) &&
3506                     __skb_linearize(skb))
3507                         goto out_kfree_skb;
3508 
3509                 /* If packet is not checksummed and device does not
3510                  * support checksumming for this protocol, complete
3511                  * checksumming here.
3512                  */
3513                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3514                         if (skb->encapsulation)
3515                                 skb_set_inner_transport_header(skb,
3516                                                                skb_checksum_start_offset(skb));
3517                         else
3518                                 skb_set_transport_header(skb,
3519                                                          skb_checksum_start_offset(skb));
3520                         if (skb_csum_hwoffload_help(skb, features))
3521                                 goto out_kfree_skb;
3522                 }
3523         }
3524 
3525         skb = validate_xmit_xfrm(skb, features, again);
3526 
3527         return skb;
3528 
3529 out_kfree_skb:
3530         kfree_skb(skb);
3531 out_null:
3532         atomic_long_inc(&dev->tx_dropped);
3533         return NULL;
3534 }
3535 
3536 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3537 {
3538         struct sk_buff *next, *head = NULL, *tail;
3539 
3540         for (; skb != NULL; skb = next) {
3541                 next = skb->next;
3542                 skb_mark_not_on_list(skb);
3543 
3544                 /* in case skb wont be segmented, point to itself */
3545                 skb->prev = skb;
3546 
3547                 skb = validate_xmit_skb(skb, dev, again);
3548                 if (!skb)
3549                         continue;
3550 
3551                 if (!head)
3552                         head = skb;
3553                 else
3554                         tail->next = skb;
3555                 /* If skb was segmented, skb->prev points to
3556                  * the last segment. If not, it still contains skb.
3557                  */
3558                 tail = skb->prev;
3559         }
3560         return head;
3561 }
3562 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3563 
3564 static void qdisc_pkt_len_init(struct sk_buff *skb)
3565 {
3566         const struct skb_shared_info *shinfo = skb_shinfo(skb);
3567 
3568         qdisc_skb_cb(skb)->pkt_len = skb->len;
3569 
3570         /* To get more precise estimation of bytes sent on wire,
3571          * we add to pkt_len the headers size of all segments
3572          */
3573         if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3574                 unsigned int hdr_len;
3575                 u16 gso_segs = shinfo->gso_segs;
3576 
3577                 /* mac layer + network layer */
3578                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3579 
3580                 /* + transport layer */
3581                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3582                         const struct tcphdr *th;
3583                         struct tcphdr _tcphdr;
3584 
3585                         th = skb_header_pointer(skb, skb_transport_offset(skb),
3586                                                 sizeof(_tcphdr), &_tcphdr);
3587                         if (likely(th))
3588                                 hdr_len += __tcp_hdrlen(th);
3589                 } else {
3590                         struct udphdr _udphdr;
3591 
3592                         if (skb_header_pointer(skb, skb_transport_offset(skb),
3593                                                sizeof(_udphdr), &_udphdr))
3594                                 hdr_len += sizeof(struct udphdr);
3595                 }
3596 
3597                 if (shinfo->gso_type & SKB_GSO_DODGY)
3598                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3599                                                 shinfo->gso_size);
3600 
3601                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3602         }
3603 }
3604 
3605 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3606                                  struct net_device *dev,
3607                                  struct netdev_queue *txq)
3608 {
3609         spinlock_t *root_lock = qdisc_lock(q);
3610         struct sk_buff *to_free = NULL;
3611         bool contended;
3612         int rc;
3613 
3614         qdisc_calculate_pkt_len(skb, q);
3615 
3616         if (q->flags & TCQ_F_NOLOCK) {
3617                 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3618                 qdisc_run(q);
3619 
3620                 if (unlikely(to_free))
3621                         kfree_skb_list(to_free);
3622                 return rc;
3623         }
3624 
3625         /*
3626          * Heuristic to force contended enqueues to serialize on a
3627          * separate lock before trying to get qdisc main lock.
3628          * This permits qdisc->running owner to get the lock more
3629          * often and dequeue packets faster.
3630          */
3631         contended = qdisc_is_running(q);
3632         if (unlikely(contended))
3633                 spin_lock(&q->busylock);
3634 
3635         spin_lock(root_lock);
3636         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3637                 __qdisc_drop(skb, &to_free);
3638                 rc = NET_XMIT_DROP;
3639         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3640                    qdisc_run_begin(q)) {
3641                 /*
3642                  * This is a work-conserving queue; there are no old skbs
3643                  * waiting to be sent out; and the qdisc is not running -
3644                  * xmit the skb directly.
3645                  */
3646 
3647                 qdisc_bstats_update(q, skb);
3648 
3649                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3650                         if (unlikely(contended)) {
3651                                 spin_unlock(&q->busylock);
3652                                 contended = false;
3653                         }
3654                         __qdisc_run(q);
3655                 }
3656 
3657                 qdisc_run_end(q);
3658                 rc = NET_XMIT_SUCCESS;
3659         } else {
3660                 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3661                 if (qdisc_run_begin(q)) {
3662                         if (unlikely(contended)) {
3663                                 spin_unlock(&q->busylock);
3664                                 contended = false;
3665                         }
3666                         __qdisc_run(q);
3667                         qdisc_run_end(q);
3668                 }
3669         }
3670         spin_unlock(root_lock);
3671         if (unlikely(to_free))
3672                 kfree_skb_list(to_free);
3673         if (unlikely(contended))
3674                 spin_unlock(&q->busylock);
3675         return rc;
3676 }
3677 
3678 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3679 static void skb_update_prio(struct sk_buff *skb)
3680 {
3681         const struct netprio_map *map;
3682         const struct sock *sk;
3683         unsigned int prioidx;
3684 
3685         if (skb->priority)
3686                 return;
3687         map = rcu_dereference_bh(skb->dev->priomap);
3688         if (!map)
3689                 return;
3690         sk = skb_to_full_sk(skb);
3691         if (!sk)
3692                 return;
3693 
3694         prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3695 
3696         if (prioidx < map->priomap_len)
3697                 skb->priority = map->priomap[prioidx];
3698 }
3699 #else
3700 #define skb_update_prio(skb)
3701 #endif
3702 
3703 /**
3704  *      dev_loopback_xmit - loop back @skb
3705  *      @net: network namespace this loopback is happening in
3706  *      @sk:  sk needed to be a netfilter okfn
3707  *      @skb: buffer to transmit
3708  */
3709 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3710 {
3711         skb_reset_mac_header(skb);
3712         __skb_pull(skb, skb_network_offset(skb));
3713         skb->pkt_type = PACKET_LOOPBACK;
3714         skb->ip_summed = CHECKSUM_UNNECESSARY;
3715         WARN_ON(!skb_dst(skb));
3716         skb_dst_force(skb);
3717         netif_rx_ni(skb);
3718         return 0;
3719 }
3720 EXPORT_SYMBOL(dev_loopback_xmit);
3721 
3722 #ifdef CONFIG_NET_EGRESS
3723 static struct sk_buff *
3724 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3725 {
3726         struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3727         struct tcf_result cl_res;
3728 
3729         if (!miniq)
3730                 return skb;
3731 
3732         /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3733         mini_qdisc_bstats_cpu_update(miniq, skb);
3734 
3735         switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3736         case TC_ACT_OK:
3737         case TC_ACT_RECLASSIFY:
3738                 skb->tc_index = TC_H_MIN(cl_res.classid);
3739                 break;
3740         case TC_ACT_SHOT:
3741                 mini_qdisc_qstats_cpu_drop(miniq);
3742                 *ret = NET_XMIT_DROP;
3743                 kfree_skb(skb);
3744                 return NULL;
3745         case TC_ACT_STOLEN:
3746         case TC_ACT_QUEUED:
3747         case TC_ACT_TRAP:
3748                 *ret = NET_XMIT_SUCCESS;
3749                 consume_skb(skb);
3750                 return NULL;
3751         case TC_ACT_REDIRECT:
3752                 /* No need to push/pop skb's mac_header here on egress! */
3753                 skb_do_redirect(skb);
3754                 *ret = NET_XMIT_SUCCESS;
3755                 return NULL;
3756         default:
3757                 break;
3758         }
3759 
3760         return skb;
3761 }
3762 #endif /* CONFIG_NET_EGRESS */
3763 
3764 #ifdef CONFIG_XPS
3765 static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
3766                                struct xps_dev_maps *dev_maps, unsigned int tci)
3767 {
3768         struct xps_map *map;
3769         int queue_index = -1;
3770 
3771         if (dev->num_tc) {
3772                 tci *= dev->num_tc;
3773                 tci += netdev_get_prio_tc_map(dev, skb->priority);
3774         }
3775 
3776         map = rcu_dereference(dev_maps->attr_map[tci]);
3777         if (map) {
3778                 if (map->len == 1)
3779                         queue_index = map->queues[0];
3780                 else
3781                         queue_index = map->queues[reciprocal_scale(
3782                                                 skb_get_hash(skb), map->len)];
3783                 if (unlikely(queue_index >= dev->real_num_tx_queues))
3784                         queue_index = -1;
3785         }
3786         return queue_index;
3787 }
3788 #endif
3789 
3790 static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
3791                          struct sk_buff *skb)
3792 {
3793 #ifdef CONFIG_XPS
3794         struct xps_dev_maps *dev_maps;
3795         struct sock *sk = skb->sk;
3796         int queue_index = -1;
3797 
3798         if (!static_key_false(&xps_needed))
3799                 return -1;
3800 
3801         rcu_read_lock();
3802         if (!static_key_false(&xps_rxqs_needed))
3803                 goto get_cpus_map;
3804 
3805         dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
3806         if (dev_maps) {
3807                 int tci = sk_rx_queue_get(sk);
3808 
3809                 if (tci >= 0 && tci < dev->num_rx_queues)
3810                         queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3811                                                           tci);
3812         }
3813 
3814 get_cpus_map:
3815         if (queue_index < 0) {
3816                 dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
3817                 if (dev_maps) {
3818                         unsigned int tci = skb->sender_cpu - 1;
3819 
3820                         queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3821                                                           tci);
3822                 }
3823         }
3824         rcu_read_unlock();
3825 
3826         return queue_index;
3827 #else
3828         return -1;
3829 #endif
3830 }
3831 
3832 u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
3833                      struct net_device *sb_dev)
3834 {
3835         return 0;
3836 }
3837 EXPORT_SYMBOL(dev_pick_tx_zero);
3838 
3839 u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
3840                        struct net_device *sb_dev)
3841 {
3842         return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
3843 }
3844 EXPORT_SYMBOL(dev_pick_tx_cpu_id);
3845 
3846 u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
3847                      struct net_device *sb_dev)
3848 {
3849         struct sock *sk = skb->sk;
3850         int queue_index = sk_tx_queue_get(sk);
3851 
3852         sb_dev = sb_dev ? : dev;
3853 
3854         if (queue_index < 0 || skb->ooo_okay ||
3855             queue_index >= dev->real_num_tx_queues) {
3856                 int new_index = get_xps_queue(dev, sb_dev, skb);
3857 
3858                 if (new_index < 0)
3859                         new_index = skb_tx_hash(dev, sb_dev, skb);
3860 
3861                 if (queue_index != new_index && sk &&
3862                     sk_fullsock(sk) &&
3863                     rcu_access_pointer(sk->sk_dst_cache))
3864                         sk_tx_queue_set(sk, new_index);
3865 
3866                 queue_index = new_index;
3867         }
3868 
3869         return queue_index;
3870 }
3871 EXPORT_SYMBOL(netdev_pick_tx);
3872 
3873 struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
3874                                          struct sk_buff *skb,
3875                                          struct net_device *sb_dev)
3876 {
3877         int queue_index = 0;
3878 
3879 #ifdef CONFIG_XPS
3880         u32 sender_cpu = skb->sender_cpu - 1;
3881 
3882         if (sender_cpu >= (u32)NR_CPUS)
3883                 skb->sender_cpu = raw_smp_processor_id() + 1;
3884 #endif
3885 
3886         if (dev->real_num_tx_queues != 1) {
3887                 const struct net_device_ops *ops = dev->netdev_ops;
3888 
3889                 if (ops->ndo_select_queue)
3890                         queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
3891                 else
3892                         queue_index = netdev_pick_tx(dev, skb, sb_dev);
3893 
3894                 queue_index = netdev_cap_txqueue(dev, queue_index);
3895         }
3896 
3897         skb_set_queue_mapping(skb, queue_index);
3898         return netdev_get_tx_queue(dev, queue_index);
3899 }
3900 
3901 /**
3902  *      __dev_queue_xmit - transmit a buffer
3903  *      @skb: buffer to transmit
3904  *      @sb_dev: suboordinate device used for L2 forwarding offload
3905  *
3906  *      Queue a buffer for transmission to a network device. The caller must
3907  *      have set the device and priority and built the buffer before calling
3908  *      this function. The function can be called from an interrupt.
3909  *
3910  *      A negative errno code is returned on a failure. A success does not
3911  *      guarantee the frame will be transmitted as it may be dropped due
3912  *      to congestion or traffic shaping.
3913  *
3914  * -----------------------------------------------------------------------------------
3915  *      I notice this method can also return errors from the queue disciplines,
3916  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3917  *      be positive.
3918  *
3919  *      Regardless of the return value, the skb is consumed, so it is currently
3920  *      difficult to retry a send to this method.  (You can bump the ref count
3921  *      before sending to hold a reference for retry if you are careful.)
3922  *
3923  *      When calling this method, interrupts MUST be enabled.  This is because
3924  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3925  *          --BLG
3926  */
3927 static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
3928 {
3929         struct net_device *dev = skb->dev;
3930         struct netdev_queue *txq;
3931         struct Qdisc *q;
3932         int rc = -ENOMEM;
3933         bool again = false;
3934 
3935         skb_reset_mac_header(skb);
3936 
3937         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3938                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3939 
3940         /* Disable soft irqs for various locks below. Also
3941          * stops preemption for RCU.
3942          */
3943         rcu_read_lock_bh();
3944 
3945         skb_update_prio(skb);
3946 
3947         qdisc_pkt_len_init(skb);
3948 #ifdef CONFIG_NET_CLS_ACT
3949         skb->tc_at_ingress = 0;
3950 # ifdef CONFIG_NET_EGRESS
3951         if (static_branch_unlikely(&egress_needed_key)) {
3952                 skb = sch_handle_egress(skb, &rc, dev);
3953                 if (!skb)
3954                         goto out;
3955         }
3956 # endif
3957 #endif
3958         /* If device/qdisc don't need skb->dst, release it right now while
3959          * its hot in this cpu cache.
3960          */
3961         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3962                 skb_dst_drop(skb);
3963         else
3964                 skb_dst_force(skb);
3965 
3966         txq = netdev_core_pick_tx(dev, skb, sb_dev);
3967         q = rcu_dereference_bh(txq->qdisc);
3968 
3969         trace_net_dev_queue(skb);
3970         if (q->enqueue) {
3971                 rc = __dev_xmit_skb(skb, q, dev, txq);
3972                 goto out;
3973         }
3974 
3975         /* The device has no queue. Common case for software devices:
3976          * loopback, all the sorts of tunnels...
3977 
3978          * Really, it is unlikely that netif_tx_lock protection is necessary
3979          * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3980          * counters.)
3981          * However, it is possible, that they rely on protection
3982          * made by us here.
3983 
3984          * Check this and shot the lock. It is not prone from deadlocks.
3985          *Either shot noqueue qdisc, it is even simpler 8)
3986          */
3987         if (dev->flags & IFF_UP) {
3988                 int cpu = smp_processor_id(); /* ok because BHs are off */
3989 
3990                 if (txq->xmit_lock_owner != cpu) {
3991                         if (dev_xmit_recursion())
3992                                 goto recursion_alert;
3993 
3994                         skb = validate_xmit_skb(skb, dev, &again);
3995                         if (!skb)
3996                                 goto out;
3997 
3998                         HARD_TX_LOCK(dev, txq, cpu);
3999 
4000                         if (!netif_xmit_stopped(txq)) {
4001                                 dev_xmit_recursion_inc();
4002                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4003                                 dev_xmit_recursion_dec();
4004                                 if (dev_xmit_complete(rc)) {
4005                                         HARD_TX_UNLOCK(dev, txq);
4006                                         goto out;
4007                                 }
4008                         }
4009                         HARD_TX_UNLOCK(dev, txq);
4010                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4011                                              dev->name);
4012                 } else {
4013                         /* Recursion is detected! It is possible,
4014                          * unfortunately
4015                          */
4016 recursion_alert:
4017                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4018                                              dev->name);
4019                 }
4020         }
4021 
4022         rc = -ENETDOWN;
4023         rcu_read_unlock_bh();
4024 
4025         atomic_long_inc(&dev->tx_dropped);
4026         kfree_skb_list(skb);
4027         return rc;
4028 out:
4029         rcu_read_unlock_bh();
4030         return rc;
4031 }
4032 
4033 int dev_queue_xmit(struct sk_buff *skb)
4034 {
4035         return __dev_queue_xmit(skb, NULL);
4036 }
4037 EXPORT_SYMBOL(dev_queue_xmit);
4038 
4039 int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
4040 {
4041         return __dev_queue_xmit(skb, sb_dev);
4042 }
4043 EXPORT_SYMBOL(dev_queue_xmit_accel);
4044 
4045 int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4046 {
4047         struct net_device *dev = skb->dev;
4048         struct sk_buff *orig_skb = skb;
4049         struct netdev_queue *txq;
4050         int ret = NETDEV_TX_BUSY;
4051         bool again = false;
4052 
4053         if (unlikely(!netif_running(dev) ||
4054                      !netif_carrier_ok(dev)))
4055                 goto drop;
4056 
4057         skb = validate_xmit_skb_list(skb, dev, &again);
4058         if (skb != orig_skb)
4059                 goto drop;
4060 
4061         skb_set_queue_mapping(skb, queue_id);
4062         txq = skb_get_tx_queue(dev, skb);
4063 
4064         local_bh_disable();
4065 
4066         HARD_TX_LOCK(dev, txq, smp_processor_id());
4067         if (!netif_xmit_frozen_or_drv_stopped(txq))
4068                 ret = netdev_start_xmit(skb, dev, txq, false);
4069         HARD_TX_UNLOCK(dev, txq);
4070 
4071         local_bh_enable();
4072 
4073         if (!dev_xmit_complete(ret))
4074                 kfree_skb(skb);
4075 
4076         return ret;
4077 drop:
4078         atomic_long_inc(&dev->tx_dropped);
4079         kfree_skb_list(skb);
4080         return NET_XMIT_DROP;
4081 }
4082 EXPORT_SYMBOL(dev_direct_xmit);
4083 
4084 /*************************************************************************
4085  *                      Receiver routines
4086  *************************************************************************/
4087 
4088 int netdev_max_backlog __read_mostly = 1000;
4089 EXPORT_SYMBOL(netdev_max_backlog);
4090 
4091 int netdev_tstamp_prequeue __read_mostly = 1;
4092 int netdev_budget __read_mostly = 300;
4093 /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
4094 unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
4095 int weight_p __read_mostly = 64;           /* old backlog weight */
4096 int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
4097 int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
4098 int dev_rx_weight __read_mostly = 64;
4099 int dev_tx_weight __read_mostly = 64;
4100 /* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
4101 int gro_normal_batch __read_mostly = 8;
4102 
4103 /* Called with irq disabled */
4104 static inline void ____napi_schedule(struct softnet_data *sd,
4105                                      struct napi_struct *napi)
4106 {
4107         list_add_tail(&napi->poll_list, &sd->poll_list);
4108         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4109 }
4110 
4111 #ifdef CONFIG_RPS
4112 
4113 /* One global table that all flow-based protocols share. */
4114 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
4115 EXPORT_SYMBOL(rps_sock_flow_table);
4116 u32 rps_cpu_mask __read_mostly;
4117 EXPORT_SYMBOL(rps_cpu_mask);
4118 
4119 struct static_key_false rps_needed __read_mostly;
4120 EXPORT_SYMBOL(rps_needed);
4121 struct static_key_false rfs_needed __read_mostly;
4122 EXPORT_SYMBOL(rfs_needed);
4123 
4124 static struct rps_dev_flow *
4125 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4126             struct rps_dev_flow *rflow, u16 next_cpu)
4127 {
4128         if (next_cpu < nr_cpu_ids) {
4129 #ifdef CONFIG_RFS_ACCEL
4130                 struct netdev_rx_queue *rxqueue;
4131                 struct rps_dev_flow_table *flow_table;
4132                 struct rps_dev_flow *old_rflow;
4133                 u32 flow_id;
4134                 u16 rxq_index;
4135                 int rc;
4136 
4137                 /* Should we steer this flow to a different hardware queue? */
4138                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4139                     !(dev->features & NETIF_F_NTUPLE))
4140                         goto out;
4141                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4142                 if (rxq_index == skb_get_rx_queue(skb))
4143                         goto out;
4144 
4145                 rxqueue = dev->_rx + rxq_index;
4146                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4147                 if (!flow_table)
4148                         goto out;
4149                 flow_id = skb_get_hash(skb) & flow_table->mask;
4150                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4151                                                         rxq_index, flow_id);
4152                 if (rc < 0)
4153                         goto out;
4154                 old_rflow = rflow;
4155                 rflow = &flow_table->flows[flow_id];
4156                 rflow->filter = rc;
4157                 if (old_rflow->filter == rflow->filter)
4158                         old_rflow->filter = RPS_NO_FILTER;
4159         out:
4160 #endif
4161                 rflow->last_qtail =
4162                         per_cpu(softnet_data, next_cpu).input_queue_head;
4163         }
4164 
4165         rflow->cpu = next_cpu;
4166         return rflow;
4167 }
4168 
4169 /*
4170  * get_rps_cpu is called from netif_receive_skb and returns the target
4171  * CPU from the RPS map of the receiving queue for a given skb.
4172  * rcu_read_lock must be held on entry.
4173  */
4174 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4175                        struct rps_dev_flow **rflowp)
4176 {
4177         const struct rps_sock_flow_table *sock_flow_table;
4178         struct netdev_rx_queue *rxqueue = dev->_rx;
4179         struct rps_dev_flow_table *flow_table;
4180         struct rps_map *map;
4181         int cpu = -1;
4182         u32 tcpu;
4183         u32 hash;
4184 
4185         if (skb_rx_queue_recorded(skb)) {
4186                 u16 index = skb_get_rx_queue(skb);
4187 
4188                 if (unlikely(index >= dev->real_num_rx_queues)) {
4189                         WARN_ONCE(dev->real_num_rx_queues > 1,
4190                                   "%s received packet on queue %u, but number "
4191                                   "of RX queues is %u\n",
4192                                   dev->name, index, dev->real_num_rx_queues);
4193                         goto done;
4194                 }
4195                 rxqueue += index;
4196         }
4197 
4198         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4199 
4200         flow_table = rcu_dereference(rxqueue->rps_flow_table);
4201         map = rcu_dereference(rxqueue->rps_map);
4202         if (!flow_table && !map)
4203                 goto done;
4204 
4205         skb_reset_network_header(skb);
4206         hash = skb_get_hash(skb);
4207         if (!hash)
4208                 goto done;
4209 
4210         sock_flow_table = rcu_dereference(rps_sock_flow_table);
4211         if (flow_table && sock_flow_table) {
4212                 struct rps_dev_flow *rflow;
4213                 u32 next_cpu;
4214                 u32 ident;
4215 
4216                 /* First check into global flow table if there is a match */
4217                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
4218                 if ((ident ^ hash) & ~rps_cpu_mask)
4219                         goto try_rps;
4220 
4221                 next_cpu = ident & rps_cpu_mask;
4222 
4223                 /* OK, now we know there is a match,
4224                  * we can look at the local (per receive queue) flow table
4225                  */
4226                 rflow = &flow_table->flows[hash & flow_table->mask];
4227                 tcpu = rflow->cpu;
4228 
4229                 /*
4230                  * If the desired CPU (where last recvmsg was done) is
4231                  * different from current CPU (one in the rx-queue flow
4232                  * table entry), switch if one of the following holds:
4233                  *   - Current CPU is unset (>= nr_cpu_ids).
4234                  *   - Current CPU is offline.
4235                  *   - The current CPU's queue tail has advanced beyond the
4236                  *     last packet that was enqueued using this table entry.
4237                  *     This guarantees that all previous packets for the flow
4238                  *     have been dequeued, thus preserving in order delivery.
4239                  */
4240                 if (unlikely(tcpu != next_cpu) &&
4241                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4242                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4243                       rflow->last_qtail)) >= 0)) {
4244                         tcpu = next_cpu;
4245                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4246                 }
4247 
4248                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4249                         *rflowp = rflow;
4250                         cpu = tcpu;
4251                         goto done;
4252                 }
4253         }
4254 
4255 try_rps:
4256 
4257         if (map) {
4258                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4259                 if (cpu_online(tcpu)) {
4260                         cpu = tcpu;
4261                         goto done;
4262                 }
4263         }
4264 
4265 done:
4266         return cpu;
4267 }
4268 
4269 #ifdef CONFIG_RFS_ACCEL
4270 
4271 /**
4272  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4273  * @dev: Device on which the filter was set
4274  * @rxq_index: RX queue index
4275  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4276  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4277  *
4278  * Drivers that implement ndo_rx_flow_steer() should periodically call
4279  * this function for each installed filter and remove the filters for
4280  * which it returns %true.
4281  */
4282 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4283                          u32 flow_id, u16 filter_id)
4284 {
4285         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4286         struct rps_dev_flow_table *flow_table;
4287         struct rps_dev_flow *rflow;
4288         bool expire = true;
4289         unsigned int cpu;
4290 
4291         rcu_read_lock();
4292         flow_table = rcu_dereference(rxqueue->rps_flow_table);
4293         if (flow_table && flow_id <= flow_table->mask) {
4294                 rflow = &flow_table->flows[flow_id];
4295                 cpu = READ_ONCE(rflow->cpu);
4296                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4297                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4298                            rflow->last_qtail) <
4299                      (int)(10 * flow_table->mask)))
4300                         expire = false;
4301         }
4302         rcu_read_unlock();
4303         return expire;
4304 }
4305 EXPORT_SYMBOL(rps_may_expire_flow);
4306 
4307 #endif /* CONFIG_RFS_ACCEL */
4308 
4309 /* Called from hardirq (IPI) context */
4310 static void rps_trigger_softirq(void *data)
4311 {
4312         struct softnet_data *sd = data;
4313 
4314         ____napi_schedule(sd, &sd->backlog);
4315         sd->received_rps++;
4316 }
4317 
4318 #endif /* CONFIG_RPS */
4319 
4320 /*
4321  * Check if this softnet_data structure is another cpu one
4322  * If yes, queue it to our IPI list and return 1
4323  * If no, return 0
4324  */
4325 static int rps_ipi_queued(struct softnet_data *sd)
4326 {
4327 #ifdef CONFIG_RPS
4328         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4329 
4330         if (sd != mysd) {
4331                 sd->rps_ipi_next = mysd->rps_ipi_list;
4332                 mysd->rps_ipi_list = sd;
4333 
4334                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4335                 return 1;
4336         }
4337 #endif /* CONFIG_RPS */
4338         return 0;
4339 }
4340 
4341 #ifdef CONFIG_NET_FLOW_LIMIT
4342 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4343 #endif
4344 
4345 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4346 {
4347 #ifdef CONFIG_NET_FLOW_LIMIT
4348         struct sd_flow_limit *fl;
4349         struct softnet_data *sd;
4350         unsigned int old_flow, new_flow;
4351 
4352         if (qlen < (netdev_max_backlog >> 1))
4353                 return false;
4354 
4355         sd = this_cpu_ptr(&softnet_data);
4356 
4357         rcu_read_lock();
4358         fl = rcu_dereference(sd->flow_limit);
4359         if (fl) {
4360                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4361                 old_flow = fl->history[fl->history_head];
4362                 fl->history[fl->history_head] = new_flow;
4363 
4364                 fl->history_head++;
4365                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4366 
4367                 if (likely(fl->buckets[old_flow]))
4368                         fl->buckets[old_flow]--;
4369 
4370                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4371                         fl->count++;
4372                         rcu_read_unlock();
4373                         return true;
4374                 }
4375         }
4376         rcu_read_unlock();
4377 #endif
4378         return false;
4379 }
4380 
4381 /*
4382  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4383  * queue (may be a remote CPU queue).
4384  */
4385 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4386                               unsigned int *qtail)
4387 {
4388         struct softnet_data *sd;
4389         unsigned long flags;
4390         unsigned int qlen;
4391 
4392         sd = &per_cpu(softnet_data, cpu);
4393 
4394         local_irq_save(flags);
4395 
4396         rps_lock(sd);
4397         if (!netif_running(skb->dev))
4398                 goto drop;
4399         qlen = skb_queue_len(&sd->input_pkt_queue);
4400         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
4401                 if (qlen) {
4402 enqueue:
4403                         __skb_queue_tail(&sd->input_pkt_queue, skb);
4404                         input_queue_tail_incr_save(sd, qtail);
4405                         rps_unlock(sd);
4406                         local_irq_restore(flags);
4407                         return NET_RX_SUCCESS;
4408                 }
4409 
4410                 /* Schedule NAPI for backlog device
4411                  * We can use non atomic operation since we own the queue lock
4412                  */
4413                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
4414                         if (!rps_ipi_queued(sd))
4415                                 ____napi_schedule(sd, &sd->backlog);
4416                 }
4417                 goto enqueue;
4418         }
4419 
4420 drop:
4421         sd->dropped++;
4422         rps_unlock(sd);
4423 
4424         local_irq_restore(flags);
4425 
4426         atomic_long_inc(&skb->dev->rx_dropped);
4427         kfree_skb(skb);
4428         return NET_RX_DROP;
4429 }
4430 
4431 static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4432 {
4433         struct net_device *dev = skb->dev;
4434         struct netdev_rx_queue *rxqueue;
4435 
4436         rxqueue = dev->_rx;
4437 
4438         if (skb_rx_queue_recorded(skb)) {
4439                 u16 index = skb_get_rx_queue(skb);
4440 
4441                 if (unlikely(index >= dev->real_num_rx_queues)) {
4442                         WARN_ONCE(dev->real_num_rx_queues > 1,
4443                                   "%s received packet on queue %u, but number "
4444                                   "of RX queues is %u\n",
4445                                   dev->name, index, dev->real_num_rx_queues);
4446 
4447                         return rxqueue; /* Return first rxqueue */
4448                 }
4449                 rxqueue += index;
4450         }
4451         return rxqueue;
4452 }
4453 
4454 static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4455                                      struct xdp_buff *xdp,
4456                                      struct bpf_prog *xdp_prog)
4457 {
4458         struct netdev_rx_queue *rxqueue;
4459         void *orig_data, *orig_data_end;
4460         u32 metalen, act = XDP_DROP;
4461         __be16 orig_eth_type;
4462         struct ethhdr *eth;
4463         bool orig_bcast;
4464         int hlen, off;
4465         u32 mac_len;
4466 
4467         /* Reinjected packets coming from act_mirred or similar should
4468          * not get XDP generic processing.
4469          */
4470         if (skb_is_redirected(skb))
4471                 return XDP_PASS;
4472 
4473         /* XDP packets must be linear and must have sufficient headroom
4474          * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4475          * native XDP provides, thus we need to do it here as well.
4476          */
4477         if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
4478             skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4479                 int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4480                 int troom = skb->tail + skb->data_len - skb->end;
4481 
4482                 /* In case we have to go down the path and also linearize,
4483                  * then lets do the pskb_expand_head() work just once here.
4484                  */
4485                 if (pskb_expand_head(skb,
4486                                      hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4487                                      troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4488                         goto do_drop;
4489                 if (skb_linearize(skb))
4490                         goto do_drop;
4491         }
4492 
4493         /* The XDP program wants to see the packet starting at the MAC
4494          * header.
4495          */
4496         mac_len = skb->data - skb_mac_header(skb);
4497         hlen = skb_headlen(skb) + mac_len;
4498         xdp->data = skb->data - mac_len;
4499         xdp->data_meta = xdp->data;
4500         xdp->data_end = xdp->data + hlen;
4501         xdp->data_hard_start = skb->data - skb_headroom(skb);
4502         orig_data_end = xdp->data_end;
4503         orig_data = xdp->data;
4504         eth = (struct ethhdr *)xdp->data;
4505         orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4506         orig_eth_type = eth->h_proto;
4507 
4508         rxqueue = netif_get_rxqueue(skb);
4509         xdp->rxq = &rxqueue->xdp_rxq;
4510 
4511         act = bpf_prog_run_xdp(xdp_prog, xdp);
4512 
4513         /* check if bpf_xdp_adjust_head was used */
4514         off = xdp->data - orig_data;
4515         if (off) {
4516                 if (off > 0)
4517                         __skb_pull(skb, off);
4518                 else if (off < 0)
4519                         __skb_push(skb, -off);
4520 
4521                 skb->mac_header += off;
4522                 skb_reset_network_header(skb);
4523         }
4524 
4525         /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
4526          * pckt.
4527          */
4528         off = orig_data_end - xdp->data_end;
4529         if (off != 0) {
4530                 skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4531                 skb->len -= off;
4532 
4533         }
4534 
4535         /* check if XDP changed eth hdr such SKB needs update */
4536         eth = (struct ethhdr *)xdp->data;
4537         if ((orig_eth_type != eth->h_proto) ||
4538             (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4539                 __skb_push(skb, ETH_HLEN);
4540                 skb->protocol = eth_type_trans(skb, skb->dev);
4541         }
4542 
4543         switch (act) {
4544         case XDP_REDIRECT:
4545         case XDP_TX:
4546                 __skb_push(skb, mac_len);
4547                 break;
4548         case XDP_PASS:
4549                 metalen = xdp->data - xdp->data_meta;
4550                 if (metalen)
4551                         skb_metadata_set(skb, metalen);
4552                 break;
4553         default:
4554                 bpf_warn_invalid_xdp_action(act);
4555                 /* fall through */
4556         case XDP_ABORTED:
4557                 trace_xdp_exception(skb->dev, xdp_prog, act);
4558                 /* fall through */
4559         case XDP_DROP:
4560         do_drop:
4561                 kfree_skb(skb);
4562                 break;
4563         }
4564 
4565         return act;
4566 }
4567 
4568 /* When doing generic XDP we have to bypass the qdisc layer and the
4569  * network taps in order to match in-driver-XDP behavior.
4570  */
4571 void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4572 {
4573         struct net_device *dev = skb->dev;
4574         struct netdev_queue *txq;
4575         bool free_skb = true;
4576         int cpu, rc;
4577 
4578         txq = netdev_core_pick_tx(dev, skb, NULL);
4579         cpu = smp_processor_id();
4580         HARD_TX_LOCK(dev, txq, cpu);
4581         if (!netif_xmit_stopped(txq)) {
4582                 rc = netdev_start_xmit(skb, dev, txq, 0);
4583                 if (dev_xmit_complete(rc))
4584                         free_skb = false;
4585         }
4586         HARD_TX_UNLOCK(dev, txq);
4587         if (free_skb) {
4588                 trace_xdp_exception(dev, xdp_prog, XDP_TX);
4589                 kfree_skb(skb);
4590         }
4591 }
4592 EXPORT_SYMBOL_GPL(generic_xdp_tx);
4593 
4594 static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
4595 
4596 int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4597 {
4598         if (xdp_prog) {
4599                 struct xdp_buff xdp;
4600                 u32 act;
4601                 int err;
4602 
4603                 act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4604                 if (act != XDP_PASS) {
4605                         switch (act) {
4606                         case XDP_REDIRECT:
4607                                 err = xdp_do_generic_redirect(skb->dev, skb,
4608                                                               &xdp, xdp_prog);
4609                                 if (err)
4610                                         goto out_redir;
4611                                 break;
4612                         case XDP_TX:
4613                                 generic_xdp_tx(skb, xdp_prog);
4614                                 break;
4615                         }
4616                         return XDP_DROP;
4617                 }
4618         }
4619         return XDP_PASS;
4620 out_redir:
4621         kfree_skb(skb);
4622         return XDP_DROP;
4623 }
4624 EXPORT_SYMBOL_GPL(do_xdp_generic);
4625 
4626 static int netif_rx_internal(struct sk_buff *skb)
4627 {
4628         int ret;
4629 
4630         net_timestamp_check(netdev_tstamp_prequeue, skb);
4631 
4632         trace_netif_rx(skb);
4633 
4634 #ifdef CONFIG_RPS
4635         if (static_branch_unlikely(&rps_needed)) {
4636                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4637                 int cpu;
4638 
4639                 preempt_disable();
4640                 rcu_read_lock();
4641 
4642                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
4643                 if (cpu < 0)
4644                         cpu = smp_processor_id();
4645 
4646                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4647 
4648                 rcu_read_unlock();
4649                 preempt_enable();
4650         } else
4651 #endif
4652         {
4653                 unsigned int qtail;
4654 
4655                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4656                 put_cpu();
4657         }
4658         return ret;
4659 }
4660 
4661 /**
4662  *      netif_rx        -       post buffer to the network code
4663  *      @skb: buffer to post
4664  *
4665  *      This function receives a packet from a device driver and queues it for
4666  *      the upper (protocol) levels to process.  It always succeeds. The buffer
4667  *      may be dropped during processing for congestion control or by the
4668  *      protocol layers.
4669  *
4670  *      return values:
4671  *      NET_RX_SUCCESS  (no congestion)
4672  *      NET_RX_DROP     (packet was dropped)
4673  *
4674  */
4675 
4676 int netif_rx(struct sk_buff *skb)
4677 {
4678         int ret;
4679 
4680         trace_netif_rx_entry(skb);
4681 
4682         ret = netif_rx_internal(skb);
4683         trace_netif_rx_exit(ret);
4684 
4685         return ret;
4686 }
4687 EXPORT_SYMBOL(netif_rx);
4688 
4689 int netif_rx_ni(struct sk_buff *skb)
4690 {
4691         int err;
4692 
4693         trace_netif_rx_ni_entry(skb);
4694 
4695         preempt_disable();
4696         err = netif_rx_internal(skb);
4697         if (local_softirq_pending())
4698                 do_softirq();
4699         preempt_enable();
4700         trace_netif_rx_ni_exit(err);
4701 
4702         return err;
4703 }
4704 EXPORT_SYMBOL(netif_rx_ni);
4705 
4706 static __latent_entropy void net_tx_action(struct softirq_action *h)
4707 {
4708         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4709 
4710         if (sd->completion_queue) {
4711                 struct sk_buff *clist;
4712 
4713                 local_irq_disable();
4714                 clist = sd->completion_queue;
4715                 sd->completion_queue = NULL;
4716                 local_irq_enable();
4717 
4718                 while (clist) {
4719                         struct sk_buff *skb = clist;
4720 
4721                         clist = clist->next;
4722 
4723                         WARN_ON(refcount_read(&skb->users));
4724                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4725                                 trace_consume_skb(skb);
4726                         else
4727                                 trace_kfree_skb(skb, net_tx_action);
4728 
4729                         if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4730                                 __kfree_skb(skb);
4731                         else
4732                                 __kfree_skb_defer(skb);
4733                 }
4734 
4735                 __kfree_skb_flush();
4736         }
4737 
4738         if (sd->output_queue) {
4739                 struct Qdisc *head;
4740 
4741                 local_irq_disable();
4742                 head = sd->output_queue;
4743                 sd->output_queue = NULL;
4744                 sd->output_queue_tailp = &sd->output_queue;
4745                 local_irq_enable();
4746 
4747                 while (head) {
4748                         struct Qdisc *q = head;
4749                         spinlock_t *root_lock = NULL;
4750 
4751                         head = head->next_sched;
4752 
4753                         if (!(q->flags & TCQ_F_NOLOCK)) {
4754                                 root_lock = qdisc_lock(q);
4755                                 spin_lock(root_lock);
4756                         }
4757                         /* We need to make sure head->next_sched is read
4758                          * before clearing __QDISC_STATE_SCHED
4759                          */
4760                         smp_mb__before_atomic();
4761                         clear_bit(__QDISC_STATE_SCHED, &q->state);
4762                         qdisc_run(q);
4763                         if (root_lock)
4764                                 spin_unlock(root_lock);
4765                 }
4766         }
4767 
4768         xfrm_dev_backlog(sd);
4769 }
4770 
4771 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4772 /* This hook is defined here for ATM LANE */
4773 int (*br_fdb_test_addr_hook)(struct net_device *dev,
4774                              unsigned char *addr) __read_mostly;
4775 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4776 #endif
4777 
4778 static inline struct sk_buff *
4779 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4780                    struct net_device *orig_dev)
4781 {
4782 #ifdef CONFIG_NET_CLS_ACT
4783         struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
4784         struct tcf_result cl_res;
4785 
4786         /* If there's at least one ingress present somewhere (so
4787          * we get here via enabled static key), remaining devices
4788          * that are not configured with an ingress qdisc will bail
4789          * out here.
4790          */
4791         if (!miniq)
4792                 return skb;
4793 
4794         if (*pt_prev) {
4795                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
4796                 *pt_prev = NULL;
4797         }
4798 
4799         qdisc_skb_cb(skb)->pkt_len = skb->len;
4800         skb->tc_at_ingress = 1;
4801         mini_qdisc_bstats_cpu_update(miniq, skb);
4802 
4803         switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
4804         case TC_ACT_OK:
4805         case TC_ACT_RECLASSIFY:
4806                 skb->tc_index = TC_H_MIN(cl_res.classid);
4807                 break;
4808         case TC_ACT_SHOT:
4809                 mini_qdisc_qstats_cpu_drop(miniq);
4810                 kfree_skb(skb);
4811                 return NULL;
4812         case TC_ACT_STOLEN:
4813         case TC_ACT_QUEUED:
4814         case TC_ACT_TRAP:
4815                 consume_skb(skb);
4816                 return NULL;
4817         case TC_ACT_REDIRECT:
4818                 /* skb_mac_header check was done by cls/act_bpf, so
4819                  * we can safely push the L2 header back before
4820                  * redirecting to another netdev
4821                  */
4822                 __skb_push(skb, skb->mac_len);
4823                 skb_do_redirect(skb);
4824                 return NULL;
4825         case TC_ACT_CONSUMED:
4826                 return NULL;
4827         default:
4828                 break;
4829         }
4830 #endif /* CONFIG_NET_CLS_ACT */
4831         return skb;
4832 }
4833 
4834 /**
4835  *      netdev_is_rx_handler_busy - check if receive handler is registered
4836  *      @dev: device to check
4837  *
4838  *      Check if a receive handler is already registered for a given device.
4839  *      Return true if there one.
4840  *
4841  *      The caller must hold the rtnl_mutex.
4842  */
4843 bool netdev_is_rx_handler_busy(struct net_device *dev)
4844 {
4845         ASSERT_RTNL();
4846         return dev && rtnl_dereference(dev->rx_handler);
4847 }
4848 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
4849 
4850 /**
4851  *      netdev_rx_handler_register - register receive handler
4852  *      @dev: device to register a handler for
4853  *      @rx_handler: receive handler to register
4854  *      @rx_handler_data: data pointer that is used by rx handler
4855  *
4856  *      Register a receive handler for a device. This handler will then be
4857  *      called from __netif_receive_skb. A negative errno code is returned
4858  *      on a failure.
4859  *
4860  *      The caller must hold the rtnl_mutex.
4861  *
4862  *      For a general description of rx_handler, see enum rx_handler_result.
4863  */
4864 int netdev_rx_handler_register(struct net_device *dev,
4865                                rx_handler_func_t *rx_handler,
4866                                void *rx_handler_data)
4867 {
4868         if (netdev_is_rx_handler_busy(dev))
4869                 return -EBUSY;
4870 
4871         if (dev->priv_flags & IFF_NO_RX_HANDLER)
4872                 return -EINVAL;
4873 
4874         /* Note: rx_handler_data must be set before rx_handler */
4875         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4876         rcu_assign_pointer(dev->rx_handler, rx_handler);
4877 
4878         return 0;
4879 }
4880 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4881 
4882 /**
4883  *      netdev_rx_handler_unregister - unregister receive handler
4884  *      @dev: device to unregister a handler from
4885  *
4886  *      Unregister a receive handler from a device.
4887  *
4888  *      The caller must hold the rtnl_mutex.
4889  */
4890 void netdev_rx_handler_unregister(struct net_device *dev)
4891 {
4892 
4893         ASSERT_RTNL();
4894         RCU_INIT_POINTER(dev->rx_handler, NULL);
4895         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4896          * section has a guarantee to see a non NULL rx_handler_data
4897          * as well.
4898          */
4899         synchronize_net();
4900         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4901 }
4902 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4903 
4904 /*
4905  * Limit the use of PFMEMALLOC reserves to those protocols that implement
4906  * the special handling of PFMEMALLOC skbs.
4907  */
4908 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4909 {
4910         switch (skb->protocol) {
4911         case htons(ETH_P_ARP):
4912         case htons(ETH_P_IP):
4913         case htons(ETH_P_IPV6):
4914         case htons(ETH_P_8021Q):
4915         case htons(ETH_P_8021AD):
4916                 return true;
4917         default:
4918                 return false;
4919         }
4920 }
4921 
4922 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4923                              int *ret, struct net_device *orig_dev)
4924 {
4925 #ifdef CONFIG_NETFILTER_INGRESS
4926         if (nf_hook_ingress_active(skb)) {
4927                 int ingress_retval;
4928 
4929                 if (*pt_prev) {
4930                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
4931                         *pt_prev = NULL;
4932                 }
4933 
4934                 rcu_read_lock();
4935                 ingress_retval = nf_hook_ingress(skb);
4936                 rcu_read_unlock();
4937                 return ingress_retval;
4938         }
4939 #endif /* CONFIG_NETFILTER_INGRESS */
4940         return 0;
4941 }
4942 
4943 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
4944                                     struct packet_type **ppt_prev)
4945 {
4946         struct packet_type *ptype, *pt_prev;
4947         rx_handler_func_t *rx_handler;
4948         struct net_device *orig_dev;
4949         bool deliver_exact = false;
4950         int ret = NET_RX_DROP;
4951         __be16 type;
4952 
4953         net_timestamp_check(!netdev_tstamp_prequeue, skb);
4954 
4955         trace_netif_receive_skb(skb);
4956 
4957         orig_dev = skb->dev;
4958 
4959         skb_reset_network_header(skb);
4960         if (!skb_transport_header_was_set(skb))
4961                 skb_reset_transport_header(skb);
4962         skb_reset_mac_len(skb);
4963 
4964         pt_prev = NULL;
4965 
4966 another_round:
4967         skb->skb_iif = skb->dev->ifindex;
4968 
4969         __this_cpu_inc(softnet_data.processed);
4970 
4971         if (static_branch_unlikely(&generic_xdp_needed_key)) {
4972                 int ret2;
4973 
4974                 preempt_disable();
4975                 ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
4976                 preempt_enable();
4977 
4978                 if (ret2 != XDP_PASS)
4979                         return NET_RX_DROP;
4980                 skb_reset_mac_len(skb);
4981         }
4982 
4983         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4984             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4985                 skb = skb_vlan_untag(skb);
4986                 if (unlikely(!skb))
4987                         goto out;
4988         }
4989 
4990         if (skb_skip_tc_classify(skb))
4991                 goto skip_classify;
4992 
4993         if (pfmemalloc)
4994                 goto skip_taps;
4995 
4996         list_for_each_entry_rcu(ptype, &ptype_all, list) {
4997                 if (pt_prev)
4998                         ret = deliver_skb(skb, pt_prev, orig_dev);
4999                 pt_prev = ptype;
5000         }
5001 
5002         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5003                 if (pt_prev)
5004                         ret = deliver_skb(skb, pt_prev, orig_dev);
5005                 pt_prev = ptype;
5006         }
5007 
5008 skip_taps:
5009 #ifdef CONFIG_NET_INGRESS
5010         if (static_branch_unlikely(&ingress_needed_key)) {
5011                 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
5012                 if (!skb)
5013                         goto out;
5014 
5015                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5016                         goto out;
5017         }
5018 #endif
5019         skb_reset_redirect(skb);
5020 skip_classify:
5021         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5022                 goto drop;
5023 
5024         if (skb_vlan_tag_present(skb)) {
5025                 if (pt_prev) {
5026                         ret = deliver_skb(skb, pt_prev, orig_dev);
5027                         pt_prev = NULL;
5028                 }
5029                 if (vlan_do_receive(&skb))
5030                         goto another_round;
5031                 else if (unlikely(!skb))
5032                         goto out;
5033         }
5034 
5035         rx_handler = rcu_dereference(skb->dev->rx_handler);
5036         if (rx_handler) {
5037                 if (pt_prev) {
5038                         ret = deliver_skb(skb, pt_prev, orig_dev);
5039                         pt_prev = NULL;
5040                 }
5041                 switch (rx_handler(&skb)) {
5042                 case RX_HANDLER_CONSUMED:
5043                         ret = NET_RX_SUCCESS;
5044                         goto out;
5045                 case RX_HANDLER_ANOTHER:
5046                         goto another_round;
5047                 case RX_HANDLER_EXACT:
5048                         deliver_exact = true;
5049                 case RX_HANDLER_PASS:
5050                         break;
5051                 default:
5052                         BUG();
5053                 }
5054         }
5055 
5056         if (unlikely(skb_vlan_tag_present(skb))) {
5057 check_vlan_id:
5058                 if (skb_vlan_tag_get_id(skb)) {
5059                         /* Vlan id is non 0 and vlan_do_receive() above couldn't
5060                          * find vlan device.
5061                          */
5062                         skb->pkt_type = PACKET_OTHERHOST;
5063                 } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
5064                            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
5065                         /* Outer header is 802.1P with vlan 0, inner header is
5066                          * 802.1Q or 802.1AD and vlan_do_receive() above could
5067                          * not find vlan dev for vlan id 0.
5068                          */
5069                         __vlan_hwaccel_clear_tag(skb);
5070                         skb = skb_vlan_untag(skb);
5071                         if (unlikely(!skb))
5072                                 goto out;
5073                         if (vlan_do_receive(&skb))
5074                                 /* After stripping off 802.1P header with vlan 0
5075                                  * vlan dev is found for inner header.
5076                                  */
5077                                 goto another_round;
5078                         else if (unlikely(!skb))
5079                                 goto out;
5080                         else
5081                                 /* We have stripped outer 802.1P vlan 0 header.
5082                                  * But could not find vlan dev.
5083                                  * check again for vlan id to set OTHERHOST.
5084                                  */
5085                                 goto check_vlan_id;
5086                 }
5087                 /* Note: we might in the future use prio bits
5088                  * and set skb->priority like in vlan_do_receive()
5089                  * For the time being, just ignore Priority Code Point
5090                  */
5091                 __vlan_hwaccel_clear_tag(skb);
5092         }
5093 
5094         type = skb->protocol;
5095 
5096         /* deliver only exact match when indicated */
5097         if (likely(!deliver_exact)) {
5098                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5099                                        &ptype_base[ntohs(type) &
5100                                                    PTYPE_HASH_MASK]);
5101         }
5102 
5103         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5104                                &orig_dev->ptype_specific);
5105 
5106         if (unlikely(skb->dev != orig_dev)) {
5107                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5108                                        &skb->dev->ptype_specific);
5109         }
5110 
5111         if (pt_prev) {
5112                 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5113                         goto drop;
5114                 *ppt_prev = pt_prev;
5115         } else {
5116 drop:
5117                 if (!deliver_exact)
5118                         atomic_long_inc(&skb->dev->rx_dropped);
5119                 else
5120                         atomic_long_inc(&skb->dev->rx_nohandler);
5121                 kfree_skb(skb);
5122                 /* Jamal, now you will not able to escape explaining
5123                  * me how you were going to use this. :-)
5124                  */
5125                 ret = NET_RX_DROP;
5126         }
5127 
5128 out:
5129         return ret;
5130 }
5131 
5132 static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5133 {
5134         struct net_device *orig_dev = skb->dev;
5135         struct packet_type *pt_prev = NULL;
5136         int ret;
5137 
5138         ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
5139         if (pt_prev)
5140                 ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5141                                          skb->dev, pt_prev, orig_dev);
5142         return ret;
5143 }
5144 
5145 /**
5146  *      netif_receive_skb_core - special purpose version of netif_receive_skb
5147  *      @skb: buffer to process
5148  *
5149  *      More direct receive version of netif_receive_skb().  It should
5150  *      only be used by callers that have a need to skip RPS and Generic XDP.
5151  *      Caller must also take care of handling if (page_is_)pfmemalloc.
5152  *
5153  *      This function may only be called from softirq context and interrupts
5154  *      should be enabled.
5155  *
5156  *      Return values (usually ignored):
5157  *      NET_RX_SUCCESS: no congestion
5158  *      NET_RX_DROP: packet was dropped
5159  */
5160 int netif_receive_skb_core(struct sk_buff *skb)
5161 {
5162         int ret;
5163 
5164         rcu_read_lock();
5165         ret = __netif_receive_skb_one_core(skb, false);
5166         rcu_read_unlock();
5167 
5168         return ret;
5169 }
5170 EXPORT_SYMBOL(netif_receive_skb_core);
5171 
5172 static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5173                                                   struct packet_type *pt_prev,
5174                                                   struct net_device *orig_dev)
5175 {
5176         struct sk_buff *skb, *next;
5177 
5178         if (!pt_prev)
5179                 return;
5180         if (list_empty(head))
5181                 return;
5182         if (pt_prev->list_func != NULL)
5183                 INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5184                                    ip_list_rcv, head, pt_prev, orig_dev);
5185         else
5186                 list_for_each_entry_safe(skb, next, head, list) {
5187                         skb_list_del_init(skb);
5188                         pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5189                 }
5190 }
5191 
5192 static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5193 {
5194         /* Fast-path assumptions:
5195          * - There is no RX handler.
5196          * - Only one packet_type matches.
5197          * If either of these fails, we will end up doing some per-packet
5198          * processing in-line, then handling the 'last ptype' for the whole
5199          * sublist.  This can't cause out-of-order delivery to any single ptype,
5200          * because the 'last ptype' must be constant across the sublist, and all
5201          * other ptypes are handled per-packet.
5202          */
5203         /* Current (common) ptype of sublist */
5204         struct packet_type *pt_curr = NULL;
5205         /* Current (common) orig_dev of sublist */
5206         struct net_device *od_curr = NULL;
5207         struct list_head sublist;
5208         struct sk_buff *skb, *next;
5209 
5210         INIT_LIST_HEAD(&sublist);
5211         list_for_each_entry_safe(skb, next, head, list) {
5212                 struct net_device *orig_dev = skb->dev;
5213                 struct packet_type *pt_prev = NULL;
5214 
5215                 skb_list_del_init(skb);
5216                 __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
5217                 if (!pt_prev)
5218                         continue;
5219                 if (pt_curr != pt_prev || od_curr != orig_dev) {
5220                         /* dispatch old sublist */
5221                         __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5222                         /* start new sublist */
5223                         INIT_LIST_HEAD(&sublist);
5224                         pt_curr = pt_prev;
5225                         od_curr = orig_dev;
5226                 }
5227                 list_add_tail(&skb->list, &sublist);
5228         }
5229 
5230         /* dispatch final sublist */
5231         __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5232 }
5233 
5234 static int __netif_receive_skb(struct sk_buff *skb)
5235 {
5236         int ret;
5237 
5238         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5239                 unsigned int noreclaim_flag;
5240 
5241                 /*
5242                  * PFMEMALLOC skbs are special, they should
5243                  * - be delivered to SOCK_MEMALLOC sockets only
5244                  * - stay away from userspace
5245                  * - have bounded memory usage
5246                  *
5247                  * Use PF_MEMALLOC as this saves us from propagating the allocation
5248                  * context down to all allocation sites.
5249                  */
5250                 noreclaim_flag = memalloc_noreclaim_save();
5251                 ret = __netif_receive_skb_one_core(skb, true);
5252                 memalloc_noreclaim_restore(noreclaim_flag);
5253         } else
5254                 ret = __netif_receive_skb_one_core(skb, false);
5255 
5256         return ret;
5257 }
5258 
5259 static void __netif_receive_skb_list(struct list_head *head)
5260 {
5261         unsigned long noreclaim_flag = 0;
5262         struct sk_buff *skb, *next;
5263         bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5264 
5265         list_for_each_entry_safe(skb, next, head, list) {
5266                 if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5267                         struct list_head sublist;
5268 
5269                         /* Handle the previous sublist */
5270                         list_cut_before(&sublist, head, &skb->list);
5271                         if (!list_empty(&sublist))
5272                                 __netif_receive_skb_list_core(&sublist, pfmemalloc);
5273                         pfmemalloc = !pfmemalloc;
5274                         /* See comments in __netif_receive_skb */
5275                         if (pfmemalloc)
5276                                 noreclaim_flag = memalloc_noreclaim_save();
5277                         else
5278                                 memalloc_noreclaim_restore(noreclaim_flag);
5279                 }
5280         }
5281         /* Handle the remaining sublist */
5282         if (!list_empty(head))
5283                 __netif_receive_skb_list_core(head, pfmemalloc);
5284         /* Restore pflags */
5285         if (pfmemalloc)
5286                 memalloc_noreclaim_restore(noreclaim_flag);
5287 }
5288 
5289 static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5290 {
5291         struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5292         struct bpf_prog *new = xdp->prog;
5293         int ret = 0;
5294 
5295         switch (xdp->command) {
5296         case XDP_SETUP_PROG:
5297                 rcu_assign_pointer(dev->xdp_prog, new);
5298                 if (old)
5299                         bpf_prog_put(old);
5300 
5301                 if (old && !new) {
5302                         static_branch_dec(&generic_xdp_needed_key);
5303                 } else if (new && !old) {
5304                         static_branch_inc(&generic_xdp_needed_key);
5305                         dev_disable_lro(dev);
5306                         dev_disable_gro_hw(dev);
5307                 }
5308                 break;
5309 
5310         case XDP_QUERY_PROG:
5311                 xdp->prog_id = old ? old->aux->id : 0;
5312                 break;
5313 
5314         default:
5315                 ret = -EINVAL;
5316                 break;
5317         }
5318 
5319         return ret;
5320 }
5321 
5322 static int netif_receive_skb_internal(struct sk_buff *skb)
5323 {
5324         int ret;
5325 
5326         net_timestamp_check(netdev_tstamp_prequeue, skb);
5327 
5328         if (skb_defer_rx_timestamp(skb))
5329                 return NET_RX_SUCCESS;
5330 
5331         rcu_read_lock();
5332 #ifdef CONFIG_RPS
5333         if (static_branch_unlikely(&rps_needed)) {
5334                 struct rps_dev_flow voidflow, *rflow = &voidflow;
5335                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5336 
5337                 if (cpu >= 0) {
5338                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5339                         rcu_read_unlock();
5340                         return ret;
5341                 }
5342         }
5343 #endif
5344         ret = __netif_receive_skb(skb);
5345         rcu_read_unlock();
5346         return ret;
5347 }
5348 
5349 static void netif_receive_skb_list_internal(struct list_head *head)
5350 {
5351         struct sk_buff *skb, *next;
5352         struct list_head sublist;
5353 
5354         INIT_LIST_HEAD(&sublist);
5355         list_for_each_entry_safe(skb, next, head, list) {
5356                 net_timestamp_check(netdev_tstamp_prequeue, skb);
5357                 skb_list_del_init(skb);
5358                 if (!skb_defer_rx_timestamp(skb))
5359                         list_add_tail(&skb->list, &sublist);
5360         }
5361         list_splice_init(&sublist, head);
5362 
5363         rcu_read_lock();
5364 #ifdef CONFIG_RPS
5365         if (static_branch_unlikely(&rps_needed)) {
5366                 list_for_each_entry_safe(skb, next, head, list) {
5367                         struct rps_dev_flow voidflow, *rflow = &voidflow;
5368                         int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5369 
5370                         if (cpu >= 0) {
5371                                 /* Will be handled, remove from list */
5372                                 skb_list_del_init(skb);
5373                                 enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5374                         }
5375                 }
5376         }
5377 #endif
5378         __netif_receive_skb_list(head);
5379         rcu_read_unlock();
5380 }
5381 
5382 /**
5383  *      netif_receive_skb - process receive buffer from network
5384  *      @skb: buffer to process
5385  *
5386  *      netif_receive_skb() is the main receive data processing function.
5387  *      It always succeeds. The buffer may be dropped during processing
5388  *      for congestion control or by the protocol layers.
5389  *
5390  *      This function may only be called from softirq context and interrupts
5391  *      should be enabled.
5392  *
5393  *      Return values (usually ignored):
5394  *      NET_RX_SUCCESS: no congestion
5395  *      NET_RX_DROP: packet was dropped
5396  */
5397 int netif_receive_skb(struct sk_buff *skb)
5398 {
5399         int ret;
5400 
5401         trace_netif_receive_skb_entry(skb);
5402 
5403         ret = netif_receive_skb_internal(skb);
5404         trace_netif_receive_skb_exit(ret);
5405 
5406         return ret;
5407 }
5408 EXPORT_SYMBOL(netif_receive_skb);
5409 
5410 /**
5411  *      netif_receive_skb_list - process many receive buffers from network
5412  *      @head: list of skbs to process.
5413  *
5414  *      Since return value of netif_receive_skb() is normally ignored, and
5415  *      wouldn't be meaningful for a list, this function returns void.
5416  *
5417  *      This function may only be called from softirq context and interrupts
5418  *      should be enabled.
5419  */
5420 void netif_receive_skb_list(struct list_head *head)
5421 {
5422         struct sk_buff *skb;
5423 
5424         if (list_empty(head))
5425                 return;
5426         if (trace_netif_receive_skb_list_entry_enabled()) {
5427                 list_for_each_entry(skb, head, list)
5428                         trace_netif_receive_skb_list_entry(skb);
5429         }
5430         netif_receive_skb_list_internal(head);
5431         trace_netif_receive_skb_list_exit(0);
5432 }
5433 EXPORT_SYMBOL(netif_receive_skb_list);
5434 
5435 DEFINE_PER_CPU(struct work_struct, flush_works);
5436 
5437 /* Network device is going away, flush any packets still pending */
5438 static void flush_backlog(struct work_struct *work)
5439 {
5440         struct sk_buff *skb, *tmp;
5441         struct softnet_data *sd;
5442 
5443         local_bh_disable();
5444         sd = this_cpu_ptr(&softnet_data);
5445 
5446         local_irq_disable();
5447         rps_lock(sd);
5448         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5449                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5450                         __skb_unlink(skb, &sd->input_pkt_queue);
5451                         kfree_skb(skb);
5452                         input_queue_head_incr(sd);
5453                 }
5454         }
5455         rps_unlock(sd);
5456         local_irq_enable();
5457 
5458         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5459                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5460                         __skb_unlink(skb, &sd->process_queue);
5461                         kfree_skb(skb);
5462                         input_queue_head_incr(sd);
5463                 }
5464         }
5465         local_bh_enable();
5466 }
5467 
5468 static void flush_all_backlogs(void)
5469 {
5470         unsigned int cpu;
5471 
5472         get_online_cpus();
5473 
5474         for_each_online_cpu(cpu)
5475                 queue_work_on(cpu, system_highpri_wq,
5476                               per_cpu_ptr(&flush_works, cpu));
5477 
5478         for_each_online_cpu(cpu)
5479                 flush_work(per_cpu_ptr(&flush_works, cpu));
5480 
5481         put_online_cpus();
5482 }
5483 
5484 /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
5485 static void gro_normal_list(struct napi_struct *napi)
5486 {
5487         if (!napi->rx_count)
5488                 return;
5489         netif_receive_skb_list_internal(&napi->rx_list);
5490         INIT_LIST_HEAD(&napi->rx_list);
5491         napi->rx_count = 0;
5492 }
5493 
5494 /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
5495  * pass the whole batch up to the stack.
5496  */
5497 static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
5498 {
5499         list_add_tail(&skb->list, &napi->rx_list);
5500         if (++napi->rx_count >= gro_normal_batch)
5501                 gro_normal_list(napi);
5502 }
5503 
5504 INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
5505 INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
5506 static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
5507 {
5508         struct packet_offload *ptype;
5509         __be16 type = skb->protocol;
5510         struct list_head *head = &offload_base;
5511         int err = -ENOENT;
5512 
5513         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
5514 
5515         if (NAPI_GRO_CB(skb)->count == 1) {
5516                 skb_shinfo(skb)->gso_size = 0;
5517                 goto out;
5518         }
5519 
5520         rcu_read_lock();
5521         list_for_each_entry_rcu(ptype, head, list) {
5522                 if (ptype->type != type || !ptype->callbacks.gro_complete)
5523                         continue;
5524 
5525                 err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
5526                                          ipv6_gro_complete, inet_gro_complete,
5527                                          skb, 0);
5528                 break;
5529         }
5530         rcu_read_unlock();
5531 
5532         if (err) {
5533                 WARN_ON(&ptype->list == head);
5534                 kfree_skb(skb);
5535                 return NET_RX_SUCCESS;
5536         }
5537 
5538 out:
5539         gro_normal_one(napi, skb);
5540         return NET_RX_SUCCESS;
5541 }
5542 
5543 static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
5544                                    bool flush_old)
5545 {
5546         struct list_head *head = &napi->gro_hash[index].list;
5547         struct sk_buff *skb, *p;
5548 
5549         list_for_each_entry_safe_reverse(skb, p, head, list) {
5550                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
5551                         return;
5552                 skb_list_del_init(skb);
5553                 napi_gro_complete(napi, skb);
5554                 napi->gro_hash[index].count--;
5555         }
5556 
5557         if (!napi->gro_hash[index].count)
5558                 __clear_bit(index, &napi->gro_bitmask);
5559 }
5560 
5561 /* napi->gro_hash[].list contains packets ordered by age.
5562  * youngest packets at the head of it.
5563  * Complete skbs in reverse order to reduce latencies.
5564  */
5565 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
5566 {
5567         unsigned long bitmask = napi->gro_bitmask;
5568         unsigned int i, base = ~0U;
5569 
5570         while ((i = ffs(bitmask)) != 0) {
5571                 bitmask >>= i;
5572                 base += i;
5573                 __napi_gro_flush_chain(napi, base, flush_old);
5574         }
5575 }
5576 EXPORT_SYMBOL(napi_gro_flush);
5577 
5578 static struct list_head *gro_list_prepare(struct napi_struct *napi,
5579                                           struct sk_buff *skb)
5580 {
5581         unsigned int maclen = skb->dev->hard_header_len;
5582         u32 hash = skb_get_hash_raw(skb);
5583         struct list_head *head;
5584         struct sk_buff *p;
5585 
5586         head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
5587         list_for_each_entry(p, head, list) {
5588                 unsigned long diffs;
5589 
5590                 NAPI_GRO_CB(p)->flush = 0;
5591 
5592                 if (hash != skb_get_hash_raw(p)) {
5593                         NAPI_GRO_CB(p)->same_flow = 0;
5594                         continue;
5595                 }
5596 
5597                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
5598                 diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
5599                 if (skb_vlan_tag_present(p))
5600                         diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
5601                 diffs |= skb_metadata_dst_cmp(p, skb);
5602                 diffs |= skb_metadata_differs(p, skb);
5603                 if (maclen == ETH_HLEN)
5604                         diffs |= compare_ether_header(skb_mac_header(p),
5605                                                       skb_mac_header(skb));
5606                 else if (!diffs)
5607                         diffs = memcmp(skb_mac_header(p),
5608                                        skb_mac_header(skb),
5609                                        maclen);
5610                 NAPI_GRO_CB(p)->same_flow = !diffs;
5611         }
5612 
5613         return head;
5614 }
5615 
5616 static void skb_gro_reset_offset(struct sk_buff *skb)
5617 {
5618         const struct skb_shared_info *pinfo = skb_shinfo(skb);
5619         const skb_frag_t *frag0 = &pinfo->frags[0];
5620 
5621         NAPI_GRO_CB(skb)->data_offset = 0;
5622         NAPI_GRO_CB(skb)->frag0 = NULL;
5623         NAPI_GRO_CB(skb)->frag0_len = 0;
5624 
5625         if (!skb_headlen(skb) && pinfo->nr_frags &&
5626             !PageHighMem(skb_frag_page(frag0))) {
5627                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
5628                 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
5629                                                     skb_frag_size(frag0),
5630                                                     skb->end - skb->tail);
5631         }
5632 }
5633 
5634 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
5635 {
5636         struct skb_shared_info *pinfo = skb_shinfo(skb);
5637 
5638         BUG_ON(skb->end - skb->tail < grow);
5639 
5640         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
5641 
5642         skb->data_len -= grow;
5643         skb->tail += grow;
5644 
5645         skb_frag_off_add(&pinfo->frags[0], grow);
5646         skb_frag_size_sub(&pinfo->frags[0], grow);
5647 
5648         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
5649                 skb_frag_unref(skb, 0);
5650                 memmove(pinfo->frags, pinfo->frags + 1,
5651                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
5652         }
5653 }
5654 
5655 static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
5656 {
5657         struct sk_buff *oldest;
5658 
5659         oldest = list_last_entry(head, struct sk_buff, list);
5660 
5661         /* We are called with head length >= MAX_GRO_SKBS, so this is
5662          * impossible.
5663          */
5664         if (WARN_ON_ONCE(!oldest))
5665                 return;
5666 
5667         /* Do not adjust napi->gro_hash[].count, caller is adding a new
5668          * SKB to the chain.
5669          */
5670         skb_list_del_init(oldest);
5671         napi_gro_complete(napi, oldest);
5672 }
5673 
5674 INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
5675                                                            struct sk_buff *));
5676 INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
5677                                                            struct sk_buff *));
5678 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5679 {
5680         u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
5681         struct list_head *head = &offload_base;
5682         struct packet_offload *ptype;
5683         __be16 type = skb->protocol;
5684         struct list_head *gro_head;
5685         struct sk_buff *pp = NULL;
5686         enum gro_result ret;
5687         int same_flow;
5688         int grow;
5689 
5690         if (netif_elide_gro(skb->dev))
5691                 goto normal;
5692 
5693         gro_head = gro_list_prepare(napi, skb);
5694 
5695         rcu_read_lock();
5696         list_for_each_entry_rcu(ptype, head, list) {
5697                 if (ptype->type != type || !ptype->callbacks.gro_receive)
5698                         continue;
5699 
5700                 skb_set_network_header(skb, skb_gro_offset(skb));
5701                 skb_reset_mac_len(skb);
5702                 NAPI_GRO_CB(skb)->same_flow = 0;
5703                 NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
5704                 NAPI_GRO_CB(skb)->free = 0;
5705                 NAPI_GRO_CB(skb)->encap_mark = 0;
5706                 NAPI_GRO_CB(skb)->recursion_counter = 0;
5707                 NAPI_GRO_CB(skb)->is_fou = 0;
5708                 NAPI_GRO_CB(skb)->is_atomic = 1;
5709                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
5710 
5711                 /* Setup for GRO checksum validation */
5712                 switch (skb->ip_summed) {
5713                 case CHECKSUM_COMPLETE:
5714                         NAPI_GRO_CB(skb)->csum = skb->csum;
5715                         NAPI_GRO_CB(skb)->csum_valid = 1;
5716                         NAPI_GRO_CB(skb)->csum_cnt = 0;
5717                         break;
5718                 case CHECKSUM_UNNECESSARY:
5719                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
5720                         NAPI_GRO_CB(skb)->csum_valid = 0;
5721                         break;
5722                 default:
5723                         NAPI_GRO_CB(skb)->csum_cnt = 0;
5724                         NAPI_GRO_CB(skb)->csum_valid = 0;
5725                 }
5726 
5727                 pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
5728                                         ipv6_gro_receive, inet_gro_receive,
5729                                         gro_head, skb);
5730                 break;
5731         }
5732         rcu_read_unlock();
5733 
5734         if (&ptype->list == head)
5735                 goto normal;
5736 
5737         if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
5738                 ret = GRO_CONSUMED;
5739                 goto ok;
5740         }
5741 
5742         same_flow = NAPI_GRO_CB(skb)->same_flow;
5743         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
5744 
5745         if (pp) {
5746                 skb_list_del_init(pp);
5747                 napi_gro_complete(napi, pp);
5748                 napi->gro_hash[hash].count--;
5749         }
5750 
5751         if (same_flow)
5752                 goto ok;
5753 
5754         if (NAPI_GRO_CB(skb)->flush)
5755                 goto normal;
5756 
5757         if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
5758                 gro_flush_oldest(napi, gro_head);
5759         } else {
5760                 napi->gro_hash[hash].count++;
5761         }
5762         NAPI_GRO_CB(skb)->count = 1;
5763         NAPI_GRO_CB(skb)->age = jiffies;
5764         NAPI_GRO_CB(skb)->last = skb;
5765         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
5766         list_add(&skb->list, gro_head);
5767         ret = GRO_HELD;
5768 
5769 pull:
5770         grow = skb_gro_offset(skb) - skb_headlen(skb);
5771         if (grow > 0)
5772                 gro_pull_from_frag0(skb, grow);
5773 ok:
5774         if (napi->gro_hash[hash].count) {
5775                 if (!test_bit(hash, &napi->gro_bitmask))
5776                         __set_bit(hash, &napi->gro_bitmask);
5777         } else if (test_bit(hash, &napi->gro_bitmask)) {
5778                 __clear_bit(hash, &napi->gro_bitmask);
5779         }
5780 
5781         return ret;
5782 
5783 normal:
5784         ret = GRO_NORMAL;
5785         goto pull;
5786 }
5787 
5788 struct packet_offload *gro_find_receive_by_type(__be16 type)
5789 {
5790         struct list_head *offload_head = &offload_base;
5791         struct packet_offload *ptype;
5792 
5793         list_for_each_entry_rcu(ptype, offload_head, list) {
5794                 if (ptype->type != type || !ptype->callbacks.gro_receive)
5795                         continue;
5796                 return ptype;
5797         }
5798         return NULL;
5799 }
5800 EXPORT_SYMBOL(gro_find_receive_by_type);
5801 
5802 struct packet_offload *gro_find_complete_by_type(__be16 type)
5803 {
5804         struct list_head *offload_head = &offload_base;
5805         struct packet_offload *ptype;
5806 
5807         list_for_each_entry_rcu(ptype, offload_head, list) {
5808                 if (ptype->type != type || !ptype->callbacks.gro_complete)
5809                         continue;
5810                 return ptype;
5811         }
5812         return NULL;
5813 }
5814 EXPORT_SYMBOL(gro_find_complete_by_type);
5815 
5816 static void napi_skb_free_stolen_head(struct sk_buff *skb)
5817 {
5818         skb_dst_drop(skb);
5819         skb_ext_put(skb);
5820         kmem_cache_free(skbuff_head_cache, skb);
5821 }
5822 
5823 static gro_result_t napi_skb_finish(struct napi_struct *napi,
5824                                     struct sk_buff *skb,
5825                                     gro_result_t ret)
5826 {
5827         switch (ret) {
5828         case GRO_NORMAL:
5829                 gro_normal_one(napi, skb);
5830                 break;
5831 
5832         case GRO_DROP:
5833                 kfree_skb(skb);
5834                 break;
5835 
5836         case GRO_MERGED_FREE:
5837                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5838                         napi_skb_free_stolen_head(skb);
5839                 else
5840                         __kfree_skb(skb);
5841                 break;
5842 
5843         case GRO_HELD:
5844         case GRO_MERGED:
5845         case GRO_CONSUMED:
5846                 break;
5847         }
5848 
5849         return ret;
5850 }
5851 
5852 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5853 {
5854         gro_result_t ret;
5855 
5856         skb_mark_napi_id(skb, napi);
5857         trace_napi_gro_receive_entry(skb);
5858 
5859         skb_gro_reset_offset(skb);
5860 
5861         ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
5862         trace_napi_gro_receive_exit(ret);
5863 
5864         return ret;
5865 }
5866 EXPORT_SYMBOL(napi_gro_receive);
5867 
5868 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
5869 {
5870         if (unlikely(skb->pfmemalloc)) {
5871                 consume_skb(skb);
5872                 return;
5873         }
5874         __skb_pull(skb, skb_headlen(skb));
5875         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
5876         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
5877         __vlan_hwaccel_clear_tag(skb);
5878         skb->dev = napi->dev;
5879         skb->skb_iif = 0;
5880 
5881         /* eth_type_trans() assumes pkt_type is PACKET_HOST */
5882         skb->pkt_type = PACKET_HOST;
5883 
5884         skb->encapsulation = 0;
5885         skb_shinfo(skb)->gso_type = 0;
5886         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
5887         skb_ext_reset(skb);
5888 
5889         napi->skb = skb;
5890 }
5891 
5892 struct sk_buff *napi_get_frags(struct napi_struct *napi)
5893 {
5894         struct sk_buff *skb = napi->skb;
5895 
5896         if (!skb) {
5897                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
5898                 if (skb) {
5899                         napi->skb = skb;
5900                         skb_mark_napi_id(skb, napi);
5901                 }
5902         }
5903         return skb;
5904 }
5905 EXPORT_SYMBOL(napi_get_frags);
5906 
5907 static gro_result_t napi_frags_finish(struct napi_struct *napi,
5908                                       struct sk_buff *skb,
5909                                       gro_result_t ret)
5910 {
5911         switch (ret) {
5912         case GRO_NORMAL:
5913         case GRO_HELD:
5914                 __skb_push(skb, ETH_HLEN);
5915                 skb->protocol = eth_type_trans(skb, skb->dev);
5916                 if (ret == GRO_NORMAL)
5917                         gro_normal_one(napi, skb);
5918                 break;
5919 
5920         case GRO_DROP:
5921                 napi_reuse_skb(napi, skb);
5922                 break;
5923 
5924         case GRO_MERGED_FREE:
5925                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5926                         napi_skb_free_stolen_head(skb);
5927                 else
5928                         napi_reuse_skb(napi, skb);
5929                 break;
5930 
5931         case GRO_MERGED:
5932         case GRO_CONSUMED:
5933                 break;
5934         }
5935 
5936         return ret;
5937 }
5938 
5939 /* Upper GRO stack assumes network header starts at gro_offset=0
5940  * Drivers could call both napi_gro_frags() and napi_gro_receive()
5941  * We copy ethernet header into skb->data to have a common layout.
5942  */
5943 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
5944 {
5945         struct sk_buff *skb = napi->skb;
5946         const struct ethhdr *eth;
5947         unsigned int hlen = sizeof(*eth);
5948 
5949         napi->skb = NULL;
5950 
5951         skb_reset_mac_header(skb);
5952         skb_gro_reset_offset(skb);
5953 
5954         if (unlikely(skb_gro_header_hard(skb, hlen))) {
5955                 eth = skb_gro_header_slow(skb, hlen, 0);
5956                 if (unlikely(!eth)) {
5957                         net_warn_ratelimited("%s: dropping impossible skb from %s\n",
5958                                              __func__, napi->dev->name);
5959                         napi_reuse_skb(napi, skb);
5960                         return NULL;
5961                 }
5962         } else {
5963                 eth = (const struct ethhdr *)skb->data;
5964                 gro_pull_from_frag0(skb, hlen);
5965                 NAPI_GRO_CB(skb)->frag0 += hlen;
5966                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
5967         }
5968         __skb_pull(skb, hlen);
5969 
5970         /*
5971          * This works because the only protocols we care about don't require
5972          * special handling.
5973          * We'll fix it up properly in napi_frags_finish()
5974          */
5975         skb->protocol = eth->h_proto;
5976 
5977         return skb;
5978 }
5979 
5980 gro_result_t napi_gro_frags(struct napi_struct *napi)
5981 {
5982         gro_result_t ret;
5983         struct sk_buff *skb = napi_frags_skb(napi);
5984 
5985         if (!skb)
5986                 return GRO_DROP;
5987 
5988         trace_napi_gro_frags_entry(skb);
5989 
5990         ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5991         trace_napi_gro_frags_exit(ret);
5992 
5993         return ret;
5994 }
5995 EXPORT_SYMBOL(napi_gro_frags);
5996 
5997 /* Compute the checksum from gro_offset and return the folded value
5998  * after adding in any pseudo checksum.
5999  */
6000 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
6001 {
6002         __wsum wsum;
6003         __sum16 sum;
6004 
6005         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
6006 
6007         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
6008         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
6009         /* See comments in __skb_checksum_complete(). */
6010         if (likely(!sum)) {
6011                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
6012                     !skb->csum_complete_sw)
6013                         netdev_rx_csum_fault(skb->dev, skb);
6014         }
6015 
6016         NAPI_GRO_CB(skb)->csum = wsum;
6017         NAPI_GRO_CB(skb)->csum_valid = 1;
6018 
6019         return sum;
6020 }
6021 EXPORT_SYMBOL(__skb_gro_checksum_complete);
6022 
6023 static void net_rps_send_ipi(struct softnet_data *remsd)
6024 {
6025 #ifdef CONFIG_RPS
6026         while (remsd) {
6027                 struct softnet_data *next = remsd->rps_ipi_next;
6028 
6029                 if (cpu_online(remsd->cpu))
6030                         smp_call_function_single_async(remsd->cpu, &remsd->csd);
6031                 remsd = next;
6032         }
6033 #endif
6034 }
6035 
6036 /*
6037  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
6038  * Note: called with local irq disabled, but exits with local irq enabled.
6039  */
6040 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
6041 {
6042 #ifdef CONFIG_RPS
6043         struct softnet_data *remsd = sd->rps_ipi_list;
6044 
6045         if (remsd) {
6046                 sd->rps_ipi_list = NULL;
6047 
6048                 local_irq_enable();
6049 
6050                 /* Send pending IPI's to kick RPS processing on remote cpus. */
6051                 net_rps_send_ipi(remsd);
6052         } else
6053 #endif
6054                 local_irq_enable();
6055 }
6056 
6057 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
6058 {
6059 #ifdef CONFIG_RPS
6060         return sd->rps_ipi_list != NULL;
6061 #else
6062         return false;
6063 #endif
6064 }
6065 
6066 static int process_backlog(struct napi_struct *napi, int quota)
6067 {
6068         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
6069         bool again = true;
6070         int work = 0;
6071 
6072         /* Check if we have pending ipi, its better to send them now,
6073          * not waiting net_rx_action() end.
6074          */
6075         if (sd_has_rps_ipi_waiting(sd)) {
6076                 local_irq_disable();
6077                 net_rps_action_and_irq_enable(sd);
6078         }
6079 
6080         napi->weight = dev_rx_weight;
6081         while (again) {
6082                 struct sk_buff *skb;
6083 
6084                 while ((skb = __skb_dequeue(&sd->process_queue))) {
6085                         rcu_read_lock();
6086                         __netif_receive_skb(skb);
6087                         rcu_read_unlock();
6088                         input_queue_head_incr(sd);
6089                         if (++work >= quota)
6090                                 return work;
6091 
6092                 }
6093 
6094                 local_irq_disable();
6095                 rps_lock(sd);
6096                 if (skb_queue_empty(&sd->input_pkt_queue)) {
6097                         /*
6098                          * Inline a custom version of __napi_complete().
6099                          * only current cpu owns and manipulates this napi,
6100                          * and NAPI_STATE_SCHED is the only possible flag set
6101                          * on backlog.
6102                          * We can use a plain write instead of clear_bit(),
6103                          * and we dont need an smp_mb() memory barrier.
6104                          */
6105                         napi->state = 0;
6106                         again = false;
6107                 } else {
6108                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
6109                                                    &sd->process_queue);
6110                 }
6111                 rps_unlock(sd);
6112                 local_irq_enable();
6113         }
6114 
6115         return work;
6116 }
6117 
6118 /**
6119  * __napi_schedule - schedule for receive
6120  * @n: entry to schedule
6121  *
6122  * The entry's receive function will be scheduled to run.
6123  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6124  */
6125 void __napi_schedule(struct napi_struct *n)
6126 {
6127         unsigned long flags;
6128 
6129         local_irq_save(flags);
6130         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6131         local_irq_restore(flags);
6132 }
6133 EXPORT_SYMBOL(__napi_schedule);
6134 
6135 /**
6136  *      napi_schedule_prep - check if napi can be scheduled
6137  *      @n: napi context
6138  *
6139  * Test if NAPI routine is already running, and if not mark
6140  * it as running.  This is used as a condition variable
6141  * insure only one NAPI poll instance runs.  We also make
6142  * sure there is no pending NAPI disable.
6143  */
6144 bool napi_schedule_prep(struct napi_struct *n)
6145 {
6146         unsigned long val, new;
6147 
6148         do {
6149                 val = READ_ONCE(n->state);
6150                 if (unlikely(val & NAPIF_STATE_DISABLE))
6151                         return false;
6152                 new = val | NAPIF_STATE_SCHED;
6153 
6154                 /* Sets STATE_MISSED bit if STATE_SCHED was already set
6155                  * This was suggested by Alexander Duyck, as compiler
6156                  * emits better code than :
6157                  * if (val & NAPIF_STATE_SCHED)
6158                  *     new |= NAPIF_STATE_MISSED;
6159                  */
6160                 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6161                                                    NAPIF_STATE_MISSED;
6162         } while (cmpxchg(&n->state, val, new) != val);
6163 
6164         return !(val & NAPIF_STATE_SCHED);
6165 }
6166 EXPORT_SYMBOL(napi_schedule_prep);
6167 
6168 /**
6169  * __napi_schedule_irqoff - schedule for receive
6170  * @n: entry to schedule
6171  *
6172  * Variant of __napi_schedule() assuming hard irqs are masked
6173  */
6174 void __napi_schedule_irqoff(struct napi_struct *n)
6175 {
6176         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6177 }
6178 EXPORT_SYMBOL(__napi_schedule_irqoff);
6179 
6180 bool napi_complete_done(struct napi_struct *n, int work_done)
6181 {
6182         unsigned long flags, val, new;
6183 
6184         /*
6185          * 1) Don't let napi dequeue from the cpu poll list
6186          *    just in case its running on a different cpu.
6187          * 2) If we are busy polling, do nothing here, we have
6188          *    the guarantee we will be called later.
6189          */
6190         if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6191                                  NAPIF_STATE_IN_BUSY_POLL)))
6192                 return false;
6193 
6194         if (n->gro_bitmask) {
6195                 unsigned long timeout = 0;
6196 
6197                 if (work_done)
6198                         timeout = n->dev->gro_flush_timeout;
6199 
6200                 /* When the NAPI instance uses a timeout and keeps postponing
6201                  * it, we need to bound somehow the time packets are kept in
6202                  * the GRO layer
6203                  */
6204                 napi_gro_flush(n, !!timeout);
6205                 if (timeout)
6206                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
6207                                       HRTIMER_MODE_REL_PINNED);
6208         }
6209 
6210         gro_normal_list(n);
6211 
6212         if (unlikely(!list_empty(&n->poll_list))) {
6213                 /* If n->poll_list is not empty, we need to mask irqs */
6214                 local_irq_save(flags);
6215                 list_del_init(&n->poll_list);
6216                 local_irq_restore(flags);
6217         }
6218 
6219         do {
6220                 val = READ_ONCE(n->state);
6221 
6222                 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6223 
6224                 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
6225 
6226                 /* If STATE_MISSED was set, leave STATE_SCHED set,
6227                  * because we will call napi->poll() one more time.
6228                  * This C code was suggested by Alexander Duyck to help gcc.
6229                  */
6230                 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6231                                                     NAPIF_STATE_SCHED;
6232         } while (cmpxchg(&n->state, val, new) != val);
6233 
6234         if (unlikely(val & NAPIF_STATE_MISSED)) {
6235                 __napi_schedule(n);
6236                 return false;
6237         }
6238 
6239         return true;
6240 }
6241 EXPORT_SYMBOL(napi_complete_done);
6242 
6243 /* must be called under rcu_read_lock(), as we dont take a reference */
6244 static struct napi_struct *napi_by_id(unsigned int napi_id)
6245 {
6246         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6247         struct napi_struct *napi;
6248 
6249         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6250                 if (napi->napi_id == napi_id)
6251                         return napi;
6252 
6253         return NULL;
6254 }
6255 
6256 #if defined(CONFIG_NET_RX_BUSY_POLL)
6257 
6258 #define BUSY_POLL_BUDGET 8
6259 
6260 static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
6261 {
6262         int rc;
6263 
6264         /* Busy polling means there is a high chance device driver hard irq
6265          * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6266          * set in napi_schedule_prep().
6267          * Since we are about to call napi->poll() once more, we can safely
6268          * clear NAPI_STATE_MISSED.
6269          *
6270          * Note: x86 could use a single "lock and ..." instruction
6271          * to perform these two clear_bit()
6272          */
6273         clear_bit(NAPI_STATE_MISSED, &napi->state);
6274         clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6275 
6276         local_bh_disable();
6277 
6278         /* All we really want here is to re-enable device interrupts.
6279          * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6280          */
6281         rc = napi->poll(napi, BUSY_POLL_BUDGET);
6282         /* We can't gro_normal_list() here, because napi->poll() might have
6283          * rearmed the napi (napi_complete_done()) in which case it could
6284          * already be running on another CPU.
6285          */
6286         trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
6287         netpoll_poll_unlock(have_poll_lock);
6288         if (rc == BUSY_POLL_BUDGET) {
6289                 /* As the whole budget was spent, we still own the napi so can
6290                  * safely handle the rx_list.
6291                  */
6292                 gro_normal_list(napi);
6293                 __napi_schedule(napi);
6294         }
6295         local_bh_enable();
6296 }
6297 
6298 void napi_busy_loop(unsigned int napi_id,
6299                     bool (*loop_end)(void *, unsigned long),
6300                     void *loop_end_arg)
6301 {
6302         unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6303         int (*napi_poll)(struct napi_struct *napi, int budget);
6304         void *have_poll_lock = NULL;
6305         struct napi_struct *napi;
6306 
6307 restart:
6308         napi_poll = NULL;
6309 
6310         rcu_read_lock();
6311 
6312         napi = napi_by_id(napi_id);
6313         if (!napi)
6314                 goto out;
6315 
6316         preempt_disable();
6317         for (;;) {
6318                 int work = 0;
6319 
6320                 local_bh_disable();
6321                 if (!napi_poll) {
6322                         unsigned long val = READ_ONCE(napi->state);
6323 
6324                         /* If multiple threads are competing for this napi,
6325                          * we avoid dirtying napi->state as much as we can.
6326                          */
6327                         if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6328                                    NAPIF_STATE_IN_BUSY_POLL))
6329                                 goto count;
6330                         if (cmpxchg(&napi->state, val,
6331                                     val | NAPIF_STATE_IN_BUSY_POLL |
6332                                           NAPIF_STATE_SCHED) != val)
6333                                 goto count;
6334                         have_poll_lock = netpoll_poll_lock(napi);
6335                         napi_poll = napi->poll;
6336                 }
6337                 work = napi_poll(napi, BUSY_POLL_BUDGET);
6338                 trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
6339                 gro_normal_list(napi);
6340 count:
6341                 if (work > 0)
6342                         __NET_ADD_STATS(dev_net(napi->dev),
6343                                         LINUX_MIB_BUSYPOLLRXPACKETS, work);
6344                 local_bh_enable();
6345 
6346                 if (!loop_end || loop_end(loop_end_arg, start_time))
6347                         break;
6348 
6349                 if (unlikely(need_resched())) {
6350                         if (napi_poll)
6351                                 busy_poll_stop(napi, have_poll_lock);
6352                         preempt_enable();
6353                         rcu_read_unlock();
6354                         cond_resched();
6355                         if (loop_end(loop_end_arg, start_time))
6356                                 return;
6357                         goto restart;
6358                 }
6359                 cpu_relax();
6360         }
6361         if (napi_poll)
6362                 busy_poll_stop(napi, have_poll_lock);
6363         preempt_enable();
6364 out:
6365         rcu_read_unlock();
6366 }
6367 EXPORT_SYMBOL(napi_busy_loop);
6368 
6369 #endif /* CONFIG_NET_RX_BUSY_POLL */
6370 
6371 static void napi_hash_add(struct napi_struct *napi)
6372 {
6373         if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
6374             test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
6375                 return;
6376 
6377         spin_lock(&napi_hash_lock);
6378 
6379         /* 0..NR_CPUS range is reserved for sender_cpu use */
6380         do {
6381                 if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6382                         napi_gen_id = MIN_NAPI_ID;
6383         } while (napi_by_id(napi_gen_id));
6384         napi->napi_id = napi_gen_id;
6385 
6386         hlist_add_head_rcu(&napi->napi_hash_node,
6387                            &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6388 
6389         spin_unlock(&napi_hash_lock);
6390 }
6391 
6392 /* Warning : caller is responsible to make sure rcu grace period
6393  * is respected before freeing memory containing @napi
6394  */
6395 bool napi_hash_del(struct napi_struct *napi)
6396 {
6397         bool rcu_sync_needed = false;
6398 
6399         spin_lock(&napi_hash_lock);
6400 
6401         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
6402                 rcu_sync_needed = true;
6403                 hlist_del_rcu(&napi->napi_hash_node);
6404         }
6405         spin_unlock(&napi_hash_lock);
6406         return rcu_sync_needed;
6407 }
6408 EXPORT_SYMBOL_GPL(napi_hash_del);
6409 
6410 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6411 {
6412         struct napi_struct *napi;
6413 
6414         napi = container_of(timer, struct napi_struct, timer);
6415 
6416         /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6417          * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6418          */
6419         if (napi->gro_bitmask && !napi_disable_pending(napi) &&
6420             !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
6421                 __napi_schedule_irqoff(napi);
6422 
6423         return HRTIMER_NORESTART;
6424 }
6425 
6426 static void init_gro_hash(struct napi_struct *napi)
6427 {
6428         int i;
6429 
6430         for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6431                 INIT_LIST_HEAD(&napi->gro_hash[i].list);
6432                 napi->gro_hash[i].count = 0;
6433         }
6434         napi->gro_bitmask = 0;
6435 }
6436 
6437 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
6438                     int (*poll)(struct napi_struct *, int), int weight)
6439 {
6440         INIT_LIST_HEAD(&napi->poll_list);
6441         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6442         napi->timer.function = napi_watchdog;
6443         init_gro_hash(napi);
6444         napi->skb = NULL;
6445         INIT_LIST_HEAD(&napi->rx_list);
6446         napi->rx_count = 0;
6447         napi->poll = poll;
6448         if (weight > NAPI_POLL_WEIGHT)
6449                 netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6450                                 weight);
6451         napi->weight = weight;
6452         list_add(&napi->dev_list, &dev->napi_list);
6453         napi->dev = dev;
6454 #ifdef CONFIG_NETPOLL
6455         napi->poll_owner = -1;
6456 #endif
6457         set_bit(NAPI_STATE_SCHED, &napi->state);
6458         napi_hash_add(napi);
6459 }
6460 EXPORT_SYMBOL(netif_napi_add);
6461 
6462 void napi_disable(struct napi_struct *n)
6463 {
6464         might_sleep();
6465         set_bit(NAPI_STATE_DISABLE, &n->state);
6466 
6467         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
6468                 msleep(1);
6469         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
6470                 msleep(1);
6471 
6472         hrtimer_cancel(&n->timer);
6473 
6474         clear_bit(NAPI_STATE_DISABLE, &n->state);
6475 }
6476 EXPORT_SYMBOL(napi_disable);
6477 
6478 static void flush_gro_hash(struct napi_struct *napi)
6479 {
6480         int i;
6481 
6482         for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6483                 struct sk_buff *skb, *n;
6484 
6485                 list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6486                         kfree_skb(skb);
6487                 napi->gro_hash[i].count = 0;
6488         }
6489 }
6490 
6491 /* Must be called in process context */
6492 void netif_napi_del(struct napi_struct *napi)
6493 {
6494         might_sleep();
6495         if (napi_hash_del(napi))
6496                 synchronize_net();
6497         list_del_init(&napi->dev_list);
6498         napi_free_frags(napi);
6499 
6500         flush_gro_hash(napi);
6501         napi->gro_bitmask = 0;
6502 }
6503 EXPORT_SYMBOL(netif_napi_del);
6504 
6505 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6506 {
6507         void *have;
6508         int work, weight;
6509 
6510         list_del_init(&n->poll_list);
6511 
6512         have = netpoll_poll_lock(n);
6513 
6514         weight = n->weight;
6515 
6516         /* This NAPI_STATE_SCHED test is for avoiding a race
6517          * with netpoll's poll_napi().  Only the entity which
6518          * obtains the lock and sees NAPI_STATE_SCHED set will
6519          * actually make the ->poll() call.  Therefore we avoid
6520          * accidentally calling ->poll() when NAPI is not scheduled.
6521          */
6522         work = 0;
6523         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6524                 work = n->poll(n, weight);
6525                 trace_napi_poll(n, work, weight);
6526         }
6527 
6528         WARN_ON_ONCE(work > weight);
6529 
6530         if (likely(work < weight))
6531                 goto out_unlock;
6532 
6533         /* Drivers must not modify the NAPI state if they
6534          * consume the entire weight.  In such cases this code
6535          * still "owns" the NAPI instance and therefore can
6536          * move the instance around on the list at-will.
6537          */
6538         if (unlikely(napi_disable_pending(n))) {
6539                 napi_complete(n);
6540                 goto out_unlock;
6541         }
6542 
6543         if (n->gro_bitmask) {
6544                 /* flush too old packets
6545                  * If HZ < 1000, flush all packets.
6546                  */
6547                 napi_gro_flush(n, HZ >= 1000);
6548         }
6549 
6550         gro_normal_list(n);
6551 
6552         /* Some drivers may have called napi_schedule
6553          * prior to exhausting their budget.
6554          */
6555         if (unlikely(!list_empty(&n->poll_list))) {
6556                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6557                              n->dev ? n->dev->name : "backlog");
6558                 goto out_unlock;
6559         }
6560 
6561         list_add_tail(&n->poll_list, repoll);
6562 
6563 out_unlock:
6564         netpoll_poll_unlock(have);
6565 
6566         return work;
6567 }
6568 
6569 static __latent_entropy void net_rx_action(struct softirq_action *h)
6570 {
6571         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6572         unsigned long time_limit = jiffies +
6573                 usecs_to_jiffies(netdev_budget_usecs);
6574         int budget = netdev_budget;
6575         LIST_HEAD(list);
6576         LIST_HEAD(repoll);
6577 
6578         local_irq_disable();
6579         list_splice_init(&sd->poll_list, &list);
6580         local_irq_enable();
6581 
6582         for (;;) {
6583                 struct napi_struct *n;
6584 
6585                 if (list_empty(&list)) {
6586                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
6587                                 goto out;
6588                         break;
6589                 }
6590 
6591                 n = list_first_entry(&list, struct napi_struct, poll_list);
6592                 budget -= napi_poll(n, &repoll);
6593 
6594                 /* If softirq window is exhausted then punt.
6595                  * Allow this to run for 2 jiffies since which will allow
6596                  * an average latency of 1.5/HZ.
6597                  */
6598                 if (unlikely(budget <= 0 ||
6599                              time_after_eq(jiffies, time_limit))) {
6600                         sd->time_squeeze++;
6601                         break;
6602                 }
6603         }
6604 
6605         local_irq_disable();
6606 
6607         list_splice_tail_init(&sd->poll_list, &list);
6608         list_splice_tail(&repoll, &list);
6609         list_splice(&list, &sd->poll_list);
6610         if (!list_empty(&sd->poll_list))
6611                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
6612 
6613         net_rps_action_and_irq_enable(sd);
6614 out:
6615         __kfree_skb_flush();
6616 }
6617 
6618 struct netdev_adjacent {
6619         struct net_device *dev;
6620 
6621         /* upper master flag, there can only be one master device per list */
6622         bool master;
6623 
6624         /* lookup ignore flag */
6625         bool ignore;
6626 
6627         /* counter for the number of times this device was added to us */
6628         u16 ref_nr;
6629 
6630         /* private field for the users */
6631         void *private;
6632 
6633         struct list_head list;
6634         struct rcu_head rcu;
6635 };
6636 
6637 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6638                                                  struct list_head *adj_list)
6639 {
6640         struct netdev_adjacent *adj;
6641 
6642         list_for_each_entry(adj, adj_list, list) {
6643                 if (adj->dev == adj_dev)
6644                         return adj;
6645         }
6646         return NULL;
6647 }
6648 
6649 static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data)
6650 {
6651         struct net_device *dev = data;
6652 
6653         return upper_dev == dev;
6654 }
6655 
6656 /**
6657  * netdev_has_upper_dev - Check if device is linked to an upper device
6658  * @dev: device
6659  * @upper_dev: upper device to check
6660  *
6661  * Find out if a device is linked to specified upper device and return true
6662  * in case it is. Note that this checks only immediate upper device,
6663  * not through a complete stack of devices. The caller must hold the RTNL lock.
6664  */
6665 bool netdev_has_upper_dev(struct net_device *dev,
6666                           struct net_device *upper_dev)
6667 {
6668         ASSERT_RTNL();
6669 
6670         return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6671                                              upper_dev);
6672 }
6673 EXPORT_SYMBOL(netdev_has_upper_dev);
6674 
6675 /**
6676  * netdev_has_upper_dev_all - Check if device is linked to an upper device
6677  * @dev: device
6678  * @upper_dev: upper device to check
6679  *
6680  * Find out if a device is linked to specified upper device and return true
6681  * in case it is. Note that this checks the entire upper device chain.
6682  * The caller must hold rcu lock.
6683  */
6684 
6685 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
6686                                   struct net_device *upper_dev)
6687 {
6688         return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6689                                                upper_dev);
6690 }
6691 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
6692 
6693 /**
6694  * netdev_has_any_upper_dev - Check if device is linked to some device
6695  * @dev: device
6696  *
6697  * Find out if a device is linked to an upper device and return true in case
6698  * it is. The caller must hold the RTNL lock.
6699  */
6700 bool netdev_has_any_upper_dev(struct net_device *dev)
6701 {
6702         ASSERT_RTNL();
6703 
6704         return !list_empty(&dev->adj_list.upper);
6705 }
6706 EXPORT_SYMBOL(netdev_has_any_upper_dev);
6707 
6708 /**
6709  * netdev_master_upper_dev_get - Get master upper device
6710  * @dev: device
6711  *
6712  * Find a master upper device and return pointer to it or NULL in case
6713  * it's not there. The caller must hold the RTNL lock.
6714  */
6715 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
6716 {
6717         struct netdev_adjacent *upper;
6718 
6719         ASSERT_RTNL();
6720 
6721         if (list_empty(&dev->adj_list.upper))
6722                 return NULL;
6723 
6724         upper = list_first_entry(&dev->adj_list.upper,
6725                                  struct netdev_adjacent, list);
6726         if (likely(upper->master))
6727                 return upper->dev;
6728         return NULL;
6729 }
6730 EXPORT_SYMBOL(netdev_master_upper_dev_get);
6731 
6732 static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
6733 {
6734         struct netdev_adjacent *upper;
6735 
6736         ASSERT_RTNL();
6737 
6738         if (list_empty(&dev->adj_list.upper))
6739                 return NULL;
6740 
6741         upper = list_first_entry(&dev->adj_list.upper,
6742                                  struct netdev_adjacent, list);
6743         if (likely(upper->master) && !upper->ignore)
6744                 return upper->dev;
6745         return NULL;
6746 }
6747 
6748 /**
6749  * netdev_has_any_lower_dev - Check if device is linked to some device
6750  * @dev: device
6751  *
6752  * Find out if a device is linked to a lower device and return true in case
6753  * it is. The caller must hold the RTNL lock.
6754  */
6755 static bool netdev_has_any_lower_dev(struct net_device *dev)
6756 {
6757         ASSERT_RTNL();
6758 
6759         return !list_empty(&dev->adj_list.lower);
6760 }
6761 
6762 void *netdev_adjacent_get_private(struct list_head *adj_list)
6763 {
6764         struct netdev_adjacent *adj;
6765 
6766         adj = list_entry(adj_list, struct netdev_adjacent, list);
6767 
6768         return adj->private;
6769 }
6770 EXPORT_SYMBOL(netdev_adjacent_get_private);
6771 
6772 /**
6773  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
6774  * @dev: device
6775  * @iter: list_head ** of the current position
6776  *
6777  * Gets the next device from the