1 /* 2 * NET3 Protocol independent device support routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Derived from the non IP parts of dev.c 1.0.19 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * 14 * Additional Authors: 15 * Florian la Roche <rzsfl@rz.uni-sb.de> 16 * Alan Cox <gw4pts@gw4pts.ampr.org> 17 * David Hinds <dahinds@users.sourceforge.net> 18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> 19 * Adam Sulmicki <adam@cfar.umd.edu> 20 * Pekka Riikonen <priikone@poesidon.pspt.fi> 21 * 22 * Changes: 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 24 * to 2 if register_netdev gets called 25 * before net_dev_init & also removed a 26 * few lines of code in the process. 27 * Alan Cox : device private ioctl copies fields back. 28 * Alan Cox : Transmit queue code does relevant 29 * stunts to keep the queue safe. 30 * Alan Cox : Fixed double lock. 31 * Alan Cox : Fixed promisc NULL pointer trap 32 * ???????? : Support the full private ioctl range 33 * Alan Cox : Moved ioctl permission check into 34 * drivers 35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 36 * Alan Cox : 100 backlog just doesn't cut it when 37 * you start doing multicast video 8) 38 * Alan Cox : Rewrote net_bh and list manager. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 40 * Alan Cox : Took out transmit every packet pass 41 * Saved a few bytes in the ioctl handler 42 * Alan Cox : Network driver sets packet type before 43 * calling netif_rx. Saves a function 44 * call a packet. 45 * Alan Cox : Hashed net_bh() 46 * Richard Kooijman: Timestamp fixes. 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 48 * Alan Cox : Device lock protection. 49 * Alan Cox : Fixed nasty side effect of device close 50 * changes. 51 * Rudi Cilibrasi : Pass the right thing to 52 * set_mac_address() 53 * Dave Miller : 32bit quantity for the device lock to 54 * make it work out on a Sparc. 55 * Bjorn Ekwall : Added KERNELD hack. 56 * Alan Cox : Cleaned up the backlog initialise. 57 * Craig Metz : SIOCGIFCONF fix if space for under 58 * 1 device. 59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 60 * is no device open function. 61 * Andi Kleen : Fix error reporting for SIOCGIFCONF 62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 63 * Cyrus Durgin : Cleaned for KMOD 64 * Adam Sulmicki : Bug Fix : Network Device Unload 65 * A network device unload needs to purge 66 * the backlog queue. 67 * Paul Rusty Russell : SIOCSIFNAME 68 * Pekka Riikonen : Netdev boot-time settings code 69 * Andrew Morton : Make unregister_netdevice wait 70 * indefinitely on dev->refcnt 71 * J Hadi Salim : - Backlog queue sampling 72 * - netif_rx() feedback 73 */ 74 75 #include <asm/uaccess.h> 76 #include <linux/bitops.h> 77 #include <linux/capability.h> 78 #include <linux/cpu.h> 79 #include <linux/types.h> 80 #include <linux/kernel.h> 81 #include <linux/hash.h> 82 #include <linux/slab.h> 83 #include <linux/sched.h> 84 #include <linux/mutex.h> 85 #include <linux/rwsem.h> 86 #include <linux/string.h> 87 #include <linux/mm.h> 88 #include <linux/socket.h> 89 #include <linux/sockios.h> 90 #include <linux/errno.h> 91 #include <linux/interrupt.h> 92 #include <linux/if_ether.h> 93 #include <linux/netdevice.h> 94 #include <linux/etherdevice.h> 95 #include <linux/ethtool.h> 96 #include <linux/notifier.h> 97 #include <linux/skbuff.h> 98 #include <linux/bpf.h> 99 #include <net/net_namespace.h> 100 #include <net/sock.h> 101 #include <net/busy_poll.h> 102 #include <linux/rtnetlink.h> 103 #include <linux/stat.h> 104 #include <net/dst.h> 105 #include <net/dst_metadata.h> 106 #include <net/pkt_sched.h> 107 #include <net/checksum.h> 108 #include <net/xfrm.h> 109 #include <linux/highmem.h> 110 #include <linux/init.h> 111 #include <linux/module.h> 112 #include <linux/netpoll.h> 113 #include <linux/rcupdate.h> 114 #include <linux/delay.h> 115 #include <net/iw_handler.h> 116 #include <asm/current.h> 117 #include <linux/audit.h> 118 #include <linux/dmaengine.h> 119 #include <linux/err.h> 120 #include <linux/ctype.h> 121 #include <linux/if_arp.h> 122 #include <linux/if_vlan.h> 123 #include <linux/ip.h> 124 #include <net/ip.h> 125 #include <net/mpls.h> 126 #include <linux/ipv6.h> 127 #include <linux/in.h> 128 #include <linux/jhash.h> 129 #include <linux/random.h> 130 #include <trace/events/napi.h> 131 #include <trace/events/net.h> 132 #include <trace/events/skb.h> 133 #include <linux/pci.h> 134 #include <linux/inetdevice.h> 135 #include <linux/cpu_rmap.h> 136 #include <linux/static_key.h> 137 #include <linux/hashtable.h> 138 #include <linux/vmalloc.h> 139 #include <linux/if_macvlan.h> 140 #include <linux/errqueue.h> 141 #include <linux/hrtimer.h> 142 #include <linux/netfilter_ingress.h> 143 #include <linux/sctp.h> 144 #include <linux/crash_dump.h> 145 146 #include "net-sysfs.h" 147 148 /* Instead of increasing this, you should create a hash table. */ 149 #define MAX_GRO_SKBS 8 150 151 /* This should be increased if a protocol with a bigger head is added. */ 152 #define GRO_MAX_HEAD (MAX_HEADER + 128) 153 154 static DEFINE_SPINLOCK(ptype_lock); 155 static DEFINE_SPINLOCK(offload_lock); 156 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 157 struct list_head ptype_all __read_mostly; /* Taps */ 158 static struct list_head offload_base __read_mostly; 159 160 static int netif_rx_internal(struct sk_buff *skb); 161 static int call_netdevice_notifiers_info(unsigned long val, 162 struct net_device *dev, 163 struct netdev_notifier_info *info); 164 165 /* 166 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 167 * semaphore. 168 * 169 * Pure readers hold dev_base_lock for reading, or rcu_read_lock() 170 * 171 * Writers must hold the rtnl semaphore while they loop through the 172 * dev_base_head list, and hold dev_base_lock for writing when they do the 173 * actual updates. This allows pure readers to access the list even 174 * while a writer is preparing to update it. 175 * 176 * To put it another way, dev_base_lock is held for writing only to 177 * protect against pure readers; the rtnl semaphore provides the 178 * protection against other writers. 179 * 180 * See, for example usages, register_netdevice() and 181 * unregister_netdevice(), which must be called with the rtnl 182 * semaphore held. 183 */ 184 DEFINE_RWLOCK(dev_base_lock); 185 EXPORT_SYMBOL(dev_base_lock); 186 187 /* protects napi_hash addition/deletion and napi_gen_id */ 188 static DEFINE_SPINLOCK(napi_hash_lock); 189 190 static unsigned int napi_gen_id = NR_CPUS; 191 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); 192 193 static DECLARE_RWSEM(devnet_rename_sem); 194 195 static inline void dev_base_seq_inc(struct net *net) 196 { 197 while (++net->dev_base_seq == 0); 198 } 199 200 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 201 { 202 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); 203 204 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; 205 } 206 207 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 208 { 209 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 210 } 211 212 static inline void rps_lock(struct softnet_data *sd) 213 { 214 #ifdef CONFIG_RPS 215 spin_lock(&sd->input_pkt_queue.lock); 216 #endif 217 } 218 219 static inline void rps_unlock(struct softnet_data *sd) 220 { 221 #ifdef CONFIG_RPS 222 spin_unlock(&sd->input_pkt_queue.lock); 223 #endif 224 } 225 226 /* Device list insertion */ 227 static void list_netdevice(struct net_device *dev) 228 { 229 struct net *net = dev_net(dev); 230 231 ASSERT_RTNL(); 232 233 write_lock_bh(&dev_base_lock); 234 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); 235 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 236 hlist_add_head_rcu(&dev->index_hlist, 237 dev_index_hash(net, dev->ifindex)); 238 write_unlock_bh(&dev_base_lock); 239 240 dev_base_seq_inc(net); 241 } 242 243 /* Device list removal 244 * caller must respect a RCU grace period before freeing/reusing dev 245 */ 246 static void unlist_netdevice(struct net_device *dev) 247 { 248 ASSERT_RTNL(); 249 250 /* Unlink dev from the device chain */ 251 write_lock_bh(&dev_base_lock); 252 list_del_rcu(&dev->dev_list); 253 hlist_del_rcu(&dev->name_hlist); 254 hlist_del_rcu(&dev->index_hlist); 255 write_unlock_bh(&dev_base_lock); 256 257 dev_base_seq_inc(dev_net(dev)); 258 } 259 260 /* 261 * Our notifier list 262 */ 263 264 static RAW_NOTIFIER_HEAD(netdev_chain); 265 266 /* 267 * Device drivers call our routines to queue packets here. We empty the 268 * queue in the local softnet handler. 269 */ 270 271 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); 272 EXPORT_PER_CPU_SYMBOL(softnet_data); 273 274 #ifdef CONFIG_LOCKDEP 275 /* 276 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 277 * according to dev->type 278 */ 279 static const unsigned short netdev_lock_type[] = 280 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 281 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 282 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 283 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 284 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 285 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 286 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 287 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 288 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 289 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 290 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 291 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 292 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, 293 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, 294 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; 295 296 static const char *const netdev_lock_name[] = 297 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 298 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 299 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 300 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 301 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 302 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 303 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 304 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 305 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 306 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 307 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 308 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 309 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", 310 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", 311 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; 312 313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 315 316 static inline unsigned short netdev_lock_pos(unsigned short dev_type) 317 { 318 int i; 319 320 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 321 if (netdev_lock_type[i] == dev_type) 322 return i; 323 /* the last key is used by default */ 324 return ARRAY_SIZE(netdev_lock_type) - 1; 325 } 326 327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 328 unsigned short dev_type) 329 { 330 int i; 331 332 i = netdev_lock_pos(dev_type); 333 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 334 netdev_lock_name[i]); 335 } 336 337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 338 { 339 int i; 340 341 i = netdev_lock_pos(dev->type); 342 lockdep_set_class_and_name(&dev->addr_list_lock, 343 &netdev_addr_lock_key[i], 344 netdev_lock_name[i]); 345 } 346 #else 347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 348 unsigned short dev_type) 349 { 350 } 351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 352 { 353 } 354 #endif 355 356 /******************************************************************************* 357 358 Protocol management and registration routines 359 360 *******************************************************************************/ 361 362 /* 363 * Add a protocol ID to the list. Now that the input handler is 364 * smarter we can dispense with all the messy stuff that used to be 365 * here. 366 * 367 * BEWARE!!! Protocol handlers, mangling input packets, 368 * MUST BE last in hash buckets and checking protocol handlers 369 * MUST start from promiscuous ptype_all chain in net_bh. 370 * It is true now, do not change it. 371 * Explanation follows: if protocol handler, mangling packet, will 372 * be the first on list, it is not able to sense, that packet 373 * is cloned and should be copied-on-write, so that it will 374 * change it and subsequent readers will get broken packet. 375 * --ANK (980803) 376 */ 377 378 static inline struct list_head *ptype_head(const struct packet_type *pt) 379 { 380 if (pt->type == htons(ETH_P_ALL)) 381 return pt->dev ? &pt->dev->ptype_all : &ptype_all; 382 else 383 return pt->dev ? &pt->dev->ptype_specific : 384 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 385 } 386 387 /** 388 * dev_add_pack - add packet handler 389 * @pt: packet type declaration 390 * 391 * Add a protocol handler to the networking stack. The passed &packet_type 392 * is linked into kernel lists and may not be freed until it has been 393 * removed from the kernel lists. 394 * 395 * This call does not sleep therefore it can not 396 * guarantee all CPU's that are in middle of receiving packets 397 * will see the new packet type (until the next received packet). 398 */ 399 400 void dev_add_pack(struct packet_type *pt) 401 { 402 struct list_head *head = ptype_head(pt); 403 404 spin_lock(&ptype_lock); 405 list_add_rcu(&pt->list, head); 406 spin_unlock(&ptype_lock); 407 } 408 EXPORT_SYMBOL(dev_add_pack); 409 410 /** 411 * __dev_remove_pack - remove packet handler 412 * @pt: packet type declaration 413 * 414 * Remove a protocol handler that was previously added to the kernel 415 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 416 * from the kernel lists and can be freed or reused once this function 417 * returns. 418 * 419 * The packet type might still be in use by receivers 420 * and must not be freed until after all the CPU's have gone 421 * through a quiescent state. 422 */ 423 void __dev_remove_pack(struct packet_type *pt) 424 { 425 struct list_head *head = ptype_head(pt); 426 struct packet_type *pt1; 427 428 spin_lock(&ptype_lock); 429 430 list_for_each_entry(pt1, head, list) { 431 if (pt == pt1) { 432 list_del_rcu(&pt->list); 433 goto out; 434 } 435 } 436 437 pr_warn("dev_remove_pack: %p not found\n", pt); 438 out: 439 spin_unlock(&ptype_lock); 440 } 441 EXPORT_SYMBOL(__dev_remove_pack); 442 443 /** 444 * dev_remove_pack - remove packet handler 445 * @pt: packet type declaration 446 * 447 * Remove a protocol handler that was previously added to the kernel 448 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 449 * from the kernel lists and can be freed or reused once this function 450 * returns. 451 * 452 * This call sleeps to guarantee that no CPU is looking at the packet 453 * type after return. 454 */ 455 void dev_remove_pack(struct packet_type *pt) 456 { 457 __dev_remove_pack(pt); 458 459 synchronize_net(); 460 } 461 EXPORT_SYMBOL(dev_remove_pack); 462 463 464 /** 465 * dev_add_offload - register offload handlers 466 * @po: protocol offload declaration 467 * 468 * Add protocol offload handlers to the networking stack. The passed 469 * &proto_offload is linked into kernel lists and may not be freed until 470 * it has been removed from the kernel lists. 471 * 472 * This call does not sleep therefore it can not 473 * guarantee all CPU's that are in middle of receiving packets 474 * will see the new offload handlers (until the next received packet). 475 */ 476 void dev_add_offload(struct packet_offload *po) 477 { 478 struct packet_offload *elem; 479 480 spin_lock(&offload_lock); 481 list_for_each_entry(elem, &offload_base, list) { 482 if (po->priority < elem->priority) 483 break; 484 } 485 list_add_rcu(&po->list, elem->list.prev); 486 spin_unlock(&offload_lock); 487 } 488 EXPORT_SYMBOL(dev_add_offload); 489 490 /** 491 * __dev_remove_offload - remove offload handler 492 * @po: packet offload declaration 493 * 494 * Remove a protocol offload handler that was previously added to the 495 * kernel offload handlers by dev_add_offload(). The passed &offload_type 496 * is removed from the kernel lists and can be freed or reused once this 497 * function returns. 498 * 499 * The packet type might still be in use by receivers 500 * and must not be freed until after all the CPU's have gone 501 * through a quiescent state. 502 */ 503 static void __dev_remove_offload(struct packet_offload *po) 504 { 505 struct list_head *head = &offload_base; 506 struct packet_offload *po1; 507 508 spin_lock(&offload_lock); 509 510 list_for_each_entry(po1, head, list) { 511 if (po == po1) { 512 list_del_rcu(&po->list); 513 goto out; 514 } 515 } 516 517 pr_warn("dev_remove_offload: %p not found\n", po); 518 out: 519 spin_unlock(&offload_lock); 520 } 521 522 /** 523 * dev_remove_offload - remove packet offload handler 524 * @po: packet offload declaration 525 * 526 * Remove a packet offload handler that was previously added to the kernel 527 * offload handlers by dev_add_offload(). The passed &offload_type is 528 * removed from the kernel lists and can be freed or reused once this 529 * function returns. 530 * 531 * This call sleeps to guarantee that no CPU is looking at the packet 532 * type after return. 533 */ 534 void dev_remove_offload(struct packet_offload *po) 535 { 536 __dev_remove_offload(po); 537 538 synchronize_net(); 539 } 540 EXPORT_SYMBOL(dev_remove_offload); 541 542 /****************************************************************************** 543 544 Device Boot-time Settings Routines 545 546 *******************************************************************************/ 547 548 /* Boot time configuration table */ 549 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 550 551 /** 552 * netdev_boot_setup_add - add new setup entry 553 * @name: name of the device 554 * @map: configured settings for the device 555 * 556 * Adds new setup entry to the dev_boot_setup list. The function 557 * returns 0 on error and 1 on success. This is a generic routine to 558 * all netdevices. 559 */ 560 static int netdev_boot_setup_add(char *name, struct ifmap *map) 561 { 562 struct netdev_boot_setup *s; 563 int i; 564 565 s = dev_boot_setup; 566 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 567 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 568 memset(s[i].name, 0, sizeof(s[i].name)); 569 strlcpy(s[i].name, name, IFNAMSIZ); 570 memcpy(&s[i].map, map, sizeof(s[i].map)); 571 break; 572 } 573 } 574 575 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 576 } 577 578 /** 579 * netdev_boot_setup_check - check boot time settings 580 * @dev: the netdevice 581 * 582 * Check boot time settings for the device. 583 * The found settings are set for the device to be used 584 * later in the device probing. 585 * Returns 0 if no settings found, 1 if they are. 586 */ 587 int netdev_boot_setup_check(struct net_device *dev) 588 { 589 struct netdev_boot_setup *s = dev_boot_setup; 590 int i; 591 592 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 593 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 594 !strcmp(dev->name, s[i].name)) { 595 dev->irq = s[i].map.irq; 596 dev->base_addr = s[i].map.base_addr; 597 dev->mem_start = s[i].map.mem_start; 598 dev->mem_end = s[i].map.mem_end; 599 return 1; 600 } 601 } 602 return 0; 603 } 604 EXPORT_SYMBOL(netdev_boot_setup_check); 605 606 607 /** 608 * netdev_boot_base - get address from boot time settings 609 * @prefix: prefix for network device 610 * @unit: id for network device 611 * 612 * Check boot time settings for the base address of device. 613 * The found settings are set for the device to be used 614 * later in the device probing. 615 * Returns 0 if no settings found. 616 */ 617 unsigned long netdev_boot_base(const char *prefix, int unit) 618 { 619 const struct netdev_boot_setup *s = dev_boot_setup; 620 char name[IFNAMSIZ]; 621 int i; 622 623 sprintf(name, "%s%d", prefix, unit); 624 625 /* 626 * If device already registered then return base of 1 627 * to indicate not to probe for this interface 628 */ 629 if (__dev_get_by_name(&init_net, name)) 630 return 1; 631 632 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 633 if (!strcmp(name, s[i].name)) 634 return s[i].map.base_addr; 635 return 0; 636 } 637 638 /* 639 * Saves at boot time configured settings for any netdevice. 640 */ 641 int __init netdev_boot_setup(char *str) 642 { 643 int ints[5]; 644 struct ifmap map; 645 646 str = get_options(str, ARRAY_SIZE(ints), ints); 647 if (!str || !*str) 648 return 0; 649 650 /* Save settings */ 651 memset(&map, 0, sizeof(map)); 652 if (ints[0] > 0) 653 map.irq = ints[1]; 654 if (ints[0] > 1) 655 map.base_addr = ints[2]; 656 if (ints[0] > 2) 657 map.mem_start = ints[3]; 658 if (ints[0] > 3) 659 map.mem_end = ints[4]; 660 661 /* Add new entry to the list */ 662 return netdev_boot_setup_add(str, &map); 663 } 664 665 __setup("netdev=", netdev_boot_setup); 666 667 /******************************************************************************* 668 669 Device Interface Subroutines 670 671 *******************************************************************************/ 672 673 /** 674 * dev_get_iflink - get 'iflink' value of a interface 675 * @dev: targeted interface 676 * 677 * Indicates the ifindex the interface is linked to. 678 * Physical interfaces have the same 'ifindex' and 'iflink' values. 679 */ 680 681 int dev_get_iflink(const struct net_device *dev) 682 { 683 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) 684 return dev->netdev_ops->ndo_get_iflink(dev); 685 686 return dev->ifindex; 687 } 688 EXPORT_SYMBOL(dev_get_iflink); 689 690 /** 691 * dev_fill_metadata_dst - Retrieve tunnel egress information. 692 * @dev: targeted interface 693 * @skb: The packet. 694 * 695 * For better visibility of tunnel traffic OVS needs to retrieve 696 * egress tunnel information for a packet. Following API allows 697 * user to get this info. 698 */ 699 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) 700 { 701 struct ip_tunnel_info *info; 702 703 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) 704 return -EINVAL; 705 706 info = skb_tunnel_info_unclone(skb); 707 if (!info) 708 return -ENOMEM; 709 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) 710 return -EINVAL; 711 712 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); 713 } 714 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); 715 716 /** 717 * __dev_get_by_name - find a device by its name 718 * @net: the applicable net namespace 719 * @name: name to find 720 * 721 * Find an interface by name. Must be called under RTNL semaphore 722 * or @dev_base_lock. If the name is found a pointer to the device 723 * is returned. If the name is not found then %NULL is returned. The 724 * reference counters are not incremented so the caller must be 725 * careful with locks. 726 */ 727 728 struct net_device *__dev_get_by_name(struct net *net, const char *name) 729 { 730 struct net_device *dev; 731 struct hlist_head *head = dev_name_hash(net, name); 732 733 hlist_for_each_entry(dev, head, name_hlist) 734 if (!strncmp(dev->name, name, IFNAMSIZ)) 735 return dev; 736 737 return NULL; 738 } 739 EXPORT_SYMBOL(__dev_get_by_name); 740 741 /** 742 * dev_get_by_name_rcu - find a device by its name 743 * @net: the applicable net namespace 744 * @name: name to find 745 * 746 * Find an interface by name. 747 * If the name is found a pointer to the device is returned. 748 * If the name is not found then %NULL is returned. 749 * The reference counters are not incremented so the caller must be 750 * careful with locks. The caller must hold RCU lock. 751 */ 752 753 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 754 { 755 struct net_device *dev; 756 struct hlist_head *head = dev_name_hash(net, name); 757 758 hlist_for_each_entry_rcu(dev, head, name_hlist) 759 if (!strncmp(dev->name, name, IFNAMSIZ)) 760 return dev; 761 762 return NULL; 763 } 764 EXPORT_SYMBOL(dev_get_by_name_rcu); 765 766 /** 767 * dev_get_by_name - find a device by its name 768 * @net: the applicable net namespace 769 * @name: name to find 770 * 771 * Find an interface by name. This can be called from any 772 * context and does its own locking. The returned handle has 773 * the usage count incremented and the caller must use dev_put() to 774 * release it when it is no longer needed. %NULL is returned if no 775 * matching device is found. 776 */ 777 778 struct net_device *dev_get_by_name(struct net *net, const char *name) 779 { 780 struct net_device *dev; 781 782 rcu_read_lock(); 783 dev = dev_get_by_name_rcu(net, name); 784 if (dev) 785 dev_hold(dev); 786 rcu_read_unlock(); 787 return dev; 788 } 789 EXPORT_SYMBOL(dev_get_by_name); 790 791 /** 792 * __dev_get_by_index - find a device by its ifindex 793 * @net: the applicable net namespace 794 * @ifindex: index of device 795 * 796 * Search for an interface by index. Returns %NULL if the device 797 * is not found or a pointer to the device. The device has not 798 * had its reference counter increased so the caller must be careful 799 * about locking. The caller must hold either the RTNL semaphore 800 * or @dev_base_lock. 801 */ 802 803 struct net_device *__dev_get_by_index(struct net *net, int ifindex) 804 { 805 struct net_device *dev; 806 struct hlist_head *head = dev_index_hash(net, ifindex); 807 808 hlist_for_each_entry(dev, head, index_hlist) 809 if (dev->ifindex == ifindex) 810 return dev; 811 812 return NULL; 813 } 814 EXPORT_SYMBOL(__dev_get_by_index); 815 816 /** 817 * dev_get_by_index_rcu - find a device by its ifindex 818 * @net: the applicable net namespace 819 * @ifindex: index of device 820 * 821 * Search for an interface by index. Returns %NULL if the device 822 * is not found or a pointer to the device. The device has not 823 * had its reference counter increased so the caller must be careful 824 * about locking. The caller must hold RCU lock. 825 */ 826 827 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) 828 { 829 struct net_device *dev; 830 struct hlist_head *head = dev_index_hash(net, ifindex); 831 832 hlist_for_each_entry_rcu(dev, head, index_hlist) 833 if (dev->ifindex == ifindex) 834 return dev; 835 836 return NULL; 837 } 838 EXPORT_SYMBOL(dev_get_by_index_rcu); 839 840 841 /** 842 * dev_get_by_index - find a device by its ifindex 843 * @net: the applicable net namespace 844 * @ifindex: index of device 845 * 846 * Search for an interface by index. Returns NULL if the device 847 * is not found or a pointer to the device. The device returned has 848 * had a reference added and the pointer is safe until the user calls 849 * dev_put to indicate they have finished with it. 850 */ 851 852 struct net_device *dev_get_by_index(struct net *net, int ifindex) 853 { 854 struct net_device *dev; 855 856 rcu_read_lock(); 857 dev = dev_get_by_index_rcu(net, ifindex); 858 if (dev) 859 dev_hold(dev); 860 rcu_read_unlock(); 861 return dev; 862 } 863 EXPORT_SYMBOL(dev_get_by_index); 864 865 /** 866 * netdev_get_name - get a netdevice name, knowing its ifindex. 867 * @net: network namespace 868 * @name: a pointer to the buffer where the name will be stored. 869 * @ifindex: the ifindex of the interface to get the name from. 870 */ 871 int netdev_get_name(struct net *net, char *name, int ifindex) 872 { 873 struct net_device *dev; 874 int ret; 875 876 down_read(&devnet_rename_sem); 877 rcu_read_lock(); 878 879 dev = dev_get_by_index_rcu(net, ifindex); 880 if (!dev) { 881 ret = -ENODEV; 882 goto out; 883 } 884 885 strcpy(name, dev->name); 886 887 ret = 0; 888 out: 889 rcu_read_unlock(); 890 up_read(&devnet_rename_sem); 891 return ret; 892 } 893 894 /** 895 * dev_getbyhwaddr_rcu - find a device by its hardware address 896 * @net: the applicable net namespace 897 * @type: media type of device 898 * @ha: hardware address 899 * 900 * Search for an interface by MAC address. Returns NULL if the device 901 * is not found or a pointer to the device. 902 * The caller must hold RCU or RTNL. 903 * The returned device has not had its ref count increased 904 * and the caller must therefore be careful about locking 905 * 906 */ 907 908 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, 909 const char *ha) 910 { 911 struct net_device *dev; 912 913 for_each_netdev_rcu(net, dev) 914 if (dev->type == type && 915 !memcmp(dev->dev_addr, ha, dev->addr_len)) 916 return dev; 917 918 return NULL; 919 } 920 EXPORT_SYMBOL(dev_getbyhwaddr_rcu); 921 922 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 923 { 924 struct net_device *dev; 925 926 ASSERT_RTNL(); 927 for_each_netdev(net, dev) 928 if (dev->type == type) 929 return dev; 930 931 return NULL; 932 } 933 EXPORT_SYMBOL(__dev_getfirstbyhwtype); 934 935 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 936 { 937 struct net_device *dev, *ret = NULL; 938 939 rcu_read_lock(); 940 for_each_netdev_rcu(net, dev) 941 if (dev->type == type) { 942 dev_hold(dev); 943 ret = dev; 944 break; 945 } 946 rcu_read_unlock(); 947 return ret; 948 } 949 EXPORT_SYMBOL(dev_getfirstbyhwtype); 950 951 /** 952 * __dev_get_by_flags - find any device with given flags 953 * @net: the applicable net namespace 954 * @if_flags: IFF_* values 955 * @mask: bitmask of bits in if_flags to check 956 * 957 * Search for any interface with the given flags. Returns NULL if a device 958 * is not found or a pointer to the device. Must be called inside 959 * rtnl_lock(), and result refcount is unchanged. 960 */ 961 962 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, 963 unsigned short mask) 964 { 965 struct net_device *dev, *ret; 966 967 ASSERT_RTNL(); 968 969 ret = NULL; 970 for_each_netdev(net, dev) { 971 if (((dev->flags ^ if_flags) & mask) == 0) { 972 ret = dev; 973 break; 974 } 975 } 976 return ret; 977 } 978 EXPORT_SYMBOL(__dev_get_by_flags); 979 980 /** 981 * dev_valid_name - check if name is okay for network device 982 * @name: name string 983 * 984 * Network device names need to be valid file names to 985 * to allow sysfs to work. We also disallow any kind of 986 * whitespace. 987 */ 988 bool dev_valid_name(const char *name) 989 { 990 if (*name == '\0') 991 return false; 992 if (strnlen(name, IFNAMSIZ) == IFNAMSIZ) 993 return false; 994 if (!strcmp(name, ".") || !strcmp(name, "..")) 995 return false; 996 997 while (*name) { 998 if (*name == '/' || *name == ':' || isspace(*name)) 999 return false; 1000 name++; 1001 } 1002 return true; 1003 } 1004 EXPORT_SYMBOL(dev_valid_name); 1005 1006 /** 1007 * __dev_alloc_name - allocate a name for a device 1008 * @net: network namespace to allocate the device name in 1009 * @name: name format string 1010 * @buf: scratch buffer and result name string 1011 * 1012 * Passed a format string - eg "lt%d" it will try and find a suitable 1013 * id. It scans list of devices to build up a free map, then chooses 1014 * the first empty slot. The caller must hold the dev_base or rtnl lock 1015 * while allocating the name and adding the device in order to avoid 1016 * duplicates. 1017 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1018 * Returns the number of the unit assigned or a negative errno code. 1019 */ 1020 1021 static int __dev_alloc_name(struct net *net, const char *name, char *buf) 1022 { 1023 int i = 0; 1024 const char *p; 1025 const int max_netdevices = 8*PAGE_SIZE; 1026 unsigned long *inuse; 1027 struct net_device *d; 1028 1029 p = strnchr(name, IFNAMSIZ-1, '%'); 1030 if (p) { 1031 /* 1032 * Verify the string as this thing may have come from 1033 * the user. There must be either one "%d" and no other "%" 1034 * characters. 1035 */ 1036 if (p[1] != 'd' || strchr(p + 2, '%')) 1037 return -EINVAL; 1038 1039 /* Use one page as a bit array of possible slots */ 1040 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 1041 if (!inuse) 1042 return -ENOMEM; 1043 1044 for_each_netdev(net, d) { 1045 if (!sscanf(d->name, name, &i)) 1046 continue; 1047 if (i < 0 || i >= max_netdevices) 1048 continue; 1049 1050 /* avoid cases where sscanf is not exact inverse of printf */ 1051 snprintf(buf, IFNAMSIZ, name, i); 1052 if (!strncmp(buf, d->name, IFNAMSIZ)) 1053 set_bit(i, inuse); 1054 } 1055 1056 i = find_first_zero_bit(inuse, max_netdevices); 1057 free_page((unsigned long) inuse); 1058 } 1059 1060 if (buf != name) 1061 snprintf(buf, IFNAMSIZ, name, i); 1062 if (!__dev_get_by_name(net, buf)) 1063 return i; 1064 1065 /* It is possible to run out of possible slots 1066 * when the name is long and there isn't enough space left 1067 * for the digits, or if all bits are used. 1068 */ 1069 return -ENFILE; 1070 } 1071 1072 /** 1073 * dev_alloc_name - allocate a name for a device 1074 * @dev: device 1075 * @name: name format string 1076 * 1077 * Passed a format string - eg "lt%d" it will try and find a suitable 1078 * id. It scans list of devices to build up a free map, then chooses 1079 * the first empty slot. The caller must hold the dev_base or rtnl lock 1080 * while allocating the name and adding the device in order to avoid 1081 * duplicates. 1082 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1083 * Returns the number of the unit assigned or a negative errno code. 1084 */ 1085 1086 int dev_alloc_name(struct net_device *dev, const char *name) 1087 { 1088 char buf[IFNAMSIZ]; 1089 struct net *net; 1090 int ret; 1091 1092 BUG_ON(!dev_net(dev)); 1093 net = dev_net(dev); 1094 ret = __dev_alloc_name(net, name, buf); 1095 if (ret >= 0) 1096 strlcpy(dev->name, buf, IFNAMSIZ); 1097 return ret; 1098 } 1099 EXPORT_SYMBOL(dev_alloc_name); 1100 1101 static int dev_alloc_name_ns(struct net *net, 1102 struct net_device *dev, 1103 const char *name) 1104 { 1105 char buf[IFNAMSIZ]; 1106 int ret; 1107 1108 ret = __dev_alloc_name(net, name, buf); 1109 if (ret >= 0) 1110 strlcpy(dev->name, buf, IFNAMSIZ); 1111 return ret; 1112 } 1113 1114 int dev_get_valid_name(struct net *net, struct net_device *dev, 1115 const char *name) 1116 { 1117 BUG_ON(!net); 1118 1119 if (!dev_valid_name(name)) 1120 return -EINVAL; 1121 1122 if (strchr(name, '%')) 1123 return dev_alloc_name_ns(net, dev, name); 1124 else if (__dev_get_by_name(net, name)) 1125 return -EEXIST; 1126 else if (dev->name != name) 1127 strlcpy(dev->name, name, IFNAMSIZ); 1128 1129 return 0; 1130 } 1131 EXPORT_SYMBOL(dev_get_valid_name); 1132 1133 /** 1134 * dev_change_name - change name of a device 1135 * @dev: device 1136 * @newname: name (or format string) must be at least IFNAMSIZ 1137 * 1138 * Change name of a device, can pass format strings "eth%d". 1139 * for wildcarding. 1140 */ 1141 int dev_change_name(struct net_device *dev, const char *newname) 1142 { 1143 unsigned char old_assign_type; 1144 char oldname[IFNAMSIZ]; 1145 int err = 0; 1146 int ret; 1147 struct net *net; 1148 1149 ASSERT_RTNL(); 1150 BUG_ON(!dev_net(dev)); 1151 1152 net = dev_net(dev); 1153 if (dev->flags & IFF_UP) 1154 return -EBUSY; 1155 1156 down_write(&devnet_rename_sem); 1157 1158 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { 1159 up_write(&devnet_rename_sem); 1160 return 0; 1161 } 1162 1163 memcpy(oldname, dev->name, IFNAMSIZ); 1164 1165 err = dev_get_valid_name(net, dev, newname); 1166 if (err < 0) { 1167 up_write(&devnet_rename_sem); 1168 return err; 1169 } 1170 1171 if (oldname[0] && !strchr(oldname, '%')) 1172 netdev_info(dev, "renamed from %s\n", oldname); 1173 1174 old_assign_type = dev->name_assign_type; 1175 dev->name_assign_type = NET_NAME_RENAMED; 1176 1177 rollback: 1178 ret = device_rename(&dev->dev, dev->name); 1179 if (ret) { 1180 memcpy(dev->name, oldname, IFNAMSIZ); 1181 dev->name_assign_type = old_assign_type; 1182 up_write(&devnet_rename_sem); 1183 return ret; 1184 } 1185 1186 up_write(&devnet_rename_sem); 1187 1188 netdev_adjacent_rename_links(dev, oldname); 1189 1190 write_lock_bh(&dev_base_lock); 1191 hlist_del_rcu(&dev->name_hlist); 1192 write_unlock_bh(&dev_base_lock); 1193 1194 synchronize_rcu(); 1195 1196 write_lock_bh(&dev_base_lock); 1197 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 1198 write_unlock_bh(&dev_base_lock); 1199 1200 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 1201 ret = notifier_to_errno(ret); 1202 1203 if (ret) { 1204 /* err >= 0 after dev_alloc_name() or stores the first errno */ 1205 if (err >= 0) { 1206 err = ret; 1207 down_write(&devnet_rename_sem); 1208 memcpy(dev->name, oldname, IFNAMSIZ); 1209 memcpy(oldname, newname, IFNAMSIZ); 1210 dev->name_assign_type = old_assign_type; 1211 old_assign_type = NET_NAME_RENAMED; 1212 goto rollback; 1213 } else { 1214 pr_err("%s: name change rollback failed: %d\n", 1215 dev->name, ret); 1216 } 1217 } 1218 1219 return err; 1220 } 1221 1222 /** 1223 * dev_set_alias - change ifalias of a device 1224 * @dev: device 1225 * @alias: name up to IFALIASZ 1226 * @len: limit of bytes to copy from info 1227 * 1228 * Set ifalias for a device, 1229 */ 1230 int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 1231 { 1232 char *new_ifalias; 1233 1234 ASSERT_RTNL(); 1235 1236 if (len >= IFALIASZ) 1237 return -EINVAL; 1238 1239 if (!len) { 1240 kfree(dev->ifalias); 1241 dev->ifalias = NULL; 1242 return 0; 1243 } 1244 1245 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); 1246 if (!new_ifalias) 1247 return -ENOMEM; 1248 dev->ifalias = new_ifalias; 1249 memcpy(dev->ifalias, alias, len); 1250 dev->ifalias[len] = 0; 1251 1252 return len; 1253 } 1254 1255 1256 /** 1257 * netdev_features_change - device changes features 1258 * @dev: device to cause notification 1259 * 1260 * Called to indicate a device has changed features. 1261 */ 1262 void netdev_features_change(struct net_device *dev) 1263 { 1264 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 1265 } 1266 EXPORT_SYMBOL(netdev_features_change); 1267 1268 /** 1269 * netdev_state_change - device changes state 1270 * @dev: device to cause notification 1271 * 1272 * Called to indicate a device has changed state. This function calls 1273 * the notifier chains for netdev_chain and sends a NEWLINK message 1274 * to the routing socket. 1275 */ 1276 void netdev_state_change(struct net_device *dev) 1277 { 1278 if (dev->flags & IFF_UP) { 1279 struct netdev_notifier_change_info change_info; 1280 1281 change_info.flags_changed = 0; 1282 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 1283 &change_info.info); 1284 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); 1285 } 1286 } 1287 EXPORT_SYMBOL(netdev_state_change); 1288 1289 /** 1290 * netdev_notify_peers - notify network peers about existence of @dev 1291 * @dev: network device 1292 * 1293 * Generate traffic such that interested network peers are aware of 1294 * @dev, such as by generating a gratuitous ARP. This may be used when 1295 * a device wants to inform the rest of the network about some sort of 1296 * reconfiguration such as a failover event or virtual machine 1297 * migration. 1298 */ 1299 void netdev_notify_peers(struct net_device *dev) 1300 { 1301 rtnl_lock(); 1302 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); 1303 call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); 1304 rtnl_unlock(); 1305 } 1306 EXPORT_SYMBOL(netdev_notify_peers); 1307 1308 static int __dev_open(struct net_device *dev) 1309 { 1310 const struct net_device_ops *ops = dev->netdev_ops; 1311 int ret; 1312 1313 ASSERT_RTNL(); 1314 1315 if (!netif_device_present(dev)) 1316 return -ENODEV; 1317 1318 /* Block netpoll from trying to do any rx path servicing. 1319 * If we don't do this there is a chance ndo_poll_controller 1320 * or ndo_poll may be running while we open the device 1321 */ 1322 netpoll_poll_disable(dev); 1323 1324 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1325 ret = notifier_to_errno(ret); 1326 if (ret) 1327 return ret; 1328 1329 set_bit(__LINK_STATE_START, &dev->state); 1330 1331 if (ops->ndo_validate_addr) 1332 ret = ops->ndo_validate_addr(dev); 1333 1334 if (!ret && ops->ndo_open) 1335 ret = ops->ndo_open(dev); 1336 1337 netpoll_poll_enable(dev); 1338 1339 if (ret) 1340 clear_bit(__LINK_STATE_START, &dev->state); 1341 else { 1342 dev->flags |= IFF_UP; 1343 dev_set_rx_mode(dev); 1344 dev_activate(dev); 1345 add_device_randomness(dev->dev_addr, dev->addr_len); 1346 } 1347 1348 return ret; 1349 } 1350 1351 /** 1352 * dev_open - prepare an interface for use. 1353 * @dev: device to open 1354 * 1355 * Takes a device from down to up state. The device's private open 1356 * function is invoked and then the multicast lists are loaded. Finally 1357 * the device is moved into the up state and a %NETDEV_UP message is 1358 * sent to the netdev notifier chain. 1359 * 1360 * Calling this function on an active interface is a nop. On a failure 1361 * a negative errno code is returned. 1362 */ 1363 int dev_open(struct net_device *dev) 1364 { 1365 int ret; 1366 1367 if (dev->flags & IFF_UP) 1368 return 0; 1369 1370 ret = __dev_open(dev); 1371 if (ret < 0) 1372 return ret; 1373 1374 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1375 call_netdevice_notifiers(NETDEV_UP, dev); 1376 1377 return ret; 1378 } 1379 EXPORT_SYMBOL(dev_open); 1380 1381 static int __dev_close_many(struct list_head *head) 1382 { 1383 struct net_device *dev; 1384 1385 ASSERT_RTNL(); 1386 might_sleep(); 1387 1388 list_for_each_entry(dev, head, close_list) { 1389 /* Temporarily disable netpoll until the interface is down */ 1390 netpoll_poll_disable(dev); 1391 1392 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1393 1394 clear_bit(__LINK_STATE_START, &dev->state); 1395 1396 /* Synchronize to scheduled poll. We cannot touch poll list, it 1397 * can be even on different cpu. So just clear netif_running(). 1398 * 1399 * dev->stop() will invoke napi_disable() on all of it's 1400 * napi_struct instances on this device. 1401 */ 1402 smp_mb__after_atomic(); /* Commit netif_running(). */ 1403 } 1404 1405 dev_deactivate_many(head); 1406 1407 list_for_each_entry(dev, head, close_list) { 1408 const struct net_device_ops *ops = dev->netdev_ops; 1409 1410 /* 1411 * Call the device specific close. This cannot fail. 1412 * Only if device is UP 1413 * 1414 * We allow it to be called even after a DETACH hot-plug 1415 * event. 1416 */ 1417 if (ops->ndo_stop) 1418 ops->ndo_stop(dev); 1419 1420 dev->flags &= ~IFF_UP; 1421 netpoll_poll_enable(dev); 1422 } 1423 1424 return 0; 1425 } 1426 1427 static int __dev_close(struct net_device *dev) 1428 { 1429 int retval; 1430 LIST_HEAD(single); 1431 1432 list_add(&dev->close_list, &single); 1433 retval = __dev_close_many(&single); 1434 list_del(&single); 1435 1436 return retval; 1437 } 1438 1439 int dev_close_many(struct list_head *head, bool unlink) 1440 { 1441 struct net_device *dev, *tmp; 1442 1443 /* Remove the devices that don't need to be closed */ 1444 list_for_each_entry_safe(dev, tmp, head, close_list) 1445 if (!(dev->flags & IFF_UP)) 1446 list_del_init(&dev->close_list); 1447 1448 __dev_close_many(head); 1449 1450 list_for_each_entry_safe(dev, tmp, head, close_list) { 1451 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1452 call_netdevice_notifiers(NETDEV_DOWN, dev); 1453 if (unlink) 1454 list_del_init(&dev->close_list); 1455 } 1456 1457 return 0; 1458 } 1459 EXPORT_SYMBOL(dev_close_many); 1460 1461 /** 1462 * dev_close - shutdown an interface. 1463 * @dev: device to shutdown 1464 * 1465 * This function moves an active device into down state. A 1466 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1467 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1468 * chain. 1469 */ 1470 int dev_close(struct net_device *dev) 1471 { 1472 if (dev->flags & IFF_UP) { 1473 LIST_HEAD(single); 1474 1475 list_add(&dev->close_list, &single); 1476 dev_close_many(&single, true); 1477 list_del(&single); 1478 } 1479 return 0; 1480 } 1481 EXPORT_SYMBOL(dev_close); 1482 1483 1484 /** 1485 * dev_disable_lro - disable Large Receive Offload on a device 1486 * @dev: device 1487 * 1488 * Disable Large Receive Offload (LRO) on a net device. Must be 1489 * called under RTNL. This is needed if received packets may be 1490 * forwarded to another interface. 1491 */ 1492 void dev_disable_lro(struct net_device *dev) 1493 { 1494 struct net_device *lower_dev; 1495 struct list_head *iter; 1496 1497 dev->wanted_features &= ~NETIF_F_LRO; 1498 netdev_update_features(dev); 1499 1500 if (unlikely(dev->features & NETIF_F_LRO)) 1501 netdev_WARN(dev, "failed to disable LRO!\n"); 1502 1503 netdev_for_each_lower_dev(dev, lower_dev, iter) 1504 dev_disable_lro(lower_dev); 1505 } 1506 EXPORT_SYMBOL(dev_disable_lro); 1507 1508 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, 1509 struct net_device *dev) 1510 { 1511 struct netdev_notifier_info info; 1512 1513 netdev_notifier_info_init(&info, dev); 1514 return nb->notifier_call(nb, val, &info); 1515 } 1516 1517 static int dev_boot_phase = 1; 1518 1519 /** 1520 * register_netdevice_notifier - register a network notifier block 1521 * @nb: notifier 1522 * 1523 * Register a notifier to be called when network device events occur. 1524 * The notifier passed is linked into the kernel structures and must 1525 * not be reused until it has been unregistered. A negative errno code 1526 * is returned on a failure. 1527 * 1528 * When registered all registration and up events are replayed 1529 * to the new notifier to allow device to have a race free 1530 * view of the network device list. 1531 */ 1532 1533 int register_netdevice_notifier(struct notifier_block *nb) 1534 { 1535 struct net_device *dev; 1536 struct net_device *last; 1537 struct net *net; 1538 int err; 1539 1540 rtnl_lock(); 1541 err = raw_notifier_chain_register(&netdev_chain, nb); 1542 if (err) 1543 goto unlock; 1544 if (dev_boot_phase) 1545 goto unlock; 1546 for_each_net(net) { 1547 for_each_netdev(net, dev) { 1548 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); 1549 err = notifier_to_errno(err); 1550 if (err) 1551 goto rollback; 1552 1553 if (!(dev->flags & IFF_UP)) 1554 continue; 1555 1556 call_netdevice_notifier(nb, NETDEV_UP, dev); 1557 } 1558 } 1559 1560 unlock: 1561 rtnl_unlock(); 1562 return err; 1563 1564 rollback: 1565 last = dev; 1566 for_each_net(net) { 1567 for_each_netdev(net, dev) { 1568 if (dev == last) 1569 goto outroll; 1570 1571 if (dev->flags & IFF_UP) { 1572 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1573 dev); 1574 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1575 } 1576 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1577 } 1578 } 1579 1580 outroll: 1581 raw_notifier_chain_unregister(&netdev_chain, nb); 1582 goto unlock; 1583 } 1584 EXPORT_SYMBOL(register_netdevice_notifier); 1585 1586 /** 1587 * unregister_netdevice_notifier - unregister a network notifier block 1588 * @nb: notifier 1589 * 1590 * Unregister a notifier previously registered by 1591 * register_netdevice_notifier(). The notifier is unlinked into the 1592 * kernel structures and may then be reused. A negative errno code 1593 * is returned on a failure. 1594 * 1595 * After unregistering unregister and down device events are synthesized 1596 * for all devices on the device list to the removed notifier to remove 1597 * the need for special case cleanup code. 1598 */ 1599 1600 int unregister_netdevice_notifier(struct notifier_block *nb) 1601 { 1602 struct net_device *dev; 1603 struct net *net; 1604 int err; 1605 1606 rtnl_lock(); 1607 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1608 if (err) 1609 goto unlock; 1610 1611 for_each_net(net) { 1612 for_each_netdev(net, dev) { 1613 if (dev->flags & IFF_UP) { 1614 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1615 dev); 1616 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1617 } 1618 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1619 } 1620 } 1621 unlock: 1622 rtnl_unlock(); 1623 return err; 1624 } 1625 EXPORT_SYMBOL(unregister_netdevice_notifier); 1626 1627 /** 1628 * call_netdevice_notifiers_info - call all network notifier blocks 1629 * @val: value passed unmodified to notifier function 1630 * @dev: net_device pointer passed unmodified to notifier function 1631 * @info: notifier information data 1632 * 1633 * Call all network notifier blocks. Parameters and return value 1634 * are as for raw_notifier_call_chain(). 1635 */ 1636 1637 static int call_netdevice_notifiers_info(unsigned long val, 1638 struct net_device *dev, 1639 struct netdev_notifier_info *info) 1640 { 1641 ASSERT_RTNL(); 1642 netdev_notifier_info_init(info, dev); 1643 return raw_notifier_call_chain(&netdev_chain, val, info); 1644 } 1645 1646 /** 1647 * call_netdevice_notifiers - call all network notifier blocks 1648 * @val: value passed unmodified to notifier function 1649 * @dev: net_device pointer passed unmodified to notifier function 1650 * 1651 * Call all network notifier blocks. Parameters and return value 1652 * are as for raw_notifier_call_chain(). 1653 */ 1654 1655 int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1656 { 1657 struct netdev_notifier_info info; 1658 1659 return call_netdevice_notifiers_info(val, dev, &info); 1660 } 1661 EXPORT_SYMBOL(call_netdevice_notifiers); 1662 1663 /** 1664 * call_netdevice_notifiers_mtu - call all network notifier blocks 1665 * @val: value passed unmodified to notifier function 1666 * @dev: net_device pointer passed unmodified to notifier function 1667 * @arg: additional u32 argument passed to the notifier function 1668 * 1669 * Call all network notifier blocks. Parameters and return value 1670 * are as for raw_notifier_call_chain(). 1671 */ 1672 static int call_netdevice_notifiers_mtu(unsigned long val, 1673 struct net_device *dev, u32 arg) 1674 { 1675 struct netdev_notifier_info_ext info = { 1676 .info.dev = dev, 1677 .ext.mtu = arg, 1678 }; 1679 1680 BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0); 1681 1682 return call_netdevice_notifiers_info(val, dev, &info.info); 1683 } 1684 1685 #ifdef CONFIG_NET_INGRESS 1686 static struct static_key ingress_needed __read_mostly; 1687 1688 void net_inc_ingress_queue(void) 1689 { 1690 static_key_slow_inc(&ingress_needed); 1691 } 1692 EXPORT_SYMBOL_GPL(net_inc_ingress_queue); 1693 1694 void net_dec_ingress_queue(void) 1695 { 1696 static_key_slow_dec(&ingress_needed); 1697 } 1698 EXPORT_SYMBOL_GPL(net_dec_ingress_queue); 1699 #endif 1700 1701 #ifdef CONFIG_NET_EGRESS 1702 static struct static_key egress_needed __read_mostly; 1703 1704 void net_inc_egress_queue(void) 1705 { 1706 static_key_slow_inc(&egress_needed); 1707 } 1708 EXPORT_SYMBOL_GPL(net_inc_egress_queue); 1709 1710 void net_dec_egress_queue(void) 1711 { 1712 static_key_slow_dec(&egress_needed); 1713 } 1714 EXPORT_SYMBOL_GPL(net_dec_egress_queue); 1715 #endif 1716 1717 static struct static_key netstamp_needed __read_mostly; 1718 #ifdef HAVE_JUMP_LABEL 1719 static atomic_t netstamp_needed_deferred; 1720 static atomic_t netstamp_wanted; 1721 static void netstamp_clear(struct work_struct *work) 1722 { 1723 int deferred = atomic_xchg(&netstamp_needed_deferred, 0); 1724 int wanted; 1725 1726 wanted = atomic_add_return(deferred, &netstamp_wanted); 1727 if (wanted > 0) 1728 static_key_enable(&netstamp_needed); 1729 else 1730 static_key_disable(&netstamp_needed); 1731 } 1732 static DECLARE_WORK(netstamp_work, netstamp_clear); 1733 #endif 1734 1735 void net_enable_timestamp(void) 1736 { 1737 #ifdef HAVE_JUMP_LABEL 1738 int wanted; 1739 1740 while (1) { 1741 wanted = atomic_read(&netstamp_wanted); 1742 if (wanted <= 0) 1743 break; 1744 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted) 1745 return; 1746 } 1747 atomic_inc(&netstamp_needed_deferred); 1748 schedule_work(&netstamp_work); 1749 #else 1750 static_key_slow_inc(&netstamp_needed); 1751 #endif 1752 } 1753 EXPORT_SYMBOL(net_enable_timestamp); 1754 1755 void net_disable_timestamp(void) 1756 { 1757 #ifdef HAVE_JUMP_LABEL 1758 int wanted; 1759 1760 while (1) { 1761 wanted = atomic_read(&netstamp_wanted); 1762 if (wanted <= 1) 1763 break; 1764 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted) 1765 return; 1766 } 1767 atomic_dec(&netstamp_needed_deferred); 1768 schedule_work(&netstamp_work); 1769 #else 1770 static_key_slow_dec(&netstamp_needed); 1771 #endif 1772 } 1773 EXPORT_SYMBOL(net_disable_timestamp); 1774 1775 static inline void net_timestamp_set(struct sk_buff *skb) 1776 { 1777 skb->tstamp.tv64 = 0; 1778 if (static_key_false(&netstamp_needed)) 1779 __net_timestamp(skb); 1780 } 1781 1782 #define net_timestamp_check(COND, SKB) \ 1783 if (static_key_false(&netstamp_needed)) { \ 1784 if ((COND) && !(SKB)->tstamp.tv64) \ 1785 __net_timestamp(SKB); \ 1786 } \ 1787 1788 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) 1789 { 1790 unsigned int len; 1791 1792 if (!(dev->flags & IFF_UP)) 1793 return false; 1794 1795 len = dev->mtu + dev->hard_header_len + VLAN_HLEN; 1796 if (skb->len <= len) 1797 return true; 1798 1799 /* if TSO is enabled, we don't care about the length as the packet 1800 * could be forwarded without being segmented before 1801 */ 1802 if (skb_is_gso(skb)) 1803 return true; 1804 1805 return false; 1806 } 1807 EXPORT_SYMBOL_GPL(is_skb_forwardable); 1808 1809 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1810 { 1811 int ret = ____dev_forward_skb(dev, skb); 1812 1813 if (likely(!ret)) { 1814 skb->protocol = eth_type_trans(skb, dev); 1815 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 1816 } 1817 1818 return ret; 1819 } 1820 EXPORT_SYMBOL_GPL(__dev_forward_skb); 1821 1822 /** 1823 * dev_forward_skb - loopback an skb to another netif 1824 * 1825 * @dev: destination network device 1826 * @skb: buffer to forward 1827 * 1828 * return values: 1829 * NET_RX_SUCCESS (no congestion) 1830 * NET_RX_DROP (packet was dropped, but freed) 1831 * 1832 * dev_forward_skb can be used for injecting an skb from the 1833 * start_xmit function of one device into the receive queue 1834 * of another device. 1835 * 1836 * The receiving device may be in another namespace, so 1837 * we have to clear all information in the skb that could 1838 * impact namespace isolation. 1839 */ 1840 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1841 { 1842 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); 1843 } 1844 EXPORT_SYMBOL_GPL(dev_forward_skb); 1845 1846 static inline int deliver_skb(struct sk_buff *skb, 1847 struct packet_type *pt_prev, 1848 struct net_device *orig_dev) 1849 { 1850 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 1851 return -ENOMEM; 1852 atomic_inc(&skb->users); 1853 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 1854 } 1855 1856 static inline void deliver_ptype_list_skb(struct sk_buff *skb, 1857 struct packet_type **pt, 1858 struct net_device *orig_dev, 1859 __be16 type, 1860 struct list_head *ptype_list) 1861 { 1862 struct packet_type *ptype, *pt_prev = *pt; 1863 1864 list_for_each_entry_rcu(ptype, ptype_list, list) { 1865 if (ptype->type != type) 1866 continue; 1867 if (pt_prev) 1868 deliver_skb(skb, pt_prev, orig_dev); 1869 pt_prev = ptype; 1870 } 1871 *pt = pt_prev; 1872 } 1873 1874 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) 1875 { 1876 if (!ptype->af_packet_priv || !skb->sk) 1877 return false; 1878 1879 if (ptype->id_match) 1880 return ptype->id_match(ptype, skb->sk); 1881 else if ((struct sock *)ptype->af_packet_priv == skb->sk) 1882 return true; 1883 1884 return false; 1885 } 1886 1887 /* 1888 * Support routine. Sends outgoing frames to any network 1889 * taps currently in use. 1890 */ 1891 1892 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1893 { 1894 struct packet_type *ptype; 1895 struct sk_buff *skb2 = NULL; 1896 struct packet_type *pt_prev = NULL; 1897 struct list_head *ptype_list = &ptype_all; 1898 1899 rcu_read_lock(); 1900 again: 1901 list_for_each_entry_rcu(ptype, ptype_list, list) { 1902 /* Never send packets back to the socket 1903 * they originated from - MvS (miquels@drinkel.ow.org) 1904 */ 1905 if (skb_loop_sk(ptype, skb)) 1906 continue; 1907 1908 if (pt_prev) { 1909 deliver_skb(skb2, pt_prev, skb->dev); 1910 pt_prev = ptype; 1911 continue; 1912 } 1913 1914 /* need to clone skb, done only once */ 1915 skb2 = skb_clone(skb, GFP_ATOMIC); 1916 if (!skb2) 1917 goto out_unlock; 1918 1919 net_timestamp_set(skb2); 1920 1921 /* skb->nh should be correctly 1922 * set by sender, so that the second statement is 1923 * just protection against buggy protocols. 1924 */ 1925 skb_reset_mac_header(skb2); 1926 1927 if (skb_network_header(skb2) < skb2->data || 1928 skb_network_header(skb2) > skb_tail_pointer(skb2)) { 1929 net_crit_ratelimited("protocol %04x is buggy, dev %s\n", 1930 ntohs(skb2->protocol), 1931 dev->name); 1932 skb_reset_network_header(skb2); 1933 } 1934 1935 skb2->transport_header = skb2->network_header; 1936 skb2->pkt_type = PACKET_OUTGOING; 1937 pt_prev = ptype; 1938 } 1939 1940 if (ptype_list == &ptype_all) { 1941 ptype_list = &dev->ptype_all; 1942 goto again; 1943 } 1944 out_unlock: 1945 if (pt_prev) 1946 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); 1947 rcu_read_unlock(); 1948 } 1949 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); 1950 1951 /** 1952 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change 1953 * @dev: Network device 1954 * @txq: number of queues available 1955 * 1956 * If real_num_tx_queues is changed the tc mappings may no longer be 1957 * valid. To resolve this verify the tc mapping remains valid and if 1958 * not NULL the mapping. With no priorities mapping to this 1959 * offset/count pair it will no longer be used. In the worst case TC0 1960 * is invalid nothing can be done so disable priority mappings. If is 1961 * expected that drivers will fix this mapping if they can before 1962 * calling netif_set_real_num_tx_queues. 1963 */ 1964 static void netif_setup_tc(struct net_device *dev, unsigned int txq) 1965 { 1966 int i; 1967 struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; 1968 1969 /* If TC0 is invalidated disable TC mapping */ 1970 if (tc->offset + tc->count > txq) { 1971 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); 1972 dev->num_tc = 0; 1973 return; 1974 } 1975 1976 /* Invalidated prio to tc mappings set to TC0 */ 1977 for (i = 1; i < TC_BITMASK + 1; i++) { 1978 int q = netdev_get_prio_tc_map(dev, i); 1979 1980 tc = &dev->tc_to_txq[q]; 1981 if (tc->offset + tc->count > txq) { 1982 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", 1983 i, q); 1984 netdev_set_prio_tc_map(dev, i, 0); 1985 } 1986 } 1987 } 1988 1989 #ifdef CONFIG_XPS 1990 static DEFINE_MUTEX(xps_map_mutex); 1991 #define xmap_dereference(P) \ 1992 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) 1993 1994 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, 1995 int cpu, u16 index) 1996 { 1997 struct xps_map *map = NULL; 1998 int pos; 1999 2000 if (dev_maps) 2001 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2002 2003 for (pos = 0; map && pos < map->len; pos++) { 2004 if (map->queues[pos] == index) { 2005 if (map->len > 1) { 2006 map->queues[pos] = map->queues[--map->len]; 2007 } else { 2008 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); 2009 kfree_rcu(map, rcu); 2010 map = NULL; 2011 } 2012 break; 2013 } 2014 } 2015 2016 return map; 2017 } 2018 2019 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) 2020 { 2021 struct xps_dev_maps *dev_maps; 2022 int cpu, i; 2023 bool active = false; 2024 2025 mutex_lock(&xps_map_mutex); 2026 dev_maps = xmap_dereference(dev->xps_maps); 2027 2028 if (!dev_maps) 2029 goto out_no_maps; 2030 2031 for_each_possible_cpu(cpu) { 2032 for (i = index; i < dev->num_tx_queues; i++) { 2033 if (!remove_xps_queue(dev_maps, cpu, i)) 2034 break; 2035 } 2036 if (i == dev->num_tx_queues) 2037 active = true; 2038 } 2039 2040 if (!active) { 2041 RCU_INIT_POINTER(dev->xps_maps, NULL); 2042 kfree_rcu(dev_maps, rcu); 2043 } 2044 2045 for (i = index; i < dev->num_tx_queues; i++) 2046 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), 2047 NUMA_NO_NODE); 2048 2049 out_no_maps: 2050 mutex_unlock(&xps_map_mutex); 2051 } 2052 2053 static struct xps_map *expand_xps_map(struct xps_map *map, 2054 int cpu, u16 index) 2055 { 2056 struct xps_map *new_map; 2057 int alloc_len = XPS_MIN_MAP_ALLOC; 2058 int i, pos; 2059 2060 for (pos = 0; map && pos < map->len; pos++) { 2061 if (map->queues[pos] != index) 2062 continue; 2063 return map; 2064 } 2065 2066 /* Need to add queue to this CPU's existing map */ 2067 if (map) { 2068 if (pos < map->alloc_len) 2069 return map; 2070 2071 alloc_len = map->alloc_len * 2; 2072 } 2073 2074 /* Need to allocate new map to store queue on this CPU's map */ 2075 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, 2076 cpu_to_node(cpu)); 2077 if (!new_map) 2078 return NULL; 2079 2080 for (i = 0; i < pos; i++) 2081 new_map->queues[i] = map->queues[i]; 2082 new_map->alloc_len = alloc_len; 2083 new_map->len = pos; 2084 2085 return new_map; 2086 } 2087 2088 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, 2089 u16 index) 2090 { 2091 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; 2092 struct xps_map *map, *new_map; 2093 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); 2094 int cpu, numa_node_id = -2; 2095 bool active = false; 2096 2097 mutex_lock(&xps_map_mutex); 2098 2099 dev_maps = xmap_dereference(dev->xps_maps); 2100 2101 /* allocate memory for queue storage */ 2102 for_each_online_cpu(cpu) { 2103 if (!cpumask_test_cpu(cpu, mask)) 2104 continue; 2105 2106 if (!new_dev_maps) 2107 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); 2108 if (!new_dev_maps) { 2109 mutex_unlock(&xps_map_mutex); 2110 return -ENOMEM; 2111 } 2112 2113 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2114 NULL; 2115 2116 map = expand_xps_map(map, cpu, index); 2117 if (!map) 2118 goto error; 2119 2120 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2121 } 2122 2123 if (!new_dev_maps) 2124 goto out_no_new_maps; 2125 2126 for_each_possible_cpu(cpu) { 2127 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { 2128 /* add queue to CPU maps */ 2129 int pos = 0; 2130 2131 map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2132 while ((pos < map->len) && (map->queues[pos] != index)) 2133 pos++; 2134 2135 if (pos == map->len) 2136 map->queues[map->len++] = index; 2137 #ifdef CONFIG_NUMA 2138 if (numa_node_id == -2) 2139 numa_node_id = cpu_to_node(cpu); 2140 else if (numa_node_id != cpu_to_node(cpu)) 2141 numa_node_id = -1; 2142 #endif 2143 } else if (dev_maps) { 2144 /* fill in the new device map from the old device map */ 2145 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2146 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2147 } 2148 2149 } 2150 2151 rcu_assign_pointer(dev->xps_maps, new_dev_maps); 2152 2153 /* Cleanup old maps */ 2154 if (dev_maps) { 2155 for_each_possible_cpu(cpu) { 2156 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2157 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2158 if (map && map != new_map) 2159 kfree_rcu(map, rcu); 2160 } 2161 2162 kfree_rcu(dev_maps, rcu); 2163 } 2164 2165 dev_maps = new_dev_maps; 2166 active = true; 2167 2168 out_no_new_maps: 2169 /* update Tx queue numa node */ 2170 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), 2171 (numa_node_id >= 0) ? numa_node_id : 2172 NUMA_NO_NODE); 2173 2174 if (!dev_maps) 2175 goto out_no_maps; 2176 2177 /* removes queue from unused CPUs */ 2178 for_each_possible_cpu(cpu) { 2179 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) 2180 continue; 2181 2182 if (remove_xps_queue(dev_maps, cpu, index)) 2183 active = true; 2184 } 2185 2186 /* free map if not active */ 2187 if (!active) { 2188 RCU_INIT_POINTER(dev->xps_maps, NULL); 2189 kfree_rcu(dev_maps, rcu); 2190 } 2191 2192 out_no_maps: 2193 mutex_unlock(&xps_map_mutex); 2194 2195 return 0; 2196 error: 2197 /* remove any maps that we added */ 2198 for_each_possible_cpu(cpu) { 2199 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2200 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2201 NULL; 2202 if (new_map && new_map != map) 2203 kfree(new_map); 2204 } 2205 2206 mutex_unlock(&xps_map_mutex); 2207 2208 kfree(new_dev_maps); 2209 return -ENOMEM; 2210 } 2211 EXPORT_SYMBOL(netif_set_xps_queue); 2212 2213 #endif 2214 /* 2215 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 2216 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 2217 */ 2218 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 2219 { 2220 bool disabling; 2221 int rc; 2222 2223 disabling = txq < dev->real_num_tx_queues; 2224 2225 if (txq < 1 || txq > dev->num_tx_queues) 2226 return -EINVAL; 2227 2228 if (dev->reg_state == NETREG_REGISTERED || 2229 dev->reg_state == NETREG_UNREGISTERING) { 2230 ASSERT_RTNL(); 2231 2232 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, 2233 txq); 2234 if (rc) 2235 return rc; 2236 2237 if (dev->num_tc) 2238 netif_setup_tc(dev, txq); 2239 2240 dev->real_num_tx_queues = txq; 2241 2242 if (disabling) { 2243 synchronize_net(); 2244 qdisc_reset_all_tx_gt(dev, txq); 2245 #ifdef CONFIG_XPS 2246 netif_reset_xps_queues_gt(dev, txq); 2247 #endif 2248 } 2249 } else { 2250 dev->real_num_tx_queues = txq; 2251 } 2252 2253 return 0; 2254 } 2255 EXPORT_SYMBOL(netif_set_real_num_tx_queues); 2256 2257 #ifdef CONFIG_SYSFS 2258 /** 2259 * netif_set_real_num_rx_queues - set actual number of RX queues used 2260 * @dev: Network device 2261 * @rxq: Actual number of RX queues 2262 * 2263 * This must be called either with the rtnl_lock held or before 2264 * registration of the net device. Returns 0 on success, or a 2265 * negative error code. If called before registration, it always 2266 * succeeds. 2267 */ 2268 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) 2269 { 2270 int rc; 2271 2272 if (rxq < 1 || rxq > dev->num_rx_queues) 2273 return -EINVAL; 2274 2275 if (dev->reg_state == NETREG_REGISTERED) { 2276 ASSERT_RTNL(); 2277 2278 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, 2279 rxq); 2280 if (rc) 2281 return rc; 2282 } 2283 2284 dev->real_num_rx_queues = rxq; 2285 return 0; 2286 } 2287 EXPORT_SYMBOL(netif_set_real_num_rx_queues); 2288 #endif 2289 2290 /** 2291 * netif_get_num_default_rss_queues - default number of RSS queues 2292 * 2293 * This routine should set an upper limit on the number of RSS queues 2294 * used by default by multiqueue devices. 2295 */ 2296 int netif_get_num_default_rss_queues(void) 2297 { 2298 return is_kdump_kernel() ? 2299 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); 2300 } 2301 EXPORT_SYMBOL(netif_get_num_default_rss_queues); 2302 2303 static void __netif_reschedule(struct Qdisc *q) 2304 { 2305 struct softnet_data *sd; 2306 unsigned long flags; 2307 2308 local_irq_save(flags); 2309 sd = this_cpu_ptr(&softnet_data); 2310 q->next_sched = NULL; 2311 *sd->output_queue_tailp = q; 2312 sd->output_queue_tailp = &q->next_sched; 2313 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2314 local_irq_restore(flags); 2315 } 2316 2317 void __netif_schedule(struct Qdisc *q) 2318 { 2319 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) 2320 __netif_reschedule(q); 2321 } 2322 EXPORT_SYMBOL(__netif_schedule); 2323 2324 struct dev_kfree_skb_cb { 2325 enum skb_free_reason reason; 2326 }; 2327 2328 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) 2329 { 2330 return (struct dev_kfree_skb_cb *)skb->cb; 2331 } 2332 2333 void netif_schedule_queue(struct netdev_queue *txq) 2334 { 2335 rcu_read_lock(); 2336 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { 2337 struct Qdisc *q = rcu_dereference(txq->qdisc); 2338 2339 __netif_schedule(q); 2340 } 2341 rcu_read_unlock(); 2342 } 2343 EXPORT_SYMBOL(netif_schedule_queue); 2344 2345 /** 2346 * netif_wake_subqueue - allow sending packets on subqueue 2347 * @dev: network device 2348 * @queue_index: sub queue index 2349 * 2350 * Resume individual transmit queue of a device with multiple transmit queues. 2351 */ 2352 void netif_wake_subqueue(struct net_device *dev, u16 queue_index) 2353 { 2354 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); 2355 2356 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { 2357 struct Qdisc *q; 2358 2359 rcu_read_lock(); 2360 q = rcu_dereference(txq->qdisc); 2361 __netif_schedule(q); 2362 rcu_read_unlock(); 2363 } 2364 } 2365 EXPORT_SYMBOL(netif_wake_subqueue); 2366 2367 void netif_tx_wake_queue(struct netdev_queue *dev_queue) 2368 { 2369 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { 2370 struct Qdisc *q; 2371 2372 rcu_read_lock(); 2373 q = rcu_dereference(dev_queue->qdisc); 2374 __netif_schedule(q); 2375 rcu_read_unlock(); 2376 } 2377 } 2378 EXPORT_SYMBOL(netif_tx_wake_queue); 2379 2380 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) 2381 { 2382 unsigned long flags; 2383 2384 if (unlikely(!skb)) 2385 return; 2386 2387 if (likely(atomic_read(&skb->users) == 1)) { 2388 smp_rmb(); 2389 atomic_set(&skb->users, 0); 2390 } else if (likely(!atomic_dec_and_test(&skb->users))) { 2391 return; 2392 } 2393 get_kfree_skb_cb(skb)->reason = reason; 2394 local_irq_save(flags); 2395 skb->next = __this_cpu_read(softnet_data.completion_queue); 2396 __this_cpu_write(softnet_data.completion_queue, skb); 2397 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2398 local_irq_restore(flags); 2399 } 2400 EXPORT_SYMBOL(__dev_kfree_skb_irq); 2401 2402 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) 2403 { 2404 if (in_irq() || irqs_disabled()) 2405 __dev_kfree_skb_irq(skb, reason); 2406 else 2407 dev_kfree_skb(skb); 2408 } 2409 EXPORT_SYMBOL(__dev_kfree_skb_any); 2410 2411 2412 /** 2413 * netif_device_detach - mark device as removed 2414 * @dev: network device 2415 * 2416 * Mark device as removed from system and therefore no longer available. 2417 */ 2418 void netif_device_detach(struct net_device *dev) 2419 { 2420 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 2421 netif_running(dev)) { 2422 netif_tx_stop_all_queues(dev); 2423 } 2424 } 2425 EXPORT_SYMBOL(netif_device_detach); 2426 2427 /** 2428 * netif_device_attach - mark device as attached 2429 * @dev: network device 2430 * 2431 * Mark device as attached from system and restart if needed. 2432 */ 2433 void netif_device_attach(struct net_device *dev) 2434 { 2435 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 2436 netif_running(dev)) { 2437 netif_tx_wake_all_queues(dev); 2438 __netdev_watchdog_up(dev); 2439 } 2440 } 2441 EXPORT_SYMBOL(netif_device_attach); 2442 2443 /* 2444 * Returns a Tx hash based on the given packet descriptor a Tx queues' number 2445 * to be used as a distribution range. 2446 */ 2447 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, 2448 unsigned int num_tx_queues) 2449 { 2450 u32 hash; 2451 u16 qoffset = 0; 2452 u16 qcount = num_tx_queues; 2453 2454 if (skb_rx_queue_recorded(skb)) { 2455 hash = skb_get_rx_queue(skb); 2456 while (unlikely(hash >= num_tx_queues)) 2457 hash -= num_tx_queues; 2458 return hash; 2459 } 2460 2461 if (dev->num_tc) { 2462 u8 tc = netdev_get_prio_tc_map(dev, skb->priority); 2463 qoffset = dev->tc_to_txq[tc].offset; 2464 qcount = dev->tc_to_txq[tc].count; 2465 } 2466 2467 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; 2468 } 2469 EXPORT_SYMBOL(__skb_tx_hash); 2470 2471 static void skb_warn_bad_offload(const struct sk_buff *skb) 2472 { 2473 static const netdev_features_t null_features; 2474 struct net_device *dev = skb->dev; 2475 const char *name = ""; 2476 2477 if (!net_ratelimit()) 2478 return; 2479 2480 if (dev) { 2481 if (dev->dev.parent) 2482 name = dev_driver_string(dev->dev.parent); 2483 else 2484 name = netdev_name(dev); 2485 } 2486 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " 2487 "gso_type=%d ip_summed=%d\n", 2488 name, dev ? &dev->features : &null_features, 2489 skb->sk ? &skb->sk->sk_route_caps : &null_features, 2490 skb->len, skb->data_len, skb_shinfo(skb)->gso_size, 2491 skb_shinfo(skb)->gso_type, skb->ip_summed); 2492 } 2493 2494 /* 2495 * Invalidate hardware checksum when packet is to be mangled, and 2496 * complete checksum manually on outgoing path. 2497 */ 2498 int skb_checksum_help(struct sk_buff *skb) 2499 { 2500 __wsum csum; 2501 int ret = 0, offset; 2502 2503 if (skb->ip_summed == CHECKSUM_COMPLETE) 2504 goto out_set_summed; 2505 2506 if (unlikely(skb_shinfo(skb)->gso_size)) { 2507 skb_warn_bad_offload(skb); 2508 return -EINVAL; 2509 } 2510 2511 /* Before computing a checksum, we should make sure no frag could 2512 * be modified by an external entity : checksum could be wrong. 2513 */ 2514 if (skb_has_shared_frag(skb)) { 2515 ret = __skb_linearize(skb); 2516 if (ret) 2517 goto out; 2518 } 2519 2520 offset = skb_checksum_start_offset(skb); 2521 BUG_ON(offset >= skb_headlen(skb)); 2522 csum = skb_checksum(skb, offset, skb->len - offset, 0); 2523 2524 offset += skb->csum_offset; 2525 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 2526 2527 if (skb_cloned(skb) && 2528 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 2529 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 2530 if (ret) 2531 goto out; 2532 } 2533 2534 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; 2535 out_set_summed: 2536 skb->ip_summed = CHECKSUM_NONE; 2537 out: 2538 return ret; 2539 } 2540 EXPORT_SYMBOL(skb_checksum_help); 2541 2542 /* skb_csum_offload_check - Driver helper function to determine if a device 2543 * with limited checksum offload capabilities is able to offload the checksum 2544 * for a given packet. 2545 * 2546 * Arguments: 2547 * skb - sk_buff for the packet in question 2548 * spec - contains the description of what device can offload 2549 * csum_encapped - returns true if the checksum being offloaded is 2550 * encpasulated. That is it is checksum for the transport header 2551 * in the inner headers. 2552 * checksum_help - when set indicates that helper function should 2553 * call skb_checksum_help if offload checks fail 2554 * 2555 * Returns: 2556 * true: Packet has passed the checksum checks and should be offloadable to 2557 * the device (a driver may still need to check for additional 2558 * restrictions of its device) 2559 * false: Checksum is not offloadable. If checksum_help was set then 2560 * skb_checksum_help was called to resolve checksum for non-GSO 2561 * packets and when IP protocol is not SCTP 2562 */ 2563 bool __skb_csum_offload_chk(struct sk_buff *skb, 2564 const struct skb_csum_offl_spec *spec, 2565 bool *csum_encapped, 2566 bool csum_help) 2567 { 2568 struct iphdr *iph; 2569 struct ipv6hdr *ipv6; 2570 void *nhdr; 2571 int protocol; 2572 u8 ip_proto; 2573 2574 if (skb->protocol == htons(ETH_P_8021Q) || 2575 skb->protocol == htons(ETH_P_8021AD)) { 2576 if (!spec->vlan_okay) 2577 goto need_help; 2578 } 2579 2580 /* We check whether the checksum refers to a transport layer checksum in 2581 * the outermost header or an encapsulated transport layer checksum that 2582 * corresponds to the inner headers of the skb. If the checksum is for 2583 * something else in the packet we need help. 2584 */ 2585 if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) { 2586 /* Non-encapsulated checksum */ 2587 protocol = eproto_to_ipproto(vlan_get_protocol(skb)); 2588 nhdr = skb_network_header(skb); 2589 *csum_encapped = false; 2590 if (spec->no_not_encapped) 2591 goto need_help; 2592 } else if (skb->encapsulation && spec->encap_okay && 2593 skb_checksum_start_offset(skb) == 2594 skb_inner_transport_offset(skb)) { 2595 /* Encapsulated checksum */ 2596 *csum_encapped = true; 2597 switch (skb->inner_protocol_type) { 2598 case ENCAP_TYPE_ETHER: 2599 protocol = eproto_to_ipproto(skb->inner_protocol); 2600 break; 2601 case ENCAP_TYPE_IPPROTO: 2602 protocol = skb->inner_protocol; 2603 break; 2604 } 2605 nhdr = skb_inner_network_header(skb); 2606 } else { 2607 goto need_help; 2608 } 2609 2610 switch (protocol) { 2611 case IPPROTO_IP: 2612 if (!spec->ipv4_okay) 2613 goto need_help; 2614 iph = nhdr; 2615 ip_proto = iph->protocol; 2616 if (iph->ihl != 5 && !spec->ip_options_okay) 2617 goto need_help; 2618 break; 2619 case IPPROTO_IPV6: 2620 if (!spec->ipv6_okay) 2621 goto need_help; 2622 if (spec->no_encapped_ipv6 && *csum_encapped) 2623 goto need_help; 2624 ipv6 = nhdr; 2625 nhdr += sizeof(*ipv6); 2626 ip_proto = ipv6->nexthdr; 2627 break; 2628 default: 2629 goto need_help; 2630 } 2631 2632 ip_proto_again: 2633 switch (ip_proto) { 2634 case IPPROTO_TCP: 2635 if (!spec->tcp_okay || 2636 skb->csum_offset != offsetof(struct tcphdr, check)) 2637 goto need_help; 2638 break; 2639 case IPPROTO_UDP: 2640 if (!spec->udp_okay || 2641 skb->csum_offset != offsetof(struct udphdr, check)) 2642 goto need_help; 2643 break; 2644 case IPPROTO_SCTP: 2645 if (!spec->sctp_okay || 2646 skb->csum_offset != offsetof(struct sctphdr, checksum)) 2647 goto cant_help; 2648 break; 2649 case NEXTHDR_HOP: 2650 case NEXTHDR_ROUTING: 2651 case NEXTHDR_DEST: { 2652 u8 *opthdr = nhdr; 2653 2654 if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay) 2655 goto need_help; 2656 2657 ip_proto = opthdr[0]; 2658 nhdr += (opthdr[1] + 1) << 3; 2659 2660 goto ip_proto_again; 2661 } 2662 default: 2663 goto need_help; 2664 } 2665 2666 /* Passed the tests for offloading checksum */ 2667 return true; 2668 2669 need_help: 2670 if (csum_help && !skb_shinfo(skb)->gso_size) 2671 skb_checksum_help(skb); 2672 cant_help: 2673 return false; 2674 } 2675 EXPORT_SYMBOL(__skb_csum_offload_chk); 2676 2677 __be16 skb_network_protocol(struct sk_buff *skb, int *depth) 2678 { 2679 __be16 type = skb->protocol; 2680 2681 /* Tunnel gso handlers can set protocol to ethernet. */ 2682 if (type == htons(ETH_P_TEB)) { 2683 struct ethhdr *eth; 2684 2685 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) 2686 return 0; 2687 2688 eth = (struct ethhdr *)skb->data; 2689 type = eth->h_proto; 2690 } 2691 2692 return __vlan_get_protocol(skb, type, depth); 2693 } 2694 2695 /** 2696 * skb_mac_gso_segment - mac layer segmentation handler. 2697 * @skb: buffer to segment 2698 * @features: features for the output path (see dev->features) 2699 */ 2700 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, 2701 netdev_features_t features) 2702 { 2703 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 2704 struct packet_offload *ptype; 2705 int vlan_depth = skb->mac_len; 2706 __be16 type = skb_network_protocol(skb, &vlan_depth); 2707 2708 if (unlikely(!type)) 2709 return ERR_PTR(-EINVAL); 2710 2711 __skb_pull(skb, vlan_depth); 2712 2713 rcu_read_lock(); 2714 list_for_each_entry_rcu(ptype, &offload_base, list) { 2715 if (ptype->type == type && ptype->callbacks.gso_segment) { 2716 segs = ptype->callbacks.gso_segment(skb, features); 2717 break; 2718 } 2719 } 2720 rcu_read_unlock(); 2721 2722 __skb_push(skb, skb->data - skb_mac_header(skb)); 2723 2724 return segs; 2725 } 2726 EXPORT_SYMBOL(skb_mac_gso_segment); 2727 2728 2729 /* openvswitch calls this on rx path, so we need a different check. 2730 */ 2731 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) 2732 { 2733 if (tx_path) 2734 return skb->ip_summed != CHECKSUM_PARTIAL && 2735 skb->ip_summed != CHECKSUM_UNNECESSARY; 2736 2737 return skb->ip_summed == CHECKSUM_NONE; 2738 } 2739 2740 /** 2741 * __skb_gso_segment - Perform segmentation on skb. 2742 * @skb: buffer to segment 2743 * @features: features for the output path (see dev->features) 2744 * @tx_path: whether it is called in TX path 2745 * 2746 * This function segments the given skb and returns a list of segments. 2747 * 2748 * It may return NULL if the skb requires no segmentation. This is 2749 * only possible when GSO is used for verifying header integrity. 2750 * 2751 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb. 2752 */ 2753 struct sk_buff *__skb_gso_segment(struct sk_buff *skb, 2754 netdev_features_t features, bool tx_path) 2755 { 2756 struct sk_buff *segs; 2757 2758 if (unlikely(skb_needs_check(skb, tx_path))) { 2759 int err; 2760 2761 /* We're going to init ->check field in TCP or UDP header */ 2762 err = skb_cow_head(skb, 0); 2763 if (err < 0) 2764 return ERR_PTR(err); 2765 } 2766 2767 /* Only report GSO partial support if it will enable us to 2768 * support segmentation on this frame without needing additional 2769 * work. 2770 */ 2771 if (features & NETIF_F_GSO_PARTIAL) { 2772 netdev_features_t partial_features = NETIF_F_GSO_ROBUST; 2773 struct net_device *dev = skb->dev; 2774 2775 partial_features |= dev->features & dev->gso_partial_features; 2776 if (!skb_gso_ok(skb, features | partial_features)) 2777 features &= ~NETIF_F_GSO_PARTIAL; 2778 } 2779 2780 BUILD_BUG_ON(SKB_SGO_CB_OFFSET + 2781 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); 2782 2783 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); 2784 SKB_GSO_CB(skb)->encap_level = 0; 2785 2786 skb_reset_mac_header(skb); 2787 skb_reset_mac_len(skb); 2788 2789 segs = skb_mac_gso_segment(skb, features); 2790 2791 if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) 2792 skb_warn_bad_offload(skb); 2793 2794 return segs; 2795 } 2796 EXPORT_SYMBOL(__skb_gso_segment); 2797 2798 /* Take action when hardware reception checksum errors are detected. */ 2799 #ifdef CONFIG_BUG 2800 void netdev_rx_csum_fault(struct net_device *dev) 2801 { 2802 if (net_ratelimit()) { 2803 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); 2804 dump_stack(); 2805 } 2806 } 2807 EXPORT_SYMBOL(netdev_rx_csum_fault); 2808 #endif 2809 2810 /* Actually, we should eliminate this check as soon as we know, that: 2811 * 1. IOMMU is present and allows to map all the memory. 2812 * 2. No high memory really exists on this machine. 2813 */ 2814 2815 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 2816 { 2817 #ifdef CONFIG_HIGHMEM 2818 int i; 2819 if (!(dev->features & NETIF_F_HIGHDMA)) { 2820 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2821 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2822 if (PageHighMem(skb_frag_page(frag))) 2823 return 1; 2824 } 2825 } 2826 2827 if (PCI_DMA_BUS_IS_PHYS) { 2828 struct device *pdev = dev->dev.parent; 2829 2830 if (!pdev) 2831 return 0; 2832 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2833 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2834 dma_addr_t addr = page_to_phys(skb_frag_page(frag)); 2835 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) 2836 return 1; 2837 } 2838 } 2839 #endif 2840 return 0; 2841 } 2842 2843 /* If MPLS offload request, verify we are testing hardware MPLS features 2844 * instead of standard features for the netdev. 2845 */ 2846 #if IS_ENABLED(CONFIG_NET_MPLS_GSO) 2847 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2848 netdev_features_t features, 2849 __be16 type) 2850 { 2851 if (eth_p_mpls(type)) 2852 features &= skb->dev->mpls_features; 2853 2854 return features; 2855 } 2856 #else 2857 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2858 netdev_features_t features, 2859 __be16 type) 2860 { 2861 return features; 2862 } 2863 #endif 2864 2865 static netdev_features_t harmonize_features(struct sk_buff *skb, 2866 netdev_features_t features) 2867 { 2868 int tmp; 2869 __be16 type; 2870 2871 type = skb_network_protocol(skb, &tmp); 2872 features = net_mpls_features(skb, features, type); 2873 2874 if (skb->ip_summed != CHECKSUM_NONE && 2875 !can_checksum_protocol(features, type)) { 2876 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); 2877 } 2878 if (illegal_highdma(skb->dev, skb)) 2879 features &= ~NETIF_F_SG; 2880 2881 return features; 2882 } 2883 2884 netdev_features_t passthru_features_check(struct sk_buff *skb, 2885 struct net_device *dev, 2886 netdev_features_t features) 2887 { 2888 return features; 2889 } 2890 EXPORT_SYMBOL(passthru_features_check); 2891 2892 static netdev_features_t dflt_features_check(struct sk_buff *skb, 2893 struct net_device *dev, 2894 netdev_features_t features) 2895 { 2896 return vlan_features_check(skb, features); 2897 } 2898 2899 static netdev_features_t gso_features_check(const struct sk_buff *skb, 2900 struct net_device *dev, 2901 netdev_features_t features) 2902 { 2903 u16 gso_segs = skb_shinfo(skb)->gso_segs; 2904 2905 if (gso_segs > dev->gso_max_segs) 2906 return features & ~NETIF_F_GSO_MASK; 2907 2908 /* Support for GSO partial features requires software 2909 * intervention before we can actually process the packets 2910 * so we need to strip support for any partial features now 2911 * and we can pull them back in after we have partially 2912 * segmented the frame. 2913 */ 2914 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) 2915 features &= ~dev->gso_partial_features; 2916 2917 /* Make sure to clear the IPv4 ID mangling feature if the 2918 * IPv4 header has the potential to be fragmented. 2919 */ 2920 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { 2921 struct iphdr *iph = skb->encapsulation ? 2922 inner_ip_hdr(skb) : ip_hdr(skb); 2923 2924 if (!(iph->frag_off & htons(IP_DF))) 2925 features &= ~NETIF_F_TSO_MANGLEID; 2926 } 2927 2928 return features; 2929 } 2930 2931 netdev_features_t netif_skb_features(struct sk_buff *skb) 2932 { 2933 struct net_device *dev = skb->dev; 2934 netdev_features_t features = dev->features; 2935 2936 if (skb_is_gso(skb)) 2937 features = gso_features_check(skb, dev, features); 2938 2939 /* If encapsulation offload request, verify we are testing 2940 * hardware encapsulation features instead of standard 2941 * features for the netdev 2942 */ 2943 if (skb->encapsulation) 2944 features &= dev->hw_enc_features; 2945 2946 if (skb_vlan_tagged(skb)) 2947 features = netdev_intersect_features(features, 2948 dev->vlan_features | 2949 NETIF_F_HW_VLAN_CTAG_TX | 2950 NETIF_F_HW_VLAN_STAG_TX); 2951 2952 if (dev->netdev_ops->ndo_features_check) 2953 features &= dev->netdev_ops->ndo_features_check(skb, dev, 2954 features); 2955 else 2956 features &= dflt_features_check(skb, dev, features); 2957 2958 return harmonize_features(skb, features); 2959 } 2960 EXPORT_SYMBOL(netif_skb_features); 2961 2962 static int xmit_one(struct sk_buff *skb, struct net_device *dev, 2963 struct netdev_queue *txq, bool more) 2964 { 2965 unsigned int len; 2966 int rc; 2967 2968 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) 2969 dev_queue_xmit_nit(skb, dev); 2970 2971 len = skb->len; 2972 trace_net_dev_start_xmit(skb, dev); 2973 rc = netdev_start_xmit(skb, dev, txq, more); 2974 trace_net_dev_xmit(skb, rc, dev, len); 2975 2976 return rc; 2977 } 2978 2979 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, 2980 struct netdev_queue *txq, int *ret) 2981 { 2982 struct sk_buff *skb = first; 2983 int rc = NETDEV_TX_OK; 2984 2985 while (skb) { 2986 struct sk_buff *next = skb->next; 2987 2988 skb->next = NULL; 2989 rc = xmit_one(skb, dev, txq, next != NULL); 2990 if (unlikely(!dev_xmit_complete(rc))) { 2991 skb->next = next; 2992 goto out; 2993 } 2994 2995 skb = next; 2996 if (netif_tx_queue_stopped(txq) && skb) { 2997 rc = NETDEV_TX_BUSY; 2998 break; 2999 } 3000 } 3001 3002 out: 3003 *ret = rc; 3004 return skb; 3005 } 3006 3007 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, 3008 netdev_features_t features) 3009 { 3010 if (skb_vlan_tag_present(skb) && 3011 !vlan_hw_offload_capable(features, skb->vlan_proto)) 3012 skb = __vlan_hwaccel_push_inside(skb); 3013 return skb; 3014 } 3015 3016 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) 3017 { 3018 netdev_features_t features; 3019 3020 features = netif_skb_features(skb); 3021 skb = validate_xmit_vlan(skb, features); 3022 if (unlikely(!skb)) 3023 goto out_null; 3024 3025 if (netif_needs_gso(skb, features)) { 3026 struct sk_buff *segs; 3027 3028 segs = skb_gso_segment(skb, features); 3029 if (IS_ERR(segs)) { 3030 goto out_kfree_skb; 3031 } else if (segs) { 3032 consume_skb(skb); 3033 skb = segs; 3034 } 3035 } else { 3036 if (skb_needs_linearize(skb, features) && 3037 __skb_linearize(skb)) 3038 goto out_kfree_skb; 3039 3040 /* If packet is not checksummed and device does not 3041 * support checksumming for this protocol, complete 3042 * checksumming here. 3043 */ 3044 if (skb->ip_summed == CHECKSUM_PARTIAL) { 3045 if (skb->encapsulation) 3046 skb_set_inner_transport_header(skb, 3047 skb_checksum_start_offset(skb)); 3048 else 3049 skb_set_transport_header(skb, 3050 skb_checksum_start_offset(skb)); 3051 if (!(features & NETIF_F_CSUM_MASK) && 3052 skb_checksum_help(skb)) 3053 goto out_kfree_skb; 3054 } 3055 } 3056 3057 return skb; 3058 3059 out_kfree_skb: 3060 kfree_skb(skb); 3061 out_null: 3062 atomic_long_inc(&dev->tx_dropped); 3063 return NULL; 3064 } 3065 3066 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) 3067 { 3068 struct sk_buff *next, *head = NULL, *tail; 3069 3070 for (; skb != NULL; skb = next) { 3071 next = skb->next; 3072 skb->next = NULL; 3073 3074 /* in case skb wont be segmented, point to itself */ 3075 skb->prev = skb; 3076 3077 skb = validate_xmit_skb(skb, dev); 3078 if (!skb) 3079 continue; 3080 3081 if (!head) 3082 head = skb; 3083 else 3084 tail->next = skb; 3085 /* If skb was segmented, skb->prev points to 3086 * the last segment. If not, it still contains skb. 3087 */ 3088 tail = skb->prev; 3089 } 3090 return head; 3091 } 3092 EXPORT_SYMBOL_GPL(validate_xmit_skb_list); 3093 3094 static void qdisc_pkt_len_init(struct sk_buff *skb) 3095 { 3096 const struct skb_shared_info *shinfo = skb_shinfo(skb); 3097 3098 qdisc_skb_cb(skb)->pkt_len = skb->len; 3099 3100 /* To get more precise estimation of bytes sent on wire, 3101 * we add to pkt_len the headers size of all segments 3102 */ 3103 if (shinfo->gso_size) { 3104 unsigned int hdr_len; 3105 u16 gso_segs = shinfo->gso_segs; 3106 3107 /* mac layer + network layer */ 3108 hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 3109 3110 /* + transport layer */ 3111 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { 3112 const struct tcphdr *th; 3113 struct tcphdr _tcphdr; 3114 3115 th = skb_header_pointer(skb, skb_transport_offset(skb), 3116 sizeof(_tcphdr), &_tcphdr); 3117 if (likely(th)) 3118 hdr_len += __tcp_hdrlen(th); 3119 } else { 3120 struct udphdr _udphdr; 3121 3122 if (skb_header_pointer(skb, skb_transport_offset(skb), 3123 sizeof(_udphdr), &_udphdr)) 3124 hdr_len += sizeof(struct udphdr); 3125 } 3126 3127 if (shinfo->gso_type & SKB_GSO_DODGY) 3128 gso_segs = DIV_ROUND_UP(skb->len - hdr_len, 3129 shinfo->gso_size); 3130 3131 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; 3132 } 3133 } 3134 3135 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, 3136 struct net_device *dev, 3137 struct netdev_queue *txq) 3138 { 3139 spinlock_t *root_lock = qdisc_lock(q); 3140 struct sk_buff *to_free = NULL; 3141 bool contended; 3142 int rc; 3143 3144 qdisc_calculate_pkt_len(skb, q); 3145 /* 3146 * Heuristic to force contended enqueues to serialize on a 3147 * separate lock before trying to get qdisc main lock. 3148 * This permits qdisc->running owner to get the lock more 3149 * often and dequeue packets faster. 3150 */ 3151 contended = qdisc_is_running(q); 3152 if (unlikely(contended)) 3153 spin_lock(&q->busylock); 3154 3155 spin_lock(root_lock); 3156 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 3157 __qdisc_drop(skb, &to_free); 3158 rc = NET_XMIT_DROP; 3159 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 3160 qdisc_run_begin(q)) { 3161 /* 3162 * This is a work-conserving queue; there are no old skbs 3163 * waiting to be sent out; and the qdisc is not running - 3164 * xmit the skb directly. 3165 */ 3166 3167 qdisc_bstats_update(q, skb); 3168 3169 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { 3170 if (unlikely(contended)) { 3171 spin_unlock(&q->busylock); 3172 contended = false; 3173 } 3174 __qdisc_run(q); 3175 } else 3176 qdisc_run_end(q); 3177 3178 rc = NET_XMIT_SUCCESS; 3179 } else { 3180 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; 3181 if (qdisc_run_begin(q)) { 3182 if (unlikely(contended)) { 3183 spin_unlock(&q->busylock); 3184 contended = false; 3185 } 3186 __qdisc_run(q); 3187 } 3188 } 3189 spin_unlock(root_lock); 3190 if (unlikely(to_free)) 3191 kfree_skb_list(to_free); 3192 if (unlikely(contended)) 3193 spin_unlock(&q->busylock); 3194 return rc; 3195 } 3196 3197 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) 3198 static void skb_update_prio(struct sk_buff *skb) 3199 { 3200 const struct netprio_map *map; 3201 const struct sock *sk; 3202 unsigned int prioidx; 3203 3204 if (skb->priority) 3205 return; 3206 map = rcu_dereference_bh(skb->dev->priomap); 3207 if (!map) 3208 return; 3209 sk = skb_to_full_sk(skb); 3210 if (!sk) 3211 return; 3212 3213 prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data); 3214 3215 if (prioidx < map->priomap_len) 3216 skb->priority = map->priomap[prioidx]; 3217 } 3218 #else 3219 #define skb_update_prio(skb) 3220 #endif 3221 3222 DEFINE_PER_CPU(int, xmit_recursion); 3223 EXPORT_SYMBOL(xmit_recursion); 3224 3225 /** 3226 * dev_loopback_xmit - loop back @skb 3227 * @net: network namespace this loopback is happening in 3228 * @sk: sk needed to be a netfilter okfn 3229 * @skb: buffer to transmit 3230 */ 3231 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) 3232 { 3233 skb_reset_mac_header(skb); 3234 __skb_pull(skb, skb_network_offset(skb)); 3235 skb->pkt_type = PACKET_LOOPBACK; 3236 skb->ip_summed = CHECKSUM_UNNECESSARY; 3237 WARN_ON(!skb_dst(skb)); 3238 skb_dst_force(skb); 3239 netif_rx_ni(skb); 3240 return 0; 3241 } 3242 EXPORT_SYMBOL(dev_loopback_xmit); 3243 3244 #ifdef CONFIG_NET_EGRESS 3245 static struct sk_buff * 3246 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) 3247 { 3248 struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list); 3249 struct tcf_result cl_res; 3250 3251 if (!cl) 3252 return skb; 3253 3254 /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set 3255 * earlier by the caller. 3256 */ 3257 qdisc_bstats_cpu_update(cl->q, skb); 3258 3259 switch (tc_classify(skb, cl, &cl_res, false)) { 3260 case TC_ACT_OK: 3261 case TC_ACT_RECLASSIFY: 3262 skb->tc_index = TC_H_MIN(cl_res.classid); 3263 break; 3264 case TC_ACT_SHOT: 3265 qdisc_qstats_cpu_drop(cl->q); 3266 *ret = NET_XMIT_DROP; 3267 kfree_skb(skb); 3268 return NULL; 3269 case TC_ACT_STOLEN: 3270 case TC_ACT_QUEUED: 3271 *ret = NET_XMIT_SUCCESS; 3272 consume_skb(skb); 3273 return NULL; 3274 case TC_ACT_REDIRECT: 3275 /* No need to push/pop skb's mac_header here on egress! */ 3276 skb_do_redirect(skb); 3277 *ret = NET_XMIT_SUCCESS; 3278 return NULL; 3279 default: 3280 break; 3281 } 3282 3283 return skb; 3284 } 3285 #endif /* CONFIG_NET_EGRESS */ 3286 3287 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) 3288 { 3289 #ifdef CONFIG_XPS 3290 struct xps_dev_maps *dev_maps; 3291 struct xps_map *map; 3292 int queue_index = -1; 3293 3294 rcu_read_lock(); 3295 dev_maps = rcu_dereference(dev->xps_maps); 3296 if (dev_maps) { 3297 map = rcu_dereference( 3298 dev_maps->cpu_map[skb->sender_cpu - 1]); 3299 if (map) { 3300 if (map->len == 1) 3301 queue_index = map->queues[0]; 3302 else 3303 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), 3304 map->len)]; 3305 if (unlikely(queue_index >= dev->real_num_tx_queues)) 3306 queue_index = -1; 3307 } 3308 } 3309 rcu_read_unlock(); 3310 3311 return queue_index; 3312 #else 3313 return -1; 3314 #endif 3315 } 3316 3317 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) 3318 { 3319 struct sock *sk = skb->sk; 3320 int queue_index = sk_tx_queue_get(sk); 3321 3322 if (queue_index < 0 || skb->ooo_okay || 3323 queue_index >= dev->real_num_tx_queues) { 3324 int new_index = get_xps_queue(dev, skb); 3325 if (new_index < 0) 3326 new_index = skb_tx_hash(dev, skb); 3327 3328 if (queue_index != new_index && sk && 3329 sk_fullsock(sk) && 3330 rcu_access_pointer(sk->sk_dst_cache)) 3331 sk_tx_queue_set(sk, new_index); 3332 3333 queue_index = new_index; 3334 } 3335 3336 return queue_index; 3337 } 3338 3339 struct netdev_queue *netdev_pick_tx(struct net_device *dev, 3340 struct sk_buff *skb, 3341 void *accel_priv) 3342 { 3343 int queue_index = 0; 3344 3345 #ifdef CONFIG_XPS 3346 u32 sender_cpu = skb->sender_cpu - 1; 3347 3348 if (sender_cpu >= (u32)NR_CPUS) 3349 skb->sender_cpu = raw_smp_processor_id() + 1; 3350 #endif 3351 3352 if (dev->real_num_tx_queues != 1) { 3353 const struct net_device_ops *ops = dev->netdev_ops; 3354 if (ops->ndo_select_queue) 3355 queue_index = ops->ndo_select_queue(dev, skb, accel_priv, 3356 __netdev_pick_tx); 3357 else 3358 queue_index = __netdev_pick_tx(dev, skb); 3359 3360 if (!accel_priv) 3361 queue_index = netdev_cap_txqueue(dev, queue_index); 3362 } 3363 3364 skb_set_queue_mapping(skb, queue_index); 3365 return netdev_get_tx_queue(dev, queue_index); 3366 } 3367 3368 /** 3369 * __dev_queue_xmit - transmit a buffer 3370 * @skb: buffer to transmit 3371 * @accel_priv: private data used for L2 forwarding offload 3372 * 3373 * Queue a buffer for transmission to a network device. The caller must 3374 * have set the device and priority and built the buffer before calling 3375 * this function. The function can be called from an interrupt. 3376 * 3377 * A negative errno code is returned on a failure. A success does not 3378 * guarantee the frame will be transmitted as it may be dropped due 3379 * to congestion or traffic shaping. 3380 * 3381 * ----------------------------------------------------------------------------------- 3382 * I notice this method can also return errors from the queue disciplines, 3383 * including NET_XMIT_DROP, which is a positive value. So, errors can also 3384 * be positive. 3385 * 3386 * Regardless of the return value, the skb is consumed, so it is currently 3387 * difficult to retry a send to this method. (You can bump the ref count 3388 * before sending to hold a reference for retry if you are careful.) 3389 * 3390 * When calling this method, interrupts MUST be enabled. This is because 3391 * the BH enable code must have IRQs enabled so that it will not deadlock. 3392 * --BLG 3393 */ 3394 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) 3395 { 3396 struct net_device *dev = skb->dev; 3397 struct netdev_queue *txq; 3398 struct Qdisc *q; 3399 int rc = -ENOMEM; 3400 3401 skb_reset_mac_header(skb); 3402 3403 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) 3404 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); 3405 3406 /* Disable soft irqs for various locks below. Also 3407 * stops preemption for RCU. 3408 */ 3409 rcu_read_lock_bh(); 3410 3411 skb_update_prio(skb); 3412 3413 qdisc_pkt_len_init(skb); 3414 #ifdef CONFIG_NET_CLS_ACT 3415 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 3416 # ifdef CONFIG_NET_EGRESS 3417 if (static_key_false(&egress_needed)) { 3418 skb = sch_handle_egress(skb, &rc, dev); 3419 if (!skb) 3420 goto out; 3421 } 3422 # endif 3423 #endif 3424 /* If device/qdisc don't need skb->dst, release it right now while 3425 * its hot in this cpu cache. 3426 */ 3427 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 3428 skb_dst_drop(skb); 3429 else 3430 skb_dst_force(skb); 3431 3432 txq = netdev_pick_tx(dev, skb, accel_priv); 3433 q = rcu_dereference_bh(txq->qdisc); 3434 3435 trace_net_dev_queue(skb); 3436 if (q->enqueue) { 3437 rc = __dev_xmit_skb(skb, q, dev, txq); 3438 goto out; 3439 } 3440 3441 /* The device has no queue. Common case for software devices: 3442 loopback, all the sorts of tunnels... 3443 3444 Really, it is unlikely that netif_tx_lock protection is necessary 3445 here. (f.e. loopback and IP tunnels are clean ignoring statistics 3446 counters.) 3447 However, it is possible, that they rely on protection 3448 made by us here. 3449 3450 Check this and shot the lock. It is not prone from deadlocks. 3451 Either shot noqueue qdisc, it is even simpler 8) 3452 */ 3453 if (dev->flags & IFF_UP) { 3454 int cpu = smp_processor_id(); /* ok because BHs are off */ 3455 3456 if (txq->xmit_lock_owner != cpu) { 3457 if (unlikely(__this_cpu_read(xmit_recursion) > 3458 XMIT_RECURSION_LIMIT)) 3459 goto recursion_alert; 3460 3461 skb = validate_xmit_skb(skb, dev); 3462 if (!skb) 3463 goto out; 3464 3465 HARD_TX_LOCK(dev, txq, cpu); 3466 3467 if (!netif_xmit_stopped(txq)) { 3468 __this_cpu_inc(xmit_recursion); 3469 skb = dev_hard_start_xmit(skb, dev, txq, &rc); 3470 __this_cpu_dec(xmit_recursion); 3471 if (dev_xmit_complete(rc)) { 3472 HARD_TX_UNLOCK(dev, txq); 3473 goto out; 3474 } 3475 } 3476 HARD_TX_UNLOCK(dev, txq); 3477 net_crit_ratelimited("Virtual device %s asks to queue packet!\n", 3478 dev->name); 3479 } else { 3480 /* Recursion is detected! It is possible, 3481 * unfortunately 3482 */ 3483 recursion_alert: 3484 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", 3485 dev->name); 3486 } 3487 } 3488 3489 rc = -ENETDOWN; 3490 rcu_read_unlock_bh(); 3491 3492 atomic_long_inc(&dev->tx_dropped); 3493 kfree_skb_list(skb); 3494 return rc; 3495 out: 3496 rcu_read_unlock_bh(); 3497 return rc; 3498 } 3499 3500 int dev_queue_xmit(struct sk_buff *skb) 3501 { 3502 return __dev_queue_xmit(skb, NULL); 3503 } 3504 EXPORT_SYMBOL(dev_queue_xmit); 3505 3506 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) 3507 { 3508 return __dev_queue_xmit(skb, accel_priv); 3509 } 3510 EXPORT_SYMBOL(dev_queue_xmit_accel); 3511 3512 3513 /*======================================================================= 3514 Receiver routines 3515 =======================================================================*/ 3516 3517 int netdev_max_backlog __read_mostly = 1000; 3518 EXPORT_SYMBOL(netdev_max_backlog); 3519 3520 int netdev_tstamp_prequeue __read_mostly = 1; 3521 int netdev_budget __read_mostly = 300; 3522 int weight_p __read_mostly = 64; /* old backlog weight */ 3523 3524 /* Called with irq disabled */ 3525 static inline void ____napi_schedule(struct softnet_data *sd, 3526 struct napi_struct *napi) 3527 { 3528 list_add_tail(&napi->poll_list, &sd->poll_list); 3529 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3530 } 3531 3532 #ifdef CONFIG_RPS 3533 3534 /* One global table that all flow-based protocols share. */ 3535 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; 3536 EXPORT_SYMBOL(rps_sock_flow_table); 3537 u32 rps_cpu_mask __read_mostly; 3538 EXPORT_SYMBOL(rps_cpu_mask); 3539 3540 struct static_key rps_needed __read_mostly; 3541 EXPORT_SYMBOL(rps_needed); 3542 3543 static struct rps_dev_flow * 3544 set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3545 struct rps_dev_flow *rflow, u16 next_cpu) 3546 { 3547 if (next_cpu < nr_cpu_ids) { 3548 #ifdef CONFIG_RFS_ACCEL 3549 struct netdev_rx_queue *rxqueue; 3550 struct rps_dev_flow_table *flow_table; 3551 struct rps_dev_flow *old_rflow; 3552 u32 flow_id; 3553 u16 rxq_index; 3554 int rc; 3555 3556 /* Should we steer this flow to a different hardware queue? */ 3557 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || 3558 !(dev->features & NETIF_F_NTUPLE)) 3559 goto out; 3560 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); 3561 if (rxq_index == skb_get_rx_queue(skb)) 3562 goto out; 3563 3564 rxqueue = dev->_rx + rxq_index; 3565 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3566 if (!flow_table) 3567 goto out; 3568 flow_id = skb_get_hash(skb) & flow_table->mask; 3569 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, 3570 rxq_index, flow_id); 3571 if (rc < 0) 3572 goto out; 3573 old_rflow = rflow; 3574 rflow = &flow_table->flows[flow_id]; 3575 rflow->filter = rc; 3576 if (old_rflow->filter == rflow->filter) 3577 old_rflow->filter = RPS_NO_FILTER; 3578 out: 3579 #endif 3580 rflow->last_qtail = 3581 per_cpu(softnet_data, next_cpu).input_queue_head; 3582 } 3583 3584 rflow->cpu = next_cpu; 3585 return rflow; 3586 } 3587 3588 /* 3589 * get_rps_cpu is called from netif_receive_skb and returns the target 3590 * CPU from the RPS map of the receiving queue for a given skb. 3591 * rcu_read_lock must be held on entry. 3592 */ 3593 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3594 struct rps_dev_flow **rflowp) 3595 { 3596 const struct rps_sock_flow_table *sock_flow_table; 3597 struct netdev_rx_queue *rxqueue = dev->_rx; 3598 struct rps_dev_flow_table *flow_table; 3599 struct rps_map *map; 3600 int cpu = -1; 3601 u32 tcpu; 3602 u32 hash; 3603 3604 if (skb_rx_queue_recorded(skb)) { 3605 u16 index = skb_get_rx_queue(skb); 3606 3607 if (unlikely(index >= dev->real_num_rx_queues)) { 3608 WARN_ONCE(dev->real_num_rx_queues > 1, 3609 "%s received packet on queue %u, but number " 3610 "of RX queues is %u\n", 3611 dev->name, index, dev->real_num_rx_queues); 3612 goto done; 3613 } 3614 rxqueue += index; 3615 } 3616 3617 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ 3618 3619 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3620 map = rcu_dereference(rxqueue->rps_map); 3621 if (!flow_table && !map) 3622 goto done; 3623 3624 skb_reset_network_header(skb); 3625 hash = skb_get_hash(skb); 3626 if (!hash) 3627 goto done; 3628 3629 sock_flow_table = rcu_dereference(rps_sock_flow_table); 3630 if (flow_table && sock_flow_table) { 3631 struct rps_dev_flow *rflow; 3632 u32 next_cpu; 3633 u32 ident; 3634 3635 /* First check into global flow table if there is a match */ 3636 ident = sock_flow_table->ents[hash & sock_flow_table->mask]; 3637 if ((ident ^ hash) & ~rps_cpu_mask) 3638 goto try_rps; 3639 3640 next_cpu = ident & rps_cpu_mask; 3641 3642 /* OK, now we know there is a match, 3643 * we can look at the local (per receive queue) flow table 3644 */ 3645 rflow = &flow_table->flows[hash & flow_table->mask]; 3646 tcpu = rflow->cpu; 3647 3648 /* 3649 * If the desired CPU (where last recvmsg was done) is 3650 * different from current CPU (one in the rx-queue flow 3651 * table entry), switch if one of the following holds: 3652 * - Current CPU is unset (>= nr_cpu_ids). 3653 * - Current CPU is offline. 3654 * - The current CPU's queue tail has advanced beyond the 3655 * last packet that was enqueued using this table entry. 3656 * This guarantees that all previous packets for the flow 3657 * have been dequeued, thus preserving in order delivery. 3658 */ 3659 if (unlikely(tcpu != next_cpu) && 3660 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || 3661 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 3662 rflow->last_qtail)) >= 0)) { 3663 tcpu = next_cpu; 3664 rflow = set_rps_cpu(dev, skb, rflow, next_cpu); 3665 } 3666 3667 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { 3668 *rflowp = rflow; 3669 cpu = tcpu; 3670 goto done; 3671 } 3672 } 3673 3674 try_rps: 3675 3676 if (map) { 3677 tcpu = map->cpus[reciprocal_scale(hash, map->len)]; 3678 if (cpu_online(tcpu)) { 3679 cpu = tcpu; 3680 goto done; 3681 } 3682 } 3683 3684 done: 3685 return cpu; 3686 } 3687 3688 #ifdef CONFIG_RFS_ACCEL 3689 3690 /** 3691 * rps_may_expire_flow - check whether an RFS hardware filter may be removed 3692 * @dev: Device on which the filter was set 3693 * @rxq_index: RX queue index 3694 * @flow_id: Flow ID passed to ndo_rx_flow_steer() 3695 * @filter_id: Filter ID returned by ndo_rx_flow_steer() 3696 * 3697 * Drivers that implement ndo_rx_flow_steer() should periodically call 3698 * this function for each installed filter and remove the filters for 3699 * which it returns %true. 3700 */ 3701 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, 3702 u32 flow_id, u16 filter_id) 3703 { 3704 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; 3705 struct rps_dev_flow_table *flow_table; 3706 struct rps_dev_flow *rflow; 3707 bool expire = true; 3708 unsigned int cpu; 3709 3710 rcu_read_lock(); 3711 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3712 if (flow_table && flow_id <= flow_table->mask) { 3713 rflow = &flow_table->flows[flow_id]; 3714 cpu = ACCESS_ONCE(rflow->cpu); 3715 if (rflow->filter == filter_id && cpu < nr_cpu_ids && 3716 ((int)(per_cpu(softnet_data, cpu).input_queue_head - 3717 rflow->last_qtail) < 3718 (int)(10 * flow_table->mask))) 3719 expire = false; 3720 } 3721 rcu_read_unlock(); 3722 return expire; 3723 } 3724 EXPORT_SYMBOL(rps_may_expire_flow); 3725 3726 #endif /* CONFIG_RFS_ACCEL */ 3727 3728 /* Called from hardirq (IPI) context */ 3729 static void rps_trigger_softirq(void *data) 3730 { 3731 struct softnet_data *sd = data; 3732 3733 ____napi_schedule(sd, &sd->backlog); 3734 sd->received_rps++; 3735 } 3736 3737 #endif /* CONFIG_RPS */ 3738 3739 /* 3740 * Check if this softnet_data structure is another cpu one 3741 * If yes, queue it to our IPI list and return 1 3742 * If no, return 0 3743 */ 3744 static int rps_ipi_queued(struct softnet_data *sd) 3745 { 3746 #ifdef CONFIG_RPS 3747 struct softnet_data *mysd = this_cpu_ptr(&softnet_data); 3748 3749 if (sd != mysd) { 3750 sd->rps_ipi_next = mysd->rps_ipi_list; 3751 mysd->rps_ipi_list = sd; 3752 3753 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3754 return 1; 3755 } 3756 #endif /* CONFIG_RPS */ 3757 return 0; 3758 } 3759 3760 #ifdef CONFIG_NET_FLOW_LIMIT 3761 int netdev_flow_limit_table_len __read_mostly = (1 << 12); 3762 #endif 3763 3764 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) 3765 { 3766 #ifdef CONFIG_NET_FLOW_LIMIT 3767 struct sd_flow_limit *fl; 3768 struct softnet_data *sd; 3769 unsigned int old_flow, new_flow; 3770 3771 if (qlen < (netdev_max_backlog >> 1)) 3772 return false; 3773 3774 sd = this_cpu_ptr(&softnet_data); 3775 3776 rcu_read_lock(); 3777 fl = rcu_dereference(sd->flow_limit); 3778 if (fl) { 3779 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); 3780 old_flow = fl->history[fl->history_head]; 3781 fl->history[fl->history_head] = new_flow; 3782 3783 fl->history_head++; 3784 fl->history_head &= FLOW_LIMIT_HISTORY - 1; 3785 3786 if (likely(fl->buckets[old_flow])) 3787 fl->buckets[old_flow]--; 3788 3789 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { 3790 fl->count++; 3791 rcu_read_unlock(); 3792 return true; 3793 } 3794 } 3795 rcu_read_unlock(); 3796 #endif 3797 return false; 3798 } 3799 3800 /* 3801 * enqueue_to_backlog is called to queue an skb to a per CPU backlog 3802 * queue (may be a remote CPU queue). 3803 */ 3804 static int enqueue_to_backlog(struct sk_buff *skb, int cpu, 3805 unsigned int *qtail) 3806 { 3807 struct softnet_data *sd; 3808 unsigned long flags; 3809 unsigned int qlen; 3810 3811 sd = &per_cpu(softnet_data, cpu); 3812 3813 local_irq_save(flags); 3814 3815 rps_lock(sd); 3816 if (!netif_running(skb->dev)) 3817 goto drop; 3818 qlen = skb_queue_len(&sd->input_pkt_queue); 3819 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { 3820 if (qlen) { 3821 enqueue: 3822 __skb_queue_tail(&sd->input_pkt_queue, skb); 3823 input_queue_tail_incr_save(sd, qtail); 3824 rps_unlock(sd); 3825 local_irq_restore(flags); 3826 return NET_RX_SUCCESS; 3827 } 3828 3829 /* Schedule NAPI for backlog device 3830 * We can use non atomic operation since we own the queue lock 3831 */ 3832 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { 3833 if (!rps_ipi_queued(sd)) 3834 ____napi_schedule(sd, &sd->backlog); 3835 } 3836 goto enqueue; 3837 } 3838 3839 drop: 3840 sd->dropped++; 3841 rps_unlock(sd); 3842 3843 local_irq_restore(flags); 3844 3845 atomic_long_inc(&skb->dev->rx_dropped); 3846 kfree_skb(skb); 3847 return NET_RX_DROP; 3848 } 3849 3850 static int netif_rx_internal(struct sk_buff *skb) 3851 { 3852 int ret; 3853 3854 net_timestamp_check(netdev_tstamp_prequeue, skb); 3855 3856 trace_netif_rx(skb); 3857 #ifdef CONFIG_RPS 3858 if (static_key_false(&rps_needed)) { 3859 struct rps_dev_flow voidflow, *rflow = &voidflow; 3860 int cpu; 3861 3862 preempt_disable(); 3863 rcu_read_lock(); 3864 3865 cpu = get_rps_cpu(skb->dev, skb, &rflow); 3866 if (cpu < 0) 3867 cpu = smp_processor_id(); 3868 3869 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3870 3871 rcu_read_unlock(); 3872 preempt_enable(); 3873 } else 3874 #endif 3875 { 3876 unsigned int qtail; 3877 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 3878 put_cpu(); 3879 } 3880 return ret; 3881 } 3882 3883 /** 3884 * netif_rx - post buffer to the network code 3885 * @skb: buffer to post 3886 * 3887 * This function receives a packet from a device driver and queues it for 3888 * the upper (protocol) levels to process. It always succeeds. The buffer 3889 * may be dropped during processing for congestion control or by the 3890 * protocol layers. 3891 * 3892 * return values: 3893 * NET_RX_SUCCESS (no congestion) 3894 * NET_RX_DROP (packet was dropped) 3895 * 3896 */ 3897 3898 int netif_rx(struct sk_buff *skb) 3899 { 3900 trace_netif_rx_entry(skb); 3901 3902 return netif_rx_internal(skb); 3903 } 3904 EXPORT_SYMBOL(netif_rx); 3905 3906 int netif_rx_ni(struct sk_buff *skb) 3907 { 3908 int err; 3909 3910 trace_netif_rx_ni_entry(skb); 3911 3912 preempt_disable(); 3913 err = netif_rx_internal(skb); 3914 if (local_softirq_pending()) 3915 do_softirq(); 3916 preempt_enable(); 3917 3918 return err; 3919 } 3920 EXPORT_SYMBOL(netif_rx_ni); 3921 3922 static __latent_entropy void net_tx_action(struct softirq_action *h) 3923 { 3924 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 3925 3926 if (sd->completion_queue) { 3927 struct sk_buff *clist; 3928 3929 local_irq_disable(); 3930 clist = sd->completion_queue; 3931 sd->completion_queue = NULL; 3932 local_irq_enable(); 3933 3934 while (clist) { 3935 struct sk_buff *skb = clist; 3936 clist = clist->next; 3937 3938 WARN_ON(atomic_read(&skb->users)); 3939 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) 3940 trace_consume_skb(skb); 3941 else 3942 trace_kfree_skb(skb, net_tx_action); 3943 3944 if (skb->fclone != SKB_FCLONE_UNAVAILABLE) 3945 __kfree_skb(skb); 3946 else 3947 __kfree_skb_defer(skb); 3948 } 3949 3950 __kfree_skb_flush(); 3951 } 3952 3953 if (sd->output_queue) { 3954 struct Qdisc *head; 3955 3956 local_irq_disable(); 3957 head = sd->output_queue; 3958 sd->output_queue = NULL; 3959 sd->output_queue_tailp = &sd->output_queue; 3960 local_irq_enable(); 3961 3962 while (head) { 3963 struct Qdisc *q = head; 3964 spinlock_t *root_lock; 3965 3966 head = head->next_sched; 3967 3968 root_lock = qdisc_lock(q); 3969 spin_lock(root_lock); 3970 /* We need to make sure head->next_sched is read 3971 * before clearing __QDISC_STATE_SCHED 3972 */ 3973 smp_mb__before_atomic(); 3974 clear_bit(__QDISC_STATE_SCHED, &q->state); 3975 qdisc_run(q); 3976 spin_unlock(root_lock); 3977 } 3978 } 3979 } 3980 3981 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) 3982 /* This hook is defined here for ATM LANE */ 3983 int (*br_fdb_test_addr_hook)(struct net_device *dev, 3984 unsigned char *addr) __read_mostly; 3985 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); 3986 #endif 3987 3988 static inline struct sk_buff * 3989 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, 3990 struct net_device *orig_dev) 3991 { 3992 #ifdef CONFIG_NET_CLS_ACT 3993 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); 3994 struct tcf_result cl_res; 3995 3996 /* If there's at least one ingress present somewhere (so 3997 * we get here via enabled static key), remaining devices 3998 * that are not configured with an ingress qdisc will bail 3999 * out here. 4000 */ 4001 if (!cl) 4002 return skb; 4003 if (*pt_prev) { 4004 *ret = deliver_skb(skb, *pt_prev, orig_dev); 4005 *pt_prev = NULL; 4006 } 4007 4008 qdisc_skb_cb(skb)->pkt_len = skb->len; 4009 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 4010 qdisc_bstats_cpu_update(cl->q, skb); 4011 4012 switch (tc_classify(skb, cl, &cl_res, false)) { 4013 case TC_ACT_OK: 4014 case TC_ACT_RECLASSIFY: 4015 skb->tc_index = TC_H_MIN(cl_res.classid); 4016 break; 4017 case TC_ACT_SHOT: 4018 qdisc_qstats_cpu_drop(cl->q); 4019 kfree_skb(skb); 4020 return NULL; 4021 case TC_ACT_STOLEN: 4022 case TC_ACT_QUEUED: 4023 consume_skb(skb); 4024 return NULL; 4025 case TC_ACT_REDIRECT: 4026 /* skb_mac_header check was done by cls/act_bpf, so 4027 * we can safely push the L2 header back before 4028 * redirecting to another netdev 4029 */ 4030 __skb_push(skb, skb->mac_len); 4031 skb_do_redirect(skb); 4032 return NULL; 4033 default: 4034 break; 4035 } 4036 #endif /* CONFIG_NET_CLS_ACT */ 4037 return skb; 4038 } 4039 4040 /** 4041 * netdev_is_rx_handler_busy - check if receive handler is registered 4042 * @dev: device to check 4043 * 4044 * Check if a receive handler is already registered for a given device. 4045 * Return true if there one. 4046 * 4047 * The caller must hold the rtnl_mutex. 4048 */ 4049 bool netdev_is_rx_handler_busy(struct net_device *dev) 4050 { 4051 ASSERT_RTNL(); 4052 return dev && rtnl_dereference(dev->rx_handler); 4053 } 4054 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); 4055 4056 /** 4057 * netdev_rx_handler_register - register receive handler 4058 * @dev: device to register a handler for 4059 * @rx_handler: receive handler to register 4060 * @rx_handler_data: data pointer that is used by rx handler 4061 * 4062 * Register a receive handler for a device. This handler will then be 4063 * called from __netif_receive_skb. A negative errno code is returned 4064 * on a failure. 4065 * 4066 * The caller must hold the rtnl_mutex. 4067 * 4068 * For a general description of rx_handler, see enum rx_handler_result. 4069 */ 4070 int netdev_rx_handler_register(struct net_device *dev, 4071 rx_handler_func_t *rx_handler, 4072 void *rx_handler_data) 4073 { 4074 ASSERT_RTNL(); 4075 4076 if (dev->rx_handler) 4077 return -EBUSY; 4078 4079 /* Note: rx_handler_data must be set before rx_handler */ 4080 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); 4081 rcu_assign_pointer(dev->rx_handler, rx_handler); 4082 4083 return 0; 4084 } 4085 EXPORT_SYMBOL_GPL(netdev_rx_handler_register); 4086 4087 /** 4088 * netdev_rx_handler_unregister - unregister receive handler 4089 * @dev: device to unregister a handler from 4090 * 4091 * Unregister a receive handler from a device. 4092 * 4093 * The caller must hold the rtnl_mutex. 4094 */ 4095 void netdev_rx_handler_unregister(struct net_device *dev) 4096 { 4097 4098 ASSERT_RTNL(); 4099 RCU_INIT_POINTER(dev->rx_handler, NULL); 4100 /* a reader seeing a non NULL rx_handler in a rcu_read_lock() 4101 * section has a guarantee to see a non NULL rx_handler_data 4102 * as well. 4103 */ 4104 synchronize_net(); 4105 RCU_INIT_POINTER(dev->rx_handler_data, NULL); 4106 } 4107 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 4108 4109 /* 4110 * Limit the use of PFMEMALLOC reserves to those protocols that implement 4111 * the special handling of PFMEMALLOC skbs. 4112 */ 4113 static bool skb_pfmemalloc_protocol(struct sk_buff *skb) 4114 { 4115 switch (skb->protocol) { 4116 case htons(ETH_P_ARP): 4117 case htons(ETH_P_IP): 4118 case htons(ETH_P_IPV6): 4119 case htons(ETH_P_8021Q): 4120 case htons(ETH_P_8021AD): 4121 return true; 4122 default: 4123 return false; 4124 } 4125 } 4126 4127 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, 4128 int *ret, struct net_device *orig_dev) 4129 { 4130 #ifdef CONFIG_NETFILTER_INGRESS 4131 if (nf_hook_ingress_active(skb)) { 4132 int ingress_retval; 4133 4134 if (*pt_prev) { 4135 *ret = deliver_skb(skb, *pt_prev, orig_dev); 4136 *pt_prev = NULL; 4137 } 4138 4139 rcu_read_lock(); 4140 ingress_retval = nf_hook_ingress(skb); 4141 rcu_read_unlock(); 4142 return ingress_retval; 4143 } 4144 #endif /* CONFIG_NETFILTER_INGRESS */ 4145 return 0; 4146 } 4147 4148 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) 4149 { 4150 struct packet_type *ptype, *pt_prev; 4151 rx_handler_func_t *rx_handler; 4152 struct net_device *orig_dev; 4153 bool deliver_exact = false; 4154 int ret = NET_RX_DROP; 4155 __be16 type; 4156 4157 net_timestamp_check(!netdev_tstamp_prequeue, skb); 4158 4159 trace_netif_receive_skb(skb); 4160 4161 orig_dev = skb->dev; 4162 4163 skb_reset_network_header(skb); 4164 if (!skb_transport_header_was_set(skb)) 4165 skb_reset_transport_header(skb); 4166 skb_reset_mac_len(skb); 4167 4168 pt_prev = NULL; 4169 4170 another_round: 4171 skb->skb_iif = skb->dev->ifindex; 4172 4173 __this_cpu_inc(softnet_data.processed); 4174 4175 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || 4176 skb->protocol == cpu_to_be16(ETH_P_8021AD)) { 4177 skb = skb_vlan_untag(skb); 4178 if (unlikely(!skb)) 4179 goto out; 4180 } 4181 4182 #ifdef CONFIG_NET_CLS_ACT 4183 if (skb->tc_verd & TC_NCLS) { 4184 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 4185 goto ncls; 4186 } 4187 #endif 4188 4189 if (pfmemalloc) 4190 goto skip_taps; 4191 4192 list_for_each_entry_rcu(ptype, &ptype_all, list) { 4193 if (pt_prev) 4194 ret = deliver_skb(skb, pt_prev, orig_dev); 4195 pt_prev = ptype; 4196 } 4197 4198 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { 4199 if (pt_prev) 4200 ret = deliver_skb(skb, pt_prev, orig_dev); 4201 pt_prev = ptype; 4202 } 4203 4204 skip_taps: 4205 #ifdef CONFIG_NET_INGRESS 4206 if (static_key_false(&ingress_needed)) { 4207 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); 4208 if (!skb) 4209 goto out; 4210 4211 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) 4212 goto out; 4213 } 4214 #endif 4215 #ifdef CONFIG_NET_CLS_ACT 4216 skb->tc_verd = 0; 4217 ncls: 4218 #endif 4219 if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) 4220 goto drop; 4221 4222 if (skb_vlan_tag_present(skb)) { 4223 if (pt_prev) { 4224 ret = deliver_skb(skb, pt_prev, orig_dev); 4225 pt_prev = NULL; 4226 } 4227 if (vlan_do_receive(&skb)) 4228 goto another_round; 4229 else if (unlikely(!skb)) 4230 goto out; 4231 } 4232 4233 rx_handler = rcu_dereference(skb->dev->rx_handler); 4234 if (rx_handler) { 4235 if (pt_prev) { 4236 ret = deliver_skb(skb, pt_prev, orig_dev); 4237 pt_prev = NULL; 4238 } 4239 switch (rx_handler(&skb)) { 4240 case RX_HANDLER_CONSUMED: 4241 ret = NET_RX_SUCCESS; 4242 goto out; 4243 case RX_HANDLER_ANOTHER: 4244 goto another_round; 4245 case RX_HANDLER_EXACT: 4246 deliver_exact = true; 4247 case RX_HANDLER_PASS: 4248 break; 4249 default: 4250 BUG(); 4251 } 4252 } 4253 4254 if (unlikely(skb_vlan_tag_present(skb))) { 4255 if (skb_vlan_tag_get_id(skb)) 4256 skb->pkt_type = PACKET_OTHERHOST; 4257 /* Note: we might in the future use prio bits 4258 * and set skb->priority like in vlan_do_receive() 4259 * For the time being, just ignore Priority Code Point 4260 */ 4261 skb->vlan_tci = 0; 4262 } 4263 4264 type = skb->protocol; 4265 4266 /* deliver only exact match when indicated */ 4267 if (likely(!deliver_exact)) { 4268 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4269 &ptype_base[ntohs(type) & 4270 PTYPE_HASH_MASK]); 4271 } 4272 4273 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4274 &orig_dev->ptype_specific); 4275 4276 if (unlikely(skb->dev != orig_dev)) { 4277 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4278 &skb->dev->ptype_specific); 4279 } 4280 4281 if (pt_prev) { 4282 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 4283 goto drop; 4284 else 4285 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 4286 } else { 4287 drop: 4288 if (!deliver_exact) 4289 atomic_long_inc(&skb->dev->rx_dropped); 4290 else 4291 atomic_long_inc(&skb->dev->rx_nohandler); 4292 kfree_skb(skb); 4293 /* Jamal, now you will not able to escape explaining 4294 * me how you were going to use this. :-) 4295 */ 4296 ret = NET_RX_DROP; 4297 } 4298 4299 out: 4300 return ret; 4301 } 4302 4303 static int __netif_receive_skb(struct sk_buff *skb) 4304 { 4305 int ret; 4306 4307 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { 4308 unsigned long pflags = current->flags; 4309 4310 /* 4311 * PFMEMALLOC skbs are special, they should 4312 * - be delivered to SOCK_MEMALLOC sockets only 4313 * - stay away from userspace 4314 * - have bounded memory usage 4315 * 4316 * Use PF_MEMALLOC as this saves us from propagating the allocation 4317 * context down to all allocation sites. 4318 */ 4319 current->flags |= PF_MEMALLOC; 4320 ret = __netif_receive_skb_core(skb, true); 4321 tsk_restore_flags(current, pflags, PF_MEMALLOC); 4322 } else 4323 ret = __netif_receive_skb_core(skb, false); 4324 4325 return ret; 4326 } 4327 4328 static int netif_receive_skb_internal(struct sk_buff *skb) 4329 { 4330 int ret; 4331 4332 net_timestamp_check(netdev_tstamp_prequeue, skb); 4333 4334 if (skb_defer_rx_timestamp(skb)) 4335 return NET_RX_SUCCESS; 4336 4337 rcu_read_lock(); 4338 4339 #ifdef CONFIG_RPS 4340 if (static_key_false(&rps_needed)) { 4341 struct rps_dev_flow voidflow, *rflow = &voidflow; 4342 int cpu = get_rps_cpu(skb->dev, skb, &rflow); 4343 4344 if (cpu >= 0) { 4345 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 4346 rcu_read_unlock(); 4347 return ret; 4348 } 4349 } 4350 #endif 4351 ret = __netif_receive_skb(skb); 4352 rcu_read_unlock(); 4353 return ret; 4354 } 4355 4356 /** 4357 * netif_receive_skb - process receive buffer from network 4358 * @skb: buffer to process 4359 * 4360 * netif_receive_skb() is the main receive data processing function. 4361 * It always succeeds. The buffer may be dropped during processing 4362 * for congestion control or by the protocol layers. 4363 * 4364 * This function may only be called from softirq context and interrupts 4365 * should be enabled. 4366 * 4367 * Return values (usually ignored): 4368 * NET_RX_SUCCESS: no congestion 4369 * NET_RX_DROP: packet was dropped 4370 */ 4371 int netif_receive_skb(struct sk_buff *skb) 4372 { 4373 trace_netif_receive_skb_entry(skb); 4374 4375 return netif_receive_skb_internal(skb); 4376 } 4377 EXPORT_SYMBOL(netif_receive_skb); 4378 4379 DEFINE_PER_CPU(struct work_struct, flush_works); 4380 4381 /* Network device is going away, flush any packets still pending */ 4382 static void flush_backlog(struct work_struct *work) 4383 { 4384 struct sk_buff *skb, *tmp; 4385 struct softnet_data *sd; 4386 4387 local_bh_disable(); 4388 sd = this_cpu_ptr(&softnet_data); 4389 4390 local_irq_disable(); 4391 rps_lock(sd); 4392 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { 4393 if (skb->dev->reg_state == NETREG_UNREGISTERING) { 4394 __skb_unlink(skb, &sd->input_pkt_queue); 4395 dev_kfree_skb_irq(skb); 4396 input_queue_head_incr(sd); 4397 } 4398 } 4399 rps_unlock(sd); 4400 local_irq_enable(); 4401 4402 skb_queue_walk_safe(&sd->process_queue, skb, tmp) { 4403 if (skb->dev->reg_state == NETREG_UNREGISTERING) { 4404 __skb_unlink(skb, &sd->process_queue); 4405 kfree_skb(skb); 4406 input_queue_head_incr(sd); 4407 } 4408 } 4409 local_bh_enable(); 4410 } 4411 4412 static void flush_all_backlogs(void) 4413 { 4414 unsigned int cpu; 4415 4416 get_online_cpus(); 4417 4418 for_each_online_cpu(cpu) 4419 queue_work_on(cpu, system_highpri_wq, 4420 per_cpu_ptr(&flush_works, cpu)); 4421 4422 for_each_online_cpu(cpu) 4423 flush_work(per_cpu_ptr(&flush_works, cpu)); 4424 4425 put_online_cpus(); 4426 } 4427 4428 static int napi_gro_complete(struct sk_buff *skb) 4429 { 4430 struct packet_offload *ptype; 4431 __be16 type = skb->protocol; 4432 struct list_head *head = &offload_base; 4433 int err = -ENOENT; 4434 4435 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); 4436 4437 if (NAPI_GRO_CB(skb)->count == 1) { 4438 skb_shinfo(skb)->gso_size = 0; 4439 goto out; 4440 } 4441 4442 rcu_read_lock(); 4443 list_for_each_entry_rcu(ptype, head, list) { 4444 if (ptype->type != type || !ptype->callbacks.gro_complete) 4445 continue; 4446 4447 err = ptype->callbacks.gro_complete(skb, 0); 4448 break; 4449 } 4450 rcu_read_unlock(); 4451 4452 if (err) { 4453 WARN_ON(&ptype->list == head); 4454 kfree_skb(skb); 4455 return NET_RX_SUCCESS; 4456 } 4457 4458 out: 4459 return netif_receive_skb_internal(skb); 4460 } 4461 4462 /* napi->gro_list contains packets ordered by age. 4463 * youngest packets at the head of it. 4464 * Complete skbs in reverse order to reduce latencies. 4465 */ 4466 void napi_gro_flush(struct napi_struct *napi, bool flush_old) 4467 { 4468 struct sk_buff *skb, *prev = NULL; 4469 4470 /* scan list and build reverse chain */ 4471 for (skb = napi->gro_list; skb != NULL; skb = skb->next) { 4472 skb->prev = prev; 4473 prev = skb; 4474 } 4475 4476 for (skb = prev; skb; skb = prev) { 4477 skb->next = NULL; 4478 4479 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) 4480 return; 4481 4482 prev = skb->prev; 4483 napi_gro_complete(skb); 4484 napi->gro_count--; 4485 } 4486 4487 napi->gro_list = NULL; 4488 } 4489 EXPORT_SYMBOL(napi_gro_flush); 4490 4491 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) 4492 { 4493 struct sk_buff *p; 4494 unsigned int maclen = skb->dev->hard_header_len; 4495 u32 hash = skb_get_hash_raw(skb); 4496 4497 for (p = napi->gro_list; p; p = p->next) { 4498 unsigned long diffs; 4499 4500 NAPI_GRO_CB(p)->flush = 0; 4501 4502 if (hash != skb_get_hash_raw(p)) { 4503 NAPI_GRO_CB(p)->same_flow = 0; 4504 continue; 4505 } 4506 4507 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; 4508 diffs |= p->vlan_tci ^ skb->vlan_tci; 4509 diffs |= skb_metadata_dst_cmp(p, skb); 4510 if (maclen == ETH_HLEN) 4511 diffs |= compare_ether_header(skb_mac_header(p), 4512 skb_mac_header(skb)); 4513 else if (!diffs) 4514 diffs = memcmp(skb_mac_header(p), 4515 skb_mac_header(skb), 4516 maclen); 4517 NAPI_GRO_CB(p)->same_flow = !diffs; 4518 } 4519 } 4520 4521 static void skb_gro_reset_offset(struct sk_buff *skb) 4522 { 4523 const struct skb_shared_info *pinfo = skb_shinfo(skb); 4524 const skb_frag_t *frag0 = &pinfo->frags[0]; 4525 4526 NAPI_GRO_CB(skb)->data_offset = 0; 4527 NAPI_GRO_CB(skb)->frag0 = NULL; 4528 NAPI_GRO_CB(skb)->frag0_len = 0; 4529 4530 if (skb_mac_header(skb) == skb_tail_pointer(skb) && 4531 pinfo->nr_frags && 4532 !PageHighMem(skb_frag_page(frag0))) { 4533 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); 4534 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, 4535 skb_frag_size(frag0), 4536 skb->end - skb->tail); 4537 } 4538 } 4539 4540 static void gro_pull_from_frag0(struct sk_buff *skb, int grow) 4541 { 4542 struct skb_shared_info *pinfo = skb_shinfo(skb); 4543 4544 BUG_ON(skb->end - skb->tail < grow); 4545 4546 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); 4547 4548 skb->data_len -= grow; 4549 skb->tail += grow; 4550 4551 pinfo->frags[0].page_offset += grow; 4552 skb_frag_size_sub(&pinfo->frags[0], grow); 4553 4554 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { 4555 skb_frag_unref(skb, 0); 4556 memmove(pinfo->frags, pinfo->frags + 1, 4557 --pinfo->nr_frags * sizeof(pinfo->frags[0])); 4558 } 4559 } 4560 4561 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4562 { 4563 struct sk_buff **pp = NULL; 4564 struct packet_offload *ptype; 4565 __be16 type = skb->protocol; 4566 struct list_head *head = &offload_base; 4567 int same_flow; 4568 enum gro_result ret; 4569 int grow; 4570 4571 if (!(skb->dev->features & NETIF_F_GRO)) 4572 goto normal; 4573 4574 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) 4575 goto normal; 4576 4577 gro_list_prepare(napi, skb); 4578 4579 rcu_read_lock(); 4580 list_for_each_entry_rcu(ptype, head, list) { 4581 if (ptype->type != type || !ptype->callbacks.gro_receive) 4582 continue; 4583 4584 skb_set_network_header(skb, skb_gro_offset(skb)); 4585 skb_reset_mac_len(skb); 4586 NAPI_GRO_CB(skb)->same_flow = 0; 4587 NAPI_GRO_CB(skb)->flush = 0; 4588 NAPI_GRO_CB(skb)->free = 0; 4589 NAPI_GRO_CB(skb)->encap_mark = 0; 4590 NAPI_GRO_CB(skb)->recursion_counter = 0; 4591 NAPI_GRO_CB(skb)->is_fou = 0; 4592 NAPI_GRO_CB(skb)->is_atomic = 1; 4593 NAPI_GRO_CB(skb)->gro_remcsum_start = 0; 4594 4595 /* Setup for GRO checksum validation */ 4596 switch (skb->ip_summed) { 4597 case CHECKSUM_COMPLETE: 4598 NAPI_GRO_CB(skb)->csum = skb->csum; 4599 NAPI_GRO_CB(skb)->csum_valid = 1; 4600 NAPI_GRO_CB(skb)->csum_cnt = 0; 4601 break; 4602 case CHECKSUM_UNNECESSARY: 4603 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; 4604 NAPI_GRO_CB(skb)->csum_valid = 0; 4605 break; 4606 default: 4607 NAPI_GRO_CB(skb)->csum_cnt = 0; 4608 NAPI_GRO_CB(skb)->csum_valid = 0; 4609 } 4610 4611 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); 4612 break; 4613 } 4614 rcu_read_unlock(); 4615 4616 if (&ptype->list == head) 4617 goto normal; 4618 4619 same_flow = NAPI_GRO_CB(skb)->same_flow; 4620 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 4621 4622 if (pp) { 4623 struct sk_buff *nskb = *pp; 4624 4625 *pp = nskb->next; 4626 nskb->next = NULL; 4627 napi_gro_complete(nskb); 4628 napi->gro_count--; 4629 } 4630 4631 if (same_flow) 4632 goto ok; 4633 4634 if (NAPI_GRO_CB(skb)->flush) 4635 goto normal; 4636 4637 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { 4638 struct sk_buff *nskb = napi->gro_list; 4639 4640 /* locate the end of the list to select the 'oldest' flow */ 4641 while (nskb->next) { 4642 pp = &nskb->next; 4643 nskb = *pp; 4644 } 4645 *pp = NULL; 4646 nskb->next = NULL; 4647 napi_gro_complete(nskb); 4648 } else { 4649 napi->gro_count++; 4650 } 4651 NAPI_GRO_CB(skb)->count = 1; 4652 NAPI_GRO_CB(skb)->age = jiffies; 4653 NAPI_GRO_CB(skb)->last = skb; 4654 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 4655 skb->next = napi->gro_list; 4656 napi->gro_list = skb; 4657 ret = GRO_HELD; 4658 4659 pull: 4660 grow = skb_gro_offset(skb) - skb_headlen(skb); 4661 if (grow > 0) 4662 gro_pull_from_frag0(skb, grow); 4663 ok: 4664 return ret; 4665 4666 normal: 4667 ret = GRO_NORMAL; 4668 goto pull; 4669 } 4670 4671 struct packet_offload *gro_find_receive_by_type(__be16 type) 4672 { 4673 struct list_head *offload_head = &offload_base; 4674 struct packet_offload *ptype; 4675 4676 list_for_each_entry_rcu(ptype, offload_head, list) { 4677 if (ptype->type != type || !ptype->callbacks.gro_receive) 4678 continue; 4679 return ptype; 4680 } 4681 return NULL; 4682 } 4683 EXPORT_SYMBOL(gro_find_receive_by_type); 4684 4685 struct packet_offload *gro_find_complete_by_type(__be16 type) 4686 { 4687 struct list_head *offload_head = &offload_base; 4688 struct packet_offload *ptype; 4689 4690 list_for_each_entry_rcu(ptype, offload_head, list) { 4691 if (ptype->type != type || !ptype->callbacks.gro_complete) 4692 continue; 4693 return ptype; 4694 } 4695 return NULL; 4696 } 4697 EXPORT_SYMBOL(gro_find_complete_by_type); 4698 4699 static void napi_skb_free_stolen_head(struct sk_buff *skb) 4700 { 4701 skb_dst_drop(skb); 4702 kmem_cache_free(skbuff_head_cache, skb); 4703 } 4704 4705 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) 4706 { 4707 switch (ret) { 4708 case GRO_NORMAL: 4709 if (netif_receive_skb_internal(skb)) 4710 ret = GRO_DROP; 4711 break; 4712 4713 case GRO_DROP: 4714 kfree_skb(skb); 4715 break; 4716 4717 case GRO_MERGED_FREE: 4718 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) 4719 napi_skb_free_stolen_head(skb); 4720 else 4721 __kfree_skb(skb); 4722 break; 4723 4724 case GRO_HELD: 4725 case GRO_MERGED: 4726 break; 4727 } 4728 4729 return ret; 4730 } 4731 4732 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4733 { 4734 skb_mark_napi_id(skb, napi); 4735 trace_napi_gro_receive_entry(skb); 4736 4737 skb_gro_reset_offset(skb); 4738 4739 return napi_skb_finish(dev_gro_receive(napi, skb), skb); 4740 } 4741 EXPORT_SYMBOL(napi_gro_receive); 4742 4743 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 4744 { 4745 if (unlikely(skb->pfmemalloc)) { 4746 consume_skb(skb); 4747 return; 4748 } 4749 __skb_pull(skb, skb_headlen(skb)); 4750 /* restore the reserve we had after netdev_alloc_skb_ip_align() */ 4751 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); 4752 skb->vlan_tci = 0; 4753 skb->dev = napi->dev; 4754 skb->skb_iif = 0; 4755 4756 /* eth_type_trans() assumes pkt_type is PACKET_HOST */ 4757 skb->pkt_type = PACKET_HOST; 4758 4759 skb->encapsulation = 0; 4760 skb_shinfo(skb)->gso_type = 0; 4761 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 4762 4763 napi->skb = skb; 4764 } 4765 4766 struct sk_buff *napi_get_frags(struct napi_struct *napi) 4767 { 4768 struct sk_buff *skb = napi->skb; 4769 4770 if (!skb) { 4771 skb = napi_alloc_skb(napi, GRO_MAX_HEAD); 4772 if (skb) { 4773 napi->skb = skb; 4774 skb_mark_napi_id(skb, napi); 4775 } 4776 } 4777 return skb; 4778 } 4779 EXPORT_SYMBOL(napi_get_frags); 4780 4781 static gro_result_t napi_frags_finish(struct napi_struct *napi, 4782 struct sk_buff *skb, 4783 gro_result_t ret) 4784 { 4785 switch (ret) { 4786 case GRO_NORMAL: 4787 case GRO_HELD: 4788 __skb_push(skb, ETH_HLEN); 4789 skb->protocol = eth_type_trans(skb, skb->dev); 4790 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) 4791 ret = GRO_DROP; 4792 break; 4793 4794 case GRO_DROP: 4795 napi_reuse_skb(napi, skb); 4796 break; 4797 4798 case GRO_MERGED_FREE: 4799 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) 4800 napi_skb_free_stolen_head(skb); 4801 else 4802 napi_reuse_skb(napi, skb); 4803 break; 4804 4805 case GRO_MERGED: 4806 break; 4807 } 4808 4809 return ret; 4810 } 4811 4812 /* Upper GRO stack assumes network header starts at gro_offset=0 4813 * Drivers could call both napi_gro_frags() and napi_gro_receive() 4814 * We copy ethernet header into skb->data to have a common layout. 4815 */ 4816 static struct sk_buff *napi_frags_skb(struct napi_struct *napi) 4817 { 4818 struct sk_buff *skb = napi->skb; 4819 const struct ethhdr *eth; 4820 unsigned int hlen = sizeof(*eth); 4821 4822 napi->skb = NULL; 4823 4824 skb_reset_mac_header(skb); 4825 skb_gro_reset_offset(skb); 4826 4827 if (unlikely(skb_gro_header_hard(skb, hlen))) { 4828 eth = skb_gro_header_slow(skb, hlen, 0); 4829 if (unlikely(!eth)) { 4830 net_warn_ratelimited("%s: dropping impossible skb from %s\n", 4831 __func__, napi->dev->name); 4832 napi_reuse_skb(napi, skb); 4833 return NULL; 4834 } 4835 } else { 4836 eth = (const struct ethhdr *)skb->data; 4837 gro_pull_from_frag0(skb, hlen); 4838 NAPI_GRO_CB(skb)->frag0 += hlen; 4839 NAPI_GRO_CB(skb)->frag0_len -= hlen; 4840 } 4841 __skb_pull(skb, hlen); 4842 4843 /* 4844 * This works because the only protocols we care about don't require 4845 * special handling. 4846 * We'll fix it up properly in napi_frags_finish() 4847 */ 4848 skb->protocol = eth->h_proto; 4849 4850 return skb; 4851 } 4852 4853 gro_result_t napi_gro_frags(struct napi_struct *napi) 4854 { 4855 struct sk_buff *skb = napi_frags_skb(napi); 4856 4857 if (!skb) 4858 return GRO_DROP; 4859 4860 trace_napi_gro_frags_entry(skb); 4861 4862 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); 4863 } 4864 EXPORT_SYMBOL(napi_gro_frags); 4865 4866 /* Compute the checksum from gro_offset and return the folded value 4867 * after adding in any pseudo checksum. 4868 */ 4869 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) 4870 { 4871 __wsum wsum; 4872 __sum16 sum; 4873 4874 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); 4875 4876 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ 4877 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); 4878 if (likely(!sum)) { 4879 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 4880 !skb->csum_complete_sw) 4881 netdev_rx_csum_fault(skb->dev); 4882 } 4883 4884 NAPI_GRO_CB(skb)->csum = wsum; 4885 NAPI_GRO_CB(skb)->csum_valid = 1; 4886 4887 return sum; 4888 } 4889 EXPORT_SYMBOL(__skb_gro_checksum_complete); 4890 4891 /* 4892 * net_rps_action_and_irq_enable sends any pending IPI's for rps. 4893 * Note: called with local irq disabled, but exits with local irq enabled. 4894 */ 4895 static void net_rps_action_and_irq_enable(struct softnet_data *sd) 4896 { 4897 #ifdef CONFIG_RPS 4898 struct softnet_data *remsd = sd->rps_ipi_list; 4899 4900 if (remsd) { 4901 sd->rps_ipi_list = NULL; 4902 4903 local_irq_enable(); 4904 4905 /* Send pending IPI's to kick RPS processing on remote cpus. */ 4906 while (remsd) { 4907 struct softnet_data *next = remsd->rps_ipi_next; 4908 4909 if (cpu_online(remsd->cpu)) 4910 smp_call_function_single_async(remsd->cpu, 4911 &remsd->csd); 4912 remsd = next; 4913 } 4914 } else 4915 #endif 4916 local_irq_enable(); 4917 } 4918 4919 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) 4920 { 4921 #ifdef CONFIG_RPS 4922 return sd->rps_ipi_list != NULL; 4923 #else 4924 return false; 4925 #endif 4926 } 4927 4928 static int process_backlog(struct napi_struct *napi, int quota) 4929 { 4930 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 4931 bool again = true; 4932 int work = 0; 4933 4934 /* Check if we have pending ipi, its better to send them now, 4935 * not waiting net_rx_action() end. 4936 */ 4937 if (sd_has_rps_ipi_waiting(sd)) { 4938 local_irq_disable(); 4939 net_rps_action_and_irq_enable(sd); 4940 } 4941 4942 napi->weight = weight_p; 4943 while (again) { 4944 struct sk_buff *skb; 4945 4946 while ((skb = __skb_dequeue(&sd->process_queue))) { 4947 rcu_read_lock(); 4948 __netif_receive_skb(skb); 4949 rcu_read_unlock(); 4950 input_queue_head_incr(sd); 4951 if (++work >= quota) 4952 return work; 4953 4954 } 4955 4956 local_irq_disable(); 4957 rps_lock(sd); 4958 if (skb_queue_empty(&sd->input_pkt_queue)) { 4959 /* 4960 * Inline a custom version of __napi_complete(). 4961 * only current cpu owns and manipulates this napi, 4962 * and NAPI_STATE_SCHED is the only possible flag set 4963 * on backlog. 4964 * We can use a plain write instead of clear_bit(), 4965 * and we dont need an smp_mb() memory barrier. 4966 */ 4967 napi->state = 0; 4968 again = false; 4969 } else { 4970 skb_queue_splice_tail_init(&sd->input_pkt_queue, 4971 &sd->process_queue); 4972 } 4973 rps_unlock(sd); 4974 local_irq_enable(); 4975 } 4976 4977 return work; 4978 } 4979 4980 /** 4981 * __napi_schedule - schedule for receive 4982 * @n: entry to schedule 4983 * 4984 * The entry's receive function will be scheduled to run. 4985 * Consider using __napi_schedule_irqoff() if hard irqs are masked. 4986 */ 4987 void __napi_schedule(struct napi_struct *n) 4988 { 4989 unsigned long flags; 4990 4991 local_irq_save(flags); 4992 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4993 local_irq_restore(flags); 4994 } 4995 EXPORT_SYMBOL(__napi_schedule); 4996 4997 /** 4998 * __napi_schedule_irqoff - schedule for receive 4999 * @n: entry to schedule 5000 * 5001 * Variant of __napi_schedule() assuming hard irqs are masked. 5002 * 5003 * On PREEMPT_RT enabled kernels this maps to __napi_schedule() 5004 * because the interrupt disabled assumption might not be true 5005 * due to force-threaded interrupts and spinlock substitution. 5006 */ 5007 void __napi_schedule_irqoff(struct napi_struct *n) 5008 { 5009 if (!IS_ENABLED(CONFIG_PREEMPT_RT)) 5010 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 5011 else 5012 __napi_schedule(n); 5013 } 5014 EXPORT_SYMBOL(__napi_schedule_irqoff); 5015 5016 void __napi_complete(struct napi_struct *n) 5017 { 5018 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 5019 5020 list_del_init(&n->poll_list); 5021 smp_mb__before_atomic(); 5022 clear_bit(NAPI_STATE_SCHED, &n->state); 5023 } 5024 EXPORT_SYMBOL(__napi_complete); 5025 5026 void napi_complete_done(struct napi_struct *n, int work_done) 5027 { 5028 unsigned long flags; 5029 5030 /* 5031 * don't let napi dequeue from the cpu poll list 5032 * just in case its running on a different cpu 5033 */ 5034 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 5035 return; 5036 5037 if (n->gro_list) { 5038 unsigned long timeout = 0; 5039 5040 if (work_done) 5041 timeout = n->dev->gro_flush_timeout; 5042 5043 if (timeout) 5044 hrtimer_start(&n->timer, ns_to_ktime(timeout), 5045 HRTIMER_MODE_REL_PINNED); 5046 else 5047 napi_gro_flush(n, false); 5048 } 5049 if (likely(list_empty(&n->poll_list))) { 5050 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); 5051 } else { 5052 /* If n->poll_list is not empty, we need to mask irqs */ 5053 local_irq_save(flags); 5054 __napi_complete(n); 5055 local_irq_restore(flags); 5056 } 5057 } 5058 EXPORT_SYMBOL(napi_complete_done); 5059 5060 /* must be called under rcu_read_lock(), as we dont take a reference */ 5061 static struct napi_struct *napi_by_id(unsigned int napi_id) 5062 { 5063 unsigned int hash = napi_id % HASH_SIZE(napi_hash); 5064 struct napi_struct *napi; 5065 5066 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) 5067 if (napi->napi_id == napi_id) 5068 return napi; 5069 5070 return NULL; 5071 } 5072 5073 #if defined(CONFIG_NET_RX_BUSY_POLL) 5074 #define BUSY_POLL_BUDGET 8 5075 bool sk_busy_loop(struct sock *sk, int nonblock) 5076 { 5077 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; 5078 int (*busy_poll)(struct napi_struct *dev); 5079 struct napi_struct *napi; 5080 int rc = false; 5081 5082 rcu_read_lock(); 5083 5084 napi = napi_by_id(sk->sk_napi_id); 5085 if (!napi) 5086 goto out; 5087 5088 /* Note: ndo_busy_poll method is optional in linux-4.5 */ 5089 if (napi->dev->netdev_ops) 5090 busy_poll = napi->dev->netdev_ops->ndo_busy_poll; 5091 else 5092 busy_poll = NULL; 5093 5094 do { 5095 rc = 0; 5096 local_bh_disable(); 5097 if (busy_poll) { 5098 rc = busy_poll(napi); 5099 } else if (napi_schedule_prep(napi)) { 5100 void *have = netpoll_poll_lock(napi); 5101 5102 if (test_bit(NAPI_STATE_SCHED, &napi->state)) { 5103 rc = napi->poll(napi, BUSY_POLL_BUDGET); 5104 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); 5105 if (rc == BUSY_POLL_BUDGET) { 5106 napi_complete_done(napi, rc); 5107 napi_schedule(napi); 5108 } 5109 } 5110 netpoll_poll_unlock(have); 5111 } 5112 if (rc > 0) 5113 __NET_ADD_STATS(sock_net(sk), 5114 LINUX_MIB_BUSYPOLLRXPACKETS, rc); 5115 local_bh_enable(); 5116 5117 if (rc == LL_FLUSH_FAILED) 5118 break; /* permanent failure */ 5119 5120 cpu_relax(); 5121 } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && 5122 !need_resched() && !busy_loop_timeout(end_time)); 5123 5124 rc = !skb_queue_empty(&sk->sk_receive_queue); 5125 out: 5126 rcu_read_unlock(); 5127 return rc; 5128 } 5129 EXPORT_SYMBOL(sk_busy_loop); 5130 5131 #endif /* CONFIG_NET_RX_BUSY_POLL */ 5132 5133 void napi_hash_add(struct napi_struct *napi) 5134 { 5135 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || 5136 test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) 5137 return; 5138 5139 spin_lock(&napi_hash_lock); 5140 5141 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ 5142 do { 5143 if (unlikely(++napi_gen_id < NR_CPUS + 1)) 5144 napi_gen_id = NR_CPUS + 1; 5145 } while (napi_by_id(napi_gen_id)); 5146 napi->napi_id = napi_gen_id; 5147 5148 hlist_add_head_rcu(&napi->napi_hash_node, 5149 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); 5150 5151 spin_unlock(&napi_hash_lock); 5152 } 5153 EXPORT_SYMBOL_GPL(napi_hash_add); 5154 5155 /* Warning : caller is responsible to make sure rcu grace period 5156 * is respected before freeing memory containing @napi 5157 */ 5158 bool napi_hash_del(struct napi_struct *napi) 5159 { 5160 bool rcu_sync_needed = false; 5161 5162 spin_lock(&napi_hash_lock); 5163 5164 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { 5165 rcu_sync_needed = true; 5166 hlist_del_rcu(&napi->napi_hash_node); 5167 } 5168 spin_unlock(&napi_hash_lock); 5169 return rcu_sync_needed; 5170 } 5171 EXPORT_SYMBOL_GPL(napi_hash_del); 5172 5173 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) 5174 { 5175 struct napi_struct *napi; 5176 5177 napi = container_of(timer, struct napi_struct, timer); 5178 if (napi->gro_list) 5179 napi_schedule(napi); 5180 5181 return HRTIMER_NORESTART; 5182 } 5183 5184 void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 5185 int (*poll)(struct napi_struct *, int), int weight) 5186 { 5187 INIT_LIST_HEAD(&napi->poll_list); 5188 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); 5189 napi->timer.function = napi_watchdog; 5190 napi->gro_count = 0; 5191 napi->gro_list = NULL; 5192 napi->skb = NULL; 5193 napi->poll = poll; 5194 if (weight > NAPI_POLL_WEIGHT) 5195 pr_err_once("netif_napi_add() called with weight %d on device %s\n", 5196 weight, dev->name); 5197 napi->weight = weight; 5198 napi->dev = dev; 5199 #ifdef CONFIG_NETPOLL 5200 spin_lock_init(&napi->poll_lock); 5201 napi->poll_owner = -1; 5202 #endif 5203 set_bit(NAPI_STATE_SCHED, &napi->state); 5204 set_bit(NAPI_STATE_NPSVC, &napi->state); 5205 list_add_rcu(&napi->dev_list, &dev->napi_list); 5206 napi_hash_add(napi); 5207 } 5208 EXPORT_SYMBOL(netif_napi_add); 5209 5210 void napi_disable(struct napi_struct *n) 5211 { 5212 might_sleep(); 5213 set_bit(NAPI_STATE_DISABLE, &n->state); 5214 5215 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) 5216 msleep(1); 5217 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) 5218 msleep(1); 5219 5220 hrtimer_cancel(&n->timer); 5221 5222 clear_bit(NAPI_STATE_DISABLE, &n->state); 5223 } 5224 EXPORT_SYMBOL(napi_disable); 5225 5226 /* Must be called in process context */ 5227 void netif_napi_del(struct napi_struct *napi) 5228 { 5229 might_sleep(); 5230 if (napi_hash_del(napi)) 5231 synchronize_net(); 5232 list_del_init(&napi->dev_list); 5233 napi_free_frags(napi); 5234 5235 kfree_skb_list(napi->gro_list); 5236 napi->gro_list = NULL; 5237 napi->gro_count = 0; 5238 } 5239 EXPORT_SYMBOL(netif_napi_del); 5240 5241 static int napi_poll(struct napi_struct *n, struct list_head *repoll) 5242 { 5243 void *have; 5244 int work, weight; 5245 5246 list_del_init(&n->poll_list); 5247 5248 have = netpoll_poll_lock(n); 5249 5250 weight = n->weight; 5251 5252 /* This NAPI_STATE_SCHED test is for avoiding a race 5253 * with netpoll's poll_napi(). Only the entity which 5254 * obtains the lock and sees NAPI_STATE_SCHED set will 5255 * actually make the ->poll() call. Therefore we avoid 5256 * accidentally calling ->poll() when NAPI is not scheduled. 5257 */ 5258 work = 0; 5259 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 5260 work = n->poll(n, weight); 5261 trace_napi_poll(n, work, weight); 5262 } 5263 5264 WARN_ON_ONCE(work > weight); 5265 5266 if (likely(work < weight)) 5267 goto out_unlock; 5268 5269 /* Drivers must not modify the NAPI state if they 5270 * consume the entire weight. In such cases this code 5271 * still "owns" the NAPI instance and therefore can 5272 * move the instance around on the list at-will. 5273 */ 5274 if (unlikely(napi_disable_pending(n))) { 5275 napi_complete(n); 5276 goto out_unlock; 5277 } 5278 5279 if (n->gro_list) { 5280 /* flush too old packets 5281 * If HZ < 1000, flush all packets. 5282 */ 5283 napi_gro_flush(n, HZ >= 1000); 5284 } 5285 5286 /* Some drivers may have called napi_schedule 5287 * prior to exhausting their budget. 5288 */ 5289 if (unlikely(!list_empty(&n->poll_list))) { 5290 pr_warn_once("%s: Budget exhausted after napi rescheduled\n", 5291 n->dev ? n->dev->name : "backlog"); 5292 goto out_unlock; 5293 } 5294 5295 list_add_tail(&n->poll_list, repoll); 5296 5297 out_unlock: 5298 netpoll_poll_unlock(have); 5299 5300 return work; 5301 } 5302 5303 static __latent_entropy void net_rx_action(struct softirq_action *h) 5304 { 5305 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 5306 unsigned long time_limit = jiffies + 2; 5307 int budget = netdev_budget; 5308 LIST_HEAD(list); 5309 LIST_HEAD(repoll); 5310 5311 local_irq_disable(); 5312 list_splice_init(&sd->poll_list, &list); 5313 local_irq_enable(); 5314 5315 for (;;) { 5316 struct napi_struct *n; 5317 5318 if (list_empty(&list)) { 5319 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) 5320 return; 5321 break; 5322 } 5323 5324 n = list_first_entry(&list, struct napi_struct, poll_list); 5325 budget -= napi_poll(n, &repoll); 5326 5327 /* If softirq window is exhausted then punt. 5328 * Allow this to run for 2 jiffies since which will allow 5329 * an average latency of 1.5/HZ. 5330 */ 5331 if (unlikely(budget <= 0 || 5332 time_after_eq(jiffies, time_limit))) { 5333 sd->time_squeeze++; 5334 break; 5335 } 5336 } 5337 5338 __kfree_skb_flush(); 5339 local_irq_disable(); 5340 5341 list_splice_tail_init(&sd->poll_list, &list); 5342 list_splice_tail(&repoll, &list); 5343 list_splice(&list, &sd->poll_list); 5344 if (!list_empty(&sd->poll_list)) 5345 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 5346 5347 net_rps_action_and_irq_enable(sd); 5348 } 5349 5350 struct netdev_adjacent { 5351 struct net_device *dev; 5352 5353 /* upper master flag, there can only be one master device per list */ 5354 bool master; 5355 5356 /* counter for the number of times this device was added to us */ 5357 u16 ref_nr; 5358 5359 /* private field for the users */ 5360 void *private; 5361 5362 struct list_head list; 5363 struct rcu_head rcu; 5364 }; 5365 5366 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, 5367 struct list_head *adj_list) 5368 { 5369 struct netdev_adjacent *adj; 5370 5371 list_for_each_entry(adj, adj_list, list) { 5372 if (adj->dev == adj_dev) 5373 return adj; 5374 } 5375 return NULL; 5376 } 5377 5378 /** 5379 * netdev_has_upper_dev - Check if device is linked to an upper device 5380 * @dev: device 5381 * @upper_dev: upper device to check 5382 * 5383 * Find out if a device is linked to specified upper device and return true 5384 * in case it is. Note that this checks only immediate upper device, 5385 * not through a complete stack of devices. The caller must hold the RTNL lock. 5386 */ 5387 bool netdev_has_upper_dev(struct net_device *dev, 5388 struct net_device *upper_dev) 5389 { 5390 ASSERT_RTNL(); 5391 5392 return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper); 5393 } 5394 EXPORT_SYMBOL(netdev_has_upper_dev); 5395 5396 /** 5397 * netdev_has_any_upper_dev - Check if device is linked to some device 5398 * @dev: device 5399 * 5400 * Find out if a device is linked to an upper device and return true in case 5401 * it is. The caller must hold the RTNL lock. 5402 */ 5403 bool netdev_has_any_upper_dev(struct net_device *dev) 5404 { 5405 ASSERT_RTNL(); 5406 5407 return !list_empty(&dev->all_adj_list.upper); 5408 } 5409 EXPORT_SYMBOL(netdev_has_any_upper_dev); 5410 5411 /** 5412 * netdev_master_upper_dev_get - Get master upper device 5413 * @dev: device 5414 * 5415 * Find a master upper device and return pointer to it or NULL in case 5416 * it's not there. The caller must hold the RTNL lock. 5417 */ 5418 struct net_device *netdev_master_upper_dev_get(struct net_device *dev) 5419 { 5420 struct netdev_adjacent *upper; 5421 5422 ASSERT_RTNL(); 5423 5424 if (list_empty(&dev->adj_list.upper)) 5425 return NULL; 5426 5427 upper = list_first_entry(&dev->adj_list.upper, 5428 struct netdev_adjacent, list); 5429 if (likely(upper->master)) 5430 return upper->dev; 5431 return NULL; 5432 } 5433 EXPORT_SYMBOL(netdev_master_upper_dev_get); 5434 5435 void *netdev_adjacent_get_private(struct list_head *adj_list) 5436 { 5437 struct netdev_adjacent *adj; 5438 5439 adj = list_entry(adj_list, struct netdev_adjacent, list); 5440 5441 return adj->private; 5442 } 5443 EXPORT_SYMBOL(netdev_adjacent_get_private); 5444 5445 /** 5446 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list 5447 * @dev: device 5448 * @iter: list_head ** of the current position 5449 * 5450 * Gets the next device from the dev's upper list, starting from iter 5451 * position. The caller must hold RCU read lock. 5452 */ 5453 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, 5454 struct list_head **iter) 5455 { 5456 struct netdev_adjacent *upper; 5457 5458 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 5459 5460 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5461 5462 if (&upper->list == &dev->adj_list.upper) 5463 return NULL; 5464 5465 *iter = &upper->list; 5466 5467 return upper->dev; 5468 } 5469 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); 5470 5471 /** 5472 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list 5473 * @dev: device 5474 * @iter: list_head ** of the current position 5475 * 5476 * Gets the next device from the dev's upper list, starting from iter 5477 * position. The caller must hold RCU read lock. 5478 */ 5479 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, 5480 struct list_head **iter) 5481 { 5482 struct netdev_adjacent *upper; 5483 5484 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 5485 5486 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5487 5488 if (&upper->list == &dev->all_adj_list.upper) 5489 return NULL; 5490 5491 *iter = &upper->list; 5492 5493 return upper->dev; 5494 } 5495 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); 5496 5497 /** 5498 * netdev_lower_get_next_private - Get the next ->private from the 5499 * lower neighbour list 5500 * @dev: device 5501 * @iter: list_head ** of the current position 5502 * 5503 * Gets the next netdev_adjacent->private from the dev's lower neighbour 5504 * list, starting from iter position. The caller must hold either hold the 5505 * RTNL lock or its own locking that guarantees that the neighbour lower 5506 * list will remain unchanged. 5507 */ 5508 void *netdev_lower_get_next_private(struct net_device *dev, 5509 struct list_head **iter) 5510 { 5511 struct netdev_adjacent *lower; 5512 5513 lower = list_entry(*iter, struct netdev_adjacent, list); 5514 5515 if (&lower->list == &dev->adj_list.lower) 5516 return NULL; 5517 5518 *iter = lower->list.next; 5519 5520 return lower->private; 5521 } 5522 EXPORT_SYMBOL(netdev_lower_get_next_private); 5523 5524 /** 5525 * netdev_lower_get_next_private_rcu - Get the next ->private from the 5526 * lower neighbour list, RCU 5527 * variant 5528 * @dev: device 5529 * @iter: list_head ** of the current position 5530 * 5531 * Gets the next netdev_adjacent->private from the dev's lower neighbour 5532 * list, starting from iter position. The caller must hold RCU read lock. 5533 */ 5534 void *netdev_lower_get_next_private_rcu(struct net_device *dev, 5535 struct list_head **iter) 5536 { 5537 struct netdev_adjacent *lower; 5538 5539 WARN_ON_ONCE(!rcu_read_lock_held()); 5540 5541 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5542 5543 if (&lower->list == &dev->adj_list.lower) 5544 return NULL; 5545 5546 *iter = &lower->list; 5547 5548 return lower->private; 5549 } 5550 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); 5551 5552 /** 5553 * netdev_lower_get_next - Get the next device from the lower neighbour 5554 * list 5555 * @dev: device 5556 * @iter: list_head ** of the current position 5557 * 5558 * Gets the next netdev_adjacent from the dev's lower neighbour 5559 * list, starting from iter position. The caller must hold RTNL lock or 5560 * its own locking that guarantees that the neighbour lower 5561 * list will remain unchanged. 5562 */ 5563 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) 5564 { 5565 struct netdev_adjacent *lower; 5566 5567 lower = list_entry(*iter, struct netdev_adjacent, list); 5568 5569 if (&lower->list == &dev->adj_list.lower) 5570 return NULL; 5571 5572 *iter = lower->list.next; 5573 5574 return lower->dev; 5575 } 5576 EXPORT_SYMBOL(netdev_lower_get_next); 5577 5578 /** 5579 * netdev_all_lower_get_next - Get the next device from all lower neighbour list 5580 * @dev: device 5581 * @iter: list_head ** of the current position 5582 * 5583 * Gets the next netdev_adjacent from the dev's all lower neighbour 5584 * list, starting from iter position. The caller must hold RTNL lock or 5585 * its own locking that guarantees that the neighbour all lower 5586 * list will remain unchanged. 5587 */ 5588 struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter) 5589 { 5590 struct netdev_adjacent *lower; 5591 5592 lower = list_entry(*iter, struct netdev_adjacent, list); 5593 5594 if (&lower->list == &dev->all_adj_list.lower) 5595 return NULL; 5596 5597 *iter = lower->list.next; 5598 5599 return lower->dev; 5600 } 5601 EXPORT_SYMBOL(netdev_all_lower_get_next); 5602 5603 /** 5604 * netdev_all_lower_get_next_rcu - Get the next device from all 5605 * lower neighbour list, RCU variant 5606 * @dev: device 5607 * @iter: list_head ** of the current position 5608 * 5609 * Gets the next netdev_adjacent from the dev's all lower neighbour 5610 * list, starting from iter position. The caller must hold RCU read lock. 5611 */ 5612 struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, 5613 struct list_head **iter) 5614 { 5615 struct netdev_adjacent *lower; 5616 5617 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5618 5619 if (&lower->list == &dev->all_adj_list.lower) 5620 return NULL; 5621 5622 *iter = &lower->list; 5623 5624 return lower->dev; 5625 } 5626 EXPORT_SYMBOL(netdev_all_lower_get_next_rcu); 5627 5628 /** 5629 * netdev_lower_get_first_private_rcu - Get the first ->private from the 5630 * lower neighbour list, RCU 5631 * variant 5632 * @dev: device 5633 * 5634 * Gets the first netdev_adjacent->private from the dev's lower neighbour 5635 * list. The caller must hold RCU read lock. 5636 */ 5637 void *netdev_lower_get_first_private_rcu(struct net_device *dev) 5638 { 5639 struct netdev_adjacent *lower; 5640 5641 lower = list_first_or_null_rcu(&dev->adj_list.lower, 5642 struct netdev_adjacent, list); 5643 if (lower) 5644 return lower->private; 5645 return NULL; 5646 } 5647 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); 5648 5649 /** 5650 * netdev_master_upper_dev_get_rcu - Get master upper device 5651 * @dev: device 5652 * 5653 * Find a master upper device and return pointer to it or NULL in case 5654 * it's not there. The caller must hold the RCU read lock. 5655 */ 5656 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) 5657 { 5658 struct netdev_adjacent *upper; 5659 5660 upper = list_first_or_null_rcu(&dev->adj_list.upper, 5661 struct netdev_adjacent, list); 5662 if (upper && likely(upper->master)) 5663 return upper->dev; 5664 return NULL; 5665 } 5666 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); 5667 5668 static int netdev_adjacent_sysfs_add(struct net_device *dev, 5669 struct net_device *adj_dev, 5670 struct list_head *dev_list) 5671 { 5672 char linkname[IFNAMSIZ+7]; 5673 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5674 "upper_%s" : "lower_%s", adj_dev->name); 5675 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), 5676 linkname); 5677 } 5678 static void netdev_adjacent_sysfs_del(struct net_device *dev, 5679 char *name, 5680 struct list_head *dev_list) 5681 { 5682 char linkname[IFNAMSIZ+7]; 5683 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5684 "upper_%s" : "lower_%s", name); 5685 sysfs_remove_link(&(dev->dev.kobj), linkname); 5686 } 5687 5688 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, 5689 struct net_device *adj_dev, 5690 struct list_head *dev_list) 5691 { 5692 return (dev_list == &dev->adj_list.upper || 5693 dev_list == &dev->adj_list.lower) && 5694 net_eq(dev_net(dev), dev_net(adj_dev)); 5695 } 5696 5697 static int __netdev_adjacent_dev_insert(struct net_device *dev, 5698 struct net_device *adj_dev, 5699 u16 ref_nr, 5700 struct list_head *dev_list, 5701 void *private, bool master) 5702 { 5703 struct netdev_adjacent *adj; 5704 int ret; 5705 5706 adj = __netdev_find_adj(adj_dev, dev_list); 5707 5708 if (adj) { 5709 adj->ref_nr += ref_nr; 5710 return 0; 5711 } 5712 5713 adj = kmalloc(sizeof(*adj), GFP_KERNEL); 5714 if (!adj) 5715 return -ENOMEM; 5716 5717 adj->dev = adj_dev; 5718 adj->master = master; 5719 adj->ref_nr = ref_nr; 5720 adj->private = private; 5721 dev_hold(adj_dev); 5722 5723 pr_debug("dev_hold for %s, because of link added from %s to %s\n", 5724 adj_dev->name, dev->name, adj_dev->name); 5725 5726 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { 5727 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); 5728 if (ret) 5729 goto free_adj; 5730 } 5731 5732 /* Ensure that master link is always the first item in list. */ 5733 if (master) { 5734 ret = sysfs_create_link(&(dev->dev.kobj), 5735 &(adj_dev->dev.kobj), "master"); 5736 if (ret) 5737 goto remove_symlinks; 5738 5739 list_add_rcu(&adj->list, dev_list); 5740 } else { 5741 list_add_tail_rcu(&adj->list, dev_list); 5742 } 5743 5744 return 0; 5745 5746 remove_symlinks: 5747 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5748 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5749 free_adj: 5750 kfree(adj); 5751 dev_put(adj_dev); 5752 5753 return ret; 5754 } 5755 5756 static void __netdev_adjacent_dev_remove(struct net_device *dev, 5757 struct net_device *adj_dev, 5758 u16 ref_nr, 5759 struct list_head *dev_list) 5760 { 5761 struct netdev_adjacent *adj; 5762 5763 adj = __netdev_find_adj(adj_dev, dev_list); 5764 5765 if (!adj) { 5766 pr_err("tried to remove device %s from %s\n", 5767 dev->name, adj_dev->name); 5768 BUG(); 5769 } 5770 5771 if (adj->ref_nr > ref_nr) { 5772 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name, 5773 ref_nr, adj->ref_nr-ref_nr); 5774 adj->ref_nr -= ref_nr; 5775 return; 5776 } 5777 5778 if (adj->master) 5779 sysfs_remove_link(&(dev->dev.kobj), "master"); 5780 5781 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5782 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5783 5784 list_del_rcu(&adj->list); 5785 pr_debug("dev_put for %s, because link removed from %s to %s\n", 5786 adj_dev->name, dev->name, adj_dev->name); 5787 dev_put(adj_dev); 5788 kfree_rcu(adj, rcu); 5789 } 5790 5791 static int __netdev_adjacent_dev_link_lists(struct net_device *dev, 5792 struct net_device *upper_dev, 5793 u16 ref_nr, 5794 struct list_head *up_list, 5795 struct list_head *down_list, 5796 void *private, bool master) 5797 { 5798 int ret; 5799 5800 ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list, 5801 private, master); 5802 if (ret) 5803 return ret; 5804 5805 ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list, 5806 private, false); 5807 if (ret) { 5808 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); 5809 return ret; 5810 } 5811 5812 return 0; 5813 } 5814 5815 static int __netdev_adjacent_dev_link(struct net_device *dev, 5816 struct net_device *upper_dev, 5817 u16 ref_nr) 5818 { 5819 return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr, 5820 &dev->all_adj_list.upper, 5821 &upper_dev->all_adj_list.lower, 5822 NULL, false); 5823 } 5824 5825 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, 5826 struct net_device *upper_dev, 5827 u16 ref_nr, 5828 struct list_head *up_list, 5829 struct list_head *down_list) 5830 { 5831 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); 5832 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); 5833 } 5834 5835 static void __netdev_adjacent_dev_unlink(struct net_device *dev, 5836 struct net_device *upper_dev, 5837 u16 ref_nr) 5838 { 5839 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr, 5840 &dev->all_adj_list.upper, 5841 &upper_dev->all_adj_list.lower); 5842 } 5843 5844 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, 5845 struct net_device *upper_dev, 5846 void *private, bool master) 5847 { 5848 int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1); 5849 5850 if (ret) 5851 return ret; 5852 5853 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1, 5854 &dev->adj_list.upper, 5855 &upper_dev->adj_list.lower, 5856 private, master); 5857 if (ret) { 5858 __netdev_adjacent_dev_unlink(dev, upper_dev, 1); 5859 return ret; 5860 } 5861 5862 return 0; 5863 } 5864 5865 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, 5866 struct net_device *upper_dev) 5867 { 5868 __netdev_adjacent_dev_unlink(dev, upper_dev, 1); 5869 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, 5870 &dev->adj_list.upper, 5871 &upper_dev->adj_list.lower); 5872 } 5873 5874 static int __netdev_upper_dev_link(struct net_device *dev, 5875 struct net_device *upper_dev, bool master, 5876 void *upper_priv, void *upper_info) 5877 { 5878 struct netdev_notifier_changeupper_info changeupper_info; 5879 struct netdev_adjacent *i, *j, *to_i, *to_j; 5880 int ret = 0; 5881 5882 ASSERT_RTNL(); 5883 5884 if (dev == upper_dev) 5885 return -EBUSY; 5886 5887 /* To prevent loops, check if dev is not upper device to upper_dev. */ 5888 if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper)) 5889 return -EBUSY; 5890 5891 if (__netdev_find_adj(upper_dev, &dev->adj_list.upper)) 5892 return -EEXIST; 5893 5894 if (master && netdev_master_upper_dev_get(dev)) 5895 return -EBUSY; 5896 5897 changeupper_info.upper_dev = upper_dev; 5898 changeupper_info.master = master; 5899 changeupper_info.linking = true; 5900 changeupper_info.upper_info = upper_info; 5901 5902 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, 5903 &changeupper_info.info); 5904 ret = notifier_to_errno(ret); 5905 if (ret) 5906 return ret; 5907 5908 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, 5909 master); 5910 if (ret) 5911 return ret; 5912 5913 /* Now that we linked these devs, make all the upper_dev's 5914 * all_adj_list.upper visible to every dev's all_adj_list.lower an 5915 * versa, and don't forget the devices itself. All of these 5916 * links are non-neighbours. 5917 */ 5918 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5919 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5920 pr_debug("Interlinking %s with %s, non-neighbour\n", 5921 i->dev->name, j->dev->name); 5922 ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr); 5923 if (ret) 5924 goto rollback_mesh; 5925 } 5926 } 5927 5928 /* add dev to every upper_dev's upper device */ 5929 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5930 pr_debug("linking %s's upper device %s with %s\n", 5931 upper_dev->name, i->dev->name, dev->name); 5932 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr); 5933 if (ret) 5934 goto rollback_upper_mesh; 5935 } 5936 5937 /* add upper_dev to every dev's lower device */ 5938 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5939 pr_debug("linking %s's lower device %s with %s\n", dev->name, 5940 i->dev->name, upper_dev->name); 5941 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr); 5942 if (ret) 5943 goto rollback_lower_mesh; 5944 } 5945 5946 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, 5947 &changeupper_info.info); 5948 ret = notifier_to_errno(ret); 5949 if (ret) 5950 goto rollback_lower_mesh; 5951 5952 return 0; 5953 5954 rollback_lower_mesh: 5955 to_i = i; 5956 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5957 if (i == to_i) 5958 break; 5959 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); 5960 } 5961 5962 i = NULL; 5963 5964 rollback_upper_mesh: 5965 to_i = i; 5966 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5967 if (i == to_i) 5968 break; 5969 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); 5970 } 5971 5972 i = j = NULL; 5973 5974 rollback_mesh: 5975 to_i = i; 5976 to_j = j; 5977 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5978 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5979 if (i == to_i && j == to_j) 5980 break; 5981 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); 5982 } 5983 if (i == to_i) 5984 break; 5985 } 5986 5987 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5988 5989 return ret; 5990 } 5991 5992 /** 5993 * netdev_upper_dev_link - Add a link to the upper device 5994 * @dev: device 5995 * @upper_dev: new upper device 5996 * 5997 * Adds a link to device which is upper to this one. The caller must hold 5998 * the RTNL lock. On a failure a negative errno code is returned. 5999 * On success the reference counts are adjusted and the function 6000 * returns zero. 6001 */ 6002 int netdev_upper_dev_link(struct net_device *dev, 6003 struct net_device *upper_dev) 6004 { 6005 return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL); 6006 } 6007 EXPORT_SYMBOL(netdev_upper_dev_link); 6008 6009 /** 6010 * netdev_master_upper_dev_link - Add a master link to the upper device 6011 * @dev: device 6012 * @upper_dev: new upper device 6013 * @upper_priv: upper device private 6014 * @upper_info: upper info to be passed down via notifier 6015 * 6016 * Adds a link to device which is upper to this one. In this case, only 6017 * one master upper device can be linked, although other non-master devices 6018 * might be linked as well. The caller must hold the RTNL lock. 6019 * On a failure a negative errno code is returned. On success the reference 6020 * counts are adjusted and the function returns zero. 6021 */ 6022 int netdev_master_upper_dev_link(struct net_device *dev, 6023 struct net_device *upper_dev, 6024 void *upper_priv, void *upper_info) 6025 { 6026 return __netdev_upper_dev_link(dev, upper_dev, true, 6027 upper_priv, upper_info); 6028 } 6029 EXPORT_SYMBOL(netdev_master_upper_dev_link); 6030 6031 /** 6032 * netdev_upper_dev_unlink - Removes a link to upper device 6033 * @dev: device 6034 * @upper_dev: new upper device 6035 * 6036 * Removes a link to device which is upper to this one. The caller must hold 6037 * the RTNL lock. 6038 */ 6039 void netdev_upper_dev_unlink(struct net_device *dev, 6040 struct net_device *upper_dev) 6041 { 6042 struct netdev_notifier_changeupper_info changeupper_info; 6043 struct netdev_adjacent *i, *j; 6044 ASSERT_RTNL(); 6045 6046 changeupper_info.upper_dev = upper_dev; 6047 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; 6048 changeupper_info.linking = false; 6049 6050 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, 6051 &changeupper_info.info); 6052 6053 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 6054 6055 /* Here is the tricky part. We must remove all dev's lower 6056 * devices from all upper_dev's upper devices and vice 6057 * versa, to maintain the graph relationship. 6058 */ 6059 list_for_each_entry(i, &dev->all_adj_list.lower, list) 6060 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) 6061 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); 6062 6063 /* remove also the devices itself from lower/upper device 6064 * list 6065 */ 6066 list_for_each_entry(i, &dev->all_adj_list.lower, list) 6067 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); 6068 6069 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) 6070 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); 6071 6072 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, 6073 &changeupper_info.info); 6074 } 6075 EXPORT_SYMBOL(netdev_upper_dev_unlink); 6076 6077 /** 6078 * netdev_bonding_info_change - Dispatch event about slave change 6079 * @dev: device 6080 * @bonding_info: info to dispatch 6081 * 6082 * Send NETDEV_BONDING_INFO to netdev notifiers with info. 6083 * The caller must hold the RTNL lock. 6084 */ 6085 void netdev_bonding_info_change(struct net_device *dev, 6086 struct netdev_bonding_info *bonding_info) 6087 { 6088 struct netdev_notifier_bonding_info info; 6089 6090 memcpy(&info.bonding_info, bonding_info, 6091 sizeof(struct netdev_bonding_info)); 6092 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, 6093 &info.info); 6094 } 6095 EXPORT_SYMBOL(netdev_bonding_info_change); 6096 6097 static void netdev_adjacent_add_links(struct net_device *dev) 6098 { 6099 struct netdev_adjacent *iter; 6100 6101 struct net *net = dev_net(dev); 6102 6103 list_for_each_entry(iter, &dev->adj_list.upper, list) { 6104 if (!net_eq(net, dev_net(iter->dev))) 6105 continue; 6106 netdev_adjacent_sysfs_add(iter->dev, dev, 6107 &iter->dev->adj_list.lower); 6108 netdev_adjacent_sysfs_add(dev, iter->dev, 6109 &dev->adj_list.upper); 6110 } 6111 6112 list_for_each_entry(iter, &dev->adj_list.lower, list) { 6113 if (!net_eq(net, dev_net(iter->dev))) 6114 continue; 6115 netdev_adjacent_sysfs_add(iter->dev, dev, 6116 &iter->dev->adj_list.upper); 6117 netdev_adjacent_sysfs_add(dev, iter->dev, 6118 &dev->adj_list.lower); 6119 } 6120 } 6121 6122 static void netdev_adjacent_del_links(struct net_device *dev) 6123 { 6124 struct netdev_adjacent *iter; 6125 6126 struct net *net = dev_net(dev); 6127 6128 list_for_each_entry(iter, &dev->adj_list.upper, list) { 6129 if (!net_eq(net, dev_net(iter->dev))) 6130 continue; 6131 netdev_adjacent_sysfs_del(iter->dev, dev->name, 6132 &iter->dev->adj_list.lower); 6133 netdev_adjacent_sysfs_del(dev, iter->dev->name, 6134 &dev->adj_list.upper); 6135 } 6136 6137 list_for_each_entry(iter, &dev->adj_list.lower, list) { 6138 if (!net_eq(net, dev_net(iter->dev))) 6139 continue; 6140 netdev_adjacent_sysfs_del(iter->dev, dev->name, 6141 &iter->dev->adj_list.upper); 6142 netdev_adjacent_sysfs_del(dev, iter->dev->name, 6143 &dev->adj_list.lower); 6144 } 6145 } 6146 6147 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) 6148 { 6149 struct netdev_adjacent *iter; 6150 6151 struct net *net = dev_net(dev); 6152 6153 list_for_each_entry(iter, &dev->adj_list.upper, list) { 6154 if (!net_eq(net, dev_net(iter->dev))) 6155 continue; 6156 netdev_adjacent_sysfs_del(iter->dev, oldname, 6157 &iter->dev->adj_list.lower); 6158 netdev_adjacent_sysfs_add(iter->dev, dev, 6159 &iter->dev->adj_list.lower); 6160 } 6161 6162 list_for_each_entry(iter, &dev->adj_list.lower, list) { 6163 if (!net_eq(net, dev_net(iter->dev))) 6164 continue; 6165 netdev_adjacent_sysfs_del(iter->