~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/core/sock.c

Version: ~ [ linux-5.13-rc7 ] ~ [ linux-5.12.12 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.45 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.127 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.195 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.237 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.273 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.273 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  3  *              operating system.  INET is implemented using the  BSD Socket
  4  *              interface as the means of communication with the user level.
  5  *
  6  *              Generic socket support routines. Memory allocators, socket lock/release
  7  *              handler for protocols to use and generic option handler.
  8  *
  9  *
 10  * Authors:     Ross Biro
 11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  *              Florian La Roche, <flla@stud.uni-sb.de>
 13  *              Alan Cox, <A.Cox@swansea.ac.uk>
 14  *
 15  * Fixes:
 16  *              Alan Cox        :       Numerous verify_area() problems
 17  *              Alan Cox        :       Connecting on a connecting socket
 18  *                                      now returns an error for tcp.
 19  *              Alan Cox        :       sock->protocol is set correctly.
 20  *                                      and is not sometimes left as 0.
 21  *              Alan Cox        :       connect handles icmp errors on a
 22  *                                      connect properly. Unfortunately there
 23  *                                      is a restart syscall nasty there. I
 24  *                                      can't match BSD without hacking the C
 25  *                                      library. Ideas urgently sought!
 26  *              Alan Cox        :       Disallow bind() to addresses that are
 27  *                                      not ours - especially broadcast ones!!
 28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
 29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
 30  *                                      instead they leave that for the DESTROY timer.
 31  *              Alan Cox        :       Clean up error flag in accept
 32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
 33  *                                      was buggy. Put a remove_sock() in the handler
 34  *                                      for memory when we hit 0. Also altered the timer
 35  *                                      code. The ACK stuff can wait and needs major
 36  *                                      TCP layer surgery.
 37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
 38  *                                      and fixed timer/inet_bh race.
 39  *              Alan Cox        :       Added zapped flag for TCP
 40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
 41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
 42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
 43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
 44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
 45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
 46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
 47  *      Pauline Middelink       :       identd support
 48  *              Alan Cox        :       Fixed connect() taking signals I think.
 49  *              Alan Cox        :       SO_LINGER supported
 50  *              Alan Cox        :       Error reporting fixes
 51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
 52  *              Alan Cox        :       inet sockets don't set sk->type!
 53  *              Alan Cox        :       Split socket option code
 54  *              Alan Cox        :       Callbacks
 55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
 56  *              Alex            :       Removed restriction on inet fioctl
 57  *              Alan Cox        :       Splitting INET from NET core
 58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
 59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
 60  *              Alan Cox        :       Split IP from generic code
 61  *              Alan Cox        :       New kfree_skbmem()
 62  *              Alan Cox        :       Make SO_DEBUG superuser only.
 63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
 64  *                                      (compatibility fix)
 65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
 66  *              Alan Cox        :       Allocator for a socket is settable.
 67  *              Alan Cox        :       SO_ERROR includes soft errors.
 68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
 69  *              Alan Cox        :       Generic socket allocation to make hooks
 70  *                                      easier (suggested by Craig Metz).
 71  *              Michael Pall    :       SO_ERROR returns positive errno again
 72  *              Steve Whitehouse:       Added default destructor to free
 73  *                                      protocol private data.
 74  *              Steve Whitehouse:       Added various other default routines
 75  *                                      common to several socket families.
 76  *              Chris Evans     :       Call suser() check last on F_SETOWN
 77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
 78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
 79  *              Andi Kleen      :       Fix write_space callback
 80  *              Chris Evans     :       Security fixes - signedness again
 81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
 82  *
 83  * To Fix:
 84  *
 85  *
 86  *              This program is free software; you can redistribute it and/or
 87  *              modify it under the terms of the GNU General Public License
 88  *              as published by the Free Software Foundation; either version
 89  *              2 of the License, or (at your option) any later version.
 90  */
 91 
 92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 93 
 94 #include <linux/capability.h>
 95 #include <linux/errno.h>
 96 #include <linux/errqueue.h>
 97 #include <linux/types.h>
 98 #include <linux/socket.h>
 99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
120 
121 #include <asm/uaccess.h>
122 
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
135 
136 #include <linux/filter.h>
137 
138 #include <trace/events/sock.h>
139 
140 #ifdef CONFIG_INET
141 #include <net/tcp.h>
142 #endif
143 
144 #include <net/busy_poll.h>
145 
146 static DEFINE_MUTEX(proto_list_mutex);
147 static LIST_HEAD(proto_list);
148 
149 /**
150  * sk_ns_capable - General socket capability test
151  * @sk: Socket to use a capability on or through
152  * @user_ns: The user namespace of the capability to use
153  * @cap: The capability to use
154  *
155  * Test to see if the opener of the socket had when the socket was
156  * created and the current process has the capability @cap in the user
157  * namespace @user_ns.
158  */
159 bool sk_ns_capable(const struct sock *sk,
160                    struct user_namespace *user_ns, int cap)
161 {
162         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
163                 ns_capable(user_ns, cap);
164 }
165 EXPORT_SYMBOL(sk_ns_capable);
166 
167 /**
168  * sk_capable - Socket global capability test
169  * @sk: Socket to use a capability on or through
170  * @cap: The global capability to use
171  *
172  * Test to see if the opener of the socket had when the socket was
173  * created and the current process has the capability @cap in all user
174  * namespaces.
175  */
176 bool sk_capable(const struct sock *sk, int cap)
177 {
178         return sk_ns_capable(sk, &init_user_ns, cap);
179 }
180 EXPORT_SYMBOL(sk_capable);
181 
182 /**
183  * sk_net_capable - Network namespace socket capability test
184  * @sk: Socket to use a capability on or through
185  * @cap: The capability to use
186  *
187  * Test to see if the opener of the socket had when the socket was created
188  * and the current process has the capability @cap over the network namespace
189  * the socket is a member of.
190  */
191 bool sk_net_capable(const struct sock *sk, int cap)
192 {
193         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
194 }
195 EXPORT_SYMBOL(sk_net_capable);
196 
197 
198 #ifdef CONFIG_MEMCG_KMEM
199 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
200 {
201         struct proto *proto;
202         int ret = 0;
203 
204         mutex_lock(&proto_list_mutex);
205         list_for_each_entry(proto, &proto_list, node) {
206                 if (proto->init_cgroup) {
207                         ret = proto->init_cgroup(memcg, ss);
208                         if (ret)
209                                 goto out;
210                 }
211         }
212 
213         mutex_unlock(&proto_list_mutex);
214         return ret;
215 out:
216         list_for_each_entry_continue_reverse(proto, &proto_list, node)
217                 if (proto->destroy_cgroup)
218                         proto->destroy_cgroup(memcg);
219         mutex_unlock(&proto_list_mutex);
220         return ret;
221 }
222 
223 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
224 {
225         struct proto *proto;
226 
227         mutex_lock(&proto_list_mutex);
228         list_for_each_entry_reverse(proto, &proto_list, node)
229                 if (proto->destroy_cgroup)
230                         proto->destroy_cgroup(memcg);
231         mutex_unlock(&proto_list_mutex);
232 }
233 #endif
234 
235 /*
236  * Each address family might have different locking rules, so we have
237  * one slock key per address family:
238  */
239 static struct lock_class_key af_family_keys[AF_MAX];
240 static struct lock_class_key af_family_slock_keys[AF_MAX];
241 
242 #if defined(CONFIG_MEMCG_KMEM)
243 struct static_key memcg_socket_limit_enabled;
244 EXPORT_SYMBOL(memcg_socket_limit_enabled);
245 #endif
246 
247 /*
248  * Make lock validator output more readable. (we pre-construct these
249  * strings build-time, so that runtime initialization of socket
250  * locks is fast):
251  */
252 static const char *const af_family_key_strings[AF_MAX+1] = {
253   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
254   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
255   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
256   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
257   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
258   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
259   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
260   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
261   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
262   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
263   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
264   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
265   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
266   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
267 };
268 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
269   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
270   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
271   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
272   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
273   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
274   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
275   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
276   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
277   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
278   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
279   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
280   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
281   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
282   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
283 };
284 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
285   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
286   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
287   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
288   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
289   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
290   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
291   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
292   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
293   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
294   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
295   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
296   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
297   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
298   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
299 };
300 
301 /*
302  * sk_callback_lock locking rules are per-address-family,
303  * so split the lock classes by using a per-AF key:
304  */
305 static struct lock_class_key af_callback_keys[AF_MAX];
306 
307 /* Take into consideration the size of the struct sk_buff overhead in the
308  * determination of these values, since that is non-constant across
309  * platforms.  This makes socket queueing behavior and performance
310  * not depend upon such differences.
311  */
312 #define _SK_MEM_PACKETS         256
313 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
314 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
315 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
316 
317 /* Run time adjustable parameters. */
318 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
319 EXPORT_SYMBOL(sysctl_wmem_max);
320 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
321 EXPORT_SYMBOL(sysctl_rmem_max);
322 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
323 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
324 
325 /* Maximal space eaten by iovec or ancillary data plus some space */
326 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
327 EXPORT_SYMBOL(sysctl_optmem_max);
328 
329 int sysctl_tstamp_allow_data __read_mostly = 1;
330 
331 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
332 EXPORT_SYMBOL_GPL(memalloc_socks);
333 
334 /**
335  * sk_set_memalloc - sets %SOCK_MEMALLOC
336  * @sk: socket to set it on
337  *
338  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
339  * It's the responsibility of the admin to adjust min_free_kbytes
340  * to meet the requirements
341  */
342 void sk_set_memalloc(struct sock *sk)
343 {
344         sock_set_flag(sk, SOCK_MEMALLOC);
345         sk->sk_allocation |= __GFP_MEMALLOC;
346         static_key_slow_inc(&memalloc_socks);
347 }
348 EXPORT_SYMBOL_GPL(sk_set_memalloc);
349 
350 void sk_clear_memalloc(struct sock *sk)
351 {
352         sock_reset_flag(sk, SOCK_MEMALLOC);
353         sk->sk_allocation &= ~__GFP_MEMALLOC;
354         static_key_slow_dec(&memalloc_socks);
355 
356         /*
357          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
358          * progress of swapping. SOCK_MEMALLOC may be cleared while
359          * it has rmem allocations due to the last swapfile being deactivated
360          * but there is a risk that the socket is unusable due to exceeding
361          * the rmem limits. Reclaim the reserves and obey rmem limits again.
362          */
363         sk_mem_reclaim(sk);
364 }
365 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
366 
367 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
368 {
369         int ret;
370         unsigned long pflags = current->flags;
371 
372         /* these should have been dropped before queueing */
373         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
374 
375         current->flags |= PF_MEMALLOC;
376         ret = sk->sk_backlog_rcv(sk, skb);
377         tsk_restore_flags(current, pflags, PF_MEMALLOC);
378 
379         return ret;
380 }
381 EXPORT_SYMBOL(__sk_backlog_rcv);
382 
383 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
384 {
385         struct timeval tv;
386 
387         if (optlen < sizeof(tv))
388                 return -EINVAL;
389         if (copy_from_user(&tv, optval, sizeof(tv)))
390                 return -EFAULT;
391         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
392                 return -EDOM;
393 
394         if (tv.tv_sec < 0) {
395                 static int warned __read_mostly;
396 
397                 *timeo_p = 0;
398                 if (warned < 10 && net_ratelimit()) {
399                         warned++;
400                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
401                                 __func__, current->comm, task_pid_nr(current));
402                 }
403                 return 0;
404         }
405         *timeo_p = MAX_SCHEDULE_TIMEOUT;
406         if (tv.tv_sec == 0 && tv.tv_usec == 0)
407                 return 0;
408         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
409                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
410         return 0;
411 }
412 
413 static void sock_warn_obsolete_bsdism(const char *name)
414 {
415         static int warned;
416         static char warncomm[TASK_COMM_LEN];
417         if (strcmp(warncomm, current->comm) && warned < 5) {
418                 strcpy(warncomm,  current->comm);
419                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
420                         warncomm, name);
421                 warned++;
422         }
423 }
424 
425 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
426 {
427         if (sk->sk_flags & flags) {
428                 sk->sk_flags &= ~flags;
429                 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
430                         net_disable_timestamp();
431         }
432 }
433 
434 
435 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
436 {
437         int err;
438         unsigned long flags;
439         struct sk_buff_head *list = &sk->sk_receive_queue;
440 
441         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
442                 atomic_inc(&sk->sk_drops);
443                 trace_sock_rcvqueue_full(sk, skb);
444                 return -ENOMEM;
445         }
446 
447         err = sk_filter(sk, skb);
448         if (err)
449                 return err;
450 
451         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
452                 atomic_inc(&sk->sk_drops);
453                 return -ENOBUFS;
454         }
455 
456         skb->dev = NULL;
457         skb_set_owner_r(skb, sk);
458 
459         /* we escape from rcu protected region, make sure we dont leak
460          * a norefcounted dst
461          */
462         skb_dst_force(skb);
463 
464         spin_lock_irqsave(&list->lock, flags);
465         sock_skb_set_dropcount(sk, skb);
466         __skb_queue_tail(list, skb);
467         spin_unlock_irqrestore(&list->lock, flags);
468 
469         if (!sock_flag(sk, SOCK_DEAD))
470                 sk->sk_data_ready(sk);
471         return 0;
472 }
473 EXPORT_SYMBOL(sock_queue_rcv_skb);
474 
475 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
476 {
477         int rc = NET_RX_SUCCESS;
478 
479         if (sk_filter(sk, skb))
480                 goto discard_and_relse;
481 
482         skb->dev = NULL;
483 
484         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
485                 atomic_inc(&sk->sk_drops);
486                 goto discard_and_relse;
487         }
488         if (nested)
489                 bh_lock_sock_nested(sk);
490         else
491                 bh_lock_sock(sk);
492         if (!sock_owned_by_user(sk)) {
493                 /*
494                  * trylock + unlock semantics:
495                  */
496                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
497 
498                 rc = sk_backlog_rcv(sk, skb);
499 
500                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
501         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
502                 bh_unlock_sock(sk);
503                 atomic_inc(&sk->sk_drops);
504                 goto discard_and_relse;
505         }
506 
507         bh_unlock_sock(sk);
508 out:
509         sock_put(sk);
510         return rc;
511 discard_and_relse:
512         kfree_skb(skb);
513         goto out;
514 }
515 EXPORT_SYMBOL(sk_receive_skb);
516 
517 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
518 {
519         struct dst_entry *dst = __sk_dst_get(sk);
520 
521         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
522                 sk_tx_queue_clear(sk);
523                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
524                 dst_release(dst);
525                 return NULL;
526         }
527 
528         return dst;
529 }
530 EXPORT_SYMBOL(__sk_dst_check);
531 
532 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
533 {
534         struct dst_entry *dst = sk_dst_get(sk);
535 
536         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
537                 sk_dst_reset(sk);
538                 dst_release(dst);
539                 return NULL;
540         }
541 
542         return dst;
543 }
544 EXPORT_SYMBOL(sk_dst_check);
545 
546 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
547                                 int optlen)
548 {
549         int ret = -ENOPROTOOPT;
550 #ifdef CONFIG_NETDEVICES
551         struct net *net = sock_net(sk);
552         char devname[IFNAMSIZ];
553         int index;
554 
555         /* Sorry... */
556         ret = -EPERM;
557         if (!ns_capable(net->user_ns, CAP_NET_RAW))
558                 goto out;
559 
560         ret = -EINVAL;
561         if (optlen < 0)
562                 goto out;
563 
564         /* Bind this socket to a particular device like "eth0",
565          * as specified in the passed interface name. If the
566          * name is "" or the option length is zero the socket
567          * is not bound.
568          */
569         if (optlen > IFNAMSIZ - 1)
570                 optlen = IFNAMSIZ - 1;
571         memset(devname, 0, sizeof(devname));
572 
573         ret = -EFAULT;
574         if (copy_from_user(devname, optval, optlen))
575                 goto out;
576 
577         index = 0;
578         if (devname[0] != '\0') {
579                 struct net_device *dev;
580 
581                 rcu_read_lock();
582                 dev = dev_get_by_name_rcu(net, devname);
583                 if (dev)
584                         index = dev->ifindex;
585                 rcu_read_unlock();
586                 ret = -ENODEV;
587                 if (!dev)
588                         goto out;
589         }
590 
591         lock_sock(sk);
592         sk->sk_bound_dev_if = index;
593         sk_dst_reset(sk);
594         release_sock(sk);
595 
596         ret = 0;
597 
598 out:
599 #endif
600 
601         return ret;
602 }
603 
604 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
605                                 int __user *optlen, int len)
606 {
607         int ret = -ENOPROTOOPT;
608 #ifdef CONFIG_NETDEVICES
609         struct net *net = sock_net(sk);
610         char devname[IFNAMSIZ];
611 
612         if (sk->sk_bound_dev_if == 0) {
613                 len = 0;
614                 goto zero;
615         }
616 
617         ret = -EINVAL;
618         if (len < IFNAMSIZ)
619                 goto out;
620 
621         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
622         if (ret)
623                 goto out;
624 
625         len = strlen(devname) + 1;
626 
627         ret = -EFAULT;
628         if (copy_to_user(optval, devname, len))
629                 goto out;
630 
631 zero:
632         ret = -EFAULT;
633         if (put_user(len, optlen))
634                 goto out;
635 
636         ret = 0;
637 
638 out:
639 #endif
640 
641         return ret;
642 }
643 
644 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
645 {
646         if (valbool)
647                 sock_set_flag(sk, bit);
648         else
649                 sock_reset_flag(sk, bit);
650 }
651 
652 bool sk_mc_loop(struct sock *sk)
653 {
654         if (dev_recursion_level())
655                 return false;
656         if (!sk)
657                 return true;
658         switch (sk->sk_family) {
659         case AF_INET:
660                 return inet_sk(sk)->mc_loop;
661 #if IS_ENABLED(CONFIG_IPV6)
662         case AF_INET6:
663                 return inet6_sk(sk)->mc_loop;
664 #endif
665         }
666         WARN_ON(1);
667         return true;
668 }
669 EXPORT_SYMBOL(sk_mc_loop);
670 
671 /*
672  *      This is meant for all protocols to use and covers goings on
673  *      at the socket level. Everything here is generic.
674  */
675 
676 int sock_setsockopt(struct socket *sock, int level, int optname,
677                     char __user *optval, unsigned int optlen)
678 {
679         struct sock *sk = sock->sk;
680         int val;
681         int valbool;
682         struct linger ling;
683         int ret = 0;
684 
685         /*
686          *      Options without arguments
687          */
688 
689         if (optname == SO_BINDTODEVICE)
690                 return sock_setbindtodevice(sk, optval, optlen);
691 
692         if (optlen < sizeof(int))
693                 return -EINVAL;
694 
695         if (get_user(val, (int __user *)optval))
696                 return -EFAULT;
697 
698         valbool = val ? 1 : 0;
699 
700         lock_sock(sk);
701 
702         switch (optname) {
703         case SO_DEBUG:
704                 if (val && !capable(CAP_NET_ADMIN))
705                         ret = -EACCES;
706                 else
707                         sock_valbool_flag(sk, SOCK_DBG, valbool);
708                 break;
709         case SO_REUSEADDR:
710                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
711                 break;
712         case SO_REUSEPORT:
713                 sk->sk_reuseport = valbool;
714                 break;
715         case SO_TYPE:
716         case SO_PROTOCOL:
717         case SO_DOMAIN:
718         case SO_ERROR:
719                 ret = -ENOPROTOOPT;
720                 break;
721         case SO_DONTROUTE:
722                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
723                 break;
724         case SO_BROADCAST:
725                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
726                 break;
727         case SO_SNDBUF:
728                 /* Don't error on this BSD doesn't and if you think
729                  * about it this is right. Otherwise apps have to
730                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
731                  * are treated in BSD as hints
732                  */
733                 val = min_t(u32, val, sysctl_wmem_max);
734 set_sndbuf:
735                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
736                 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
737                 /* Wake up sending tasks if we upped the value. */
738                 sk->sk_write_space(sk);
739                 break;
740 
741         case SO_SNDBUFFORCE:
742                 if (!capable(CAP_NET_ADMIN)) {
743                         ret = -EPERM;
744                         break;
745                 }
746                 goto set_sndbuf;
747 
748         case SO_RCVBUF:
749                 /* Don't error on this BSD doesn't and if you think
750                  * about it this is right. Otherwise apps have to
751                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
752                  * are treated in BSD as hints
753                  */
754                 val = min_t(u32, val, sysctl_rmem_max);
755 set_rcvbuf:
756                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
757                 /*
758                  * We double it on the way in to account for
759                  * "struct sk_buff" etc. overhead.   Applications
760                  * assume that the SO_RCVBUF setting they make will
761                  * allow that much actual data to be received on that
762                  * socket.
763                  *
764                  * Applications are unaware that "struct sk_buff" and
765                  * other overheads allocate from the receive buffer
766                  * during socket buffer allocation.
767                  *
768                  * And after considering the possible alternatives,
769                  * returning the value we actually used in getsockopt
770                  * is the most desirable behavior.
771                  */
772                 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
773                 break;
774 
775         case SO_RCVBUFFORCE:
776                 if (!capable(CAP_NET_ADMIN)) {
777                         ret = -EPERM;
778                         break;
779                 }
780                 goto set_rcvbuf;
781 
782         case SO_KEEPALIVE:
783 #ifdef CONFIG_INET
784                 if (sk->sk_protocol == IPPROTO_TCP &&
785                     sk->sk_type == SOCK_STREAM)
786                         tcp_set_keepalive(sk, valbool);
787 #endif
788                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
789                 break;
790 
791         case SO_OOBINLINE:
792                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
793                 break;
794 
795         case SO_NO_CHECK:
796                 sk->sk_no_check_tx = valbool;
797                 break;
798 
799         case SO_PRIORITY:
800                 if ((val >= 0 && val <= 6) ||
801                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
802                         sk->sk_priority = val;
803                 else
804                         ret = -EPERM;
805                 break;
806 
807         case SO_LINGER:
808                 if (optlen < sizeof(ling)) {
809                         ret = -EINVAL;  /* 1003.1g */
810                         break;
811                 }
812                 if (copy_from_user(&ling, optval, sizeof(ling))) {
813                         ret = -EFAULT;
814                         break;
815                 }
816                 if (!ling.l_onoff)
817                         sock_reset_flag(sk, SOCK_LINGER);
818                 else {
819 #if (BITS_PER_LONG == 32)
820                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
821                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
822                         else
823 #endif
824                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
825                         sock_set_flag(sk, SOCK_LINGER);
826                 }
827                 break;
828 
829         case SO_BSDCOMPAT:
830                 sock_warn_obsolete_bsdism("setsockopt");
831                 break;
832 
833         case SO_PASSCRED:
834                 if (valbool)
835                         set_bit(SOCK_PASSCRED, &sock->flags);
836                 else
837                         clear_bit(SOCK_PASSCRED, &sock->flags);
838                 break;
839 
840         case SO_TIMESTAMP:
841         case SO_TIMESTAMPNS:
842                 if (valbool)  {
843                         if (optname == SO_TIMESTAMP)
844                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
845                         else
846                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
847                         sock_set_flag(sk, SOCK_RCVTSTAMP);
848                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
849                 } else {
850                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
851                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
852                 }
853                 break;
854 
855         case SO_TIMESTAMPING:
856                 if (val & ~SOF_TIMESTAMPING_MASK) {
857                         ret = -EINVAL;
858                         break;
859                 }
860 
861                 if (val & SOF_TIMESTAMPING_OPT_ID &&
862                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
863                         if (sk->sk_protocol == IPPROTO_TCP &&
864                             sk->sk_type == SOCK_STREAM) {
865                                 if (sk->sk_state != TCP_ESTABLISHED) {
866                                         ret = -EINVAL;
867                                         break;
868                                 }
869                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
870                         } else {
871                                 sk->sk_tskey = 0;
872                         }
873                 }
874                 sk->sk_tsflags = val;
875                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
876                         sock_enable_timestamp(sk,
877                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
878                 else
879                         sock_disable_timestamp(sk,
880                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
881                 break;
882 
883         case SO_RCVLOWAT:
884                 if (val < 0)
885                         val = INT_MAX;
886                 sk->sk_rcvlowat = val ? : 1;
887                 break;
888 
889         case SO_RCVTIMEO:
890                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
891                 break;
892 
893         case SO_SNDTIMEO:
894                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
895                 break;
896 
897         case SO_ATTACH_FILTER:
898                 ret = -EINVAL;
899                 if (optlen == sizeof(struct sock_fprog)) {
900                         struct sock_fprog fprog;
901 
902                         ret = -EFAULT;
903                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
904                                 break;
905 
906                         ret = sk_attach_filter(&fprog, sk);
907                 }
908                 break;
909 
910         case SO_ATTACH_BPF:
911                 ret = -EINVAL;
912                 if (optlen == sizeof(u32)) {
913                         u32 ufd;
914 
915                         ret = -EFAULT;
916                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
917                                 break;
918 
919                         ret = sk_attach_bpf(ufd, sk);
920                 }
921                 break;
922 
923         case SO_DETACH_FILTER:
924                 ret = sk_detach_filter(sk);
925                 break;
926 
927         case SO_LOCK_FILTER:
928                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
929                         ret = -EPERM;
930                 else
931                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
932                 break;
933 
934         case SO_PASSSEC:
935                 if (valbool)
936                         set_bit(SOCK_PASSSEC, &sock->flags);
937                 else
938                         clear_bit(SOCK_PASSSEC, &sock->flags);
939                 break;
940         case SO_MARK:
941                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
942                         ret = -EPERM;
943                 else
944                         sk->sk_mark = val;
945                 break;
946 
947         case SO_RXQ_OVFL:
948                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
949                 break;
950 
951         case SO_WIFI_STATUS:
952                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
953                 break;
954 
955         case SO_PEEK_OFF:
956                 if (sock->ops->set_peek_off)
957                         ret = sock->ops->set_peek_off(sk, val);
958                 else
959                         ret = -EOPNOTSUPP;
960                 break;
961 
962         case SO_NOFCS:
963                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
964                 break;
965 
966         case SO_SELECT_ERR_QUEUE:
967                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
968                 break;
969 
970 #ifdef CONFIG_NET_RX_BUSY_POLL
971         case SO_BUSY_POLL:
972                 /* allow unprivileged users to decrease the value */
973                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
974                         ret = -EPERM;
975                 else {
976                         if (val < 0)
977                                 ret = -EINVAL;
978                         else
979                                 sk->sk_ll_usec = val;
980                 }
981                 break;
982 #endif
983 
984         case SO_MAX_PACING_RATE:
985                 sk->sk_max_pacing_rate = val;
986                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
987                                          sk->sk_max_pacing_rate);
988                 break;
989 
990         default:
991                 ret = -ENOPROTOOPT;
992                 break;
993         }
994         release_sock(sk);
995         return ret;
996 }
997 EXPORT_SYMBOL(sock_setsockopt);
998 
999 
1000 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1001                           struct ucred *ucred)
1002 {
1003         ucred->pid = pid_vnr(pid);
1004         ucred->uid = ucred->gid = -1;
1005         if (cred) {
1006                 struct user_namespace *current_ns = current_user_ns();
1007 
1008                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1009                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1010         }
1011 }
1012 
1013 int sock_getsockopt(struct socket *sock, int level, int optname,
1014                     char __user *optval, int __user *optlen)
1015 {
1016         struct sock *sk = sock->sk;
1017 
1018         union {
1019                 int val;
1020                 struct linger ling;
1021                 struct timeval tm;
1022         } v;
1023 
1024         int lv = sizeof(int);
1025         int len;
1026 
1027         if (get_user(len, optlen))
1028                 return -EFAULT;
1029         if (len < 0)
1030                 return -EINVAL;
1031 
1032         memset(&v, 0, sizeof(v));
1033 
1034         switch (optname) {
1035         case SO_DEBUG:
1036                 v.val = sock_flag(sk, SOCK_DBG);
1037                 break;
1038 
1039         case SO_DONTROUTE:
1040                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1041                 break;
1042 
1043         case SO_BROADCAST:
1044                 v.val = sock_flag(sk, SOCK_BROADCAST);
1045                 break;
1046 
1047         case SO_SNDBUF:
1048                 v.val = sk->sk_sndbuf;
1049                 break;
1050 
1051         case SO_RCVBUF:
1052                 v.val = sk->sk_rcvbuf;
1053                 break;
1054 
1055         case SO_REUSEADDR:
1056                 v.val = sk->sk_reuse;
1057                 break;
1058 
1059         case SO_REUSEPORT:
1060                 v.val = sk->sk_reuseport;
1061                 break;
1062 
1063         case SO_KEEPALIVE:
1064                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1065                 break;
1066 
1067         case SO_TYPE:
1068                 v.val = sk->sk_type;
1069                 break;
1070 
1071         case SO_PROTOCOL:
1072                 v.val = sk->sk_protocol;
1073                 break;
1074 
1075         case SO_DOMAIN:
1076                 v.val = sk->sk_family;
1077                 break;
1078 
1079         case SO_ERROR:
1080                 v.val = -sock_error(sk);
1081                 if (v.val == 0)
1082                         v.val = xchg(&sk->sk_err_soft, 0);
1083                 break;
1084 
1085         case SO_OOBINLINE:
1086                 v.val = sock_flag(sk, SOCK_URGINLINE);
1087                 break;
1088 
1089         case SO_NO_CHECK:
1090                 v.val = sk->sk_no_check_tx;
1091                 break;
1092 
1093         case SO_PRIORITY:
1094                 v.val = sk->sk_priority;
1095                 break;
1096 
1097         case SO_LINGER:
1098                 lv              = sizeof(v.ling);
1099                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1100                 v.ling.l_linger = sk->sk_lingertime / HZ;
1101                 break;
1102 
1103         case SO_BSDCOMPAT:
1104                 sock_warn_obsolete_bsdism("getsockopt");
1105                 break;
1106 
1107         case SO_TIMESTAMP:
1108                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1109                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1110                 break;
1111 
1112         case SO_TIMESTAMPNS:
1113                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1114                 break;
1115 
1116         case SO_TIMESTAMPING:
1117                 v.val = sk->sk_tsflags;
1118                 break;
1119 
1120         case SO_RCVTIMEO:
1121                 lv = sizeof(struct timeval);
1122                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1123                         v.tm.tv_sec = 0;
1124                         v.tm.tv_usec = 0;
1125                 } else {
1126                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1127                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1128                 }
1129                 break;
1130 
1131         case SO_SNDTIMEO:
1132                 lv = sizeof(struct timeval);
1133                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1134                         v.tm.tv_sec = 0;
1135                         v.tm.tv_usec = 0;
1136                 } else {
1137                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1138                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1139                 }
1140                 break;
1141 
1142         case SO_RCVLOWAT:
1143                 v.val = sk->sk_rcvlowat;
1144                 break;
1145 
1146         case SO_SNDLOWAT:
1147                 v.val = 1;
1148                 break;
1149 
1150         case SO_PASSCRED:
1151                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1152                 break;
1153 
1154         case SO_PEERCRED:
1155         {
1156                 struct ucred peercred;
1157                 if (len > sizeof(peercred))
1158                         len = sizeof(peercred);
1159                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1160                 if (copy_to_user(optval, &peercred, len))
1161                         return -EFAULT;
1162                 goto lenout;
1163         }
1164 
1165         case SO_PEERNAME:
1166         {
1167                 char address[128];
1168 
1169                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1170                         return -ENOTCONN;
1171                 if (lv < len)
1172                         return -EINVAL;
1173                 if (copy_to_user(optval, address, len))
1174                         return -EFAULT;
1175                 goto lenout;
1176         }
1177 
1178         /* Dubious BSD thing... Probably nobody even uses it, but
1179          * the UNIX standard wants it for whatever reason... -DaveM
1180          */
1181         case SO_ACCEPTCONN:
1182                 v.val = sk->sk_state == TCP_LISTEN;
1183                 break;
1184 
1185         case SO_PASSSEC:
1186                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1187                 break;
1188 
1189         case SO_PEERSEC:
1190                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1191 
1192         case SO_MARK:
1193                 v.val = sk->sk_mark;
1194                 break;
1195 
1196         case SO_RXQ_OVFL:
1197                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1198                 break;
1199 
1200         case SO_WIFI_STATUS:
1201                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1202                 break;
1203 
1204         case SO_PEEK_OFF:
1205                 if (!sock->ops->set_peek_off)
1206                         return -EOPNOTSUPP;
1207 
1208                 v.val = sk->sk_peek_off;
1209                 break;
1210         case SO_NOFCS:
1211                 v.val = sock_flag(sk, SOCK_NOFCS);
1212                 break;
1213 
1214         case SO_BINDTODEVICE:
1215                 return sock_getbindtodevice(sk, optval, optlen, len);
1216 
1217         case SO_GET_FILTER:
1218                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1219                 if (len < 0)
1220                         return len;
1221 
1222                 goto lenout;
1223 
1224         case SO_LOCK_FILTER:
1225                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1226                 break;
1227 
1228         case SO_BPF_EXTENSIONS:
1229                 v.val = bpf_tell_extensions();
1230                 break;
1231 
1232         case SO_SELECT_ERR_QUEUE:
1233                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1234                 break;
1235 
1236 #ifdef CONFIG_NET_RX_BUSY_POLL
1237         case SO_BUSY_POLL:
1238                 v.val = sk->sk_ll_usec;
1239                 break;
1240 #endif
1241 
1242         case SO_MAX_PACING_RATE:
1243                 v.val = sk->sk_max_pacing_rate;
1244                 break;
1245 
1246         case SO_INCOMING_CPU:
1247                 v.val = sk->sk_incoming_cpu;
1248                 break;
1249 
1250         default:
1251                 /* We implement the SO_SNDLOWAT etc to not be settable
1252                  * (1003.1g 7).
1253                  */
1254                 return -ENOPROTOOPT;
1255         }
1256 
1257         if (len > lv)
1258                 len = lv;
1259         if (copy_to_user(optval, &v, len))
1260                 return -EFAULT;
1261 lenout:
1262         if (put_user(len, optlen))
1263                 return -EFAULT;
1264         return 0;
1265 }
1266 
1267 /*
1268  * Initialize an sk_lock.
1269  *
1270  * (We also register the sk_lock with the lock validator.)
1271  */
1272 static inline void sock_lock_init(struct sock *sk)
1273 {
1274         sock_lock_init_class_and_name(sk,
1275                         af_family_slock_key_strings[sk->sk_family],
1276                         af_family_slock_keys + sk->sk_family,
1277                         af_family_key_strings[sk->sk_family],
1278                         af_family_keys + sk->sk_family);
1279 }
1280 
1281 /*
1282  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1283  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1284  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1285  */
1286 static void sock_copy(struct sock *nsk, const struct sock *osk)
1287 {
1288 #ifdef CONFIG_SECURITY_NETWORK
1289         void *sptr = nsk->sk_security;
1290 #endif
1291         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1292 
1293         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1294                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1295 
1296 #ifdef CONFIG_SECURITY_NETWORK
1297         nsk->sk_security = sptr;
1298         security_sk_clone(osk, nsk);
1299 #endif
1300 }
1301 
1302 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1303 {
1304         unsigned long nulls1, nulls2;
1305 
1306         nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1307         nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1308         if (nulls1 > nulls2)
1309                 swap(nulls1, nulls2);
1310 
1311         if (nulls1 != 0)
1312                 memset((char *)sk, 0, nulls1);
1313         memset((char *)sk + nulls1 + sizeof(void *), 0,
1314                nulls2 - nulls1 - sizeof(void *));
1315         memset((char *)sk + nulls2 + sizeof(void *), 0,
1316                size - nulls2 - sizeof(void *));
1317 }
1318 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1319 
1320 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1321                 int family)
1322 {
1323         struct sock *sk;
1324         struct kmem_cache *slab;
1325 
1326         slab = prot->slab;
1327         if (slab != NULL) {
1328                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1329                 if (!sk)
1330                         return sk;
1331                 if (priority & __GFP_ZERO) {
1332                         if (prot->clear_sk)
1333                                 prot->clear_sk(sk, prot->obj_size);
1334                         else
1335                                 sk_prot_clear_nulls(sk, prot->obj_size);
1336                 }
1337         } else
1338                 sk = kmalloc(prot->obj_size, priority);
1339 
1340         if (sk != NULL) {
1341                 kmemcheck_annotate_bitfield(sk, flags);
1342 
1343                 if (security_sk_alloc(sk, family, priority))
1344                         goto out_free;
1345 
1346                 if (!try_module_get(prot->owner))
1347                         goto out_free_sec;
1348                 sk_tx_queue_clear(sk);
1349         }
1350 
1351         return sk;
1352 
1353 out_free_sec:
1354         security_sk_free(sk);
1355 out_free:
1356         if (slab != NULL)
1357                 kmem_cache_free(slab, sk);
1358         else
1359                 kfree(sk);
1360         return NULL;
1361 }
1362 
1363 static void sk_prot_free(struct proto *prot, struct sock *sk)
1364 {
1365         struct kmem_cache *slab;
1366         struct module *owner;
1367 
1368         owner = prot->owner;
1369         slab = prot->slab;
1370 
1371         security_sk_free(sk);
1372         if (slab != NULL)
1373                 kmem_cache_free(slab, sk);
1374         else
1375                 kfree(sk);
1376         module_put(owner);
1377 }
1378 
1379 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
1380 void sock_update_netprioidx(struct sock *sk)
1381 {
1382         if (in_interrupt())
1383                 return;
1384 
1385         sk->sk_cgrp_prioidx = task_netprioidx(current);
1386 }
1387 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1388 #endif
1389 
1390 /**
1391  *      sk_alloc - All socket objects are allocated here
1392  *      @net: the applicable net namespace
1393  *      @family: protocol family
1394  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1395  *      @prot: struct proto associated with this new sock instance
1396  *      @kern: is this to be a kernel socket?
1397  */
1398 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1399                       struct proto *prot, int kern)
1400 {
1401         struct sock *sk;
1402 
1403         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1404         if (sk) {
1405                 sk->sk_family = family;
1406                 /*
1407                  * See comment in struct sock definition to understand
1408                  * why we need sk_prot_creator -acme
1409                  */
1410                 sk->sk_prot = sk->sk_prot_creator = prot;
1411                 sock_lock_init(sk);
1412                 sk->sk_net_refcnt = kern ? 0 : 1;
1413                 if (likely(sk->sk_net_refcnt))
1414                         get_net(net);
1415                 sock_net_set(sk, net);
1416                 atomic_set(&sk->sk_wmem_alloc, 1);
1417 
1418                 sock_update_classid(sk);
1419                 sock_update_netprioidx(sk);
1420         }
1421 
1422         return sk;
1423 }
1424 EXPORT_SYMBOL(sk_alloc);
1425 
1426 void sk_destruct(struct sock *sk)
1427 {
1428         struct sk_filter *filter;
1429 
1430         if (sk->sk_destruct)
1431                 sk->sk_destruct(sk);
1432 
1433         filter = rcu_dereference_check(sk->sk_filter,
1434                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1435         if (filter) {
1436                 sk_filter_uncharge(sk, filter);
1437                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1438         }
1439 
1440         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1441 
1442         if (atomic_read(&sk->sk_omem_alloc))
1443                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1444                          __func__, atomic_read(&sk->sk_omem_alloc));
1445 
1446         if (sk->sk_peer_cred)
1447                 put_cred(sk->sk_peer_cred);
1448         put_pid(sk->sk_peer_pid);
1449         if (likely(sk->sk_net_refcnt))
1450                 put_net(sock_net(sk));
1451         sk_prot_free(sk->sk_prot_creator, sk);
1452 }
1453 
1454 static void __sk_free(struct sock *sk)
1455 {
1456         if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1457                 sock_diag_broadcast_destroy(sk);
1458         else
1459                 sk_destruct(sk);
1460 }
1461 
1462 void sk_free(struct sock *sk)
1463 {
1464         /*
1465          * We subtract one from sk_wmem_alloc and can know if
1466          * some packets are still in some tx queue.
1467          * If not null, sock_wfree() will call __sk_free(sk) later
1468          */
1469         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1470                 __sk_free(sk);
1471 }
1472 EXPORT_SYMBOL(sk_free);
1473 
1474 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1475 {
1476         if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1477                 sock_update_memcg(newsk);
1478 }
1479 
1480 /**
1481  *      sk_clone_lock - clone a socket, and lock its clone
1482  *      @sk: the socket to clone
1483  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1484  *
1485  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1486  */
1487 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1488 {
1489         struct sock *newsk;
1490         bool is_charged = true;
1491 
1492         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1493         if (newsk != NULL) {
1494                 struct sk_filter *filter;
1495 
1496                 sock_copy(newsk, sk);
1497 
1498                 /* SANITY */
1499                 if (likely(newsk->sk_net_refcnt))
1500                         get_net(sock_net(newsk));
1501                 sk_node_init(&newsk->sk_node);
1502                 sock_lock_init(newsk);
1503                 bh_lock_sock(newsk);
1504                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1505                 newsk->sk_backlog.len = 0;
1506 
1507                 atomic_set(&newsk->sk_rmem_alloc, 0);
1508                 /*
1509                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1510                  */
1511                 atomic_set(&newsk->sk_wmem_alloc, 1);
1512                 atomic_set(&newsk->sk_omem_alloc, 0);
1513                 skb_queue_head_init(&newsk->sk_receive_queue);
1514                 skb_queue_head_init(&newsk->sk_write_queue);
1515 
1516                 spin_lock_init(&newsk->sk_dst_lock);
1517                 rwlock_init(&newsk->sk_callback_lock);
1518                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1519                                 af_callback_keys + newsk->sk_family,
1520                                 af_family_clock_key_strings[newsk->sk_family]);
1521 
1522                 newsk->sk_dst_cache     = NULL;
1523                 newsk->sk_wmem_queued   = 0;
1524                 newsk->sk_forward_alloc = 0;
1525                 newsk->sk_send_head     = NULL;
1526                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1527 
1528                 sock_reset_flag(newsk, SOCK_DONE);
1529                 skb_queue_head_init(&newsk->sk_error_queue);
1530 
1531                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1532                 if (filter != NULL)
1533                         /* though it's an empty new sock, the charging may fail
1534                          * if sysctl_optmem_max was changed between creation of
1535                          * original socket and cloning
1536                          */
1537                         is_charged = sk_filter_charge(newsk, filter);
1538 
1539                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
1540                         /* It is still raw copy of parent, so invalidate
1541                          * destructor and make plain sk_free() */
1542                         newsk->sk_destruct = NULL;
1543                         bh_unlock_sock(newsk);
1544                         sk_free(newsk);
1545                         newsk = NULL;
1546                         goto out;
1547                 }
1548 
1549                 newsk->sk_err      = 0;
1550                 newsk->sk_priority = 0;
1551                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1552                 atomic64_set(&newsk->sk_cookie, 0);
1553                 /*
1554                  * Before updating sk_refcnt, we must commit prior changes to memory
1555                  * (Documentation/RCU/rculist_nulls.txt for details)
1556                  */
1557                 smp_wmb();
1558                 atomic_set(&newsk->sk_refcnt, 2);
1559 
1560                 /*
1561                  * Increment the counter in the same struct proto as the master
1562                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1563                  * is the same as sk->sk_prot->socks, as this field was copied
1564                  * with memcpy).
1565                  *
1566                  * This _changes_ the previous behaviour, where
1567                  * tcp_create_openreq_child always was incrementing the
1568                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1569                  * to be taken into account in all callers. -acme
1570                  */
1571                 sk_refcnt_debug_inc(newsk);
1572                 sk_set_socket(newsk, NULL);
1573                 newsk->sk_wq = NULL;
1574 
1575                 sk_update_clone(sk, newsk);
1576 
1577                 if (newsk->sk_prot->sockets_allocated)
1578                         sk_sockets_allocated_inc(newsk);
1579 
1580                 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1581                         net_enable_timestamp();
1582         }
1583 out:
1584         return newsk;
1585 }
1586 EXPORT_SYMBOL_GPL(sk_clone_lock);
1587 
1588 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1589 {
1590         u32 max_segs = 1;
1591 
1592         __sk_dst_set(sk, dst);
1593         sk->sk_route_caps = dst->dev->features;
1594         if (sk->sk_route_caps & NETIF_F_GSO)
1595                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1596         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1597         if (sk_can_gso(sk)) {
1598                 if (dst->header_len) {
1599                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1600                 } else {
1601                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1602                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1603                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1604                 }
1605         }
1606         sk->sk_gso_max_segs = max_segs;
1607 }
1608 EXPORT_SYMBOL_GPL(sk_setup_caps);
1609 
1610 /*
1611  *      Simple resource managers for sockets.
1612  */
1613 
1614 
1615 /*
1616  * Write buffer destructor automatically called from kfree_skb.
1617  */
1618 void sock_wfree(struct sk_buff *skb)
1619 {
1620         struct sock *sk = skb->sk;
1621         unsigned int len = skb->truesize;
1622 
1623         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1624                 /*
1625                  * Keep a reference on sk_wmem_alloc, this will be released
1626                  * after sk_write_space() call
1627                  */
1628                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1629                 sk->sk_write_space(sk);
1630                 len = 1;
1631         }
1632         /*
1633          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1634          * could not do because of in-flight packets
1635          */
1636         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1637                 __sk_free(sk);
1638 }
1639 EXPORT_SYMBOL(sock_wfree);
1640 
1641 void skb_orphan_partial(struct sk_buff *skb)
1642 {
1643         /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1644          * so we do not completely orphan skb, but transfert all
1645          * accounted bytes but one, to avoid unexpected reorders.
1646          */
1647         if (skb->destructor == sock_wfree
1648 #ifdef CONFIG_INET
1649             || skb->destructor == tcp_wfree
1650 #endif
1651                 ) {
1652                 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1653                 skb->truesize = 1;
1654         } else {
1655                 skb_orphan(skb);
1656         }
1657 }
1658 EXPORT_SYMBOL(skb_orphan_partial);
1659 
1660 /*
1661  * Read buffer destructor automatically called from kfree_skb.
1662  */
1663 void sock_rfree(struct sk_buff *skb)
1664 {
1665         struct sock *sk = skb->sk;
1666         unsigned int len = skb->truesize;
1667 
1668         atomic_sub(len, &sk->sk_rmem_alloc);
1669         sk_mem_uncharge(sk, len);
1670 }
1671 EXPORT_SYMBOL(sock_rfree);
1672 
1673 /*
1674  * Buffer destructor for skbs that are not used directly in read or write
1675  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1676  */
1677 void sock_efree(struct sk_buff *skb)
1678 {
1679         sock_put(skb->sk);
1680 }
1681 EXPORT_SYMBOL(sock_efree);
1682 
1683 kuid_t sock_i_uid(struct sock *sk)
1684 {
1685         kuid_t uid;
1686 
1687         read_lock_bh(&sk->sk_callback_lock);
1688         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1689         read_unlock_bh(&sk->sk_callback_lock);
1690         return uid;
1691 }
1692 EXPORT_SYMBOL(sock_i_uid);
1693 
1694 unsigned long sock_i_ino(struct sock *sk)
1695 {
1696         unsigned long ino;
1697 
1698         read_lock_bh(&sk->sk_callback_lock);
1699         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1700         read_unlock_bh(&sk->sk_callback_lock);
1701         return ino;
1702 }
1703 EXPORT_SYMBOL(sock_i_ino);
1704 
1705 /*
1706  * Allocate a skb from the socket's send buffer.
1707  */
1708 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1709                              gfp_t priority)
1710 {
1711         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1712                 struct sk_buff *skb = alloc_skb(size, priority);
1713                 if (skb) {
1714                         skb_set_owner_w(skb, sk);
1715                         return skb;
1716                 }
1717         }
1718         return NULL;
1719 }
1720 EXPORT_SYMBOL(sock_wmalloc);
1721 
1722 /*
1723  * Allocate a memory block from the socket's option memory buffer.
1724  */
1725 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1726 {
1727         if ((unsigned int)size <= sysctl_optmem_max &&
1728             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1729                 void *mem;
1730                 /* First do the add, to avoid the race if kmalloc
1731                  * might sleep.
1732                  */
1733                 atomic_add(size, &sk->sk_omem_alloc);
1734                 mem = kmalloc(size, priority);
1735                 if (mem)
1736                         return mem;
1737                 atomic_sub(size, &sk->sk_omem_alloc);
1738         }
1739         return NULL;
1740 }
1741 EXPORT_SYMBOL(sock_kmalloc);
1742 
1743 /* Free an option memory block. Note, we actually want the inline
1744  * here as this allows gcc to detect the nullify and fold away the
1745  * condition entirely.
1746  */
1747 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1748                                   const bool nullify)
1749 {
1750         if (WARN_ON_ONCE(!mem))
1751                 return;
1752         if (nullify)
1753                 kzfree(mem);
1754         else
1755                 kfree(mem);
1756         atomic_sub(size, &sk->sk_omem_alloc);
1757 }
1758 
1759 void sock_kfree_s(struct sock *sk, void *mem, int size)
1760 {
1761         __sock_kfree_s(sk, mem, size, false);
1762 }
1763 EXPORT_SYMBOL(sock_kfree_s);
1764 
1765 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1766 {
1767         __sock_kfree_s(sk, mem, size, true);
1768 }
1769 EXPORT_SYMBOL(sock_kzfree_s);
1770 
1771 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1772    I think, these locks should be removed for datagram sockets.
1773  */
1774 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1775 {
1776         DEFINE_WAIT(wait);
1777 
1778         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1779         for (;;) {
1780                 if (!timeo)
1781                         break;
1782                 if (signal_pending(current))
1783                         break;
1784                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1785                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1786                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1787                         break;
1788                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1789                         break;
1790                 if (sk->sk_err)
1791                         break;
1792                 timeo = schedule_timeout(timeo);
1793         }
1794         finish_wait(sk_sleep(sk), &wait);
1795         return timeo;
1796 }
1797 
1798 
1799 /*
1800  *      Generic send/receive buffer handlers
1801  */
1802 
1803 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1804                                      unsigned long data_len, int noblock,
1805                                      int *errcode, int max_page_order)
1806 {
1807         struct sk_buff *skb;
1808         long timeo;
1809         int err;
1810 
1811         timeo = sock_sndtimeo(sk, noblock);
1812         for (;;) {
1813                 err = sock_error(sk);
1814                 if (err != 0)
1815                         goto failure;
1816 
1817                 err = -EPIPE;
1818                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1819                         goto failure;
1820 
1821                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1822                         break;
1823 
1824                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1825                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1826                 err = -EAGAIN;
1827                 if (!timeo)
1828                         goto failure;
1829                 if (signal_pending(current))
1830                         goto interrupted;
1831                 timeo = sock_wait_for_wmem(sk, timeo);
1832         }
1833         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1834                                    errcode, sk->sk_allocation);
1835         if (skb)
1836                 skb_set_owner_w(skb, sk);
1837         return skb;
1838 
1839 interrupted:
1840         err = sock_intr_errno(timeo);
1841 failure:
1842         *errcode = err;
1843         return NULL;
1844 }
1845 EXPORT_SYMBOL(sock_alloc_send_pskb);
1846 
1847 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1848                                     int noblock, int *errcode)
1849 {
1850         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1851 }
1852 EXPORT_SYMBOL(sock_alloc_send_skb);
1853 
1854 /* On 32bit arches, an skb frag is limited to 2^15 */
1855 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
1856 
1857 /**
1858  * skb_page_frag_refill - check that a page_frag contains enough room
1859  * @sz: minimum size of the fragment we want to get
1860  * @pfrag: pointer to page_frag
1861  * @gfp: priority for memory allocation
1862  *
1863  * Note: While this allocator tries to use high order pages, there is
1864  * no guarantee that allocations succeed. Therefore, @sz MUST be
1865  * less or equal than PAGE_SIZE.
1866  */
1867 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1868 {
1869         if (pfrag->page) {
1870                 if (atomic_read(&pfrag->page->_count) == 1) {
1871                         pfrag->offset = 0;
1872                         return true;
1873                 }
1874                 if (pfrag->offset + sz <= pfrag->size)
1875                         return true;
1876                 put_page(pfrag->page);
1877         }
1878 
1879         pfrag->offset = 0;
1880         if (SKB_FRAG_PAGE_ORDER) {
1881                 pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
1882                                           __GFP_NOWARN | __GFP_NORETRY,
1883                                           SKB_FRAG_PAGE_ORDER);
1884                 if (likely(pfrag->page)) {
1885                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1886                         return true;
1887                 }
1888         }
1889         pfrag->page = alloc_page(gfp);
1890         if (likely(pfrag->page)) {
1891                 pfrag->size = PAGE_SIZE;
1892                 return true;
1893         }
1894         return false;
1895 }
1896 EXPORT_SYMBOL(skb_page_frag_refill);
1897 
1898 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1899 {
1900         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1901                 return true;
1902 
1903         sk_enter_memory_pressure(sk);
1904         sk_stream_moderate_sndbuf(sk);
1905         return false;
1906 }
1907 EXPORT_SYMBOL(sk_page_frag_refill);
1908 
1909 static void __lock_sock(struct sock *sk)
1910         __releases(&sk->sk_lock.slock)
1911         __acquires(&sk->sk_lock.slock)
1912 {
1913         DEFINE_WAIT(wait);
1914 
1915         for (;;) {
1916                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1917                                         TASK_UNINTERRUPTIBLE);
1918                 spin_unlock_bh(&sk->sk_lock.slock);
1919                 schedule();
1920                 spin_lock_bh(&sk->sk_lock.slock);
1921                 if (!sock_owned_by_user(sk))
1922                         break;
1923         }
1924         finish_wait(&sk->sk_lock.wq, &wait);
1925 }
1926 
1927 static void __release_sock(struct sock *sk)
1928         __releases(&sk->sk_lock.slock)
1929         __acquires(&sk->sk_lock.slock)
1930 {
1931         struct sk_buff *skb = sk->sk_backlog.head;
1932 
1933         do {
1934                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1935                 bh_unlock_sock(sk);
1936 
1937                 do {
1938                         struct sk_buff *next = skb->next;
1939 
1940                         prefetch(next);
1941                         WARN_ON_ONCE(skb_dst_is_noref(skb));
1942                         skb->next = NULL;
1943                         sk_backlog_rcv(sk, skb);
1944 
1945                         /*
1946                          * We are in process context here with softirqs
1947                          * disabled, use cond_resched_softirq() to preempt.
1948                          * This is safe to do because we've taken the backlog
1949                          * queue private:
1950                          */
1951                         cond_resched_softirq();
1952 
1953                         skb = next;
1954                 } while (skb != NULL);
1955 
1956                 bh_lock_sock(sk);
1957         } while ((skb = sk->sk_backlog.head) != NULL);
1958 
1959         /*
1960          * Doing the zeroing here guarantee we can not loop forever
1961          * while a wild producer attempts to flood us.
1962          */
1963         sk->sk_backlog.len = 0;
1964 }
1965 
1966 /**
1967  * sk_wait_data - wait for data to arrive at sk_receive_queue
1968  * @sk:    sock to wait on
1969  * @timeo: for how long
1970  * @skb:   last skb seen on sk_receive_queue
1971  *
1972  * Now socket state including sk->sk_err is changed only under lock,
1973  * hence we may omit checks after joining wait queue.
1974  * We check receive queue before schedule() only as optimization;
1975  * it is very likely that release_sock() added new data.
1976  */
1977 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
1978 {
1979         int rc;
1980         DEFINE_WAIT(wait);
1981 
1982         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1983         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1984         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
1985         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1986         finish_wait(sk_sleep(sk), &wait);
1987         return rc;
1988 }
1989 EXPORT_SYMBOL(sk_wait_data);
1990 
1991 /**
1992  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1993  *      @sk: socket
1994  *      @size: memory size to allocate
1995  *      @kind: allocation type
1996  *
1997  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1998  *      rmem allocation. This function assumes that protocols which have
1999  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2000  */
2001 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2002 {
2003         struct proto *prot = sk->sk_prot;
2004         int amt = sk_mem_pages(size);
2005         long allocated;
2006         int parent_status = UNDER_LIMIT;
2007 
2008         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2009 
2010         allocated = sk_memory_allocated_add(sk, amt, &parent_status);
2011 
2012         /* Under limit. */
2013         if (parent_status == UNDER_LIMIT &&
2014                         allocated <= sk_prot_mem_limits(sk, 0)) {
2015                 sk_leave_memory_pressure(sk);
2016                 return 1;
2017         }
2018 
2019         /* Under pressure. (we or our parents) */
2020         if ((parent_status > SOFT_LIMIT) ||
2021                         allocated > sk_prot_mem_limits(sk, 1))
2022                 sk_enter_memory_pressure(sk);
2023 
2024         /* Over hard limit (we or our parents) */
2025         if ((parent_status == OVER_LIMIT) ||
2026                         (allocated > sk_prot_mem_limits(sk, 2)))
2027                 goto suppress_allocation;
2028 
2029         /* guarantee minimum buffer size under pressure */
2030         if (kind == SK_MEM_RECV) {
2031                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2032                         return 1;
2033 
2034         } else { /* SK_MEM_SEND */
2035                 if (sk->sk_type == SOCK_STREAM) {
2036                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2037                                 return 1;
2038                 } else if (atomic_read(&sk->sk_wmem_alloc) <
2039                            prot->sysctl_wmem[0])
2040                                 return 1;
2041         }
2042 
2043         if (sk_has_memory_pressure(sk)) {
2044                 int alloc;
2045 
2046                 if (!sk_under_memory_pressure(sk))
2047                         return 1;
2048                 alloc = sk_sockets_allocated_read_positive(sk);
2049                 if (sk_prot_mem_limits(sk, 2) > alloc *
2050                     sk_mem_pages(sk->sk_wmem_queued +
2051                                  atomic_read(&sk->sk_rmem_alloc) +
2052                                  sk->sk_forward_alloc))
2053                         return 1;
2054         }
2055 
2056 suppress_allocation:
2057 
2058         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2059                 sk_stream_moderate_sndbuf(sk);
2060 
2061                 /* Fail only if socket is _under_ its sndbuf.
2062                  * In this case we cannot block, so that we have to fail.
2063                  */
2064                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2065                         return 1;
2066         }
2067 
2068         trace_sock_exceed_buf_limit(sk, prot, allocated);
2069 
2070         /* Alas. Undo changes. */
2071         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2072 
2073         sk_memory_allocated_sub(sk, amt);
2074 
2075         return 0;
2076 }
2077 EXPORT_SYMBOL(__sk_mem_schedule);
2078 
2079 /**
2080  *      __sk_mem_reclaim - reclaim memory_allocated
2081  *      @sk: socket
2082  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2083  */
2084 void __sk_mem_reclaim(struct sock *sk, int amount)
2085 {
2086         amount >>= SK_MEM_QUANTUM_SHIFT;
2087         sk_memory_allocated_sub(sk, amount);
2088         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2089 
2090         if (sk_under_memory_pressure(sk) &&
2091             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2092                 sk_leave_memory_pressure(sk);
2093 }
2094 EXPORT_SYMBOL(__sk_mem_reclaim);
2095 
2096 
2097 /*
2098  * Set of default routines for initialising struct proto_ops when
2099  * the protocol does not support a particular function. In certain
2100  * cases where it makes no sense for a protocol to have a "do nothing"
2101  * function, some default processing is provided.
2102  */
2103 
2104 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2105 {
2106         return -EOPNOTSUPP;
2107 }
2108 EXPORT_SYMBOL(sock_no_bind);
2109 
2110 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2111                     int len, int flags)
2112 {
2113         return -EOPNOTSUPP;
2114 }
2115 EXPORT_SYMBOL(sock_no_connect);
2116 
2117 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2118 {
2119         return -EOPNOTSUPP;
2120 }
2121 EXPORT_SYMBOL(sock_no_socketpair);
2122 
2123 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2124 {
2125         return -EOPNOTSUPP;
2126 }
2127 EXPORT_SYMBOL(sock_no_accept);
2128 
2129 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2130                     int *len, int peer)
2131 {
2132         return -EOPNOTSUPP;
2133 }
2134 EXPORT_SYMBOL(sock_no_getname);
2135 
2136 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2137 {
2138         return 0;
2139 }
2140 EXPORT_SYMBOL(sock_no_poll);
2141 
2142 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2143 {
2144         return -EOPNOTSUPP;
2145 }
2146 EXPORT_SYMBOL(sock_no_ioctl);
2147 
2148 int sock_no_listen(struct socket *sock, int backlog)
2149 {
2150         return -EOPNOTSUPP;
2151 }
2152 EXPORT_SYMBOL(sock_no_listen);
2153 
2154 int sock_no_shutdown(struct socket *sock, int how)
2155 {
2156         return -EOPNOTSUPP;
2157 }
2158 EXPORT_SYMBOL(sock_no_shutdown);
2159 
2160 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2161                     char __user *optval, unsigned int optlen)
2162 {
2163         return -EOPNOTSUPP;
2164 }
2165 EXPORT_SYMBOL(sock_no_setsockopt);
2166 
2167 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2168                     char __user *optval, int __user *optlen)
2169 {
2170         return -EOPNOTSUPP;
2171 }
2172 EXPORT_SYMBOL(sock_no_getsockopt);
2173 
2174 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2175 {
2176         return -EOPNOTSUPP;
2177 }
2178 EXPORT_SYMBOL(sock_no_sendmsg);
2179 
2180 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2181                     int flags)
2182 {
2183         return -EOPNOTSUPP;
2184 }
2185 EXPORT_SYMBOL(sock_no_recvmsg);
2186 
2187 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2188 {
2189         /* Mirror missing mmap method error code */
2190         return -ENODEV;
2191 }
2192 EXPORT_SYMBOL(sock_no_mmap);
2193 
2194 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2195 {
2196         ssize_t res;
2197         struct msghdr msg = {.msg_flags = flags};
2198         struct kvec iov;
2199         char *kaddr = kmap(page);
2200         iov.iov_base = kaddr + offset;
2201         iov.iov_len = size;
2202         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2203         kunmap(page);
2204         return res;
2205 }
2206 EXPORT_SYMBOL(sock_no_sendpage);
2207 
2208 /*
2209  *      Default Socket Callbacks
2210  */
2211 
2212 static void sock_def_wakeup(struct sock *sk)
2213 {
2214         struct socket_wq *wq;
2215 
2216         rcu_read_lock();
2217         wq = rcu_dereference(sk->sk_wq);
2218         if (wq_has_sleeper(wq))
2219                 wake_up_interruptible_all(&wq->wait);
2220         rcu_read_unlock();
2221 }
2222 
2223 static void sock_def_error_report(struct sock *sk)
2224 {
2225         struct socket_wq *wq;
2226 
2227         rcu_read_lock();
2228         wq = rcu_dereference(sk->sk_wq);
2229         if (wq_has_sleeper(wq))
2230                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2231         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2232         rcu_read_unlock();
2233 }
2234 
2235 static void sock_def_readable(struct sock *sk)
2236 {
2237         struct socket_wq *wq;
2238 
2239         rcu_read_lock();
2240         wq = rcu_dereference(sk->sk_wq);
2241         if (wq_has_sleeper(wq))
2242                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2243                                                 POLLRDNORM | POLLRDBAND);
2244         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2245         rcu_read_unlock();
2246 }
2247 
2248 static void sock_def_write_space(struct sock *sk)
2249 {
2250         struct socket_wq *wq;
2251 
2252         rcu_read_lock();
2253 
2254         /* Do not wake up a writer until he can make "significant"
2255          * progress.  --DaveM
2256          */
2257         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2258                 wq = rcu_dereference(sk->sk_wq);
2259                 if (wq_has_sleeper(wq))
2260                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2261                                                 POLLWRNORM | POLLWRBAND);
2262 
2263                 /* Should agree with poll, otherwise some programs break */
2264                 if (sock_writeable(sk))
2265                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2266         }
2267 
2268         rcu_read_unlock();
2269 }
2270 
2271 static void sock_def_destruct(struct sock *sk)
2272 {
2273 }
2274 
2275 void sk_send_sigurg(struct sock *sk)
2276 {
2277         if (sk->sk_socket && sk->sk_socket->file)
2278                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2279                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2280 }
2281 EXPORT_SYMBOL(sk_send_sigurg);
2282 
2283 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2284                     unsigned long expires)
2285 {
2286         if (!mod_timer(timer, expires))
2287                 sock_hold(sk);
2288 }
2289 EXPORT_SYMBOL(sk_reset_timer);
2290 
2291 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2292 {
2293         if (del_timer(timer))
2294                 __sock_put(sk);
2295 }
2296 EXPORT_SYMBOL(sk_stop_timer);
2297 
2298 void sock_init_data(struct socket *sock, struct sock *sk)
2299 {
2300         skb_queue_head_init(&sk->sk_receive_queue);
2301         skb_queue_head_init(&sk->sk_write_queue);
2302         skb_queue_head_init(&sk->sk_error_queue);
2303 
2304         sk->sk_send_head        =       NULL;
2305 
2306         init_timer(&sk->sk_timer);
2307 
2308         sk->sk_allocation       =       GFP_KERNEL;
2309         sk->sk_rcvbuf           =       sysctl_rmem_default;
2310         sk->sk_sndbuf           =       sysctl_wmem_default;
2311         sk->sk_state            =       TCP_CLOSE;
2312         sk_set_socket(sk, sock);
2313 
2314         sock_set_flag(sk, SOCK_ZAPPED);
2315 
2316         if (sock) {
2317                 sk->sk_type     =       sock->type;
2318                 sk->sk_wq       =       sock->wq;
2319                 sock->sk        =       sk;
2320         } else
2321                 sk->sk_wq       =       NULL;
2322 
2323         spin_lock_init(&sk->sk_dst_lock);
2324         rwlock_init(&sk->sk_callback_lock);
2325         lockdep_set_class_and_name(&sk->sk_callback_lock,
2326                         af_callback_keys + sk->sk_family,
2327                         af_family_clock_key_strings[sk->sk_family]);
2328 
2329         sk->sk_state_change     =       sock_def_wakeup;
2330         sk->sk_data_ready       =       sock_def_readable;
2331         sk->sk_write_space      =       sock_def_write_space;
2332         sk->sk_error_report     =       sock_def_error_report;
2333         sk->sk_destruct         =       sock_def_destruct;
2334 
2335         sk->sk_frag.page        =       NULL;
2336         sk->sk_frag.offset      =       0;
2337         sk->sk_peek_off         =       -1;
2338 
2339         sk->sk_peer_pid         =       NULL;
2340         sk->sk_peer_cred        =       NULL;
2341         sk->sk_write_pending    =       0;
2342         sk->sk_rcvlowat         =       1;
2343         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2344         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2345 
2346         sk->sk_stamp = ktime_set(-1L, 0);
2347 
2348 #ifdef CONFIG_NET_RX_BUSY_POLL
2349         sk->sk_napi_id          =       0;
2350         sk->sk_ll_usec          =       sysctl_net_busy_read;
2351 #endif
2352 
2353         sk->sk_max_pacing_rate = ~0U;
2354         sk->sk_pacing_rate = ~0U;
2355         /*
2356          * Before updating sk_refcnt, we must commit prior changes to memory
2357          * (Documentation/RCU/rculist_nulls.txt for details)
2358          */
2359         smp_wmb();
2360         atomic_set(&sk->sk_refcnt, 1);
2361         atomic_set(&sk->sk_drops, 0);
2362 }
2363 EXPORT_SYMBOL(sock_init_data);
2364 
2365 void lock_sock_nested(struct sock *sk, int subclass)
2366 {
2367         might_sleep();
2368         spin_lock_bh(&sk->sk_lock.slock);
2369         if (sk->sk_lock.owned)
2370                 __lock_sock(sk);
2371         sk->sk_lock.owned = 1;
2372         spin_unlock(&sk->sk_lock.slock);
2373         /*
2374          * The sk_lock has mutex_lock() semantics here:
2375          */
2376         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2377         local_bh_enable();
2378 }
2379 EXPORT_SYMBOL(lock_sock_nested);
2380 
2381 void release_sock(struct sock *sk)
2382 {
2383         /*
2384          * The sk_lock has mutex_unlock() semantics:
2385          */
2386         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2387 
2388         spin_lock_bh(&sk->sk_lock.slock);
2389         if (sk->sk_backlog.tail)
2390                 __release_sock(sk);
2391 
2392         /* Warning : release_cb() might need to release sk ownership,
2393          * ie call sock_release_ownership(sk) before us.
2394          */
2395         if (sk->sk_prot->release_cb)
2396                 sk->sk_prot->release_cb(sk);
2397 
2398         sock_release_ownership(sk);
2399         if (waitqueue_active(&sk->sk_lock.wq))
2400                 wake_up(&sk->sk_lock.wq);
2401         spin_unlock_bh(&sk->sk_lock.slock);
2402 }
2403 EXPORT_SYMBOL(release_sock);
2404 
2405 /**
2406  * lock_sock_fast - fast version of lock_sock
2407  * @sk: socket
2408  *
2409  * This version should be used for very small section, where process wont block
2410  * return false if fast path is taken
2411  *   sk_lock.slock locked, owned = 0, BH disabled
2412  * return true if slow path is taken
2413  *   sk_lock.slock unlocked, owned = 1, BH enabled
2414  */
2415 bool lock_sock_fast(struct sock *sk)
2416 {
2417         might_sleep();
2418         spin_lock_bh(&sk->sk_lock.slock);
2419 
2420         if (!sk->sk_lock.owned)
2421                 /*
2422                  * Note : We must disable BH
2423                  */
2424                 return false;
2425 
2426         __lock_sock(sk);
2427         sk->sk_lock.owned = 1;
2428         spin_unlock(&sk->sk_lock.slock);
2429         /*
2430          * The sk_lock has mutex_lock() semantics here:
2431          */
2432         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2433         local_bh_enable();
2434         return true;
2435 }
2436 EXPORT_SYMBOL(lock_sock_fast);
2437 
2438 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2439 {
2440         struct timeval tv;
2441         if (!sock_flag(sk, SOCK_TIMESTAMP))
2442                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2443         tv = ktime_to_timeval(sk->sk_stamp);
2444         if (tv.tv_sec == -1)
2445                 return -ENOENT;
2446         if (tv.tv_sec == 0) {
2447                 sk->sk_stamp = ktime_get_real();
2448                 tv = ktime_to_timeval(sk->sk_stamp);
2449         }
2450         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2451 }
2452 EXPORT_SYMBOL(sock_get_timestamp);
2453 
2454 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2455 {
2456         struct timespec ts;
2457         if (!sock_flag(sk, SOCK_TIMESTAMP))
2458                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2459         ts = ktime_to_timespec(sk->sk_stamp);
2460         if (ts.tv_sec == -1)
2461                 return -ENOENT;
2462         if (ts.tv_sec == 0) {
2463                 sk->sk_stamp = ktime_get_real();
2464                 ts = ktime_to_timespec(sk->sk_stamp);
2465         }
2466         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2467 }
2468 EXPORT_SYMBOL(sock_get_timestampns);
2469 
2470 void sock_enable_timestamp(struct sock *sk, int flag)
2471 {
2472         if (!sock_flag(sk, flag)) {
2473                 unsigned long previous_flags = sk->sk_flags;
2474 
2475                 sock_set_flag(sk, flag);
2476                 /*
2477                  * we just set one of the two flags which require net
2478                  * time stamping, but time stamping might have been on
2479                  * already because of the other one
2480                  */
2481                 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2482                         net_enable_timestamp();
2483         }
2484 }
2485 
2486 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2487                        int level, int type)
2488 {
2489         struct sock_exterr_skb *serr;
2490         struct sk_buff *skb;
2491         int copied, err;
2492 
2493         err = -EAGAIN;
2494         skb = sock_dequeue_err_skb(sk);
2495         if (skb == NULL)
2496                 goto out;
2497 
2498         copied = skb->len;
2499         if (copied > len) {
2500                 msg->msg_flags |= MSG_TRUNC;
2501                 copied = len;
2502         }
2503         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2504         if (err)
2505                 goto out_free_skb;
2506 
2507         sock_recv_timestamp(msg, sk, skb);
2508 
2509         serr = SKB_EXT_ERR(skb);
2510         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2511 
2512         msg->msg_flags |= MSG_ERRQUEUE;
2513         err = copied;
2514 
2515 out_free_skb:
2516         kfree_skb(skb);
2517 out:
2518         return err;
2519 }
2520 EXPORT_SYMBOL(sock_recv_errqueue);
2521 
2522 /*
2523  *      Get a socket option on an socket.
2524  *
2525  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2526  *      asynchronous errors should be reported by getsockopt. We assume
2527  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2528  */
2529 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2530                            char __user *optval, int __user *optlen)
2531 {
2532         struct sock *sk = sock->sk;
2533 
2534         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2535 }
2536 EXPORT_SYMBOL(sock_common_getsockopt);
2537 
2538 #ifdef CONFIG_COMPAT
2539 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2540                                   char __user *optval, int __user *optlen)
2541 {
2542         struct sock *sk = sock->sk;
2543 
2544         if (sk->sk_prot->compat_getsockopt != NULL)
2545                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2546                                                       optval, optlen);
2547         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2548 }
2549 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2550 #endif
2551 
2552 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2553                         int flags)
2554 {
2555         struct sock *sk = sock->sk;
2556         int addr_len = 0;
2557         int err;
2558 
2559         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2560                                    flags & ~MSG_DONTWAIT, &addr_len);
2561         if (err >= 0)
2562                 msg->msg_namelen = addr_len;
2563         return err;
2564 }
2565 EXPORT_SYMBOL(sock_common_recvmsg);
2566 
2567 /*
2568  *      Set socket options on an inet socket.
2569  */
2570 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2571                            char __user *optval, unsigned int optlen)
2572 {
2573         struct sock *sk = sock->sk;
2574 
2575         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2576 }
2577 EXPORT_SYMBOL(sock_common_setsockopt);
2578 
2579 #ifdef CONFIG_COMPAT
2580 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2581                                   char __user *optval, unsigned int optlen)
2582 {
2583         struct sock *sk = sock->sk;
2584 
2585         if (sk->sk_prot->compat_setsockopt != NULL)
2586                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2587                                                       optval, optlen);
2588         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2589 }
2590 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2591 #endif
2592 
2593 void sk_common_release(struct sock *sk)
2594 {
2595         if (sk->sk_prot->destroy)
2596                 sk->sk_prot->destroy(sk);
2597 
2598         /*
2599          * Observation: when sock_common_release is called, processes have
2600          * no access to socket. But net still has.
2601          * Step one, detach it from networking:
2602          *
2603          * A. Remove from hash tables.
2604          */
2605 
2606         sk->sk_prot->unhash(sk);
2607 
2608         /*
2609          * In this point socket cannot receive new packets, but it is possible
2610          * that some packets are in flight because some CPU runs receiver and
2611          * did hash table lookup before we unhashed socket. They will achieve
2612          * receive queue and will be purged by socket destructor.
2613          *
2614          * Also we still have packets pending on receive queue and probably,
2615          * our own packets waiting in device queues. sock_destroy will drain
2616          * receive queue, but transmitted packets will delay socket destruction
2617          * until the last reference will be released.
2618          */
2619 
2620         sock_orphan(sk);
2621 
2622         xfrm_sk_free_policy(sk);
2623 
2624         sk_refcnt_debug_release(sk);
2625 
2626         if (sk->sk_frag.page) {
2627                 put_page(sk->sk_frag.page);
2628                 sk->sk_frag.page = NULL;
2629         }
2630 
2631         sock_put(sk);
2632 }
2633 EXPORT_SYMBOL(sk_common_release);
2634 
2635 #ifdef CONFIG_PROC_FS
2636 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2637 struct prot_inuse {
2638         int val[PROTO_INUSE_NR];
2639 };
2640 
2641 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2642 
2643 #ifdef CONFIG_NET_NS
2644 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2645 {
2646         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2647 }
2648 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2649 
2650 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2651 {
2652         int cpu, idx = prot->inuse_idx;
2653         int res = 0;
2654 
2655         for_each_possible_cpu(cpu)
2656                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2657 
2658         return res >= 0 ? res : 0;
2659 }
2660 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2661 
2662 static int __net_init sock_inuse_init_net(struct net *net)
2663 {
2664         net->core.inuse = alloc_percpu(struct prot_inuse);
2665         return net->core.inuse ? 0 : -ENOMEM;
2666 }
2667 
2668 static void __net_exit sock_inuse_exit_net(struct net *net)
2669 {
2670         free_percpu(net->core.inuse);
2671 }
2672 
2673 static struct pernet_operations net_inuse_ops = {
2674         .init = sock_inuse_init_net,
2675         .exit = sock_inuse_exit_net,
2676 };
2677 
2678 static __init int net_inuse_init(void)
2679 {
2680         if (register_pernet_subsys(&net_inuse_ops))
2681                 panic("Cannot initialize net inuse counters");
2682 
2683         return 0;
2684 }
2685 
2686 core_initcall(net_inuse_init);
2687 #else
2688 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2689 
2690 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2691 {
2692         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2693 }
2694 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2695 
2696 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2697 {
2698         int cpu, idx = prot->inuse_idx;
2699         int res = 0;
2700 
2701         for_each_possible_cpu(cpu)
2702                 res += per_cpu(prot_inuse, cpu).val[idx];
2703 
2704         return res >= 0 ? res : 0;
2705 }
2706 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2707 #endif
2708 
2709 static void assign_proto_idx(struct proto *prot)
2710 {
2711         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2712 
2713         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2714                 pr_err("PROTO_INUSE_NR exhausted\n");
2715                 return;
2716         }
2717 
2718         set_bit(prot->inuse_idx, proto_inuse_idx);
2719 }
2720 
2721 static void release_proto_idx(struct proto *prot)
2722 {
2723         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2724                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2725 }
2726 #else
2727 static inline void assign_proto_idx(struct proto *prot)
2728 {
2729 }
2730 
2731 static inline void release_proto_idx(struct proto *prot)
2732 {
2733 }
2734 #endif
2735 
2736 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2737 {
2738         if (!rsk_prot)
2739                 return;
2740         kfree(rsk_prot->slab_name);
2741         rsk_prot->slab_name = NULL;
2742         kmem_cache_destroy(rsk_prot->slab);
2743         rsk_prot->slab = NULL;
2744 }
2745 
2746 static int req_prot_init(const struct proto *prot)
2747 {
2748         struct request_sock_ops *rsk_prot = prot->rsk_prot;
2749 
2750         if (!rsk_prot)
2751                 return 0;
2752 
2753         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2754                                         prot->name);
2755         if (!rsk_prot->slab_name)
2756                 return -ENOMEM;
2757 
2758         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2759                                            rsk_prot->obj_size, 0,
2760                                            0, NULL);
2761 
2762         if (!rsk_prot->slab) {
2763                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2764                         prot->name);
2765                 return -ENOMEM;
2766         }
2767         return 0;
2768 }
2769 
2770 int proto_register(struct proto *prot, int alloc_slab)
2771 {
2772         if (alloc_slab) {
2773                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2774                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2775                                         NULL);
2776 
2777                 if (prot->slab == NULL) {
2778                         pr_crit("%s: Can't create sock SLAB cache!\n",
2779                                 prot->name);
2780                         goto out;
2781                 }
2782 
2783                 if (req_prot_init(prot))
2784                         goto out_free_request_sock_slab;
2785 
2786                 if (prot->twsk_prot != NULL) {
2787                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2788 
2789                         if (prot->twsk_prot->twsk_slab_name == NULL)
2790                                 goto out_free_request_sock_slab;
2791 
2792                         prot->twsk_prot->twsk_slab =
2793                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2794                                                   prot->twsk_prot->twsk_obj_size,
2795                                                   0,
2796                                                   prot->slab_flags,
2797                                                   NULL);
2798                         if (prot->twsk_prot->twsk_slab == NULL)
2799                                 goto out_free_timewait_sock_slab_name;
2800                 }
2801         }
2802 
2803         mutex_lock(&proto_list_mutex);
2804         list_add(&prot->node, &proto_list);
2805         assign_proto_idx(prot);
2806         mutex_unlock(&proto_list_mutex);
2807         return 0;
2808 
2809 out_free_timewait_sock_slab_name:
2810         kfree(prot->twsk_prot->twsk_slab_name);
2811 out_free_request_sock_slab:
2812         req_prot_cleanup(prot->rsk_prot);
2813 
2814         kmem_cache_destroy(prot->slab);
2815         prot->slab = NULL;
2816 out:
2817         return -ENOBUFS;
2818 }
2819 EXPORT_SYMBOL(proto_register);
2820 
2821 void proto_unregister(struct proto *prot)
2822 {
2823         mutex_lock(&proto_list_mutex);
2824         release_proto_idx(prot);
2825         list_del(&prot->node);
2826         mutex_unlock(&proto_list_mutex);
2827 
2828         kmem_cache_destroy(prot->slab);
2829         prot->slab = NULL;
2830 
2831         req_prot_cleanup(prot->rsk_prot);
2832 
2833         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2834                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2835                 kfree(prot->twsk_prot->twsk_slab_name);
2836                 prot->twsk_prot->twsk_slab = NULL;
2837         }
2838 }
2839 EXPORT_SYMBOL(proto_unregister);
2840 
2841 #ifdef CONFIG_PROC_FS
2842 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2843         __acquires(proto_list_mutex)
2844 {
2845         mutex_lock(&proto_list_mutex);
2846         return seq_list_start_head(&proto_list, *pos);
2847 }
2848 
2849 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2850 {
2851         return seq_list_next(v, &proto_list, pos);
2852 }
2853 
2854 static void proto_seq_stop(struct seq_file *seq, void *v)
2855         __releases(proto_list_mutex)
2856 {
2857         mutex_unlock(&proto_list_mutex);
2858 }
2859 
2860 static char proto_method_implemented(const void *method)
2861 {
2862         return method == NULL ? 'n' : 'y';
2863 }
2864 static long sock_prot_memory_allocated(struct proto *proto)
2865 {
2866         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2867 }
2868 
2869 static char *sock_prot_memory_pressure(struct proto *proto)
2870 {
2871         return proto->memory_pressure != NULL ?
2872         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2873 }
2874 
2875 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2876 {
2877 
2878         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2879                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2880                    proto->name,
2881                    proto->obj_size,
2882                    sock_prot_inuse_get(seq_file_net(seq), proto),
2883                    sock_prot_memory_allocated(proto),
2884                    sock_prot_memory_pressure(proto),
2885                    proto->max_header,
2886                    proto->slab == NULL ? "no" : "yes",
2887                    module_name(proto->owner),
2888                    proto_method_implemented(proto->close),
2889                    proto_method_implemented(proto->connect),
2890                    proto_method_implemented(proto->disconnect),
2891                    proto_method_implemented(proto->accept),
2892                    proto_method_implemented(proto->ioctl),
2893                    proto_method_implemented(proto->init),
2894                    proto_method_implemented(proto->destroy),
2895                    proto_method_implemented(proto->shutdown),
2896                    proto_method_implemented(proto->setsockopt),
2897                    proto_method_implemented(proto->getsockopt),
2898                    proto_method_implemented(proto->sendmsg),
2899                    proto_method_implemented(proto->recvmsg),
2900                    proto_method_implemented(proto->sendpage),
2901                    proto_method_implemented(proto->bind),
2902                    proto_method_implemented(proto->backlog_rcv),
2903                    proto_method_implemented(proto->hash),
2904                    proto_method_implemented(proto->unhash),
2905                    proto_method_implemented(proto->get_port),
2906                    proto_method_implemented(proto->enter_memory_pressure));
2907 }
2908 
2909 static int proto_seq_show(struct seq_file *seq, void *v)
2910 {
2911         if (v == &proto_list)
2912                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2913                            "protocol",
2914                            "size",
2915                            "sockets",
2916                            "memory",
2917                            "press",
2918                            "maxhdr",
2919                            "slab",
2920                            "module",
2921                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2922         else
2923                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2924         return 0;
2925 }
2926 
2927 static const struct seq_operations proto_seq_ops = {
2928         .start  = proto_seq_start,
2929         .next   = proto_seq_next,
2930         .stop   = proto_seq_stop,
2931         .show   = proto_seq_show,
2932 };
2933 
2934 static int proto_seq_open(struct inode *inode, struct file *file)
2935 {
2936         return seq_open_net(inode, file, &proto_seq_ops,
2937                             sizeof(struct seq_net_private));
2938 }
2939 
2940 static const struct file_operations proto_seq_fops = {
2941         .owner          = THIS_MODULE,
2942         .open           = proto_seq_open,
2943         .read           = seq_read,
2944         .llseek         = seq_lseek,
2945         .release        = seq_release_net,
2946 };
2947 
2948 static __net_init int proto_init_net(struct net *net)
2949 {
2950         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2951                 return -ENOMEM;
2952 
2953         return 0;
2954 }
2955 
2956 static __net_exit void proto_exit_net(struct net *net)
2957 {
2958         remove_proc_entry("protocols", net->proc_net);
2959 }
2960 
2961 
2962 static __net_initdata struct pernet_operations proto_net_ops = {
2963         .init = proto_init_net,
2964         .exit = proto_exit_net,
2965 };
2966 
2967 static int __init proto_init(void)
2968 {
2969         return register_pernet_subsys(&proto_net_ops);
2970 }
2971 
2972 subsys_initcall(proto_init);
2973 
2974 #endif /* PROC_FS */
2975 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp