~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/core/sock.c

Version: ~ [ linux-5.13-rc1 ] ~ [ linux-5.12.2 ] ~ [ linux-5.11.19 ] ~ [ linux-5.10.35 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.117 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.190 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.232 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.268 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.268 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  3  *              operating system.  INET is implemented using the  BSD Socket
  4  *              interface as the means of communication with the user level.
  5  *
  6  *              Generic socket support routines. Memory allocators, socket lock/release
  7  *              handler for protocols to use and generic option handler.
  8  *
  9  *
 10  * Authors:     Ross Biro
 11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  *              Florian La Roche, <flla@stud.uni-sb.de>
 13  *              Alan Cox, <A.Cox@swansea.ac.uk>
 14  *
 15  * Fixes:
 16  *              Alan Cox        :       Numerous verify_area() problems
 17  *              Alan Cox        :       Connecting on a connecting socket
 18  *                                      now returns an error for tcp.
 19  *              Alan Cox        :       sock->protocol is set correctly.
 20  *                                      and is not sometimes left as 0.
 21  *              Alan Cox        :       connect handles icmp errors on a
 22  *                                      connect properly. Unfortunately there
 23  *                                      is a restart syscall nasty there. I
 24  *                                      can't match BSD without hacking the C
 25  *                                      library. Ideas urgently sought!
 26  *              Alan Cox        :       Disallow bind() to addresses that are
 27  *                                      not ours - especially broadcast ones!!
 28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
 29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
 30  *                                      instead they leave that for the DESTROY timer.
 31  *              Alan Cox        :       Clean up error flag in accept
 32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
 33  *                                      was buggy. Put a remove_sock() in the handler
 34  *                                      for memory when we hit 0. Also altered the timer
 35  *                                      code. The ACK stuff can wait and needs major
 36  *                                      TCP layer surgery.
 37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
 38  *                                      and fixed timer/inet_bh race.
 39  *              Alan Cox        :       Added zapped flag for TCP
 40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
 41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
 42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
 43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
 44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
 45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
 46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
 47  *      Pauline Middelink       :       identd support
 48  *              Alan Cox        :       Fixed connect() taking signals I think.
 49  *              Alan Cox        :       SO_LINGER supported
 50  *              Alan Cox        :       Error reporting fixes
 51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
 52  *              Alan Cox        :       inet sockets don't set sk->type!
 53  *              Alan Cox        :       Split socket option code
 54  *              Alan Cox        :       Callbacks
 55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
 56  *              Alex            :       Removed restriction on inet fioctl
 57  *              Alan Cox        :       Splitting INET from NET core
 58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
 59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
 60  *              Alan Cox        :       Split IP from generic code
 61  *              Alan Cox        :       New kfree_skbmem()
 62  *              Alan Cox        :       Make SO_DEBUG superuser only.
 63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
 64  *                                      (compatibility fix)
 65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
 66  *              Alan Cox        :       Allocator for a socket is settable.
 67  *              Alan Cox        :       SO_ERROR includes soft errors.
 68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
 69  *              Alan Cox        :       Generic socket allocation to make hooks
 70  *                                      easier (suggested by Craig Metz).
 71  *              Michael Pall    :       SO_ERROR returns positive errno again
 72  *              Steve Whitehouse:       Added default destructor to free
 73  *                                      protocol private data.
 74  *              Steve Whitehouse:       Added various other default routines
 75  *                                      common to several socket families.
 76  *              Chris Evans     :       Call suser() check last on F_SETOWN
 77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
 78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
 79  *              Andi Kleen      :       Fix write_space callback
 80  *              Chris Evans     :       Security fixes - signedness again
 81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
 82  *
 83  * To Fix:
 84  *
 85  *
 86  *              This program is free software; you can redistribute it and/or
 87  *              modify it under the terms of the GNU General Public License
 88  *              as published by the Free Software Foundation; either version
 89  *              2 of the License, or (at your option) any later version.
 90  */
 91 
 92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 93 
 94 #include <linux/capability.h>
 95 #include <linux/errno.h>
 96 #include <linux/errqueue.h>
 97 #include <linux/types.h>
 98 #include <linux/socket.h>
 99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/sched/mm.h>
106 #include <linux/timer.h>
107 #include <linux/string.h>
108 #include <linux/sockios.h>
109 #include <linux/net.h>
110 #include <linux/mm.h>
111 #include <linux/slab.h>
112 #include <linux/interrupt.h>
113 #include <linux/poll.h>
114 #include <linux/tcp.h>
115 #include <linux/init.h>
116 #include <linux/highmem.h>
117 #include <linux/user_namespace.h>
118 #include <linux/static_key.h>
119 #include <linux/memcontrol.h>
120 #include <linux/prefetch.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <linux/net_tstamp.h>
131 #include <net/xfrm.h>
132 #include <linux/ipsec.h>
133 #include <net/cls_cgroup.h>
134 #include <net/netprio_cgroup.h>
135 #include <linux/sock_diag.h>
136 
137 #include <linux/filter.h>
138 #include <net/sock_reuseport.h>
139 
140 #include <trace/events/sock.h>
141 
142 #include <net/tcp.h>
143 #include <net/busy_poll.h>
144 
145 static DEFINE_MUTEX(proto_list_mutex);
146 static LIST_HEAD(proto_list);
147 
148 /**
149  * sk_ns_capable - General socket capability test
150  * @sk: Socket to use a capability on or through
151  * @user_ns: The user namespace of the capability to use
152  * @cap: The capability to use
153  *
154  * Test to see if the opener of the socket had when the socket was
155  * created and the current process has the capability @cap in the user
156  * namespace @user_ns.
157  */
158 bool sk_ns_capable(const struct sock *sk,
159                    struct user_namespace *user_ns, int cap)
160 {
161         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
162                 ns_capable(user_ns, cap);
163 }
164 EXPORT_SYMBOL(sk_ns_capable);
165 
166 /**
167  * sk_capable - Socket global capability test
168  * @sk: Socket to use a capability on or through
169  * @cap: The global capability to use
170  *
171  * Test to see if the opener of the socket had when the socket was
172  * created and the current process has the capability @cap in all user
173  * namespaces.
174  */
175 bool sk_capable(const struct sock *sk, int cap)
176 {
177         return sk_ns_capable(sk, &init_user_ns, cap);
178 }
179 EXPORT_SYMBOL(sk_capable);
180 
181 /**
182  * sk_net_capable - Network namespace socket capability test
183  * @sk: Socket to use a capability on or through
184  * @cap: The capability to use
185  *
186  * Test to see if the opener of the socket had when the socket was created
187  * and the current process has the capability @cap over the network namespace
188  * the socket is a member of.
189  */
190 bool sk_net_capable(const struct sock *sk, int cap)
191 {
192         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
193 }
194 EXPORT_SYMBOL(sk_net_capable);
195 
196 /*
197  * Each address family might have different locking rules, so we have
198  * one slock key per address family and separate keys for internal and
199  * userspace sockets.
200  */
201 static struct lock_class_key af_family_keys[AF_MAX];
202 static struct lock_class_key af_family_kern_keys[AF_MAX];
203 static struct lock_class_key af_family_slock_keys[AF_MAX];
204 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
205 
206 /*
207  * Make lock validator output more readable. (we pre-construct these
208  * strings build-time, so that runtime initialization of socket
209  * locks is fast):
210  */
211 
212 #define _sock_locks(x)                                            \
213   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
214   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
215   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
216   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
217   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
218   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
219   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
220   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
221   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
222   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
223   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
224   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
225   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
226   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
227   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_MAX"
228 
229 static const char *const af_family_key_strings[AF_MAX+1] = {
230         _sock_locks("sk_lock-")
231 };
232 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
233         _sock_locks("slock-")
234 };
235 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
236         _sock_locks("clock-")
237 };
238 
239 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
240         _sock_locks("k-sk_lock-")
241 };
242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
243         _sock_locks("k-slock-")
244 };
245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
246         _sock_locks("k-clock-")
247 };
248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
249   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
250   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
251   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
252   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
253   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
254   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
255   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
256   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
257   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
258   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
259   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
260   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
261   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
262   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
263   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
264 };
265 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
266   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
267   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
268   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
269   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
270   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
271   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
272   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
273   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
274   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
275   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
276   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
277   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
278   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
279   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
280   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
281 };
282 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
283   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
284   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
285   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
286   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
287   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
288   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
289   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
290   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
291   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
292   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
293   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
294   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
295   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
296   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
297   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
298 };
299 
300 /*
301  * sk_callback_lock and sk queues locking rules are per-address-family,
302  * so split the lock classes by using a per-AF key:
303  */
304 static struct lock_class_key af_callback_keys[AF_MAX];
305 static struct lock_class_key af_rlock_keys[AF_MAX];
306 static struct lock_class_key af_wlock_keys[AF_MAX];
307 static struct lock_class_key af_elock_keys[AF_MAX];
308 static struct lock_class_key af_kern_callback_keys[AF_MAX];
309 
310 /* Take into consideration the size of the struct sk_buff overhead in the
311  * determination of these values, since that is non-constant across
312  * platforms.  This makes socket queueing behavior and performance
313  * not depend upon such differences.
314  */
315 #define _SK_MEM_PACKETS         256
316 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
317 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
318 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
319 
320 /* Run time adjustable parameters. */
321 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
322 EXPORT_SYMBOL(sysctl_wmem_max);
323 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
324 EXPORT_SYMBOL(sysctl_rmem_max);
325 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
326 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
327 
328 /* Maximal space eaten by iovec or ancillary data plus some space */
329 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
330 EXPORT_SYMBOL(sysctl_optmem_max);
331 
332 int sysctl_tstamp_allow_data __read_mostly = 1;
333 
334 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
335 EXPORT_SYMBOL_GPL(memalloc_socks);
336 
337 /**
338  * sk_set_memalloc - sets %SOCK_MEMALLOC
339  * @sk: socket to set it on
340  *
341  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
342  * It's the responsibility of the admin to adjust min_free_kbytes
343  * to meet the requirements
344  */
345 void sk_set_memalloc(struct sock *sk)
346 {
347         sock_set_flag(sk, SOCK_MEMALLOC);
348         sk->sk_allocation |= __GFP_MEMALLOC;
349         static_key_slow_inc(&memalloc_socks);
350 }
351 EXPORT_SYMBOL_GPL(sk_set_memalloc);
352 
353 void sk_clear_memalloc(struct sock *sk)
354 {
355         sock_reset_flag(sk, SOCK_MEMALLOC);
356         sk->sk_allocation &= ~__GFP_MEMALLOC;
357         static_key_slow_dec(&memalloc_socks);
358 
359         /*
360          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
361          * progress of swapping. SOCK_MEMALLOC may be cleared while
362          * it has rmem allocations due to the last swapfile being deactivated
363          * but there is a risk that the socket is unusable due to exceeding
364          * the rmem limits. Reclaim the reserves and obey rmem limits again.
365          */
366         sk_mem_reclaim(sk);
367 }
368 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
369 
370 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
371 {
372         int ret;
373         unsigned int noreclaim_flag;
374 
375         /* these should have been dropped before queueing */
376         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
377 
378         noreclaim_flag = memalloc_noreclaim_save();
379         ret = sk->sk_backlog_rcv(sk, skb);
380         memalloc_noreclaim_restore(noreclaim_flag);
381 
382         return ret;
383 }
384 EXPORT_SYMBOL(__sk_backlog_rcv);
385 
386 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
387 {
388         struct timeval tv;
389 
390         if (optlen < sizeof(tv))
391                 return -EINVAL;
392         if (copy_from_user(&tv, optval, sizeof(tv)))
393                 return -EFAULT;
394         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
395                 return -EDOM;
396 
397         if (tv.tv_sec < 0) {
398                 static int warned __read_mostly;
399 
400                 *timeo_p = 0;
401                 if (warned < 10 && net_ratelimit()) {
402                         warned++;
403                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
404                                 __func__, current->comm, task_pid_nr(current));
405                 }
406                 return 0;
407         }
408         *timeo_p = MAX_SCHEDULE_TIMEOUT;
409         if (tv.tv_sec == 0 && tv.tv_usec == 0)
410                 return 0;
411         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
412                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
413         return 0;
414 }
415 
416 static void sock_warn_obsolete_bsdism(const char *name)
417 {
418         static int warned;
419         static char warncomm[TASK_COMM_LEN];
420         if (strcmp(warncomm, current->comm) && warned < 5) {
421                 strcpy(warncomm,  current->comm);
422                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
423                         warncomm, name);
424                 warned++;
425         }
426 }
427 
428 static bool sock_needs_netstamp(const struct sock *sk)
429 {
430         switch (sk->sk_family) {
431         case AF_UNSPEC:
432         case AF_UNIX:
433                 return false;
434         default:
435                 return true;
436         }
437 }
438 
439 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
440 {
441         if (sk->sk_flags & flags) {
442                 sk->sk_flags &= ~flags;
443                 if (sock_needs_netstamp(sk) &&
444                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
445                         net_disable_timestamp();
446         }
447 }
448 
449 
450 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
451 {
452         unsigned long flags;
453         struct sk_buff_head *list = &sk->sk_receive_queue;
454 
455         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
456                 atomic_inc(&sk->sk_drops);
457                 trace_sock_rcvqueue_full(sk, skb);
458                 return -ENOMEM;
459         }
460 
461         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
462                 atomic_inc(&sk->sk_drops);
463                 return -ENOBUFS;
464         }
465 
466         skb->dev = NULL;
467         skb_set_owner_r(skb, sk);
468 
469         /* we escape from rcu protected region, make sure we dont leak
470          * a norefcounted dst
471          */
472         skb_dst_force(skb);
473 
474         spin_lock_irqsave(&list->lock, flags);
475         sock_skb_set_dropcount(sk, skb);
476         __skb_queue_tail(list, skb);
477         spin_unlock_irqrestore(&list->lock, flags);
478 
479         if (!sock_flag(sk, SOCK_DEAD))
480                 sk->sk_data_ready(sk);
481         return 0;
482 }
483 EXPORT_SYMBOL(__sock_queue_rcv_skb);
484 
485 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
486 {
487         int err;
488 
489         err = sk_filter(sk, skb);
490         if (err)
491                 return err;
492 
493         return __sock_queue_rcv_skb(sk, skb);
494 }
495 EXPORT_SYMBOL(sock_queue_rcv_skb);
496 
497 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
498                      const int nested, unsigned int trim_cap, bool refcounted)
499 {
500         int rc = NET_RX_SUCCESS;
501 
502         if (sk_filter_trim_cap(sk, skb, trim_cap))
503                 goto discard_and_relse;
504 
505         skb->dev = NULL;
506 
507         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
508                 atomic_inc(&sk->sk_drops);
509                 goto discard_and_relse;
510         }
511         if (nested)
512                 bh_lock_sock_nested(sk);
513         else
514                 bh_lock_sock(sk);
515         if (!sock_owned_by_user(sk)) {
516                 /*
517                  * trylock + unlock semantics:
518                  */
519                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
520 
521                 rc = sk_backlog_rcv(sk, skb);
522 
523                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
524         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
525                 bh_unlock_sock(sk);
526                 atomic_inc(&sk->sk_drops);
527                 goto discard_and_relse;
528         }
529 
530         bh_unlock_sock(sk);
531 out:
532         if (refcounted)
533                 sock_put(sk);
534         return rc;
535 discard_and_relse:
536         kfree_skb(skb);
537         goto out;
538 }
539 EXPORT_SYMBOL(__sk_receive_skb);
540 
541 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
542 {
543         struct dst_entry *dst = __sk_dst_get(sk);
544 
545         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
546                 sk_tx_queue_clear(sk);
547                 sk->sk_dst_pending_confirm = 0;
548                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
549                 dst_release(dst);
550                 return NULL;
551         }
552 
553         return dst;
554 }
555 EXPORT_SYMBOL(__sk_dst_check);
556 
557 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
558 {
559         struct dst_entry *dst = sk_dst_get(sk);
560 
561         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
562                 sk_dst_reset(sk);
563                 dst_release(dst);
564                 return NULL;
565         }
566 
567         return dst;
568 }
569 EXPORT_SYMBOL(sk_dst_check);
570 
571 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
572                                 int optlen)
573 {
574         int ret = -ENOPROTOOPT;
575 #ifdef CONFIG_NETDEVICES
576         struct net *net = sock_net(sk);
577         char devname[IFNAMSIZ];
578         int index;
579 
580         /* Sorry... */
581         ret = -EPERM;
582         if (!ns_capable(net->user_ns, CAP_NET_RAW))
583                 goto out;
584 
585         ret = -EINVAL;
586         if (optlen < 0)
587                 goto out;
588 
589         /* Bind this socket to a particular device like "eth0",
590          * as specified in the passed interface name. If the
591          * name is "" or the option length is zero the socket
592          * is not bound.
593          */
594         if (optlen > IFNAMSIZ - 1)
595                 optlen = IFNAMSIZ - 1;
596         memset(devname, 0, sizeof(devname));
597 
598         ret = -EFAULT;
599         if (copy_from_user(devname, optval, optlen))
600                 goto out;
601 
602         index = 0;
603         if (devname[0] != '\0') {
604                 struct net_device *dev;
605 
606                 rcu_read_lock();
607                 dev = dev_get_by_name_rcu(net, devname);
608                 if (dev)
609                         index = dev->ifindex;
610                 rcu_read_unlock();
611                 ret = -ENODEV;
612                 if (!dev)
613                         goto out;
614         }
615 
616         lock_sock(sk);
617         sk->sk_bound_dev_if = index;
618         sk_dst_reset(sk);
619         release_sock(sk);
620 
621         ret = 0;
622 
623 out:
624 #endif
625 
626         return ret;
627 }
628 
629 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
630                                 int __user *optlen, int len)
631 {
632         int ret = -ENOPROTOOPT;
633 #ifdef CONFIG_NETDEVICES
634         struct net *net = sock_net(sk);
635         char devname[IFNAMSIZ];
636 
637         if (sk->sk_bound_dev_if == 0) {
638                 len = 0;
639                 goto zero;
640         }
641 
642         ret = -EINVAL;
643         if (len < IFNAMSIZ)
644                 goto out;
645 
646         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
647         if (ret)
648                 goto out;
649 
650         len = strlen(devname) + 1;
651 
652         ret = -EFAULT;
653         if (copy_to_user(optval, devname, len))
654                 goto out;
655 
656 zero:
657         ret = -EFAULT;
658         if (put_user(len, optlen))
659                 goto out;
660 
661         ret = 0;
662 
663 out:
664 #endif
665 
666         return ret;
667 }
668 
669 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
670 {
671         if (valbool)
672                 sock_set_flag(sk, bit);
673         else
674                 sock_reset_flag(sk, bit);
675 }
676 
677 bool sk_mc_loop(struct sock *sk)
678 {
679         if (dev_recursion_level())
680                 return false;
681         if (!sk)
682                 return true;
683         switch (sk->sk_family) {
684         case AF_INET:
685                 return inet_sk(sk)->mc_loop;
686 #if IS_ENABLED(CONFIG_IPV6)
687         case AF_INET6:
688                 return inet6_sk(sk)->mc_loop;
689 #endif
690         }
691         WARN_ON(1);
692         return true;
693 }
694 EXPORT_SYMBOL(sk_mc_loop);
695 
696 /*
697  *      This is meant for all protocols to use and covers goings on
698  *      at the socket level. Everything here is generic.
699  */
700 
701 int sock_setsockopt(struct socket *sock, int level, int optname,
702                     char __user *optval, unsigned int optlen)
703 {
704         struct sock *sk = sock->sk;
705         int val;
706         int valbool;
707         struct linger ling;
708         int ret = 0;
709 
710         /*
711          *      Options without arguments
712          */
713 
714         if (optname == SO_BINDTODEVICE)
715                 return sock_setbindtodevice(sk, optval, optlen);
716 
717         if (optlen < sizeof(int))
718                 return -EINVAL;
719 
720         if (get_user(val, (int __user *)optval))
721                 return -EFAULT;
722 
723         valbool = val ? 1 : 0;
724 
725         lock_sock(sk);
726 
727         switch (optname) {
728         case SO_DEBUG:
729                 if (val && !capable(CAP_NET_ADMIN))
730                         ret = -EACCES;
731                 else
732                         sock_valbool_flag(sk, SOCK_DBG, valbool);
733                 break;
734         case SO_REUSEADDR:
735                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
736                 break;
737         case SO_REUSEPORT:
738                 sk->sk_reuseport = valbool;
739                 break;
740         case SO_TYPE:
741         case SO_PROTOCOL:
742         case SO_DOMAIN:
743         case SO_ERROR:
744                 ret = -ENOPROTOOPT;
745                 break;
746         case SO_DONTROUTE:
747                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
748                 break;
749         case SO_BROADCAST:
750                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
751                 break;
752         case SO_SNDBUF:
753                 /* Don't error on this BSD doesn't and if you think
754                  * about it this is right. Otherwise apps have to
755                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
756                  * are treated in BSD as hints
757                  */
758                 val = min_t(u32, val, sysctl_wmem_max);
759 set_sndbuf:
760                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
761                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
762                 /* Wake up sending tasks if we upped the value. */
763                 sk->sk_write_space(sk);
764                 break;
765 
766         case SO_SNDBUFFORCE:
767                 if (!capable(CAP_NET_ADMIN)) {
768                         ret = -EPERM;
769                         break;
770                 }
771                 goto set_sndbuf;
772 
773         case SO_RCVBUF:
774                 /* Don't error on this BSD doesn't and if you think
775                  * about it this is right. Otherwise apps have to
776                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
777                  * are treated in BSD as hints
778                  */
779                 val = min_t(u32, val, sysctl_rmem_max);
780 set_rcvbuf:
781                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
782                 /*
783                  * We double it on the way in to account for
784                  * "struct sk_buff" etc. overhead.   Applications
785                  * assume that the SO_RCVBUF setting they make will
786                  * allow that much actual data to be received on that
787                  * socket.
788                  *
789                  * Applications are unaware that "struct sk_buff" and
790                  * other overheads allocate from the receive buffer
791                  * during socket buffer allocation.
792                  *
793                  * And after considering the possible alternatives,
794                  * returning the value we actually used in getsockopt
795                  * is the most desirable behavior.
796                  */
797                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
798                 break;
799 
800         case SO_RCVBUFFORCE:
801                 if (!capable(CAP_NET_ADMIN)) {
802                         ret = -EPERM;
803                         break;
804                 }
805                 goto set_rcvbuf;
806 
807         case SO_KEEPALIVE:
808                 if (sk->sk_prot->keepalive)
809                         sk->sk_prot->keepalive(sk, valbool);
810                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
811                 break;
812 
813         case SO_OOBINLINE:
814                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
815                 break;
816 
817         case SO_NO_CHECK:
818                 sk->sk_no_check_tx = valbool;
819                 break;
820 
821         case SO_PRIORITY:
822                 if ((val >= 0 && val <= 6) ||
823                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
824                         sk->sk_priority = val;
825                 else
826                         ret = -EPERM;
827                 break;
828 
829         case SO_LINGER:
830                 if (optlen < sizeof(ling)) {
831                         ret = -EINVAL;  /* 1003.1g */
832                         break;
833                 }
834                 if (copy_from_user(&ling, optval, sizeof(ling))) {
835                         ret = -EFAULT;
836                         break;
837                 }
838                 if (!ling.l_onoff)
839                         sock_reset_flag(sk, SOCK_LINGER);
840                 else {
841 #if (BITS_PER_LONG == 32)
842                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
843                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
844                         else
845 #endif
846                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
847                         sock_set_flag(sk, SOCK_LINGER);
848                 }
849                 break;
850 
851         case SO_BSDCOMPAT:
852                 sock_warn_obsolete_bsdism("setsockopt");
853                 break;
854 
855         case SO_PASSCRED:
856                 if (valbool)
857                         set_bit(SOCK_PASSCRED, &sock->flags);
858                 else
859                         clear_bit(SOCK_PASSCRED, &sock->flags);
860                 break;
861 
862         case SO_TIMESTAMP:
863         case SO_TIMESTAMPNS:
864                 if (valbool)  {
865                         if (optname == SO_TIMESTAMP)
866                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
867                         else
868                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
869                         sock_set_flag(sk, SOCK_RCVTSTAMP);
870                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
871                 } else {
872                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
873                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
874                 }
875                 break;
876 
877         case SO_TIMESTAMPING:
878                 if (val & ~SOF_TIMESTAMPING_MASK) {
879                         ret = -EINVAL;
880                         break;
881                 }
882 
883                 if (val & SOF_TIMESTAMPING_OPT_ID &&
884                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
885                         if (sk->sk_protocol == IPPROTO_TCP &&
886                             sk->sk_type == SOCK_STREAM) {
887                                 if ((1 << sk->sk_state) &
888                                     (TCPF_CLOSE | TCPF_LISTEN)) {
889                                         ret = -EINVAL;
890                                         break;
891                                 }
892                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
893                         } else {
894                                 sk->sk_tskey = 0;
895                         }
896                 }
897 
898                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
899                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
900                         ret = -EINVAL;
901                         break;
902                 }
903 
904                 sk->sk_tsflags = val;
905                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
906                         sock_enable_timestamp(sk,
907                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
908                 else
909                         sock_disable_timestamp(sk,
910                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
911                 break;
912 
913         case SO_RCVLOWAT:
914                 if (val < 0)
915                         val = INT_MAX;
916                 sk->sk_rcvlowat = val ? : 1;
917                 break;
918 
919         case SO_RCVTIMEO:
920                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
921                 break;
922 
923         case SO_SNDTIMEO:
924                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
925                 break;
926 
927         case SO_ATTACH_FILTER:
928                 ret = -EINVAL;
929                 if (optlen == sizeof(struct sock_fprog)) {
930                         struct sock_fprog fprog;
931 
932                         ret = -EFAULT;
933                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
934                                 break;
935 
936                         ret = sk_attach_filter(&fprog, sk);
937                 }
938                 break;
939 
940         case SO_ATTACH_BPF:
941                 ret = -EINVAL;
942                 if (optlen == sizeof(u32)) {
943                         u32 ufd;
944 
945                         ret = -EFAULT;
946                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
947                                 break;
948 
949                         ret = sk_attach_bpf(ufd, sk);
950                 }
951                 break;
952 
953         case SO_ATTACH_REUSEPORT_CBPF:
954                 ret = -EINVAL;
955                 if (optlen == sizeof(struct sock_fprog)) {
956                         struct sock_fprog fprog;
957 
958                         ret = -EFAULT;
959                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
960                                 break;
961 
962                         ret = sk_reuseport_attach_filter(&fprog, sk);
963                 }
964                 break;
965 
966         case SO_ATTACH_REUSEPORT_EBPF:
967                 ret = -EINVAL;
968                 if (optlen == sizeof(u32)) {
969                         u32 ufd;
970 
971                         ret = -EFAULT;
972                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
973                                 break;
974 
975                         ret = sk_reuseport_attach_bpf(ufd, sk);
976                 }
977                 break;
978 
979         case SO_DETACH_FILTER:
980                 ret = sk_detach_filter(sk);
981                 break;
982 
983         case SO_LOCK_FILTER:
984                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
985                         ret = -EPERM;
986                 else
987                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
988                 break;
989 
990         case SO_PASSSEC:
991                 if (valbool)
992                         set_bit(SOCK_PASSSEC, &sock->flags);
993                 else
994                         clear_bit(SOCK_PASSSEC, &sock->flags);
995                 break;
996         case SO_MARK:
997                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
998                         ret = -EPERM;
999                 else
1000                         sk->sk_mark = val;
1001                 break;
1002 
1003         case SO_RXQ_OVFL:
1004                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1005                 break;
1006 
1007         case SO_WIFI_STATUS:
1008                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1009                 break;
1010 
1011         case SO_PEEK_OFF:
1012                 if (sock->ops->set_peek_off)
1013                         ret = sock->ops->set_peek_off(sk, val);
1014                 else
1015                         ret = -EOPNOTSUPP;
1016                 break;
1017 
1018         case SO_NOFCS:
1019                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1020                 break;
1021 
1022         case SO_SELECT_ERR_QUEUE:
1023                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1024                 break;
1025 
1026 #ifdef CONFIG_NET_RX_BUSY_POLL
1027         case SO_BUSY_POLL:
1028                 /* allow unprivileged users to decrease the value */
1029                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1030                         ret = -EPERM;
1031                 else {
1032                         if (val < 0)
1033                                 ret = -EINVAL;
1034                         else
1035                                 sk->sk_ll_usec = val;
1036                 }
1037                 break;
1038 #endif
1039 
1040         case SO_MAX_PACING_RATE:
1041                 if (val != ~0U)
1042                         cmpxchg(&sk->sk_pacing_status,
1043                                 SK_PACING_NONE,
1044                                 SK_PACING_NEEDED);
1045                 sk->sk_max_pacing_rate = val;
1046                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1047                                          sk->sk_max_pacing_rate);
1048                 break;
1049 
1050         case SO_INCOMING_CPU:
1051                 sk->sk_incoming_cpu = val;
1052                 break;
1053 
1054         case SO_CNX_ADVICE:
1055                 if (val == 1)
1056                         dst_negative_advice(sk);
1057                 break;
1058         default:
1059                 ret = -ENOPROTOOPT;
1060                 break;
1061         }
1062         release_sock(sk);
1063         return ret;
1064 }
1065 EXPORT_SYMBOL(sock_setsockopt);
1066 
1067 
1068 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1069                           struct ucred *ucred)
1070 {
1071         ucred->pid = pid_vnr(pid);
1072         ucred->uid = ucred->gid = -1;
1073         if (cred) {
1074                 struct user_namespace *current_ns = current_user_ns();
1075 
1076                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1077                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1078         }
1079 }
1080 
1081 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1082 {
1083         struct user_namespace *user_ns = current_user_ns();
1084         int i;
1085 
1086         for (i = 0; i < src->ngroups; i++)
1087                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1088                         return -EFAULT;
1089 
1090         return 0;
1091 }
1092 
1093 int sock_getsockopt(struct socket *sock, int level, int optname,
1094                     char __user *optval, int __user *optlen)
1095 {
1096         struct sock *sk = sock->sk;
1097 
1098         union {
1099                 int val;
1100                 u64 val64;
1101                 struct linger ling;
1102                 struct timeval tm;
1103         } v;
1104 
1105         int lv = sizeof(int);
1106         int len;
1107 
1108         if (get_user(len, optlen))
1109                 return -EFAULT;
1110         if (len < 0)
1111                 return -EINVAL;
1112 
1113         memset(&v, 0, sizeof(v));
1114 
1115         switch (optname) {
1116         case SO_DEBUG:
1117                 v.val = sock_flag(sk, SOCK_DBG);
1118                 break;
1119 
1120         case SO_DONTROUTE:
1121                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1122                 break;
1123 
1124         case SO_BROADCAST:
1125                 v.val = sock_flag(sk, SOCK_BROADCAST);
1126                 break;
1127 
1128         case SO_SNDBUF:
1129                 v.val = sk->sk_sndbuf;
1130                 break;
1131 
1132         case SO_RCVBUF:
1133                 v.val = sk->sk_rcvbuf;
1134                 break;
1135 
1136         case SO_REUSEADDR:
1137                 v.val = sk->sk_reuse;
1138                 break;
1139 
1140         case SO_REUSEPORT:
1141                 v.val = sk->sk_reuseport;
1142                 break;
1143 
1144         case SO_KEEPALIVE:
1145                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1146                 break;
1147 
1148         case SO_TYPE:
1149                 v.val = sk->sk_type;
1150                 break;
1151 
1152         case SO_PROTOCOL:
1153                 v.val = sk->sk_protocol;
1154                 break;
1155 
1156         case SO_DOMAIN:
1157                 v.val = sk->sk_family;
1158                 break;
1159 
1160         case SO_ERROR:
1161                 v.val = -sock_error(sk);
1162                 if (v.val == 0)
1163                         v.val = xchg(&sk->sk_err_soft, 0);
1164                 break;
1165 
1166         case SO_OOBINLINE:
1167                 v.val = sock_flag(sk, SOCK_URGINLINE);
1168                 break;
1169 
1170         case SO_NO_CHECK:
1171                 v.val = sk->sk_no_check_tx;
1172                 break;
1173 
1174         case SO_PRIORITY:
1175                 v.val = sk->sk_priority;
1176                 break;
1177 
1178         case SO_LINGER:
1179                 lv              = sizeof(v.ling);
1180                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1181                 v.ling.l_linger = sk->sk_lingertime / HZ;
1182                 break;
1183 
1184         case SO_BSDCOMPAT:
1185                 sock_warn_obsolete_bsdism("getsockopt");
1186                 break;
1187 
1188         case SO_TIMESTAMP:
1189                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1190                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1191                 break;
1192 
1193         case SO_TIMESTAMPNS:
1194                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1195                 break;
1196 
1197         case SO_TIMESTAMPING:
1198                 v.val = sk->sk_tsflags;
1199                 break;
1200 
1201         case SO_RCVTIMEO:
1202                 lv = sizeof(struct timeval);
1203                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1204                         v.tm.tv_sec = 0;
1205                         v.tm.tv_usec = 0;
1206                 } else {
1207                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1208                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1209                 }
1210                 break;
1211 
1212         case SO_SNDTIMEO:
1213                 lv = sizeof(struct timeval);
1214                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1215                         v.tm.tv_sec = 0;
1216                         v.tm.tv_usec = 0;
1217                 } else {
1218                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1219                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1220                 }
1221                 break;
1222 
1223         case SO_RCVLOWAT:
1224                 v.val = sk->sk_rcvlowat;
1225                 break;
1226 
1227         case SO_SNDLOWAT:
1228                 v.val = 1;
1229                 break;
1230 
1231         case SO_PASSCRED:
1232                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1233                 break;
1234 
1235         case SO_PEERCRED:
1236         {
1237                 struct ucred peercred;
1238                 if (len > sizeof(peercred))
1239                         len = sizeof(peercred);
1240                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1241                 if (copy_to_user(optval, &peercred, len))
1242                         return -EFAULT;
1243                 goto lenout;
1244         }
1245 
1246         case SO_PEERGROUPS:
1247         {
1248                 int ret, n;
1249 
1250                 if (!sk->sk_peer_cred)
1251                         return -ENODATA;
1252 
1253                 n = sk->sk_peer_cred->group_info->ngroups;
1254                 if (len < n * sizeof(gid_t)) {
1255                         len = n * sizeof(gid_t);
1256                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1257                 }
1258                 len = n * sizeof(gid_t);
1259 
1260                 ret = groups_to_user((gid_t __user *)optval,
1261                                      sk->sk_peer_cred->group_info);
1262                 if (ret)
1263                         return ret;
1264                 goto lenout;
1265         }
1266 
1267         case SO_PEERNAME:
1268         {
1269                 char address[128];
1270 
1271                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1272                         return -ENOTCONN;
1273                 if (lv < len)
1274                         return -EINVAL;
1275                 if (copy_to_user(optval, address, len))
1276                         return -EFAULT;
1277                 goto lenout;
1278         }
1279 
1280         /* Dubious BSD thing... Probably nobody even uses it, but
1281          * the UNIX standard wants it for whatever reason... -DaveM
1282          */
1283         case SO_ACCEPTCONN:
1284                 v.val = sk->sk_state == TCP_LISTEN;
1285                 break;
1286 
1287         case SO_PASSSEC:
1288                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1289                 break;
1290 
1291         case SO_PEERSEC:
1292                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1293 
1294         case SO_MARK:
1295                 v.val = sk->sk_mark;
1296                 break;
1297 
1298         case SO_RXQ_OVFL:
1299                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1300                 break;
1301 
1302         case SO_WIFI_STATUS:
1303                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1304                 break;
1305 
1306         case SO_PEEK_OFF:
1307                 if (!sock->ops->set_peek_off)
1308                         return -EOPNOTSUPP;
1309 
1310                 v.val = sk->sk_peek_off;
1311                 break;
1312         case SO_NOFCS:
1313                 v.val = sock_flag(sk, SOCK_NOFCS);
1314                 break;
1315 
1316         case SO_BINDTODEVICE:
1317                 return sock_getbindtodevice(sk, optval, optlen, len);
1318 
1319         case SO_GET_FILTER:
1320                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1321                 if (len < 0)
1322                         return len;
1323 
1324                 goto lenout;
1325 
1326         case SO_LOCK_FILTER:
1327                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1328                 break;
1329 
1330         case SO_BPF_EXTENSIONS:
1331                 v.val = bpf_tell_extensions();
1332                 break;
1333 
1334         case SO_SELECT_ERR_QUEUE:
1335                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1336                 break;
1337 
1338 #ifdef CONFIG_NET_RX_BUSY_POLL
1339         case SO_BUSY_POLL:
1340                 v.val = sk->sk_ll_usec;
1341                 break;
1342 #endif
1343 
1344         case SO_MAX_PACING_RATE:
1345                 v.val = sk->sk_max_pacing_rate;
1346                 break;
1347 
1348         case SO_INCOMING_CPU:
1349                 v.val = sk->sk_incoming_cpu;
1350                 break;
1351 
1352         case SO_MEMINFO:
1353         {
1354                 u32 meminfo[SK_MEMINFO_VARS];
1355 
1356                 if (get_user(len, optlen))
1357                         return -EFAULT;
1358 
1359                 sk_get_meminfo(sk, meminfo);
1360 
1361                 len = min_t(unsigned int, len, sizeof(meminfo));
1362                 if (copy_to_user(optval, &meminfo, len))
1363                         return -EFAULT;
1364 
1365                 goto lenout;
1366         }
1367 
1368 #ifdef CONFIG_NET_RX_BUSY_POLL
1369         case SO_INCOMING_NAPI_ID:
1370                 v.val = READ_ONCE(sk->sk_napi_id);
1371 
1372                 /* aggregate non-NAPI IDs down to 0 */
1373                 if (v.val < MIN_NAPI_ID)
1374                         v.val = 0;
1375 
1376                 break;
1377 #endif
1378 
1379         case SO_COOKIE:
1380                 lv = sizeof(u64);
1381                 if (len < lv)
1382                         return -EINVAL;
1383                 v.val64 = sock_gen_cookie(sk);
1384                 break;
1385 
1386         default:
1387                 /* We implement the SO_SNDLOWAT etc to not be settable
1388                  * (1003.1g 7).
1389                  */
1390                 return -ENOPROTOOPT;
1391         }
1392 
1393         if (len > lv)
1394                 len = lv;
1395         if (copy_to_user(optval, &v, len))
1396                 return -EFAULT;
1397 lenout:
1398         if (put_user(len, optlen))
1399                 return -EFAULT;
1400         return 0;
1401 }
1402 
1403 /*
1404  * Initialize an sk_lock.
1405  *
1406  * (We also register the sk_lock with the lock validator.)
1407  */
1408 static inline void sock_lock_init(struct sock *sk)
1409 {
1410         if (sk->sk_kern_sock)
1411                 sock_lock_init_class_and_name(
1412                         sk,
1413                         af_family_kern_slock_key_strings[sk->sk_family],
1414                         af_family_kern_slock_keys + sk->sk_family,
1415                         af_family_kern_key_strings[sk->sk_family],
1416                         af_family_kern_keys + sk->sk_family);
1417         else
1418                 sock_lock_init_class_and_name(
1419                         sk,
1420                         af_family_slock_key_strings[sk->sk_family],
1421                         af_family_slock_keys + sk->sk_family,
1422                         af_family_key_strings[sk->sk_family],
1423                         af_family_keys + sk->sk_family);
1424 }
1425 
1426 /*
1427  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1428  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1429  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1430  */
1431 static void sock_copy(struct sock *nsk, const struct sock *osk)
1432 {
1433 #ifdef CONFIG_SECURITY_NETWORK
1434         void *sptr = nsk->sk_security;
1435 #endif
1436         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1437 
1438         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1439                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1440 
1441 #ifdef CONFIG_SECURITY_NETWORK
1442         nsk->sk_security = sptr;
1443         security_sk_clone(osk, nsk);
1444 #endif
1445 }
1446 
1447 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1448                 int family)
1449 {
1450         struct sock *sk;
1451         struct kmem_cache *slab;
1452 
1453         slab = prot->slab;
1454         if (slab != NULL) {
1455                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1456                 if (!sk)
1457                         return sk;
1458                 if (priority & __GFP_ZERO)
1459                         sk_prot_clear_nulls(sk, prot->obj_size);
1460         } else
1461                 sk = kmalloc(prot->obj_size, priority);
1462 
1463         if (sk != NULL) {
1464                 kmemcheck_annotate_bitfield(sk, flags);
1465 
1466                 if (security_sk_alloc(sk, family, priority))
1467                         goto out_free;
1468 
1469                 if (!try_module_get(prot->owner))
1470                         goto out_free_sec;
1471                 sk_tx_queue_clear(sk);
1472         }
1473 
1474         return sk;
1475 
1476 out_free_sec:
1477         security_sk_free(sk);
1478 out_free:
1479         if (slab != NULL)
1480                 kmem_cache_free(slab, sk);
1481         else
1482                 kfree(sk);
1483         return NULL;
1484 }
1485 
1486 static void sk_prot_free(struct proto *prot, struct sock *sk)
1487 {
1488         struct kmem_cache *slab;
1489         struct module *owner;
1490 
1491         owner = prot->owner;
1492         slab = prot->slab;
1493 
1494         cgroup_sk_free(&sk->sk_cgrp_data);
1495         mem_cgroup_sk_free(sk);
1496         security_sk_free(sk);
1497         if (slab != NULL)
1498                 kmem_cache_free(slab, sk);
1499         else
1500                 kfree(sk);
1501         module_put(owner);
1502 }
1503 
1504 /**
1505  *      sk_alloc - All socket objects are allocated here
1506  *      @net: the applicable net namespace
1507  *      @family: protocol family
1508  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1509  *      @prot: struct proto associated with this new sock instance
1510  *      @kern: is this to be a kernel socket?
1511  */
1512 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1513                       struct proto *prot, int kern)
1514 {
1515         struct sock *sk;
1516 
1517         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1518         if (sk) {
1519                 sk->sk_family = family;
1520                 /*
1521                  * See comment in struct sock definition to understand
1522                  * why we need sk_prot_creator -acme
1523                  */
1524                 sk->sk_prot = sk->sk_prot_creator = prot;
1525                 sk->sk_kern_sock = kern;
1526                 sock_lock_init(sk);
1527                 sk->sk_net_refcnt = kern ? 0 : 1;
1528                 if (likely(sk->sk_net_refcnt))
1529                         get_net(net);
1530                 sock_net_set(sk, net);
1531                 refcount_set(&sk->sk_wmem_alloc, 1);
1532 
1533                 mem_cgroup_sk_alloc(sk);
1534                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1535                 sock_update_classid(&sk->sk_cgrp_data);
1536                 sock_update_netprioidx(&sk->sk_cgrp_data);
1537         }
1538 
1539         return sk;
1540 }
1541 EXPORT_SYMBOL(sk_alloc);
1542 
1543 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1544  * grace period. This is the case for UDP sockets and TCP listeners.
1545  */
1546 static void __sk_destruct(struct rcu_head *head)
1547 {
1548         struct sock *sk = container_of(head, struct sock, sk_rcu);
1549         struct sk_filter *filter;
1550 
1551         if (sk->sk_destruct)
1552                 sk->sk_destruct(sk);
1553 
1554         filter = rcu_dereference_check(sk->sk_filter,
1555                                        refcount_read(&sk->sk_wmem_alloc) == 0);
1556         if (filter) {
1557                 sk_filter_uncharge(sk, filter);
1558                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1559         }
1560         if (rcu_access_pointer(sk->sk_reuseport_cb))
1561                 reuseport_detach_sock(sk);
1562 
1563         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1564 
1565         if (atomic_read(&sk->sk_omem_alloc))
1566                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1567                          __func__, atomic_read(&sk->sk_omem_alloc));
1568 
1569         if (sk->sk_frag.page) {
1570                 put_page(sk->sk_frag.page);
1571                 sk->sk_frag.page = NULL;
1572         }
1573 
1574         if (sk->sk_peer_cred)
1575                 put_cred(sk->sk_peer_cred);
1576         put_pid(sk->sk_peer_pid);
1577         if (likely(sk->sk_net_refcnt))
1578                 put_net(sock_net(sk));
1579         sk_prot_free(sk->sk_prot_creator, sk);
1580 }
1581 
1582 void sk_destruct(struct sock *sk)
1583 {
1584         if (sock_flag(sk, SOCK_RCU_FREE))
1585                 call_rcu(&sk->sk_rcu, __sk_destruct);
1586         else
1587                 __sk_destruct(&sk->sk_rcu);
1588 }
1589 
1590 static void __sk_free(struct sock *sk)
1591 {
1592         if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1593                 sock_diag_broadcast_destroy(sk);
1594         else
1595                 sk_destruct(sk);
1596 }
1597 
1598 void sk_free(struct sock *sk)
1599 {
1600         /*
1601          * We subtract one from sk_wmem_alloc and can know if
1602          * some packets are still in some tx queue.
1603          * If not null, sock_wfree() will call __sk_free(sk) later
1604          */
1605         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1606                 __sk_free(sk);
1607 }
1608 EXPORT_SYMBOL(sk_free);
1609 
1610 static void sk_init_common(struct sock *sk)
1611 {
1612         skb_queue_head_init(&sk->sk_receive_queue);
1613         skb_queue_head_init(&sk->sk_write_queue);
1614         skb_queue_head_init(&sk->sk_error_queue);
1615 
1616         rwlock_init(&sk->sk_callback_lock);
1617         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1618                         af_rlock_keys + sk->sk_family,
1619                         af_family_rlock_key_strings[sk->sk_family]);
1620         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1621                         af_wlock_keys + sk->sk_family,
1622                         af_family_wlock_key_strings[sk->sk_family]);
1623         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1624                         af_elock_keys + sk->sk_family,
1625                         af_family_elock_key_strings[sk->sk_family]);
1626         lockdep_set_class_and_name(&sk->sk_callback_lock,
1627                         af_callback_keys + sk->sk_family,
1628                         af_family_clock_key_strings[sk->sk_family]);
1629 }
1630 
1631 /**
1632  *      sk_clone_lock - clone a socket, and lock its clone
1633  *      @sk: the socket to clone
1634  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1635  *
1636  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1637  */
1638 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1639 {
1640         struct sock *newsk;
1641         bool is_charged = true;
1642 
1643         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1644         if (newsk != NULL) {
1645                 struct sk_filter *filter;
1646 
1647                 sock_copy(newsk, sk);
1648 
1649                 newsk->sk_prot_creator = sk->sk_prot;
1650 
1651                 /* SANITY */
1652                 if (likely(newsk->sk_net_refcnt))
1653                         get_net(sock_net(newsk));
1654                 sk_node_init(&newsk->sk_node);
1655                 sock_lock_init(newsk);
1656                 bh_lock_sock(newsk);
1657                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1658                 newsk->sk_backlog.len = 0;
1659 
1660                 atomic_set(&newsk->sk_rmem_alloc, 0);
1661                 /*
1662                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1663                  */
1664                 refcount_set(&newsk->sk_wmem_alloc, 1);
1665                 atomic_set(&newsk->sk_omem_alloc, 0);
1666                 sk_init_common(newsk);
1667 
1668                 newsk->sk_dst_cache     = NULL;
1669                 newsk->sk_dst_pending_confirm = 0;
1670                 newsk->sk_wmem_queued   = 0;
1671                 newsk->sk_forward_alloc = 0;
1672                 atomic_set(&newsk->sk_drops, 0);
1673                 newsk->sk_send_head     = NULL;
1674                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1675 
1676                 sock_reset_flag(newsk, SOCK_DONE);
1677                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1678 
1679                 rcu_read_lock();
1680                 filter = rcu_dereference(sk->sk_filter);
1681                 if (filter != NULL)
1682                         /* though it's an empty new sock, the charging may fail
1683                          * if sysctl_optmem_max was changed between creation of
1684                          * original socket and cloning
1685                          */
1686                         is_charged = sk_filter_charge(newsk, filter);
1687                 RCU_INIT_POINTER(newsk->sk_filter, filter);
1688                 rcu_read_unlock();
1689 
1690                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1691                         /* We need to make sure that we don't uncharge the new
1692                          * socket if we couldn't charge it in the first place
1693                          * as otherwise we uncharge the parent's filter.
1694                          */
1695                         if (!is_charged)
1696                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1697                         sk_free_unlock_clone(newsk);
1698                         newsk = NULL;
1699                         goto out;
1700                 }
1701                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1702 
1703                 newsk->sk_err      = 0;
1704                 newsk->sk_err_soft = 0;
1705                 newsk->sk_priority = 0;
1706                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1707                 atomic64_set(&newsk->sk_cookie, 0);
1708 
1709                 mem_cgroup_sk_alloc(newsk);
1710                 /*
1711                  * Before updating sk_refcnt, we must commit prior changes to memory
1712                  * (Documentation/RCU/rculist_nulls.txt for details)
1713                  */
1714                 smp_wmb();
1715                 refcount_set(&newsk->sk_refcnt, 2);
1716 
1717                 /*
1718                  * Increment the counter in the same struct proto as the master
1719                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1720                  * is the same as sk->sk_prot->socks, as this field was copied
1721                  * with memcpy).
1722                  *
1723                  * This _changes_ the previous behaviour, where
1724                  * tcp_create_openreq_child always was incrementing the
1725                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1726                  * to be taken into account in all callers. -acme
1727                  */
1728                 sk_refcnt_debug_inc(newsk);
1729                 sk_set_socket(newsk, NULL);
1730                 newsk->sk_wq = NULL;
1731 
1732                 if (newsk->sk_prot->sockets_allocated)
1733                         sk_sockets_allocated_inc(newsk);
1734 
1735                 if (sock_needs_netstamp(sk) &&
1736                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1737                         net_enable_timestamp();
1738         }
1739 out:
1740         return newsk;
1741 }
1742 EXPORT_SYMBOL_GPL(sk_clone_lock);
1743 
1744 void sk_free_unlock_clone(struct sock *sk)
1745 {
1746         /* It is still raw copy of parent, so invalidate
1747          * destructor and make plain sk_free() */
1748         sk->sk_destruct = NULL;
1749         bh_unlock_sock(sk);
1750         sk_free(sk);
1751 }
1752 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1753 
1754 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1755 {
1756         u32 max_segs = 1;
1757 
1758         sk_dst_set(sk, dst);
1759         sk->sk_route_caps = dst->dev->features;
1760         if (sk->sk_route_caps & NETIF_F_GSO)
1761                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1762         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1763         if (sk_can_gso(sk)) {
1764                 if (dst->header_len) {
1765                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1766                 } else {
1767                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1768                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1769                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1770                 }
1771         }
1772         sk->sk_gso_max_segs = max_segs;
1773 }
1774 EXPORT_SYMBOL_GPL(sk_setup_caps);
1775 
1776 /*
1777  *      Simple resource managers for sockets.
1778  */
1779 
1780 
1781 /*
1782  * Write buffer destructor automatically called from kfree_skb.
1783  */
1784 void sock_wfree(struct sk_buff *skb)
1785 {
1786         struct sock *sk = skb->sk;
1787         unsigned int len = skb->truesize;
1788 
1789         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1790                 /*
1791                  * Keep a reference on sk_wmem_alloc, this will be released
1792                  * after sk_write_space() call
1793                  */
1794                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1795                 sk->sk_write_space(sk);
1796                 len = 1;
1797         }
1798         /*
1799          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1800          * could not do because of in-flight packets
1801          */
1802         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1803                 __sk_free(sk);
1804 }
1805 EXPORT_SYMBOL(sock_wfree);
1806 
1807 /* This variant of sock_wfree() is used by TCP,
1808  * since it sets SOCK_USE_WRITE_QUEUE.
1809  */
1810 void __sock_wfree(struct sk_buff *skb)
1811 {
1812         struct sock *sk = skb->sk;
1813 
1814         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1815                 __sk_free(sk);
1816 }
1817 
1818 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1819 {
1820         skb_orphan(skb);
1821         skb->sk = sk;
1822 #ifdef CONFIG_INET
1823         if (unlikely(!sk_fullsock(sk))) {
1824                 skb->destructor = sock_edemux;
1825                 sock_hold(sk);
1826                 return;
1827         }
1828 #endif
1829         skb->destructor = sock_wfree;
1830         skb_set_hash_from_sk(skb, sk);
1831         /*
1832          * We used to take a refcount on sk, but following operation
1833          * is enough to guarantee sk_free() wont free this sock until
1834          * all in-flight packets are completed
1835          */
1836         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1837 }
1838 EXPORT_SYMBOL(skb_set_owner_w);
1839 
1840 /* This helper is used by netem, as it can hold packets in its
1841  * delay queue. We want to allow the owner socket to send more
1842  * packets, as if they were already TX completed by a typical driver.
1843  * But we also want to keep skb->sk set because some packet schedulers
1844  * rely on it (sch_fq for example).
1845  */
1846 void skb_orphan_partial(struct sk_buff *skb)
1847 {
1848         if (skb_is_tcp_pure_ack(skb))
1849                 return;
1850 
1851         if (skb->destructor == sock_wfree
1852 #ifdef CONFIG_INET
1853             || skb->destructor == tcp_wfree
1854 #endif
1855                 ) {
1856                 struct sock *sk = skb->sk;
1857 
1858                 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1859                         WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1860                         skb->destructor = sock_efree;
1861                 }
1862         } else {
1863                 skb_orphan(skb);
1864         }
1865 }
1866 EXPORT_SYMBOL(skb_orphan_partial);
1867 
1868 /*
1869  * Read buffer destructor automatically called from kfree_skb.
1870  */
1871 void sock_rfree(struct sk_buff *skb)
1872 {
1873         struct sock *sk = skb->sk;
1874         unsigned int len = skb->truesize;
1875 
1876         atomic_sub(len, &sk->sk_rmem_alloc);
1877         sk_mem_uncharge(sk, len);
1878 }
1879 EXPORT_SYMBOL(sock_rfree);
1880 
1881 /*
1882  * Buffer destructor for skbs that are not used directly in read or write
1883  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1884  */
1885 void sock_efree(struct sk_buff *skb)
1886 {
1887         sock_put(skb->sk);
1888 }
1889 EXPORT_SYMBOL(sock_efree);
1890 
1891 kuid_t sock_i_uid(struct sock *sk)
1892 {
1893         kuid_t uid;
1894 
1895         read_lock_bh(&sk->sk_callback_lock);
1896         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1897         read_unlock_bh(&sk->sk_callback_lock);
1898         return uid;
1899 }
1900 EXPORT_SYMBOL(sock_i_uid);
1901 
1902 unsigned long sock_i_ino(struct sock *sk)
1903 {
1904         unsigned long ino;
1905 
1906         read_lock_bh(&sk->sk_callback_lock);
1907         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1908         read_unlock_bh(&sk->sk_callback_lock);
1909         return ino;
1910 }
1911 EXPORT_SYMBOL(sock_i_ino);
1912 
1913 /*
1914  * Allocate a skb from the socket's send buffer.
1915  */
1916 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1917                              gfp_t priority)
1918 {
1919         if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1920                 struct sk_buff *skb = alloc_skb(size, priority);
1921                 if (skb) {
1922                         skb_set_owner_w(skb, sk);
1923                         return skb;
1924                 }
1925         }
1926         return NULL;
1927 }
1928 EXPORT_SYMBOL(sock_wmalloc);
1929 
1930 /*
1931  * Allocate a memory block from the socket's option memory buffer.
1932  */
1933 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1934 {
1935         if ((unsigned int)size <= sysctl_optmem_max &&
1936             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1937                 void *mem;
1938                 /* First do the add, to avoid the race if kmalloc
1939                  * might sleep.
1940                  */
1941                 atomic_add(size, &sk->sk_omem_alloc);
1942                 mem = kmalloc(size, priority);
1943                 if (mem)
1944                         return mem;
1945                 atomic_sub(size, &sk->sk_omem_alloc);
1946         }
1947         return NULL;
1948 }
1949 EXPORT_SYMBOL(sock_kmalloc);
1950 
1951 /* Free an option memory block. Note, we actually want the inline
1952  * here as this allows gcc to detect the nullify and fold away the
1953  * condition entirely.
1954  */
1955 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1956                                   const bool nullify)
1957 {
1958         if (WARN_ON_ONCE(!mem))
1959                 return;
1960         if (nullify)
1961                 kzfree(mem);
1962         else
1963                 kfree(mem);
1964         atomic_sub(size, &sk->sk_omem_alloc);
1965 }
1966 
1967 void sock_kfree_s(struct sock *sk, void *mem, int size)
1968 {
1969         __sock_kfree_s(sk, mem, size, false);
1970 }
1971 EXPORT_SYMBOL(sock_kfree_s);
1972 
1973 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1974 {
1975         __sock_kfree_s(sk, mem, size, true);
1976 }
1977 EXPORT_SYMBOL(sock_kzfree_s);
1978 
1979 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1980    I think, these locks should be removed for datagram sockets.
1981  */
1982 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1983 {
1984         DEFINE_WAIT(wait);
1985 
1986         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1987         for (;;) {
1988                 if (!timeo)
1989                         break;
1990                 if (signal_pending(current))
1991                         break;
1992                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1993                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1994                 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1995                         break;
1996                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1997                         break;
1998                 if (sk->sk_err)
1999                         break;
2000                 timeo = schedule_timeout(timeo);
2001         }
2002         finish_wait(sk_sleep(sk), &wait);
2003         return timeo;
2004 }
2005 
2006 
2007 /*
2008  *      Generic send/receive buffer handlers
2009  */
2010 
2011 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2012                                      unsigned long data_len, int noblock,
2013                                      int *errcode, int max_page_order)
2014 {
2015         struct sk_buff *skb;
2016         long timeo;
2017         int err;
2018 
2019         timeo = sock_sndtimeo(sk, noblock);
2020         for (;;) {
2021                 err = sock_error(sk);
2022                 if (err != 0)
2023                         goto failure;
2024 
2025                 err = -EPIPE;
2026                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2027                         goto failure;
2028 
2029                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2030                         break;
2031 
2032                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2033                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2034                 err = -EAGAIN;
2035                 if (!timeo)
2036                         goto failure;
2037                 if (signal_pending(current))
2038                         goto interrupted;
2039                 timeo = sock_wait_for_wmem(sk, timeo);
2040         }
2041         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2042                                    errcode, sk->sk_allocation);
2043         if (skb)
2044                 skb_set_owner_w(skb, sk);
2045         return skb;
2046 
2047 interrupted:
2048         err = sock_intr_errno(timeo);
2049 failure:
2050         *errcode = err;
2051         return NULL;
2052 }
2053 EXPORT_SYMBOL(sock_alloc_send_pskb);
2054 
2055 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2056                                     int noblock, int *errcode)
2057 {
2058         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2059 }
2060 EXPORT_SYMBOL(sock_alloc_send_skb);
2061 
2062 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2063                      struct sockcm_cookie *sockc)
2064 {
2065         u32 tsflags;
2066 
2067         switch (cmsg->cmsg_type) {
2068         case SO_MARK:
2069                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2070                         return -EPERM;
2071                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2072                         return -EINVAL;
2073                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2074                 break;
2075         case SO_TIMESTAMPING:
2076                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2077                         return -EINVAL;
2078 
2079                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2080                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2081                         return -EINVAL;
2082 
2083                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2084                 sockc->tsflags |= tsflags;
2085                 break;
2086         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2087         case SCM_RIGHTS:
2088         case SCM_CREDENTIALS:
2089                 break;
2090         default:
2091                 return -EINVAL;
2092         }
2093         return 0;
2094 }
2095 EXPORT_SYMBOL(__sock_cmsg_send);
2096 
2097 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2098                    struct sockcm_cookie *sockc)
2099 {
2100         struct cmsghdr *cmsg;
2101         int ret;
2102 
2103         for_each_cmsghdr(cmsg, msg) {
2104                 if (!CMSG_OK(msg, cmsg))
2105                         return -EINVAL;
2106                 if (cmsg->cmsg_level != SOL_SOCKET)
2107                         continue;
2108                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2109                 if (ret)
2110                         return ret;
2111         }
2112         return 0;
2113 }
2114 EXPORT_SYMBOL(sock_cmsg_send);
2115 
2116 static void sk_enter_memory_pressure(struct sock *sk)
2117 {
2118         if (!sk->sk_prot->enter_memory_pressure)
2119                 return;
2120 
2121         sk->sk_prot->enter_memory_pressure(sk);
2122 }
2123 
2124 static void sk_leave_memory_pressure(struct sock *sk)
2125 {
2126         if (sk->sk_prot->leave_memory_pressure) {
2127                 sk->sk_prot->leave_memory_pressure(sk);
2128         } else {
2129                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2130 
2131                 if (memory_pressure && *memory_pressure)
2132                         *memory_pressure = 0;
2133         }
2134 }
2135 
2136 /* On 32bit arches, an skb frag is limited to 2^15 */
2137 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2138 
2139 /**
2140  * skb_page_frag_refill - check that a page_frag contains enough room
2141  * @sz: minimum size of the fragment we want to get
2142  * @pfrag: pointer to page_frag
2143  * @gfp: priority for memory allocation
2144  *
2145  * Note: While this allocator tries to use high order pages, there is
2146  * no guarantee that allocations succeed. Therefore, @sz MUST be
2147  * less or equal than PAGE_SIZE.
2148  */
2149 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2150 {
2151         if (pfrag->page) {
2152                 if (page_ref_count(pfrag->page) == 1) {
2153                         pfrag->offset = 0;
2154                         return true;
2155                 }
2156                 if (pfrag->offset + sz <= pfrag->size)
2157                         return true;
2158                 put_page(pfrag->page);
2159         }
2160 
2161         pfrag->offset = 0;
2162         if (SKB_FRAG_PAGE_ORDER) {
2163                 /* Avoid direct reclaim but allow kswapd to wake */
2164                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2165                                           __GFP_COMP | __GFP_NOWARN |
2166                                           __GFP_NORETRY,
2167                                           SKB_FRAG_PAGE_ORDER);
2168                 if (likely(pfrag->page)) {
2169                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2170                         return true;
2171                 }
2172         }
2173         pfrag->page = alloc_page(gfp);
2174         if (likely(pfrag->page)) {
2175                 pfrag->size = PAGE_SIZE;
2176                 return true;
2177         }
2178         return false;
2179 }
2180 EXPORT_SYMBOL(skb_page_frag_refill);
2181 
2182 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2183 {
2184         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2185                 return true;
2186 
2187         sk_enter_memory_pressure(sk);
2188         sk_stream_moderate_sndbuf(sk);
2189         return false;
2190 }
2191 EXPORT_SYMBOL(sk_page_frag_refill);
2192 
2193 static void __lock_sock(struct sock *sk)
2194         __releases(&sk->sk_lock.slock)
2195         __acquires(&sk->sk_lock.slock)
2196 {
2197         DEFINE_WAIT(wait);
2198 
2199         for (;;) {
2200                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2201                                         TASK_UNINTERRUPTIBLE);
2202                 spin_unlock_bh(&sk->sk_lock.slock);
2203                 schedule();
2204                 spin_lock_bh(&sk->sk_lock.slock);
2205                 if (!sock_owned_by_user(sk))
2206                         break;
2207         }
2208         finish_wait(&sk->sk_lock.wq, &wait);
2209 }
2210 
2211 static void __release_sock(struct sock *sk)
2212         __releases(&sk->sk_lock.slock)
2213         __acquires(&sk->sk_lock.slock)
2214 {
2215         struct sk_buff *skb, *next;
2216 
2217         while ((skb = sk->sk_backlog.head) != NULL) {
2218                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2219 
2220                 spin_unlock_bh(&sk->sk_lock.slock);
2221 
2222                 do {
2223                         next = skb->next;
2224                         prefetch(next);
2225                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2226                         skb->next = NULL;
2227                         sk_backlog_rcv(sk, skb);
2228 
2229                         cond_resched();
2230 
2231                         skb = next;
2232                 } while (skb != NULL);
2233 
2234                 spin_lock_bh(&sk->sk_lock.slock);
2235         }
2236 
2237         /*
2238          * Doing the zeroing here guarantee we can not loop forever
2239          * while a wild producer attempts to flood us.
2240          */
2241         sk->sk_backlog.len = 0;
2242 }
2243 
2244 void __sk_flush_backlog(struct sock *sk)
2245 {
2246         spin_lock_bh(&sk->sk_lock.slock);
2247         __release_sock(sk);
2248         spin_unlock_bh(&sk->sk_lock.slock);
2249 }
2250 
2251 /**
2252  * sk_wait_data - wait for data to arrive at sk_receive_queue
2253  * @sk:    sock to wait on
2254  * @timeo: for how long
2255  * @skb:   last skb seen on sk_receive_queue
2256  *
2257  * Now socket state including sk->sk_err is changed only under lock,
2258  * hence we may omit checks after joining wait queue.
2259  * We check receive queue before schedule() only as optimization;
2260  * it is very likely that release_sock() added new data.
2261  */
2262 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2263 {
2264         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2265         int rc;
2266 
2267         add_wait_queue(sk_sleep(sk), &wait);
2268         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2269         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2270         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2271         remove_wait_queue(sk_sleep(sk), &wait);
2272         return rc;
2273 }
2274 EXPORT_SYMBOL(sk_wait_data);
2275 
2276 /**
2277  *      __sk_mem_raise_allocated - increase memory_allocated
2278  *      @sk: socket
2279  *      @size: memory size to allocate
2280  *      @amt: pages to allocate
2281  *      @kind: allocation type
2282  *
2283  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2284  */
2285 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2286 {
2287         struct proto *prot = sk->sk_prot;
2288         long allocated = sk_memory_allocated_add(sk, amt);
2289 
2290         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2291             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2292                 goto suppress_allocation;
2293 
2294         /* Under limit. */
2295         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2296                 sk_leave_memory_pressure(sk);
2297                 return 1;
2298         }
2299 
2300         /* Under pressure. */
2301         if (allocated > sk_prot_mem_limits(sk, 1))
2302                 sk_enter_memory_pressure(sk);
2303 
2304         /* Over hard limit. */
2305         if (allocated > sk_prot_mem_limits(sk, 2))
2306                 goto suppress_allocation;
2307 
2308         /* guarantee minimum buffer size under pressure */
2309         if (kind == SK_MEM_RECV) {
2310                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2311                         return 1;
2312 
2313         } else { /* SK_MEM_SEND */
2314                 if (sk->sk_type == SOCK_STREAM) {
2315                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2316                                 return 1;
2317                 } else if (refcount_read(&sk->sk_wmem_alloc) <
2318                            prot->sysctl_wmem[0])
2319                                 return 1;
2320         }
2321 
2322         if (sk_has_memory_pressure(sk)) {
2323                 int alloc;
2324 
2325                 if (!sk_under_memory_pressure(sk))
2326                         return 1;
2327                 alloc = sk_sockets_allocated_read_positive(sk);
2328                 if (sk_prot_mem_limits(sk, 2) > alloc *
2329                     sk_mem_pages(sk->sk_wmem_queued +
2330                                  atomic_read(&sk->sk_rmem_alloc) +
2331                                  sk->sk_forward_alloc))
2332                         return 1;
2333         }
2334 
2335 suppress_allocation:
2336 
2337         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2338                 sk_stream_moderate_sndbuf(sk);
2339 
2340                 /* Fail only if socket is _under_ its sndbuf.
2341                  * In this case we cannot block, so that we have to fail.
2342                  */
2343                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2344                         return 1;
2345         }
2346 
2347         trace_sock_exceed_buf_limit(sk, prot, allocated);
2348 
2349         sk_memory_allocated_sub(sk, amt);
2350 
2351         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2352                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2353 
2354         return 0;
2355 }
2356 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2357 
2358 /**
2359  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2360  *      @sk: socket
2361  *      @size: memory size to allocate
2362  *      @kind: allocation type
2363  *
2364  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2365  *      rmem allocation. This function assumes that protocols which have
2366  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2367  */
2368 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2369 {
2370         int ret, amt = sk_mem_pages(size);
2371 
2372         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2373         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2374         if (!ret)
2375                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2376         return ret;
2377 }
2378 EXPORT_SYMBOL(__sk_mem_schedule);
2379 
2380 /**
2381  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2382  *      @sk: socket
2383  *      @amount: number of quanta
2384  *
2385  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2386  */
2387 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2388 {
2389         sk_memory_allocated_sub(sk, amount);
2390 
2391         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2392                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2393 
2394         if (sk_under_memory_pressure(sk) &&
2395             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2396                 sk_leave_memory_pressure(sk);
2397 }
2398 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2399 
2400 /**
2401  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2402  *      @sk: socket
2403  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2404  */
2405 void __sk_mem_reclaim(struct sock *sk, int amount)
2406 {
2407         amount >>= SK_MEM_QUANTUM_SHIFT;
2408         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2409         __sk_mem_reduce_allocated(sk, amount);
2410 }
2411 EXPORT_SYMBOL(__sk_mem_reclaim);
2412 
2413 int sk_set_peek_off(struct sock *sk, int val)
2414 {
2415         if (val < 0)
2416                 return -EINVAL;
2417 
2418         sk->sk_peek_off = val;
2419         return 0;
2420 }
2421 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2422 
2423 /*
2424  * Set of default routines for initialising struct proto_ops when
2425  * the protocol does not support a particular function. In certain
2426  * cases where it makes no sense for a protocol to have a "do nothing"
2427  * function, some default processing is provided.
2428  */
2429 
2430 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2431 {
2432         return -EOPNOTSUPP;
2433 }
2434 EXPORT_SYMBOL(sock_no_bind);
2435 
2436 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2437                     int len, int flags)
2438 {
2439         return -EOPNOTSUPP;
2440 }
2441 EXPORT_SYMBOL(sock_no_connect);
2442 
2443 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2444 {
2445         return -EOPNOTSUPP;
2446 }
2447 EXPORT_SYMBOL(sock_no_socketpair);
2448 
2449 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2450                    bool kern)
2451 {
2452         return -EOPNOTSUPP;
2453 }
2454 EXPORT_SYMBOL(sock_no_accept);
2455 
2456 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2457                     int *len, int peer)
2458 {
2459         return -EOPNOTSUPP;
2460 }
2461 EXPORT_SYMBOL(sock_no_getname);
2462 
2463 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2464 {
2465         return 0;
2466 }
2467 EXPORT_SYMBOL(sock_no_poll);
2468 
2469 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2470 {
2471         return -EOPNOTSUPP;
2472 }
2473 EXPORT_SYMBOL(sock_no_ioctl);
2474 
2475 int sock_no_listen(struct socket *sock, int backlog)
2476 {
2477         return -EOPNOTSUPP;
2478 }
2479 EXPORT_SYMBOL(sock_no_listen);
2480 
2481 int sock_no_shutdown(struct socket *sock, int how)
2482 {
2483         return -EOPNOTSUPP;
2484 }
2485 EXPORT_SYMBOL(sock_no_shutdown);
2486 
2487 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2488                     char __user *optval, unsigned int optlen)
2489 {
2490         return -EOPNOTSUPP;
2491 }
2492 EXPORT_SYMBOL(sock_no_setsockopt);
2493 
2494 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2495                     char __user *optval, int __user *optlen)
2496 {
2497         return -EOPNOTSUPP;
2498 }
2499 EXPORT_SYMBOL(sock_no_getsockopt);
2500 
2501 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2502 {
2503         return -EOPNOTSUPP;
2504 }
2505 EXPORT_SYMBOL(sock_no_sendmsg);
2506 
2507 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2508                     int flags)
2509 {
2510         return -EOPNOTSUPP;
2511 }
2512 EXPORT_SYMBOL(sock_no_recvmsg);
2513 
2514 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2515 {
2516         /* Mirror missing mmap method error code */
2517         return -ENODEV;
2518 }
2519 EXPORT_SYMBOL(sock_no_mmap);
2520 
2521 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2522 {
2523         ssize_t res;
2524         struct msghdr msg = {.msg_flags = flags};
2525         struct kvec iov;
2526         char *kaddr = kmap(page);
2527         iov.iov_base = kaddr + offset;
2528         iov.iov_len = size;
2529         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2530         kunmap(page);
2531         return res;
2532 }
2533 EXPORT_SYMBOL(sock_no_sendpage);
2534 
2535 /*
2536  *      Default Socket Callbacks
2537  */
2538 
2539 static void sock_def_wakeup(struct sock *sk)
2540 {
2541         struct socket_wq *wq;
2542 
2543         rcu_read_lock();
2544         wq = rcu_dereference(sk->sk_wq);
2545         if (skwq_has_sleeper(wq))
2546                 wake_up_interruptible_all(&wq->wait);
2547         rcu_read_unlock();
2548 }
2549 
2550 static void sock_def_error_report(struct sock *sk)
2551 {
2552         struct socket_wq *wq;
2553 
2554         rcu_read_lock();
2555         wq = rcu_dereference(sk->sk_wq);
2556         if (skwq_has_sleeper(wq))
2557                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2558         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2559         rcu_read_unlock();
2560 }
2561 
2562 static void sock_def_readable(struct sock *sk)
2563 {
2564         struct socket_wq *wq;
2565 
2566         rcu_read_lock();
2567         wq = rcu_dereference(sk->sk_wq);
2568         if (skwq_has_sleeper(wq))
2569                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2570                                                 POLLRDNORM | POLLRDBAND);
2571         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2572         rcu_read_unlock();
2573 }
2574 
2575 static void sock_def_write_space(struct sock *sk)
2576 {
2577         struct socket_wq *wq;
2578 
2579         rcu_read_lock();
2580 
2581         /* Do not wake up a writer until he can make "significant"
2582          * progress.  --DaveM
2583          */
2584         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2585                 wq = rcu_dereference(sk->sk_wq);
2586                 if (skwq_has_sleeper(wq))
2587                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2588                                                 POLLWRNORM | POLLWRBAND);
2589 
2590                 /* Should agree with poll, otherwise some programs break */
2591                 if (sock_writeable(sk))
2592                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2593         }
2594 
2595         rcu_read_unlock();
2596 }
2597 
2598 static void sock_def_destruct(struct sock *sk)
2599 {
2600 }
2601 
2602 void sk_send_sigurg(struct sock *sk)
2603 {
2604         if (sk->sk_socket && sk->sk_socket->file)
2605                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2606                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2607 }
2608 EXPORT_SYMBOL(sk_send_sigurg);
2609 
2610 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2611                     unsigned long expires)
2612 {
2613         if (!mod_timer(timer, expires))
2614                 sock_hold(sk);
2615 }
2616 EXPORT_SYMBOL(sk_reset_timer);
2617 
2618 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2619 {
2620         if (del_timer(timer))
2621                 __sock_put(sk);
2622 }
2623 EXPORT_SYMBOL(sk_stop_timer);
2624 
2625 void sock_init_data(struct socket *sock, struct sock *sk)
2626 {
2627         sk_init_common(sk);
2628         sk->sk_send_head        =       NULL;
2629 
2630         init_timer(&sk->sk_timer);
2631 
2632         sk->sk_allocation       =       GFP_KERNEL;
2633         sk->sk_rcvbuf           =       sysctl_rmem_default;
2634         sk->sk_sndbuf           =       sysctl_wmem_default;
2635         sk->sk_state            =       TCP_CLOSE;
2636         sk_set_socket(sk, sock);
2637 
2638         sock_set_flag(sk, SOCK_ZAPPED);
2639 
2640         if (sock) {
2641                 sk->sk_type     =       sock->type;
2642                 sk->sk_wq       =       sock->wq;
2643                 sock->sk        =       sk;
2644                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2645         } else {
2646                 sk->sk_wq       =       NULL;
2647                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2648         }
2649 
2650         rwlock_init(&sk->sk_callback_lock);
2651         if (sk->sk_kern_sock)
2652                 lockdep_set_class_and_name(
2653                         &sk->sk_callback_lock,
2654                         af_kern_callback_keys + sk->sk_family,
2655                         af_family_kern_clock_key_strings[sk->sk_family]);
2656         else
2657                 lockdep_set_class_and_name(
2658                         &sk->sk_callback_lock,
2659                         af_callback_keys + sk->sk_family,
2660                         af_family_clock_key_strings[sk->sk_family]);
2661 
2662         sk->sk_state_change     =       sock_def_wakeup;
2663         sk->sk_data_ready       =       sock_def_readable;
2664         sk->sk_write_space      =       sock_def_write_space;
2665         sk->sk_error_report     =       sock_def_error_report;
2666         sk->sk_destruct         =       sock_def_destruct;
2667 
2668         sk->sk_frag.page        =       NULL;
2669         sk->sk_frag.offset      =       0;
2670         sk->sk_peek_off         =       -1;
2671 
2672         sk->sk_peer_pid         =       NULL;
2673         sk->sk_peer_cred        =       NULL;
2674         sk->sk_write_pending    =       0;
2675         sk->sk_rcvlowat         =       1;
2676         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2677         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2678 
2679         sk->sk_stamp = SK_DEFAULT_STAMP;
2680 
2681 #ifdef CONFIG_NET_RX_BUSY_POLL
2682         sk->sk_napi_id          =       0;
2683         sk->sk_ll_usec          =       sysctl_net_busy_read;
2684 #endif
2685 
2686         sk->sk_max_pacing_rate = ~0U;
2687         sk->sk_pacing_rate = ~0U;
2688         sk->sk_incoming_cpu = -1;
2689         /*
2690          * Before updating sk_refcnt, we must commit prior changes to memory
2691          * (Documentation/RCU/rculist_nulls.txt for details)
2692          */
2693         smp_wmb();
2694         refcount_set(&sk->sk_refcnt, 1);
2695         atomic_set(&sk->sk_drops, 0);
2696 }
2697 EXPORT_SYMBOL(sock_init_data);
2698 
2699 void lock_sock_nested(struct sock *sk, int subclass)
2700 {
2701         might_sleep();
2702         spin_lock_bh(&sk->sk_lock.slock);
2703         if (sk->sk_lock.owned)
2704                 __lock_sock(sk);
2705         sk->sk_lock.owned = 1;
2706         spin_unlock(&sk->sk_lock.slock);
2707         /*
2708          * The sk_lock has mutex_lock() semantics here:
2709          */
2710         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2711         local_bh_enable();
2712 }
2713 EXPORT_SYMBOL(lock_sock_nested);
2714 
2715 void release_sock(struct sock *sk)
2716 {
2717         spin_lock_bh(&sk->sk_lock.slock);
2718         if (sk->sk_backlog.tail)
2719                 __release_sock(sk);
2720 
2721         /* Warning : release_cb() might need to release sk ownership,
2722          * ie call sock_release_ownership(sk) before us.
2723          */
2724         if (sk->sk_prot->release_cb)
2725                 sk->sk_prot->release_cb(sk);
2726 
2727         sock_release_ownership(sk);
2728         if (waitqueue_active(&sk->sk_lock.wq))
2729                 wake_up(&sk->sk_lock.wq);
2730         spin_unlock_bh(&sk->sk_lock.slock);
2731 }
2732 EXPORT_SYMBOL(release_sock);
2733 
2734 /**
2735  * lock_sock_fast - fast version of lock_sock
2736  * @sk: socket
2737  *
2738  * This version should be used for very small section, where process wont block
2739  * return false if fast path is taken:
2740  *
2741  *   sk_lock.slock locked, owned = 0, BH disabled
2742  *
2743  * return true if slow path is taken:
2744  *
2745  *   sk_lock.slock unlocked, owned = 1, BH enabled
2746  */
2747 bool lock_sock_fast(struct sock *sk)
2748 {
2749         might_sleep();
2750         spin_lock_bh(&sk->sk_lock.slock);
2751 
2752         if (!sk->sk_lock.owned)
2753                 /*
2754                  * Note : We must disable BH
2755                  */
2756                 return false;
2757 
2758         __lock_sock(sk);
2759         sk->sk_lock.owned = 1;
2760         spin_unlock(&sk->sk_lock.slock);
2761         /*
2762          * The sk_lock has mutex_lock() semantics here:
2763          */
2764         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2765         local_bh_enable();
2766         return true;
2767 }
2768 EXPORT_SYMBOL(lock_sock_fast);
2769 
2770 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2771 {
2772         struct timeval tv;
2773         if (!sock_flag(sk, SOCK_TIMESTAMP))
2774                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2775         tv = ktime_to_timeval(sk->sk_stamp);
2776         if (tv.tv_sec == -1)
2777                 return -ENOENT;
2778         if (tv.tv_sec == 0) {
2779                 sk->sk_stamp = ktime_get_real();
2780                 tv = ktime_to_timeval(sk->sk_stamp);
2781         }
2782         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2783 }
2784 EXPORT_SYMBOL(sock_get_timestamp);
2785 
2786 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2787 {
2788         struct timespec ts;
2789         if (!sock_flag(sk, SOCK_TIMESTAMP))
2790                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2791         ts = ktime_to_timespec(sk->sk_stamp);
2792         if (ts.tv_sec == -1)
2793                 return -ENOENT;
2794         if (ts.tv_sec == 0) {
2795                 sk->sk_stamp = ktime_get_real();
2796                 ts = ktime_to_timespec(sk->sk_stamp);
2797         }
2798         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2799 }
2800 EXPORT_SYMBOL(sock_get_timestampns);
2801 
2802 void sock_enable_timestamp(struct sock *sk, int flag)
2803 {
2804         if (!sock_flag(sk, flag)) {
2805                 unsigned long previous_flags = sk->sk_flags;
2806 
2807                 sock_set_flag(sk, flag);
2808                 /*
2809                  * we just set one of the two flags which require net
2810                  * time stamping, but time stamping might have been on
2811                  * already because of the other one
2812                  */
2813                 if (sock_needs_netstamp(sk) &&
2814                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2815                         net_enable_timestamp();
2816         }
2817 }
2818 
2819 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2820                        int level, int type)
2821 {
2822         struct sock_exterr_skb *serr;
2823         struct sk_buff *skb;
2824         int copied, err;
2825 
2826         err = -EAGAIN;
2827         skb = sock_dequeue_err_skb(sk);
2828         if (skb == NULL)
2829                 goto out;
2830 
2831         copied = skb->len;
2832         if (copied > len) {
2833                 msg->msg_flags |= MSG_TRUNC;
2834                 copied = len;
2835         }
2836         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2837         if (err)
2838                 goto out_free_skb;
2839 
2840         sock_recv_timestamp(msg, sk, skb);
2841 
2842         serr = SKB_EXT_ERR(skb);
2843         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2844 
2845         msg->msg_flags |= MSG_ERRQUEUE;
2846         err = copied;
2847 
2848 out_free_skb:
2849         kfree_skb(skb);
2850 out:
2851         return err;
2852 }
2853 EXPORT_SYMBOL(sock_recv_errqueue);
2854 
2855 /*
2856  *      Get a socket option on an socket.
2857  *
2858  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2859  *      asynchronous errors should be reported by getsockopt. We assume
2860  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2861  */
2862 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2863                            char __user *optval, int __user *optlen)
2864 {
2865         struct sock *sk = sock->sk;
2866 
2867         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2868 }
2869 EXPORT_SYMBOL(sock_common_getsockopt);
2870 
2871 #ifdef CONFIG_COMPAT
2872 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2873                                   char __user *optval, int __user *optlen)
2874 {
2875         struct sock *sk = sock->sk;
2876 
2877         if (sk->sk_prot->compat_getsockopt != NULL)
2878                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2879                                                       optval, optlen);
2880         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2881 }
2882 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2883 #endif
2884 
2885 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2886                         int flags)
2887 {
2888         struct sock *sk = sock->sk;
2889         int addr_len = 0;
2890         int err;
2891 
2892         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2893                                    flags & ~MSG_DONTWAIT, &addr_len);
2894         if (err >= 0)
2895                 msg->msg_namelen = addr_len;
2896         return err;
2897 }
2898 EXPORT_SYMBOL(sock_common_recvmsg);
2899 
2900 /*
2901  *      Set socket options on an inet socket.
2902  */
2903 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2904                            char __user *optval, unsigned int optlen)
2905 {
2906         struct sock *sk = sock->sk;
2907 
2908         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2909 }
2910 EXPORT_SYMBOL(sock_common_setsockopt);
2911 
2912 #ifdef CONFIG_COMPAT
2913 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2914                                   char __user *optval, unsigned int optlen)
2915 {
2916         struct sock *sk = sock->sk;
2917 
2918         if (sk->sk_prot->compat_setsockopt != NULL)
2919                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2920                                                       optval, optlen);
2921         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2922 }
2923 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2924 #endif
2925 
2926 void sk_common_release(struct sock *sk)
2927 {
2928         if (sk->sk_prot->destroy)
2929                 sk->sk_prot->destroy(sk);
2930 
2931         /*
2932          * Observation: when sock_common_release is called, processes have
2933          * no access to socket. But net still has.
2934          * Step one, detach it from networking:
2935          *
2936          * A. Remove from hash tables.
2937          */
2938 
2939         sk->sk_prot->unhash(sk);
2940 
2941         /*
2942          * In this point socket cannot receive new packets, but it is possible
2943          * that some packets are in flight because some CPU runs receiver and
2944          * did hash table lookup before we unhashed socket. They will achieve
2945          * receive queue and will be purged by socket destructor.
2946          *
2947          * Also we still have packets pending on receive queue and probably,
2948          * our own packets waiting in device queues. sock_destroy will drain
2949          * receive queue, but transmitted packets will delay socket destruction
2950          * until the last reference will be released.
2951          */
2952 
2953         sock_orphan(sk);
2954 
2955         xfrm_sk_free_policy(sk);
2956 
2957         sk_refcnt_debug_release(sk);
2958 
2959         sock_put(sk);
2960 }
2961 EXPORT_SYMBOL(sk_common_release);
2962 
2963 void sk_get_meminfo(const struct sock *sk, u32 *mem)
2964 {
2965         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
2966 
2967         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
2968         mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
2969         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
2970         mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
2971         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
2972         mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
2973         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
2974         mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
2975         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
2976 }
2977 
2978 #ifdef CONFIG_PROC_FS
2979 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2980 struct prot_inuse {
2981         int val[PROTO_INUSE_NR];
2982 };
2983 
2984 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2985 
2986 #ifdef CONFIG_NET_NS
2987 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2988 {
2989         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2990 }
2991 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2992 
2993 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2994 {
2995         int cpu, idx = prot->inuse_idx;
2996         int res = 0;
2997 
2998         for_each_possible_cpu(cpu)
2999                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
3000 
3001         return res >= 0 ? res : 0;
3002 }
3003 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3004 
3005 static int __net_init sock_inuse_init_net(struct net *net)
3006 {
3007         net->core.inuse = alloc_percpu(struct prot_inuse);
3008         return net->core.inuse ? 0 : -ENOMEM;
3009 }
3010 
3011 static void __net_exit sock_inuse_exit_net(struct net *net)
3012 {
3013         free_percpu(net->core.inuse);
3014 }
3015 
3016 static struct pernet_operations net_inuse_ops = {
3017         .init = sock_inuse_init_net,
3018         .exit = sock_inuse_exit_net,
3019 };
3020 
3021 static __init int net_inuse_init(void)
3022 {
3023         if (register_pernet_subsys(&net_inuse_ops))
3024                 panic("Cannot initialize net inuse counters");
3025 
3026         return 0;
3027 }
3028 
3029 core_initcall(net_inuse_init);
3030 #else
3031 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
3032 
3033 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3034 {
3035         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
3036 }
3037 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3038 
3039 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3040 {
3041         int cpu, idx = prot->inuse_idx;
3042         int res = 0;
3043 
3044         for_each_possible_cpu(cpu)
3045                 res += per_cpu(prot_inuse, cpu).val[idx];
3046 
3047         return res >= 0 ? res : 0;
3048 }
3049 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3050 #endif
3051 
3052 static void assign_proto_idx(struct proto *prot)
3053 {
3054         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3055 
3056         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3057                 pr_err("PROTO_INUSE_NR exhausted\n");
3058                 return;
3059         }
3060 
3061         set_bit(prot->inuse_idx, proto_inuse_idx);
3062 }
3063 
3064 static void release_proto_idx(struct proto *prot)
3065 {
3066         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3067                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3068 }
3069 #else
3070 static inline void assign_proto_idx(struct proto *prot)
3071 {
3072 }
3073 
3074 static inline void release_proto_idx(struct proto *prot)
3075 {
3076 }
3077 #endif
3078 
3079 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3080 {
3081         if (!rsk_prot)
3082                 return;
3083         kfree(rsk_prot->slab_name);
3084         rsk_prot->slab_name = NULL;
3085         kmem_cache_destroy(rsk_prot->slab);
3086         rsk_prot->slab = NULL;
3087 }
3088 
3089 static int req_prot_init(const struct proto *prot)
3090 {
3091         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3092 
3093         if (!rsk_prot)
3094                 return 0;
3095 
3096         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3097                                         prot->name);
3098         if (!rsk_prot->slab_name)
3099                 return -ENOMEM;
3100 
3101         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3102                                            rsk_prot->obj_size, 0,
3103                                            prot->slab_flags, NULL);
3104 
3105         if (!rsk_prot->slab) {
3106                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3107                         prot->name);
3108                 return -ENOMEM;
3109         }
3110         return 0;
3111 }
3112 
3113 int proto_register(struct proto *prot, int alloc_slab)
3114 {
3115         if (alloc_slab) {
3116                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
3117                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
3118                                         NULL);
3119 
3120                 if (prot->slab == NULL) {
3121                         pr_crit("%s: Can't create sock SLAB cache!\n",
3122                                 prot->name);
3123                         goto out;
3124                 }
3125 
3126                 if (req_prot_init(prot))
3127                         goto out_free_request_sock_slab;
3128 
3129                 if (prot->twsk_prot != NULL) {
3130                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3131 
3132                         if (prot->twsk_prot->twsk_slab_name == NULL)
3133                                 goto out_free_request_sock_slab;
3134 
3135                         prot->twsk_prot->twsk_slab =
3136                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3137                                                   prot->twsk_prot->twsk_obj_size,
3138                                                   0,
3139                                                   prot->slab_flags,
3140                                                   NULL);
3141                         if (prot->twsk_prot->twsk_slab == NULL)
3142                                 goto out_free_timewait_sock_slab_name;
3143                 }
3144         }
3145 
3146         mutex_lock(&proto_list_mutex);
3147         list_add(&prot->node, &proto_list);
3148         assign_proto_idx(prot);
3149         mutex_unlock(&proto_list_mutex);
3150         return 0;
3151 
3152 out_free_timewait_sock_slab_name:
3153         kfree(prot->twsk_prot->twsk_slab_name);
3154 out_free_request_sock_slab:
3155         req_prot_cleanup(prot->rsk_prot);
3156 
3157         kmem_cache_destroy(prot->slab);
3158         prot->slab = NULL;
3159 out:
3160         return -ENOBUFS;
3161 }
3162 EXPORT_SYMBOL(proto_register);
3163 
3164 void proto_unregister(struct proto *prot)
3165 {
3166         mutex_lock(&proto_list_mutex);
3167         release_proto_idx(prot);
3168         list_del(&prot->node);
3169         mutex_unlock(&proto_list_mutex);
3170 
3171         kmem_cache_destroy(prot->slab);
3172         prot->slab = NULL;
3173 
3174         req_prot_cleanup(prot->rsk_prot);
3175 
3176         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3177                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3178                 kfree(prot->twsk_prot->twsk_slab_name);
3179                 prot->twsk_prot->twsk_slab = NULL;
3180         }
3181 }
3182 EXPORT_SYMBOL(proto_unregister);
3183 
3184 #ifdef CONFIG_PROC_FS
3185 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3186         __acquires(proto_list_mutex)
3187 {
3188         mutex_lock(&proto_list_mutex);
3189         return seq_list_start_head(&proto_list, *pos);
3190 }
3191 
3192 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3193 {
3194         return seq_list_next(v, &proto_list, pos);
3195 }
3196 
3197 static void proto_seq_stop(struct seq_file *seq, void *v)
3198         __releases(proto_list_mutex)
3199 {
3200         mutex_unlock(&proto_list_mutex);
3201 }
3202 
3203 static char proto_method_implemented(const void *method)
3204 {
3205         return method == NULL ? 'n' : 'y';
3206 }
3207 static long sock_prot_memory_allocated(struct proto *proto)
3208 {
3209         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3210 }
3211 
3212 static char *sock_prot_memory_pressure(struct proto *proto)
3213 {
3214         return proto->memory_pressure != NULL ?
3215         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3216 }
3217 
3218 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3219 {
3220 
3221         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3222                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3223                    proto->name,
3224                    proto->obj_size,
3225                    sock_prot_inuse_get(seq_file_net(seq), proto),
3226                    sock_prot_memory_allocated(proto),
3227                    sock_prot_memory_pressure(proto),
3228                    proto->max_header,
3229                    proto->slab == NULL ? "no" : "yes",
3230                    module_name(proto->owner),
3231                    proto_method_implemented(proto->close),
3232                    proto_method_implemented(proto->connect),
3233                    proto_method_implemented(proto->disconnect),
3234                    proto_method_implemented(proto->accept),
3235                    proto_method_implemented(proto->ioctl),
3236                    proto_method_implemented(proto->init),
3237                    proto_method_implemented(proto->destroy),
3238                    proto_method_implemented(proto->shutdown),
3239                    proto_method_implemented(proto->setsockopt),
3240                    proto_method_implemented(proto->getsockopt),
3241                    proto_method_implemented(proto->sendmsg),
3242                    proto_method_implemented(proto->recvmsg),
3243                    proto_method_implemented(proto->sendpage),
3244                    proto_method_implemented(proto->bind),
3245                    proto_method_implemented(proto->backlog_rcv),
3246                    proto_method_implemented(proto->hash),
3247                    proto_method_implemented(proto->unhash),
3248                    proto_method_implemented(proto->get_port),
3249                    proto_method_implemented(proto->enter_memory_pressure));
3250 }
3251 
3252 static int proto_seq_show(struct seq_file *seq, void *v)
3253 {
3254         if (v == &proto_list)
3255                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3256                            "protocol",
3257                            "size",
3258                            "sockets",
3259                            "memory",
3260                            "press",
3261                            "maxhdr",
3262                            "slab",
3263                            "module",
3264                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3265         else
3266                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3267         return 0;
3268 }
3269 
3270 static const struct seq_operations proto_seq_ops = {
3271         .start  = proto_seq_start,
3272         .next   = proto_seq_next,
3273         .stop   = proto_seq_stop,
3274         .show   = proto_seq_show,
3275 };
3276 
3277 static int proto_seq_open(struct inode *inode, struct file *file)
3278 {
3279         return seq_open_net(inode, file, &proto_seq_ops,
3280                             sizeof(struct seq_net_private));
3281 }
3282 
3283 static const struct file_operations proto_seq_fops = {
3284         .owner          = THIS_MODULE,
3285         .open           = proto_seq_open,
3286         .read           = seq_read,
3287         .llseek         = seq_lseek,
3288         .release        = seq_release_net,
3289 };
3290 
3291 static __net_init int proto_init_net(struct net *net)
3292 {
3293         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3294                 return -ENOMEM;
3295 
3296         return 0;
3297 }
3298 
3299 static __net_exit void proto_exit_net(struct net *net)
3300 {
3301         remove_proc_entry("protocols", net->proc_net);
3302 }
3303 
3304 
3305 static __net_initdata struct pernet_operations proto_net_ops = {
3306         .init = proto_init_net,
3307         .exit = proto_exit_net,
3308 };
3309 
3310 static int __init proto_init(void)
3311 {
3312         return register_pernet_subsys(&proto_net_ops);
3313 }
3314 
3315 subsys_initcall(proto_init);
3316 
3317 #endif /* PROC_FS */
3318 
3319 #ifdef CONFIG_NET_RX_BUSY_POLL
3320 bool sk_busy_loop_end(void *p, unsigned long start_time)
3321 {
3322         struct sock *sk = p;
3323 
3324         return !skb_queue_empty(&sk->sk_receive_queue) ||
3325                sk_busy_loop_timeout(sk, start_time);
3326 }
3327 EXPORT_SYMBOL(sk_busy_loop_end);
3328 #endif /* CONFIG_NET_RX_BUSY_POLL */
3329 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp