~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/core/sock.c

Version: ~ [ linux-5.6-rc1 ] ~ [ linux-5.5.2 ] ~ [ linux-5.4.17 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.102 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.170 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.213 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.213 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.81 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  3  *              operating system.  INET is implemented using the  BSD Socket
  4  *              interface as the means of communication with the user level.
  5  *
  6  *              Generic socket support routines. Memory allocators, socket lock/release
  7  *              handler for protocols to use and generic option handler.
  8  *
  9  *
 10  * Authors:     Ross Biro
 11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  *              Florian La Roche, <flla@stud.uni-sb.de>
 13  *              Alan Cox, <A.Cox@swansea.ac.uk>
 14  *
 15  * Fixes:
 16  *              Alan Cox        :       Numerous verify_area() problems
 17  *              Alan Cox        :       Connecting on a connecting socket
 18  *                                      now returns an error for tcp.
 19  *              Alan Cox        :       sock->protocol is set correctly.
 20  *                                      and is not sometimes left as 0.
 21  *              Alan Cox        :       connect handles icmp errors on a
 22  *                                      connect properly. Unfortunately there
 23  *                                      is a restart syscall nasty there. I
 24  *                                      can't match BSD without hacking the C
 25  *                                      library. Ideas urgently sought!
 26  *              Alan Cox        :       Disallow bind() to addresses that are
 27  *                                      not ours - especially broadcast ones!!
 28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
 29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
 30  *                                      instead they leave that for the DESTROY timer.
 31  *              Alan Cox        :       Clean up error flag in accept
 32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
 33  *                                      was buggy. Put a remove_sock() in the handler
 34  *                                      for memory when we hit 0. Also altered the timer
 35  *                                      code. The ACK stuff can wait and needs major
 36  *                                      TCP layer surgery.
 37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
 38  *                                      and fixed timer/inet_bh race.
 39  *              Alan Cox        :       Added zapped flag for TCP
 40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
 41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
 42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
 43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
 44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
 45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
 46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
 47  *      Pauline Middelink       :       identd support
 48  *              Alan Cox        :       Fixed connect() taking signals I think.
 49  *              Alan Cox        :       SO_LINGER supported
 50  *              Alan Cox        :       Error reporting fixes
 51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
 52  *              Alan Cox        :       inet sockets don't set sk->type!
 53  *              Alan Cox        :       Split socket option code
 54  *              Alan Cox        :       Callbacks
 55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
 56  *              Alex            :       Removed restriction on inet fioctl
 57  *              Alan Cox        :       Splitting INET from NET core
 58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
 59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
 60  *              Alan Cox        :       Split IP from generic code
 61  *              Alan Cox        :       New kfree_skbmem()
 62  *              Alan Cox        :       Make SO_DEBUG superuser only.
 63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
 64  *                                      (compatibility fix)
 65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
 66  *              Alan Cox        :       Allocator for a socket is settable.
 67  *              Alan Cox        :       SO_ERROR includes soft errors.
 68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
 69  *              Alan Cox        :       Generic socket allocation to make hooks
 70  *                                      easier (suggested by Craig Metz).
 71  *              Michael Pall    :       SO_ERROR returns positive errno again
 72  *              Steve Whitehouse:       Added default destructor to free
 73  *                                      protocol private data.
 74  *              Steve Whitehouse:       Added various other default routines
 75  *                                      common to several socket families.
 76  *              Chris Evans     :       Call suser() check last on F_SETOWN
 77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
 78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
 79  *              Andi Kleen      :       Fix write_space callback
 80  *              Chris Evans     :       Security fixes - signedness again
 81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
 82  *
 83  * To Fix:
 84  *
 85  *
 86  *              This program is free software; you can redistribute it and/or
 87  *              modify it under the terms of the GNU General Public License
 88  *              as published by the Free Software Foundation; either version
 89  *              2 of the License, or (at your option) any later version.
 90  */
 91 
 92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 93 
 94 #include <linux/capability.h>
 95 #include <linux/errno.h>
 96 #include <linux/types.h>
 97 #include <linux/socket.h>
 98 #include <linux/in.h>
 99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114 #include <linux/highmem.h>
115 #include <linux/user_namespace.h>
116 #include <linux/static_key.h>
117 #include <linux/memcontrol.h>
118 #include <linux/prefetch.h>
119 
120 #include <asm/uaccess.h>
121 
122 #include <linux/netdevice.h>
123 #include <net/protocol.h>
124 #include <linux/skbuff.h>
125 #include <net/net_namespace.h>
126 #include <net/request_sock.h>
127 #include <net/sock.h>
128 #include <linux/net_tstamp.h>
129 #include <net/xfrm.h>
130 #include <linux/ipsec.h>
131 #include <net/cls_cgroup.h>
132 #include <net/netprio_cgroup.h>
133 
134 #include <linux/filter.h>
135 
136 #include <trace/events/sock.h>
137 
138 #ifdef CONFIG_INET
139 #include <net/tcp.h>
140 #endif
141 
142 static DEFINE_MUTEX(proto_list_mutex);
143 static LIST_HEAD(proto_list);
144 
145 #ifdef CONFIG_MEMCG_KMEM
146 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
147 {
148         struct proto *proto;
149         int ret = 0;
150 
151         mutex_lock(&proto_list_mutex);
152         list_for_each_entry(proto, &proto_list, node) {
153                 if (proto->init_cgroup) {
154                         ret = proto->init_cgroup(memcg, ss);
155                         if (ret)
156                                 goto out;
157                 }
158         }
159 
160         mutex_unlock(&proto_list_mutex);
161         return ret;
162 out:
163         list_for_each_entry_continue_reverse(proto, &proto_list, node)
164                 if (proto->destroy_cgroup)
165                         proto->destroy_cgroup(memcg);
166         mutex_unlock(&proto_list_mutex);
167         return ret;
168 }
169 
170 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
171 {
172         struct proto *proto;
173 
174         mutex_lock(&proto_list_mutex);
175         list_for_each_entry_reverse(proto, &proto_list, node)
176                 if (proto->destroy_cgroup)
177                         proto->destroy_cgroup(memcg);
178         mutex_unlock(&proto_list_mutex);
179 }
180 #endif
181 
182 /*
183  * Each address family might have different locking rules, so we have
184  * one slock key per address family:
185  */
186 static struct lock_class_key af_family_keys[AF_MAX];
187 static struct lock_class_key af_family_slock_keys[AF_MAX];
188 
189 #if defined(CONFIG_MEMCG_KMEM)
190 struct static_key memcg_socket_limit_enabled;
191 EXPORT_SYMBOL(memcg_socket_limit_enabled);
192 #endif
193 
194 /*
195  * Make lock validator output more readable. (we pre-construct these
196  * strings build-time, so that runtime initialization of socket
197  * locks is fast):
198  */
199 static const char *const af_family_key_strings[AF_MAX+1] = {
200   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
201   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
202   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
203   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
204   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
205   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
206   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
207   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
208   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
209   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
210   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
211   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
212   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
213   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
214 };
215 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
216   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
217   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
218   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
219   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
220   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
221   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
222   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
223   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
224   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
225   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
226   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
227   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
228   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
229   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
230 };
231 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
232   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
233   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
234   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
235   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
236   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
237   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
238   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
239   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
240   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
241   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
242   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
243   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
244   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
245   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
246 };
247 
248 /*
249  * sk_callback_lock locking rules are per-address-family,
250  * so split the lock classes by using a per-AF key:
251  */
252 static struct lock_class_key af_callback_keys[AF_MAX];
253 
254 /* Take into consideration the size of the struct sk_buff overhead in the
255  * determination of these values, since that is non-constant across
256  * platforms.  This makes socket queueing behavior and performance
257  * not depend upon such differences.
258  */
259 #define _SK_MEM_PACKETS         256
260 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
261 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
262 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
263 
264 /* Run time adjustable parameters. */
265 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
266 EXPORT_SYMBOL(sysctl_wmem_max);
267 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
268 EXPORT_SYMBOL(sysctl_rmem_max);
269 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
270 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
271 
272 /* Maximal space eaten by iovec or ancillary data plus some space */
273 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
274 EXPORT_SYMBOL(sysctl_optmem_max);
275 
276 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
277 EXPORT_SYMBOL_GPL(memalloc_socks);
278 
279 /**
280  * sk_set_memalloc - sets %SOCK_MEMALLOC
281  * @sk: socket to set it on
282  *
283  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
284  * It's the responsibility of the admin to adjust min_free_kbytes
285  * to meet the requirements
286  */
287 void sk_set_memalloc(struct sock *sk)
288 {
289         sock_set_flag(sk, SOCK_MEMALLOC);
290         sk->sk_allocation |= __GFP_MEMALLOC;
291         static_key_slow_inc(&memalloc_socks);
292 }
293 EXPORT_SYMBOL_GPL(sk_set_memalloc);
294 
295 void sk_clear_memalloc(struct sock *sk)
296 {
297         sock_reset_flag(sk, SOCK_MEMALLOC);
298         sk->sk_allocation &= ~__GFP_MEMALLOC;
299         static_key_slow_dec(&memalloc_socks);
300 
301         /*
302          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
303          * progress of swapping. However, if SOCK_MEMALLOC is cleared while
304          * it has rmem allocations there is a risk that the user of the
305          * socket cannot make forward progress due to exceeding the rmem
306          * limits. By rights, sk_clear_memalloc() should only be called
307          * on sockets being torn down but warn and reset the accounting if
308          * that assumption breaks.
309          */
310         if (WARN_ON(sk->sk_forward_alloc))
311                 sk_mem_reclaim(sk);
312 }
313 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
314 
315 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
316 {
317         int ret;
318         unsigned long pflags = current->flags;
319 
320         /* these should have been dropped before queueing */
321         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
322 
323         current->flags |= PF_MEMALLOC;
324         ret = sk->sk_backlog_rcv(sk, skb);
325         tsk_restore_flags(current, pflags, PF_MEMALLOC);
326 
327         return ret;
328 }
329 EXPORT_SYMBOL(__sk_backlog_rcv);
330 
331 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
332 {
333         struct timeval tv;
334 
335         if (optlen < sizeof(tv))
336                 return -EINVAL;
337         if (copy_from_user(&tv, optval, sizeof(tv)))
338                 return -EFAULT;
339         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
340                 return -EDOM;
341 
342         if (tv.tv_sec < 0) {
343                 static int warned __read_mostly;
344 
345                 *timeo_p = 0;
346                 if (warned < 10 && net_ratelimit()) {
347                         warned++;
348                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
349                                 __func__, current->comm, task_pid_nr(current));
350                 }
351                 return 0;
352         }
353         *timeo_p = MAX_SCHEDULE_TIMEOUT;
354         if (tv.tv_sec == 0 && tv.tv_usec == 0)
355                 return 0;
356         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
357                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
358         return 0;
359 }
360 
361 static void sock_warn_obsolete_bsdism(const char *name)
362 {
363         static int warned;
364         static char warncomm[TASK_COMM_LEN];
365         if (strcmp(warncomm, current->comm) && warned < 5) {
366                 strcpy(warncomm,  current->comm);
367                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
368                         warncomm, name);
369                 warned++;
370         }
371 }
372 
373 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
374 
375 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
376 {
377         if (sk->sk_flags & flags) {
378                 sk->sk_flags &= ~flags;
379                 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
380                         net_disable_timestamp();
381         }
382 }
383 
384 
385 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
386 {
387         int err;
388         int skb_len;
389         unsigned long flags;
390         struct sk_buff_head *list = &sk->sk_receive_queue;
391 
392         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
393                 atomic_inc(&sk->sk_drops);
394                 trace_sock_rcvqueue_full(sk, skb);
395                 return -ENOMEM;
396         }
397 
398         err = sk_filter(sk, skb);
399         if (err)
400                 return err;
401 
402         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
403                 atomic_inc(&sk->sk_drops);
404                 return -ENOBUFS;
405         }
406 
407         skb->dev = NULL;
408         skb_set_owner_r(skb, sk);
409 
410         /* Cache the SKB length before we tack it onto the receive
411          * queue.  Once it is added it no longer belongs to us and
412          * may be freed by other threads of control pulling packets
413          * from the queue.
414          */
415         skb_len = skb->len;
416 
417         /* we escape from rcu protected region, make sure we dont leak
418          * a norefcounted dst
419          */
420         skb_dst_force(skb);
421 
422         spin_lock_irqsave(&list->lock, flags);
423         skb->dropcount = atomic_read(&sk->sk_drops);
424         __skb_queue_tail(list, skb);
425         spin_unlock_irqrestore(&list->lock, flags);
426 
427         if (!sock_flag(sk, SOCK_DEAD))
428                 sk->sk_data_ready(sk, skb_len);
429         return 0;
430 }
431 EXPORT_SYMBOL(sock_queue_rcv_skb);
432 
433 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
434 {
435         int rc = NET_RX_SUCCESS;
436 
437         if (sk_filter(sk, skb))
438                 goto discard_and_relse;
439 
440         skb->dev = NULL;
441 
442         if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
443                 atomic_inc(&sk->sk_drops);
444                 goto discard_and_relse;
445         }
446         if (nested)
447                 bh_lock_sock_nested(sk);
448         else
449                 bh_lock_sock(sk);
450         if (!sock_owned_by_user(sk)) {
451                 /*
452                  * trylock + unlock semantics:
453                  */
454                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
455 
456                 rc = sk_backlog_rcv(sk, skb);
457 
458                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
459         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
460                 bh_unlock_sock(sk);
461                 atomic_inc(&sk->sk_drops);
462                 goto discard_and_relse;
463         }
464 
465         bh_unlock_sock(sk);
466 out:
467         sock_put(sk);
468         return rc;
469 discard_and_relse:
470         kfree_skb(skb);
471         goto out;
472 }
473 EXPORT_SYMBOL(sk_receive_skb);
474 
475 void sk_reset_txq(struct sock *sk)
476 {
477         sk_tx_queue_clear(sk);
478 }
479 EXPORT_SYMBOL(sk_reset_txq);
480 
481 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
482 {
483         struct dst_entry *dst = __sk_dst_get(sk);
484 
485         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
486                 sk_tx_queue_clear(sk);
487                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
488                 dst_release(dst);
489                 return NULL;
490         }
491 
492         return dst;
493 }
494 EXPORT_SYMBOL(__sk_dst_check);
495 
496 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
497 {
498         struct dst_entry *dst = sk_dst_get(sk);
499 
500         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
501                 sk_dst_reset(sk);
502                 dst_release(dst);
503                 return NULL;
504         }
505 
506         return dst;
507 }
508 EXPORT_SYMBOL(sk_dst_check);
509 
510 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
511                                 int optlen)
512 {
513         int ret = -ENOPROTOOPT;
514 #ifdef CONFIG_NETDEVICES
515         struct net *net = sock_net(sk);
516         char devname[IFNAMSIZ];
517         int index;
518 
519         /* Sorry... */
520         ret = -EPERM;
521         if (!ns_capable(net->user_ns, CAP_NET_RAW))
522                 goto out;
523 
524         ret = -EINVAL;
525         if (optlen < 0)
526                 goto out;
527 
528         /* Bind this socket to a particular device like "eth0",
529          * as specified in the passed interface name. If the
530          * name is "" or the option length is zero the socket
531          * is not bound.
532          */
533         if (optlen > IFNAMSIZ - 1)
534                 optlen = IFNAMSIZ - 1;
535         memset(devname, 0, sizeof(devname));
536 
537         ret = -EFAULT;
538         if (copy_from_user(devname, optval, optlen))
539                 goto out;
540 
541         index = 0;
542         if (devname[0] != '\0') {
543                 struct net_device *dev;
544 
545                 rcu_read_lock();
546                 dev = dev_get_by_name_rcu(net, devname);
547                 if (dev)
548                         index = dev->ifindex;
549                 rcu_read_unlock();
550                 ret = -ENODEV;
551                 if (!dev)
552                         goto out;
553         }
554 
555         lock_sock(sk);
556         sk->sk_bound_dev_if = index;
557         sk_dst_reset(sk);
558         release_sock(sk);
559 
560         ret = 0;
561 
562 out:
563 #endif
564 
565         return ret;
566 }
567 
568 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
569                                 int __user *optlen, int len)
570 {
571         int ret = -ENOPROTOOPT;
572 #ifdef CONFIG_NETDEVICES
573         struct net *net = sock_net(sk);
574         struct net_device *dev;
575         char devname[IFNAMSIZ];
576         unsigned seq;
577 
578         if (sk->sk_bound_dev_if == 0) {
579                 len = 0;
580                 goto zero;
581         }
582 
583         ret = -EINVAL;
584         if (len < IFNAMSIZ)
585                 goto out;
586 
587 retry:
588         seq = read_seqcount_begin(&devnet_rename_seq);
589         rcu_read_lock();
590         dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
591         ret = -ENODEV;
592         if (!dev) {
593                 rcu_read_unlock();
594                 goto out;
595         }
596 
597         strcpy(devname, dev->name);
598         rcu_read_unlock();
599         if (read_seqcount_retry(&devnet_rename_seq, seq))
600                 goto retry;
601 
602         len = strlen(devname) + 1;
603 
604         ret = -EFAULT;
605         if (copy_to_user(optval, devname, len))
606                 goto out;
607 
608 zero:
609         ret = -EFAULT;
610         if (put_user(len, optlen))
611                 goto out;
612 
613         ret = 0;
614 
615 out:
616 #endif
617 
618         return ret;
619 }
620 
621 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
622 {
623         if (valbool)
624                 sock_set_flag(sk, bit);
625         else
626                 sock_reset_flag(sk, bit);
627 }
628 
629 /*
630  *      This is meant for all protocols to use and covers goings on
631  *      at the socket level. Everything here is generic.
632  */
633 
634 int sock_setsockopt(struct socket *sock, int level, int optname,
635                     char __user *optval, unsigned int optlen)
636 {
637         struct sock *sk = sock->sk;
638         int val;
639         int valbool;
640         struct linger ling;
641         int ret = 0;
642 
643         /*
644          *      Options without arguments
645          */
646 
647         if (optname == SO_BINDTODEVICE)
648                 return sock_setbindtodevice(sk, optval, optlen);
649 
650         if (optlen < sizeof(int))
651                 return -EINVAL;
652 
653         if (get_user(val, (int __user *)optval))
654                 return -EFAULT;
655 
656         valbool = val ? 1 : 0;
657 
658         lock_sock(sk);
659 
660         switch (optname) {
661         case SO_DEBUG:
662                 if (val && !capable(CAP_NET_ADMIN))
663                         ret = -EACCES;
664                 else
665                         sock_valbool_flag(sk, SOCK_DBG, valbool);
666                 break;
667         case SO_REUSEADDR:
668                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
669                 break;
670         case SO_REUSEPORT:
671                 sk->sk_reuseport = valbool;
672                 break;
673         case SO_TYPE:
674         case SO_PROTOCOL:
675         case SO_DOMAIN:
676         case SO_ERROR:
677                 ret = -ENOPROTOOPT;
678                 break;
679         case SO_DONTROUTE:
680                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
681                 break;
682         case SO_BROADCAST:
683                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
684                 break;
685         case SO_SNDBUF:
686                 /* Don't error on this BSD doesn't and if you think
687                  * about it this is right. Otherwise apps have to
688                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
689                  * are treated in BSD as hints
690                  */
691                 val = min_t(u32, val, sysctl_wmem_max);
692 set_sndbuf:
693                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
694                 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
695                 /* Wake up sending tasks if we upped the value. */
696                 sk->sk_write_space(sk);
697                 break;
698 
699         case SO_SNDBUFFORCE:
700                 if (!capable(CAP_NET_ADMIN)) {
701                         ret = -EPERM;
702                         break;
703                 }
704                 goto set_sndbuf;
705 
706         case SO_RCVBUF:
707                 /* Don't error on this BSD doesn't and if you think
708                  * about it this is right. Otherwise apps have to
709                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
710                  * are treated in BSD as hints
711                  */
712                 val = min_t(u32, val, sysctl_rmem_max);
713 set_rcvbuf:
714                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
715                 /*
716                  * We double it on the way in to account for
717                  * "struct sk_buff" etc. overhead.   Applications
718                  * assume that the SO_RCVBUF setting they make will
719                  * allow that much actual data to be received on that
720                  * socket.
721                  *
722                  * Applications are unaware that "struct sk_buff" and
723                  * other overheads allocate from the receive buffer
724                  * during socket buffer allocation.
725                  *
726                  * And after considering the possible alternatives,
727                  * returning the value we actually used in getsockopt
728                  * is the most desirable behavior.
729                  */
730                 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
731                 break;
732 
733         case SO_RCVBUFFORCE:
734                 if (!capable(CAP_NET_ADMIN)) {
735                         ret = -EPERM;
736                         break;
737                 }
738                 goto set_rcvbuf;
739 
740         case SO_KEEPALIVE:
741 #ifdef CONFIG_INET
742                 if (sk->sk_protocol == IPPROTO_TCP &&
743                     sk->sk_type == SOCK_STREAM)
744                         tcp_set_keepalive(sk, valbool);
745 #endif
746                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
747                 break;
748 
749         case SO_OOBINLINE:
750                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
751                 break;
752 
753         case SO_NO_CHECK:
754                 sk->sk_no_check = valbool;
755                 break;
756 
757         case SO_PRIORITY:
758                 if ((val >= 0 && val <= 6) ||
759                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
760                         sk->sk_priority = val;
761                 else
762                         ret = -EPERM;
763                 break;
764 
765         case SO_LINGER:
766                 if (optlen < sizeof(ling)) {
767                         ret = -EINVAL;  /* 1003.1g */
768                         break;
769                 }
770                 if (copy_from_user(&ling, optval, sizeof(ling))) {
771                         ret = -EFAULT;
772                         break;
773                 }
774                 if (!ling.l_onoff)
775                         sock_reset_flag(sk, SOCK_LINGER);
776                 else {
777 #if (BITS_PER_LONG == 32)
778                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
779                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
780                         else
781 #endif
782                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
783                         sock_set_flag(sk, SOCK_LINGER);
784                 }
785                 break;
786 
787         case SO_BSDCOMPAT:
788                 sock_warn_obsolete_bsdism("setsockopt");
789                 break;
790 
791         case SO_PASSCRED:
792                 if (valbool)
793                         set_bit(SOCK_PASSCRED, &sock->flags);
794                 else
795                         clear_bit(SOCK_PASSCRED, &sock->flags);
796                 break;
797 
798         case SO_TIMESTAMP:
799         case SO_TIMESTAMPNS:
800                 if (valbool)  {
801                         if (optname == SO_TIMESTAMP)
802                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
803                         else
804                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
805                         sock_set_flag(sk, SOCK_RCVTSTAMP);
806                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
807                 } else {
808                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
809                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
810                 }
811                 break;
812 
813         case SO_TIMESTAMPING:
814                 if (val & ~SOF_TIMESTAMPING_MASK) {
815                         ret = -EINVAL;
816                         break;
817                 }
818                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
819                                   val & SOF_TIMESTAMPING_TX_HARDWARE);
820                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
821                                   val & SOF_TIMESTAMPING_TX_SOFTWARE);
822                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
823                                   val & SOF_TIMESTAMPING_RX_HARDWARE);
824                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
825                         sock_enable_timestamp(sk,
826                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
827                 else
828                         sock_disable_timestamp(sk,
829                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
830                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
831                                   val & SOF_TIMESTAMPING_SOFTWARE);
832                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
833                                   val & SOF_TIMESTAMPING_SYS_HARDWARE);
834                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
835                                   val & SOF_TIMESTAMPING_RAW_HARDWARE);
836                 break;
837 
838         case SO_RCVLOWAT:
839                 if (val < 0)
840                         val = INT_MAX;
841                 sk->sk_rcvlowat = val ? : 1;
842                 break;
843 
844         case SO_RCVTIMEO:
845                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
846                 break;
847 
848         case SO_SNDTIMEO:
849                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
850                 break;
851 
852         case SO_ATTACH_FILTER:
853                 ret = -EINVAL;
854                 if (optlen == sizeof(struct sock_fprog)) {
855                         struct sock_fprog fprog;
856 
857                         ret = -EFAULT;
858                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
859                                 break;
860 
861                         ret = sk_attach_filter(&fprog, sk);
862                 }
863                 break;
864 
865         case SO_DETACH_FILTER:
866                 ret = sk_detach_filter(sk);
867                 break;
868 
869         case SO_LOCK_FILTER:
870                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
871                         ret = -EPERM;
872                 else
873                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
874                 break;
875 
876         case SO_PASSSEC:
877                 if (valbool)
878                         set_bit(SOCK_PASSSEC, &sock->flags);
879                 else
880                         clear_bit(SOCK_PASSSEC, &sock->flags);
881                 break;
882         case SO_MARK:
883                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
884                         ret = -EPERM;
885                 else
886                         sk->sk_mark = val;
887                 break;
888 
889                 /* We implement the SO_SNDLOWAT etc to
890                    not be settable (1003.1g 5.3) */
891         case SO_RXQ_OVFL:
892                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
893                 break;
894 
895         case SO_WIFI_STATUS:
896                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
897                 break;
898 
899         case SO_PEEK_OFF:
900                 if (sock->ops->set_peek_off)
901                         sock->ops->set_peek_off(sk, val);
902                 else
903                         ret = -EOPNOTSUPP;
904                 break;
905 
906         case SO_NOFCS:
907                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
908                 break;
909 
910         default:
911                 ret = -ENOPROTOOPT;
912                 break;
913         }
914         release_sock(sk);
915         return ret;
916 }
917 EXPORT_SYMBOL(sock_setsockopt);
918 
919 
920 void cred_to_ucred(struct pid *pid, const struct cred *cred,
921                    struct ucred *ucred)
922 {
923         ucred->pid = pid_vnr(pid);
924         ucred->uid = ucred->gid = -1;
925         if (cred) {
926                 struct user_namespace *current_ns = current_user_ns();
927 
928                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
929                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
930         }
931 }
932 EXPORT_SYMBOL_GPL(cred_to_ucred);
933 
934 int sock_getsockopt(struct socket *sock, int level, int optname,
935                     char __user *optval, int __user *optlen)
936 {
937         struct sock *sk = sock->sk;
938 
939         union {
940                 int val;
941                 struct linger ling;
942                 struct timeval tm;
943         } v;
944 
945         int lv = sizeof(int);
946         int len;
947 
948         if (get_user(len, optlen))
949                 return -EFAULT;
950         if (len < 0)
951                 return -EINVAL;
952 
953         memset(&v, 0, sizeof(v));
954 
955         switch (optname) {
956         case SO_DEBUG:
957                 v.val = sock_flag(sk, SOCK_DBG);
958                 break;
959 
960         case SO_DONTROUTE:
961                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
962                 break;
963 
964         case SO_BROADCAST:
965                 v.val = sock_flag(sk, SOCK_BROADCAST);
966                 break;
967 
968         case SO_SNDBUF:
969                 v.val = sk->sk_sndbuf;
970                 break;
971 
972         case SO_RCVBUF:
973                 v.val = sk->sk_rcvbuf;
974                 break;
975 
976         case SO_REUSEADDR:
977                 v.val = sk->sk_reuse;
978                 break;
979 
980         case SO_REUSEPORT:
981                 v.val = sk->sk_reuseport;
982                 break;
983 
984         case SO_KEEPALIVE:
985                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
986                 break;
987 
988         case SO_TYPE:
989                 v.val = sk->sk_type;
990                 break;
991 
992         case SO_PROTOCOL:
993                 v.val = sk->sk_protocol;
994                 break;
995 
996         case SO_DOMAIN:
997                 v.val = sk->sk_family;
998                 break;
999 
1000         case SO_ERROR:
1001                 v.val = -sock_error(sk);
1002                 if (v.val == 0)
1003                         v.val = xchg(&sk->sk_err_soft, 0);
1004                 break;
1005 
1006         case SO_OOBINLINE:
1007                 v.val = sock_flag(sk, SOCK_URGINLINE);
1008                 break;
1009 
1010         case SO_NO_CHECK:
1011                 v.val = sk->sk_no_check;
1012                 break;
1013 
1014         case SO_PRIORITY:
1015                 v.val = sk->sk_priority;
1016                 break;
1017 
1018         case SO_LINGER:
1019                 lv              = sizeof(v.ling);
1020                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1021                 v.ling.l_linger = sk->sk_lingertime / HZ;
1022                 break;
1023 
1024         case SO_BSDCOMPAT:
1025                 sock_warn_obsolete_bsdism("getsockopt");
1026                 break;
1027 
1028         case SO_TIMESTAMP:
1029                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1030                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1031                 break;
1032 
1033         case SO_TIMESTAMPNS:
1034                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1035                 break;
1036 
1037         case SO_TIMESTAMPING:
1038                 v.val = 0;
1039                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1040                         v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1041                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1042                         v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1043                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1044                         v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1045                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1046                         v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1047                 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1048                         v.val |= SOF_TIMESTAMPING_SOFTWARE;
1049                 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1050                         v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1051                 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1052                         v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1053                 break;
1054 
1055         case SO_RCVTIMEO:
1056                 lv = sizeof(struct timeval);
1057                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1058                         v.tm.tv_sec = 0;
1059                         v.tm.tv_usec = 0;
1060                 } else {
1061                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1062                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1063                 }
1064                 break;
1065 
1066         case SO_SNDTIMEO:
1067                 lv = sizeof(struct timeval);
1068                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1069                         v.tm.tv_sec = 0;
1070                         v.tm.tv_usec = 0;
1071                 } else {
1072                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1073                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1074                 }
1075                 break;
1076 
1077         case SO_RCVLOWAT:
1078                 v.val = sk->sk_rcvlowat;
1079                 break;
1080 
1081         case SO_SNDLOWAT:
1082                 v.val = 1;
1083                 break;
1084 
1085         case SO_PASSCRED:
1086                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1087                 break;
1088 
1089         case SO_PEERCRED:
1090         {
1091                 struct ucred peercred;
1092                 if (len > sizeof(peercred))
1093                         len = sizeof(peercred);
1094                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1095                 if (copy_to_user(optval, &peercred, len))
1096                         return -EFAULT;
1097                 goto lenout;
1098         }
1099 
1100         case SO_PEERNAME:
1101         {
1102                 char address[128];
1103 
1104                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1105                         return -ENOTCONN;
1106                 if (lv < len)
1107                         return -EINVAL;
1108                 if (copy_to_user(optval, address, len))
1109                         return -EFAULT;
1110                 goto lenout;
1111         }
1112 
1113         /* Dubious BSD thing... Probably nobody even uses it, but
1114          * the UNIX standard wants it for whatever reason... -DaveM
1115          */
1116         case SO_ACCEPTCONN:
1117                 v.val = sk->sk_state == TCP_LISTEN;
1118                 break;
1119 
1120         case SO_PASSSEC:
1121                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1122                 break;
1123 
1124         case SO_PEERSEC:
1125                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1126 
1127         case SO_MARK:
1128                 v.val = sk->sk_mark;
1129                 break;
1130 
1131         case SO_RXQ_OVFL:
1132                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1133                 break;
1134 
1135         case SO_WIFI_STATUS:
1136                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1137                 break;
1138 
1139         case SO_PEEK_OFF:
1140                 if (!sock->ops->set_peek_off)
1141                         return -EOPNOTSUPP;
1142 
1143                 v.val = sk->sk_peek_off;
1144                 break;
1145         case SO_NOFCS:
1146                 v.val = sock_flag(sk, SOCK_NOFCS);
1147                 break;
1148 
1149         case SO_BINDTODEVICE:
1150                 return sock_getbindtodevice(sk, optval, optlen, len);
1151 
1152         case SO_GET_FILTER:
1153                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1154                 if (len < 0)
1155                         return len;
1156 
1157                 goto lenout;
1158 
1159         case SO_LOCK_FILTER:
1160                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1161                 break;
1162 
1163         default:
1164                 return -ENOPROTOOPT;
1165         }
1166 
1167         if (len > lv)
1168                 len = lv;
1169         if (copy_to_user(optval, &v, len))
1170                 return -EFAULT;
1171 lenout:
1172         if (put_user(len, optlen))
1173                 return -EFAULT;
1174         return 0;
1175 }
1176 
1177 /*
1178  * Initialize an sk_lock.
1179  *
1180  * (We also register the sk_lock with the lock validator.)
1181  */
1182 static inline void sock_lock_init(struct sock *sk)
1183 {
1184         sock_lock_init_class_and_name(sk,
1185                         af_family_slock_key_strings[sk->sk_family],
1186                         af_family_slock_keys + sk->sk_family,
1187                         af_family_key_strings[sk->sk_family],
1188                         af_family_keys + sk->sk_family);
1189 }
1190 
1191 /*
1192  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1193  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1194  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1195  */
1196 static void sock_copy(struct sock *nsk, const struct sock *osk)
1197 {
1198 #ifdef CONFIG_SECURITY_NETWORK
1199         void *sptr = nsk->sk_security;
1200 #endif
1201         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1202 
1203         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1204                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1205 
1206 #ifdef CONFIG_SECURITY_NETWORK
1207         nsk->sk_security = sptr;
1208         security_sk_clone(osk, nsk);
1209 #endif
1210 }
1211 
1212 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1213 {
1214         unsigned long nulls1, nulls2;
1215 
1216         nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1217         nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1218         if (nulls1 > nulls2)
1219                 swap(nulls1, nulls2);
1220 
1221         if (nulls1 != 0)
1222                 memset((char *)sk, 0, nulls1);
1223         memset((char *)sk + nulls1 + sizeof(void *), 0,
1224                nulls2 - nulls1 - sizeof(void *));
1225         memset((char *)sk + nulls2 + sizeof(void *), 0,
1226                size - nulls2 - sizeof(void *));
1227 }
1228 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1229 
1230 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1231                 int family)
1232 {
1233         struct sock *sk;
1234         struct kmem_cache *slab;
1235 
1236         slab = prot->slab;
1237         if (slab != NULL) {
1238                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1239                 if (!sk)
1240                         return sk;
1241                 if (priority & __GFP_ZERO) {
1242                         if (prot->clear_sk)
1243                                 prot->clear_sk(sk, prot->obj_size);
1244                         else
1245                                 sk_prot_clear_nulls(sk, prot->obj_size);
1246                 }
1247         } else
1248                 sk = kmalloc(prot->obj_size, priority);
1249 
1250         if (sk != NULL) {
1251                 kmemcheck_annotate_bitfield(sk, flags);
1252 
1253                 if (security_sk_alloc(sk, family, priority))
1254                         goto out_free;
1255 
1256                 if (!try_module_get(prot->owner))
1257                         goto out_free_sec;
1258                 sk_tx_queue_clear(sk);
1259         }
1260 
1261         return sk;
1262 
1263 out_free_sec:
1264         security_sk_free(sk);
1265 out_free:
1266         if (slab != NULL)
1267                 kmem_cache_free(slab, sk);
1268         else
1269                 kfree(sk);
1270         return NULL;
1271 }
1272 
1273 static void sk_prot_free(struct proto *prot, struct sock *sk)
1274 {
1275         struct kmem_cache *slab;
1276         struct module *owner;
1277 
1278         owner = prot->owner;
1279         slab = prot->slab;
1280 
1281         security_sk_free(sk);
1282         if (slab != NULL)
1283                 kmem_cache_free(slab, sk);
1284         else
1285                 kfree(sk);
1286         module_put(owner);
1287 }
1288 
1289 #ifdef CONFIG_CGROUPS
1290 #if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1291 void sock_update_classid(struct sock *sk, struct task_struct *task)
1292 {
1293         u32 classid;
1294 
1295         classid = task_cls_classid(task);
1296         if (classid != sk->sk_classid)
1297                 sk->sk_classid = classid;
1298 }
1299 EXPORT_SYMBOL(sock_update_classid);
1300 #endif
1301 
1302 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1303 void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
1304 {
1305         if (in_interrupt())
1306                 return;
1307 
1308         sk->sk_cgrp_prioidx = task_netprioidx(task);
1309 }
1310 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1311 #endif
1312 #endif
1313 
1314 /**
1315  *      sk_alloc - All socket objects are allocated here
1316  *      @net: the applicable net namespace
1317  *      @family: protocol family
1318  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1319  *      @prot: struct proto associated with this new sock instance
1320  */
1321 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1322                       struct proto *prot)
1323 {
1324         struct sock *sk;
1325 
1326         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1327         if (sk) {
1328                 sk->sk_family = family;
1329                 /*
1330                  * See comment in struct sock definition to understand
1331                  * why we need sk_prot_creator -acme
1332                  */
1333                 sk->sk_prot = sk->sk_prot_creator = prot;
1334                 sock_lock_init(sk);
1335                 sock_net_set(sk, get_net(net));
1336                 atomic_set(&sk->sk_wmem_alloc, 1);
1337 
1338                 sock_update_classid(sk, current);
1339                 sock_update_netprioidx(sk, current);
1340         }
1341 
1342         return sk;
1343 }
1344 EXPORT_SYMBOL(sk_alloc);
1345 
1346 static void __sk_free(struct sock *sk)
1347 {
1348         struct sk_filter *filter;
1349 
1350         if (sk->sk_destruct)
1351                 sk->sk_destruct(sk);
1352 
1353         filter = rcu_dereference_check(sk->sk_filter,
1354                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1355         if (filter) {
1356                 sk_filter_uncharge(sk, filter);
1357                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1358         }
1359 
1360         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1361 
1362         if (atomic_read(&sk->sk_omem_alloc))
1363                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1364                          __func__, atomic_read(&sk->sk_omem_alloc));
1365 
1366         if (sk->sk_peer_cred)
1367                 put_cred(sk->sk_peer_cred);
1368         put_pid(sk->sk_peer_pid);
1369         put_net(sock_net(sk));
1370         sk_prot_free(sk->sk_prot_creator, sk);
1371 }
1372 
1373 void sk_free(struct sock *sk)
1374 {
1375         /*
1376          * We subtract one from sk_wmem_alloc and can know if
1377          * some packets are still in some tx queue.
1378          * If not null, sock_wfree() will call __sk_free(sk) later
1379          */
1380         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1381                 __sk_free(sk);
1382 }
1383 EXPORT_SYMBOL(sk_free);
1384 
1385 /*
1386  * Last sock_put should drop reference to sk->sk_net. It has already
1387  * been dropped in sk_change_net. Taking reference to stopping namespace
1388  * is not an option.
1389  * Take reference to a socket to remove it from hash _alive_ and after that
1390  * destroy it in the context of init_net.
1391  */
1392 void sk_release_kernel(struct sock *sk)
1393 {
1394         if (sk == NULL || sk->sk_socket == NULL)
1395                 return;
1396 
1397         sock_hold(sk);
1398         sock_release(sk->sk_socket);
1399         release_net(sock_net(sk));
1400         sock_net_set(sk, get_net(&init_net));
1401         sock_put(sk);
1402 }
1403 EXPORT_SYMBOL(sk_release_kernel);
1404 
1405 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1406 {
1407         if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1408                 sock_update_memcg(newsk);
1409 }
1410 
1411 /**
1412  *      sk_clone_lock - clone a socket, and lock its clone
1413  *      @sk: the socket to clone
1414  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1415  *
1416  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1417  */
1418 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1419 {
1420         struct sock *newsk;
1421 
1422         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1423         if (newsk != NULL) {
1424                 struct sk_filter *filter;
1425 
1426                 sock_copy(newsk, sk);
1427 
1428                 /* SANITY */
1429                 get_net(sock_net(newsk));
1430                 sk_node_init(&newsk->sk_node);
1431                 sock_lock_init(newsk);
1432                 bh_lock_sock(newsk);
1433                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1434                 newsk->sk_backlog.len = 0;
1435 
1436                 atomic_set(&newsk->sk_rmem_alloc, 0);
1437                 /*
1438                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1439                  */
1440                 atomic_set(&newsk->sk_wmem_alloc, 1);
1441                 atomic_set(&newsk->sk_omem_alloc, 0);
1442                 skb_queue_head_init(&newsk->sk_receive_queue);
1443                 skb_queue_head_init(&newsk->sk_write_queue);
1444 #ifdef CONFIG_NET_DMA
1445                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1446 #endif
1447 
1448                 spin_lock_init(&newsk->sk_dst_lock);
1449                 rwlock_init(&newsk->sk_callback_lock);
1450                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1451                                 af_callback_keys + newsk->sk_family,
1452                                 af_family_clock_key_strings[newsk->sk_family]);
1453 
1454                 newsk->sk_dst_cache     = NULL;
1455                 newsk->sk_wmem_queued   = 0;
1456                 newsk->sk_forward_alloc = 0;
1457                 newsk->sk_send_head     = NULL;
1458                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1459 
1460                 sock_reset_flag(newsk, SOCK_DONE);
1461                 skb_queue_head_init(&newsk->sk_error_queue);
1462 
1463                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1464                 if (filter != NULL)
1465                         sk_filter_charge(newsk, filter);
1466 
1467                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1468                         /* It is still raw copy of parent, so invalidate
1469                          * destructor and make plain sk_free() */
1470                         newsk->sk_destruct = NULL;
1471                         bh_unlock_sock(newsk);
1472                         sk_free(newsk);
1473                         newsk = NULL;
1474                         goto out;
1475                 }
1476 
1477                 newsk->sk_err      = 0;
1478                 newsk->sk_priority = 0;
1479                 /*
1480                  * Before updating sk_refcnt, we must commit prior changes to memory
1481                  * (Documentation/RCU/rculist_nulls.txt for details)
1482                  */
1483                 smp_wmb();
1484                 atomic_set(&newsk->sk_refcnt, 2);
1485 
1486                 /*
1487                  * Increment the counter in the same struct proto as the master
1488                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1489                  * is the same as sk->sk_prot->socks, as this field was copied
1490                  * with memcpy).
1491                  *
1492                  * This _changes_ the previous behaviour, where
1493                  * tcp_create_openreq_child always was incrementing the
1494                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1495                  * to be taken into account in all callers. -acme
1496                  */
1497                 sk_refcnt_debug_inc(newsk);
1498                 sk_set_socket(newsk, NULL);
1499                 newsk->sk_wq = NULL;
1500 
1501                 sk_update_clone(sk, newsk);
1502 
1503                 if (newsk->sk_prot->sockets_allocated)
1504                         sk_sockets_allocated_inc(newsk);
1505 
1506                 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1507                         net_enable_timestamp();
1508         }
1509 out:
1510         return newsk;
1511 }
1512 EXPORT_SYMBOL_GPL(sk_clone_lock);
1513 
1514 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1515 {
1516         __sk_dst_set(sk, dst);
1517         sk->sk_route_caps = dst->dev->features;
1518         if (sk->sk_route_caps & NETIF_F_GSO)
1519                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1520         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1521         if (sk_can_gso(sk)) {
1522                 if (dst->header_len) {
1523                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1524                 } else {
1525                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1526                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1527                         sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1528                 }
1529         }
1530 }
1531 EXPORT_SYMBOL_GPL(sk_setup_caps);
1532 
1533 /*
1534  *      Simple resource managers for sockets.
1535  */
1536 
1537 
1538 /*
1539  * Write buffer destructor automatically called from kfree_skb.
1540  */
1541 void sock_wfree(struct sk_buff *skb)
1542 {
1543         struct sock *sk = skb->sk;
1544         unsigned int len = skb->truesize;
1545 
1546         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1547                 /*
1548                  * Keep a reference on sk_wmem_alloc, this will be released
1549                  * after sk_write_space() call
1550                  */
1551                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1552                 sk->sk_write_space(sk);
1553                 len = 1;
1554         }
1555         /*
1556          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1557          * could not do because of in-flight packets
1558          */
1559         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1560                 __sk_free(sk);
1561 }
1562 EXPORT_SYMBOL(sock_wfree);
1563 
1564 /*
1565  * Read buffer destructor automatically called from kfree_skb.
1566  */
1567 void sock_rfree(struct sk_buff *skb)
1568 {
1569         struct sock *sk = skb->sk;
1570         unsigned int len = skb->truesize;
1571 
1572         atomic_sub(len, &sk->sk_rmem_alloc);
1573         sk_mem_uncharge(sk, len);
1574 }
1575 EXPORT_SYMBOL(sock_rfree);
1576 
1577 void sock_edemux(struct sk_buff *skb)
1578 {
1579         struct sock *sk = skb->sk;
1580 
1581 #ifdef CONFIG_INET
1582         if (sk->sk_state == TCP_TIME_WAIT)
1583                 inet_twsk_put(inet_twsk(sk));
1584         else
1585 #endif
1586                 sock_put(sk);
1587 }
1588 EXPORT_SYMBOL(sock_edemux);
1589 
1590 kuid_t sock_i_uid(struct sock *sk)
1591 {
1592         kuid_t uid;
1593 
1594         read_lock_bh(&sk->sk_callback_lock);
1595         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1596         read_unlock_bh(&sk->sk_callback_lock);
1597         return uid;
1598 }
1599 EXPORT_SYMBOL(sock_i_uid);
1600 
1601 unsigned long sock_i_ino(struct sock *sk)
1602 {
1603         unsigned long ino;
1604 
1605         read_lock_bh(&sk->sk_callback_lock);
1606         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1607         read_unlock_bh(&sk->sk_callback_lock);
1608         return ino;
1609 }
1610 EXPORT_SYMBOL(sock_i_ino);
1611 
1612 /*
1613  * Allocate a skb from the socket's send buffer.
1614  */
1615 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1616                              gfp_t priority)
1617 {
1618         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1619                 struct sk_buff *skb = alloc_skb(size, priority);
1620                 if (skb) {
1621                         skb_set_owner_w(skb, sk);
1622                         return skb;
1623                 }
1624         }
1625         return NULL;
1626 }
1627 EXPORT_SYMBOL(sock_wmalloc);
1628 
1629 /*
1630  * Allocate a skb from the socket's receive buffer.
1631  */
1632 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1633                              gfp_t priority)
1634 {
1635         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1636                 struct sk_buff *skb = alloc_skb(size, priority);
1637                 if (skb) {
1638                         skb_set_owner_r(skb, sk);
1639                         return skb;
1640                 }
1641         }
1642         return NULL;
1643 }
1644 
1645 /*
1646  * Allocate a memory block from the socket's option memory buffer.
1647  */
1648 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1649 {
1650         if ((unsigned int)size <= sysctl_optmem_max &&
1651             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1652                 void *mem;
1653                 /* First do the add, to avoid the race if kmalloc
1654                  * might sleep.
1655                  */
1656                 atomic_add(size, &sk->sk_omem_alloc);
1657                 mem = kmalloc(size, priority);
1658                 if (mem)
1659                         return mem;
1660                 atomic_sub(size, &sk->sk_omem_alloc);
1661         }
1662         return NULL;
1663 }
1664 EXPORT_SYMBOL(sock_kmalloc);
1665 
1666 /*
1667  * Free an option memory block.
1668  */
1669 void sock_kfree_s(struct sock *sk, void *mem, int size)
1670 {
1671         kfree(mem);
1672         atomic_sub(size, &sk->sk_omem_alloc);
1673 }
1674 EXPORT_SYMBOL(sock_kfree_s);
1675 
1676 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1677    I think, these locks should be removed for datagram sockets.
1678  */
1679 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1680 {
1681         DEFINE_WAIT(wait);
1682 
1683         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1684         for (;;) {
1685                 if (!timeo)
1686                         break;
1687                 if (signal_pending(current))
1688                         break;
1689                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1690                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1691                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1692                         break;
1693                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1694                         break;
1695                 if (sk->sk_err)
1696                         break;
1697                 timeo = schedule_timeout(timeo);
1698         }
1699         finish_wait(sk_sleep(sk), &wait);
1700         return timeo;
1701 }
1702 
1703 
1704 /*
1705  *      Generic send/receive buffer handlers
1706  */
1707 
1708 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1709                                      unsigned long data_len, int noblock,
1710                                      int *errcode)
1711 {
1712         struct sk_buff *skb;
1713         gfp_t gfp_mask;
1714         long timeo;
1715         int err;
1716         int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1717 
1718         err = -EMSGSIZE;
1719         if (npages > MAX_SKB_FRAGS)
1720                 goto failure;
1721 
1722         gfp_mask = sk->sk_allocation;
1723         if (gfp_mask & __GFP_WAIT)
1724                 gfp_mask |= __GFP_REPEAT;
1725 
1726         timeo = sock_sndtimeo(sk, noblock);
1727         while (1) {
1728                 err = sock_error(sk);
1729                 if (err != 0)
1730                         goto failure;
1731 
1732                 err = -EPIPE;
1733                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1734                         goto failure;
1735 
1736                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1737                         skb = alloc_skb(header_len, gfp_mask);
1738                         if (skb) {
1739                                 int i;
1740 
1741                                 /* No pages, we're done... */
1742                                 if (!data_len)
1743                                         break;
1744 
1745                                 skb->truesize += data_len;
1746                                 skb_shinfo(skb)->nr_frags = npages;
1747                                 for (i = 0; i < npages; i++) {
1748                                         struct page *page;
1749 
1750                                         page = alloc_pages(sk->sk_allocation, 0);
1751                                         if (!page) {
1752                                                 err = -ENOBUFS;
1753                                                 skb_shinfo(skb)->nr_frags = i;
1754                                                 kfree_skb(skb);
1755                                                 goto failure;
1756                                         }
1757 
1758                                         __skb_fill_page_desc(skb, i,
1759                                                         page, 0,
1760                                                         (data_len >= PAGE_SIZE ?
1761                                                          PAGE_SIZE :
1762                                                          data_len));
1763                                         data_len -= PAGE_SIZE;
1764                                 }
1765 
1766                                 /* Full success... */
1767                                 break;
1768                         }
1769                         err = -ENOBUFS;
1770                         goto failure;
1771                 }
1772                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1773                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1774                 err = -EAGAIN;
1775                 if (!timeo)
1776                         goto failure;
1777                 if (signal_pending(current))
1778                         goto interrupted;
1779                 timeo = sock_wait_for_wmem(sk, timeo);
1780         }
1781 
1782         skb_set_owner_w(skb, sk);
1783         return skb;
1784 
1785 interrupted:
1786         err = sock_intr_errno(timeo);
1787 failure:
1788         *errcode = err;
1789         return NULL;
1790 }
1791 EXPORT_SYMBOL(sock_alloc_send_pskb);
1792 
1793 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1794                                     int noblock, int *errcode)
1795 {
1796         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1797 }
1798 EXPORT_SYMBOL(sock_alloc_send_skb);
1799 
1800 /* On 32bit arches, an skb frag is limited to 2^15 */
1801 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
1802 
1803 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1804 {
1805         int order;
1806 
1807         if (pfrag->page) {
1808                 if (atomic_read(&pfrag->page->_count) == 1) {
1809                         pfrag->offset = 0;
1810                         return true;
1811                 }
1812                 if (pfrag->offset < pfrag->size)
1813                         return true;
1814                 put_page(pfrag->page);
1815         }
1816 
1817         /* We restrict high order allocations to users that can afford to wait */
1818         order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1819 
1820         do {
1821                 gfp_t gfp = sk->sk_allocation;
1822 
1823                 if (order)
1824                         gfp |= __GFP_COMP | __GFP_NOWARN;
1825                 pfrag->page = alloc_pages(gfp, order);
1826                 if (likely(pfrag->page)) {
1827                         pfrag->offset = 0;
1828                         pfrag->size = PAGE_SIZE << order;
1829                         return true;
1830                 }
1831         } while (--order >= 0);
1832 
1833         sk_enter_memory_pressure(sk);
1834         sk_stream_moderate_sndbuf(sk);
1835         return false;
1836 }
1837 EXPORT_SYMBOL(sk_page_frag_refill);
1838 
1839 static void __lock_sock(struct sock *sk)
1840         __releases(&sk->sk_lock.slock)
1841         __acquires(&sk->sk_lock.slock)
1842 {
1843         DEFINE_WAIT(wait);
1844 
1845         for (;;) {
1846                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1847                                         TASK_UNINTERRUPTIBLE);
1848                 spin_unlock_bh(&sk->sk_lock.slock);
1849                 schedule();
1850                 spin_lock_bh(&sk->sk_lock.slock);
1851                 if (!sock_owned_by_user(sk))
1852                         break;
1853         }
1854         finish_wait(&sk->sk_lock.wq, &wait);
1855 }
1856 
1857 static void __release_sock(struct sock *sk)
1858         __releases(&sk->sk_lock.slock)
1859         __acquires(&sk->sk_lock.slock)
1860 {
1861         struct sk_buff *skb = sk->sk_backlog.head;
1862 
1863         do {
1864                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1865                 bh_unlock_sock(sk);
1866 
1867                 do {
1868                         struct sk_buff *next = skb->next;
1869 
1870                         prefetch(next);
1871                         WARN_ON_ONCE(skb_dst_is_noref(skb));
1872                         skb->next = NULL;
1873                         sk_backlog_rcv(sk, skb);
1874 
1875                         /*
1876                          * We are in process context here with softirqs
1877                          * disabled, use cond_resched_softirq() to preempt.
1878                          * This is safe to do because we've taken the backlog
1879                          * queue private:
1880                          */
1881                         cond_resched_softirq();
1882 
1883                         skb = next;
1884                 } while (skb != NULL);
1885 
1886                 bh_lock_sock(sk);
1887         } while ((skb = sk->sk_backlog.head) != NULL);
1888 
1889         /*
1890          * Doing the zeroing here guarantee we can not loop forever
1891          * while a wild producer attempts to flood us.
1892          */
1893         sk->sk_backlog.len = 0;
1894 }
1895 
1896 /**
1897  * sk_wait_data - wait for data to arrive at sk_receive_queue
1898  * @sk:    sock to wait on
1899  * @timeo: for how long
1900  *
1901  * Now socket state including sk->sk_err is changed only under lock,
1902  * hence we may omit checks after joining wait queue.
1903  * We check receive queue before schedule() only as optimization;
1904  * it is very likely that release_sock() added new data.
1905  */
1906 int sk_wait_data(struct sock *sk, long *timeo)
1907 {
1908         int rc;
1909         DEFINE_WAIT(wait);
1910 
1911         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1912         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1913         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1914         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1915         finish_wait(sk_sleep(sk), &wait);
1916         return rc;
1917 }
1918 EXPORT_SYMBOL(sk_wait_data);
1919 
1920 /**
1921  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1922  *      @sk: socket
1923  *      @size: memory size to allocate
1924  *      @kind: allocation type
1925  *
1926  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1927  *      rmem allocation. This function assumes that protocols which have
1928  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1929  */
1930 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1931 {
1932         struct proto *prot = sk->sk_prot;
1933         int amt = sk_mem_pages(size);
1934         long allocated;
1935         int parent_status = UNDER_LIMIT;
1936 
1937         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1938 
1939         allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1940 
1941         /* Under limit. */
1942         if (parent_status == UNDER_LIMIT &&
1943                         allocated <= sk_prot_mem_limits(sk, 0)) {
1944                 sk_leave_memory_pressure(sk);
1945                 return 1;
1946         }
1947 
1948         /* Under pressure. (we or our parents) */
1949         if ((parent_status > SOFT_LIMIT) ||
1950                         allocated > sk_prot_mem_limits(sk, 1))
1951                 sk_enter_memory_pressure(sk);
1952 
1953         /* Over hard limit (we or our parents) */
1954         if ((parent_status == OVER_LIMIT) ||
1955                         (allocated > sk_prot_mem_limits(sk, 2)))
1956                 goto suppress_allocation;
1957 
1958         /* guarantee minimum buffer size under pressure */
1959         if (kind == SK_MEM_RECV) {
1960                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1961                         return 1;
1962 
1963         } else { /* SK_MEM_SEND */
1964                 if (sk->sk_type == SOCK_STREAM) {
1965                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1966                                 return 1;
1967                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1968                            prot->sysctl_wmem[0])
1969                                 return 1;
1970         }
1971 
1972         if (sk_has_memory_pressure(sk)) {
1973                 int alloc;
1974 
1975                 if (!sk_under_memory_pressure(sk))
1976                         return 1;
1977                 alloc = sk_sockets_allocated_read_positive(sk);
1978                 if (sk_prot_mem_limits(sk, 2) > alloc *
1979                     sk_mem_pages(sk->sk_wmem_queued +
1980                                  atomic_read(&sk->sk_rmem_alloc) +
1981                                  sk->sk_forward_alloc))
1982                         return 1;
1983         }
1984 
1985 suppress_allocation:
1986 
1987         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1988                 sk_stream_moderate_sndbuf(sk);
1989 
1990                 /* Fail only if socket is _under_ its sndbuf.
1991                  * In this case we cannot block, so that we have to fail.
1992                  */
1993                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1994                         return 1;
1995         }
1996 
1997         trace_sock_exceed_buf_limit(sk, prot, allocated);
1998 
1999         /* Alas. Undo changes. */
2000         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2001 
2002         sk_memory_allocated_sub(sk, amt);
2003 
2004         return 0;
2005 }
2006 EXPORT_SYMBOL(__sk_mem_schedule);
2007 
2008 /**
2009  *      __sk_reclaim - reclaim memory_allocated
2010  *      @sk: socket
2011  */
2012 void __sk_mem_reclaim(struct sock *sk)
2013 {
2014         sk_memory_allocated_sub(sk,
2015                                 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2016         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2017 
2018         if (sk_under_memory_pressure(sk) &&
2019             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2020                 sk_leave_memory_pressure(sk);
2021 }
2022 EXPORT_SYMBOL(__sk_mem_reclaim);
2023 
2024 
2025 /*
2026  * Set of default routines for initialising struct proto_ops when
2027  * the protocol does not support a particular function. In certain
2028  * cases where it makes no sense for a protocol to have a "do nothing"
2029  * function, some default processing is provided.
2030  */
2031 
2032 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2033 {
2034         return -EOPNOTSUPP;
2035 }
2036 EXPORT_SYMBOL(sock_no_bind);
2037 
2038 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2039                     int len, int flags)
2040 {
2041         return -EOPNOTSUPP;
2042 }
2043 EXPORT_SYMBOL(sock_no_connect);
2044 
2045 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2046 {
2047         return -EOPNOTSUPP;
2048 }
2049 EXPORT_SYMBOL(sock_no_socketpair);
2050 
2051 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2052 {
2053         return -EOPNOTSUPP;
2054 }
2055 EXPORT_SYMBOL(sock_no_accept);
2056 
2057 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2058                     int *len, int peer)
2059 {
2060         return -EOPNOTSUPP;
2061 }
2062 EXPORT_SYMBOL(sock_no_getname);
2063 
2064 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2065 {
2066         return 0;
2067 }
2068 EXPORT_SYMBOL(sock_no_poll);
2069 
2070 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2071 {
2072         return -EOPNOTSUPP;
2073 }
2074 EXPORT_SYMBOL(sock_no_ioctl);
2075 
2076 int sock_no_listen(struct socket *sock, int backlog)
2077 {
2078         return -EOPNOTSUPP;
2079 }
2080 EXPORT_SYMBOL(sock_no_listen);
2081 
2082 int sock_no_shutdown(struct socket *sock, int how)
2083 {
2084         return -EOPNOTSUPP;
2085 }
2086 EXPORT_SYMBOL(sock_no_shutdown);
2087 
2088 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2089                     char __user *optval, unsigned int optlen)
2090 {
2091         return -EOPNOTSUPP;
2092 }
2093 EXPORT_SYMBOL(sock_no_setsockopt);
2094 
2095 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2096                     char __user *optval, int __user *optlen)
2097 {
2098         return -EOPNOTSUPP;
2099 }
2100 EXPORT_SYMBOL(sock_no_getsockopt);
2101 
2102 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2103                     size_t len)
2104 {
2105         return -EOPNOTSUPP;
2106 }
2107 EXPORT_SYMBOL(sock_no_sendmsg);
2108 
2109 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2110                     size_t len, int flags)
2111 {
2112         return -EOPNOTSUPP;
2113 }
2114 EXPORT_SYMBOL(sock_no_recvmsg);
2115 
2116 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2117 {
2118         /* Mirror missing mmap method error code */
2119         return -ENODEV;
2120 }
2121 EXPORT_SYMBOL(sock_no_mmap);
2122 
2123 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2124 {
2125         ssize_t res;
2126         struct msghdr msg = {.msg_flags = flags};
2127         struct kvec iov;
2128         char *kaddr = kmap(page);
2129         iov.iov_base = kaddr + offset;
2130         iov.iov_len = size;
2131         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2132         kunmap(page);
2133         return res;
2134 }
2135 EXPORT_SYMBOL(sock_no_sendpage);
2136 
2137 /*
2138  *      Default Socket Callbacks
2139  */
2140 
2141 static void sock_def_wakeup(struct sock *sk)
2142 {
2143         struct socket_wq *wq;
2144 
2145         rcu_read_lock();
2146         wq = rcu_dereference(sk->sk_wq);
2147         if (wq_has_sleeper(wq))
2148                 wake_up_interruptible_all(&wq->wait);
2149         rcu_read_unlock();
2150 }
2151 
2152 static void sock_def_error_report(struct sock *sk)
2153 {
2154         struct socket_wq *wq;
2155 
2156         rcu_read_lock();
2157         wq = rcu_dereference(sk->sk_wq);
2158         if (wq_has_sleeper(wq))
2159                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2160         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2161         rcu_read_unlock();
2162 }
2163 
2164 static void sock_def_readable(struct sock *sk, int len)
2165 {
2166         struct socket_wq *wq;
2167 
2168         rcu_read_lock();
2169         wq = rcu_dereference(sk->sk_wq);
2170         if (wq_has_sleeper(wq))
2171                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2172                                                 POLLRDNORM | POLLRDBAND);
2173         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2174         rcu_read_unlock();
2175 }
2176 
2177 static void sock_def_write_space(struct sock *sk)
2178 {
2179         struct socket_wq *wq;
2180 
2181         rcu_read_lock();
2182 
2183         /* Do not wake up a writer until he can make "significant"
2184          * progress.  --DaveM
2185          */
2186         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2187                 wq = rcu_dereference(sk->sk_wq);
2188                 if (wq_has_sleeper(wq))
2189                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2190                                                 POLLWRNORM | POLLWRBAND);
2191 
2192                 /* Should agree with poll, otherwise some programs break */
2193                 if (sock_writeable(sk))
2194                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2195         }
2196 
2197         rcu_read_unlock();
2198 }
2199 
2200 static void sock_def_destruct(struct sock *sk)
2201 {
2202         kfree(sk->sk_protinfo);
2203 }
2204 
2205 void sk_send_sigurg(struct sock *sk)
2206 {
2207         if (sk->sk_socket && sk->sk_socket->file)
2208                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2209                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2210 }
2211 EXPORT_SYMBOL(sk_send_sigurg);
2212 
2213 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2214                     unsigned long expires)
2215 {
2216         if (!mod_timer(timer, expires))
2217                 sock_hold(sk);
2218 }
2219 EXPORT_SYMBOL(sk_reset_timer);
2220 
2221 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2222 {
2223         if (del_timer(timer))
2224                 __sock_put(sk);
2225 }
2226 EXPORT_SYMBOL(sk_stop_timer);
2227 
2228 void sock_init_data(struct socket *sock, struct sock *sk)
2229 {
2230         skb_queue_head_init(&sk->sk_receive_queue);
2231         skb_queue_head_init(&sk->sk_write_queue);
2232         skb_queue_head_init(&sk->sk_error_queue);
2233 #ifdef CONFIG_NET_DMA
2234         skb_queue_head_init(&sk->sk_async_wait_queue);
2235 #endif
2236 
2237         sk->sk_send_head        =       NULL;
2238 
2239         init_timer(&sk->sk_timer);
2240 
2241         sk->sk_allocation       =       GFP_KERNEL;
2242         sk->sk_rcvbuf           =       sysctl_rmem_default;
2243         sk->sk_sndbuf           =       sysctl_wmem_default;
2244         sk->sk_state            =       TCP_CLOSE;
2245         sk_set_socket(sk, sock);
2246 
2247         sock_set_flag(sk, SOCK_ZAPPED);
2248 
2249         if (sock) {
2250                 sk->sk_type     =       sock->type;
2251                 sk->sk_wq       =       sock->wq;
2252                 sock->sk        =       sk;
2253         } else
2254                 sk->sk_wq       =       NULL;
2255 
2256         spin_lock_init(&sk->sk_dst_lock);
2257         rwlock_init(&sk->sk_callback_lock);
2258         lockdep_set_class_and_name(&sk->sk_callback_lock,
2259                         af_callback_keys + sk->sk_family,
2260                         af_family_clock_key_strings[sk->sk_family]);
2261 
2262         sk->sk_state_change     =       sock_def_wakeup;
2263         sk->sk_data_ready       =       sock_def_readable;
2264         sk->sk_write_space      =       sock_def_write_space;
2265         sk->sk_error_report     =       sock_def_error_report;
2266         sk->sk_destruct         =       sock_def_destruct;
2267 
2268         sk->sk_frag.page        =       NULL;
2269         sk->sk_frag.offset      =       0;
2270         sk->sk_peek_off         =       -1;
2271 
2272         sk->sk_peer_pid         =       NULL;
2273         sk->sk_peer_cred        =       NULL;
2274         sk->sk_write_pending    =       0;
2275         sk->sk_rcvlowat         =       1;
2276         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2277         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2278 
2279         sk->sk_stamp = ktime_set(-1L, 0);
2280 
2281         /*
2282          * Before updating sk_refcnt, we must commit prior changes to memory
2283          * (Documentation/RCU/rculist_nulls.txt for details)
2284          */
2285         smp_wmb();
2286         atomic_set(&sk->sk_refcnt, 1);
2287         atomic_set(&sk->sk_drops, 0);
2288 }
2289 EXPORT_SYMBOL(sock_init_data);
2290 
2291 void lock_sock_nested(struct sock *sk, int subclass)
2292 {
2293         might_sleep();
2294         spin_lock_bh(&sk->sk_lock.slock);
2295         if (sk->sk_lock.owned)
2296                 __lock_sock(sk);
2297         sk->sk_lock.owned = 1;
2298         spin_unlock(&sk->sk_lock.slock);
2299         /*
2300          * The sk_lock has mutex_lock() semantics here:
2301          */
2302         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2303         local_bh_enable();
2304 }
2305 EXPORT_SYMBOL(lock_sock_nested);
2306 
2307 void release_sock(struct sock *sk)
2308 {
2309         /*
2310          * The sk_lock has mutex_unlock() semantics:
2311          */
2312         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2313 
2314         spin_lock_bh(&sk->sk_lock.slock);
2315         if (sk->sk_backlog.tail)
2316                 __release_sock(sk);
2317 
2318         if (sk->sk_prot->release_cb)
2319                 sk->sk_prot->release_cb(sk);
2320 
2321         sk->sk_lock.owned = 0;
2322         if (waitqueue_active(&sk->sk_lock.wq))
2323                 wake_up(&sk->sk_lock.wq);
2324         spin_unlock_bh(&sk->sk_lock.slock);
2325 }
2326 EXPORT_SYMBOL(release_sock);
2327 
2328 /**
2329  * lock_sock_fast - fast version of lock_sock
2330  * @sk: socket
2331  *
2332  * This version should be used for very small section, where process wont block
2333  * return false if fast path is taken
2334  *   sk_lock.slock locked, owned = 0, BH disabled
2335  * return true if slow path is taken
2336  *   sk_lock.slock unlocked, owned = 1, BH enabled
2337  */
2338 bool lock_sock_fast(struct sock *sk)
2339 {
2340         might_sleep();
2341         spin_lock_bh(&sk->sk_lock.slock);
2342 
2343         if (!sk->sk_lock.owned)
2344                 /*
2345                  * Note : We must disable BH
2346                  */
2347                 return false;
2348 
2349         __lock_sock(sk);
2350         sk->sk_lock.owned = 1;
2351         spin_unlock(&sk->sk_lock.slock);
2352         /*
2353          * The sk_lock has mutex_lock() semantics here:
2354          */
2355         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2356         local_bh_enable();
2357         return true;
2358 }
2359 EXPORT_SYMBOL(lock_sock_fast);
2360 
2361 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2362 {
2363         struct timeval tv;
2364         if (!sock_flag(sk, SOCK_TIMESTAMP))
2365                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2366         tv = ktime_to_timeval(sk->sk_stamp);
2367         if (tv.tv_sec == -1)
2368                 return -ENOENT;
2369         if (tv.tv_sec == 0) {
2370                 sk->sk_stamp = ktime_get_real();
2371                 tv = ktime_to_timeval(sk->sk_stamp);
2372         }
2373         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2374 }
2375 EXPORT_SYMBOL(sock_get_timestamp);
2376 
2377 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2378 {
2379         struct timespec ts;
2380         if (!sock_flag(sk, SOCK_TIMESTAMP))
2381                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2382         ts = ktime_to_timespec(sk->sk_stamp);
2383         if (ts.tv_sec == -1)
2384                 return -ENOENT;
2385         if (ts.tv_sec == 0) {
2386                 sk->sk_stamp = ktime_get_real();
2387                 ts = ktime_to_timespec(sk->sk_stamp);
2388         }
2389         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2390 }
2391 EXPORT_SYMBOL(sock_get_timestampns);
2392 
2393 void sock_enable_timestamp(struct sock *sk, int flag)
2394 {
2395         if (!sock_flag(sk, flag)) {
2396                 unsigned long previous_flags = sk->sk_flags;
2397 
2398                 sock_set_flag(sk, flag);
2399                 /*
2400                  * we just set one of the two flags which require net
2401                  * time stamping, but time stamping might have been on
2402                  * already because of the other one
2403                  */
2404                 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2405                         net_enable_timestamp();
2406         }
2407 }
2408 
2409 /*
2410  *      Get a socket option on an socket.
2411  *
2412  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2413  *      asynchronous errors should be reported by getsockopt. We assume
2414  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2415  */
2416 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2417                            char __user *optval, int __user *optlen)
2418 {
2419         struct sock *sk = sock->sk;
2420 
2421         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2422 }
2423 EXPORT_SYMBOL(sock_common_getsockopt);
2424 
2425 #ifdef CONFIG_COMPAT
2426 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2427                                   char __user *optval, int __user *optlen)
2428 {
2429         struct sock *sk = sock->sk;
2430 
2431         if (sk->sk_prot->compat_getsockopt != NULL)
2432                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2433                                                       optval, optlen);
2434         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2435 }
2436 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2437 #endif
2438 
2439 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2440                         struct msghdr *msg, size_t size, int flags)
2441 {
2442         struct sock *sk = sock->sk;
2443         int addr_len = 0;
2444         int err;
2445 
2446         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2447                                    flags & ~MSG_DONTWAIT, &addr_len);
2448         if (err >= 0)
2449                 msg->msg_namelen = addr_len;
2450         return err;
2451 }
2452 EXPORT_SYMBOL(sock_common_recvmsg);
2453 
2454 /*
2455  *      Set socket options on an inet socket.
2456  */
2457 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2458                            char __user *optval, unsigned int optlen)
2459 {
2460         struct sock *sk = sock->sk;
2461 
2462         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2463 }
2464 EXPORT_SYMBOL(sock_common_setsockopt);
2465 
2466 #ifdef CONFIG_COMPAT
2467 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2468                                   char __user *optval, unsigned int optlen)
2469 {
2470         struct sock *sk = sock->sk;
2471 
2472         if (sk->sk_prot->compat_setsockopt != NULL)
2473                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2474                                                       optval, optlen);
2475         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2476 }
2477 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2478 #endif
2479 
2480 void sk_common_release(struct sock *sk)
2481 {
2482         if (sk->sk_prot->destroy)
2483                 sk->sk_prot->destroy(sk);
2484 
2485         /*
2486          * Observation: when sock_common_release is called, processes have
2487          * no access to socket. But net still has.
2488          * Step one, detach it from networking:
2489          *
2490          * A. Remove from hash tables.
2491          */
2492 
2493         sk->sk_prot->unhash(sk);
2494 
2495         /*
2496          * In this point socket cannot receive new packets, but it is possible
2497          * that some packets are in flight because some CPU runs receiver and
2498          * did hash table lookup before we unhashed socket. They will achieve
2499          * receive queue and will be purged by socket destructor.
2500          *
2501          * Also we still have packets pending on receive queue and probably,
2502          * our own packets waiting in device queues. sock_destroy will drain
2503          * receive queue, but transmitted packets will delay socket destruction
2504          * until the last reference will be released.
2505          */
2506 
2507         sock_orphan(sk);
2508 
2509         xfrm_sk_free_policy(sk);
2510 
2511         sk_refcnt_debug_release(sk);
2512 
2513         if (sk->sk_frag.page) {
2514                 put_page(sk->sk_frag.page);
2515                 sk->sk_frag.page = NULL;
2516         }
2517 
2518         sock_put(sk);
2519 }
2520 EXPORT_SYMBOL(sk_common_release);
2521 
2522 #ifdef CONFIG_PROC_FS
2523 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2524 struct prot_inuse {
2525         int val[PROTO_INUSE_NR];
2526 };
2527 
2528 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2529 
2530 #ifdef CONFIG_NET_NS
2531 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2532 {
2533         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2534 }
2535 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2536 
2537 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2538 {
2539         int cpu, idx = prot->inuse_idx;
2540         int res = 0;
2541 
2542         for_each_possible_cpu(cpu)
2543                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2544 
2545         return res >= 0 ? res : 0;
2546 }
2547 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2548 
2549 static int __net_init sock_inuse_init_net(struct net *net)
2550 {
2551         net->core.inuse = alloc_percpu(struct prot_inuse);
2552         return net->core.inuse ? 0 : -ENOMEM;
2553 }
2554 
2555 static void __net_exit sock_inuse_exit_net(struct net *net)
2556 {
2557         free_percpu(net->core.inuse);
2558 }
2559 
2560 static struct pernet_operations net_inuse_ops = {
2561         .init = sock_inuse_init_net,
2562         .exit = sock_inuse_exit_net,
2563 };
2564 
2565 static __init int net_inuse_init(void)
2566 {
2567         if (register_pernet_subsys(&net_inuse_ops))
2568                 panic("Cannot initialize net inuse counters");
2569 
2570         return 0;
2571 }
2572 
2573 core_initcall(net_inuse_init);
2574 #else
2575 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2576 
2577 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2578 {
2579         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2580 }
2581 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2582 
2583 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2584 {
2585         int cpu, idx = prot->inuse_idx;
2586         int res = 0;
2587 
2588         for_each_possible_cpu(cpu)
2589                 res += per_cpu(prot_inuse, cpu).val[idx];
2590 
2591         return res >= 0 ? res : 0;
2592 }
2593 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2594 #endif
2595 
2596 static void assign_proto_idx(struct proto *prot)
2597 {
2598         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2599 
2600         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2601                 pr_err("PROTO_INUSE_NR exhausted\n");
2602                 return;
2603         }
2604 
2605         set_bit(prot->inuse_idx, proto_inuse_idx);
2606 }
2607 
2608 static void release_proto_idx(struct proto *prot)
2609 {
2610         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2611                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2612 }
2613 #else
2614 static inline void assign_proto_idx(struct proto *prot)
2615 {
2616 }
2617 
2618 static inline void release_proto_idx(struct proto *prot)
2619 {
2620 }
2621 #endif
2622 
2623 int proto_register(struct proto *prot, int alloc_slab)
2624 {
2625         if (alloc_slab) {
2626                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2627                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2628                                         NULL);
2629 
2630                 if (prot->slab == NULL) {
2631                         pr_crit("%s: Can't create sock SLAB cache!\n",
2632                                 prot->name);
2633                         goto out;
2634                 }
2635 
2636                 if (prot->rsk_prot != NULL) {
2637                         prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2638                         if (prot->rsk_prot->slab_name == NULL)
2639                                 goto out_free_sock_slab;
2640 
2641                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2642                                                                  prot->rsk_prot->obj_size, 0,
2643                                                                  SLAB_HWCACHE_ALIGN, NULL);
2644 
2645                         if (prot->rsk_prot->slab == NULL) {
2646                                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2647                                         prot->name);
2648                                 goto out_free_request_sock_slab_name;
2649                         }
2650                 }
2651 
2652                 if (prot->twsk_prot != NULL) {
2653                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2654 
2655                         if (prot->twsk_prot->twsk_slab_name == NULL)
2656                                 goto out_free_request_sock_slab;
2657 
2658                         prot->twsk_prot->twsk_slab =
2659                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2660                                                   prot->twsk_prot->twsk_obj_size,
2661                                                   0,
2662                                                   SLAB_HWCACHE_ALIGN |
2663                                                         prot->slab_flags,
2664                                                   NULL);
2665                         if (prot->twsk_prot->twsk_slab == NULL)
2666                                 goto out_free_timewait_sock_slab_name;
2667                 }
2668         }
2669 
2670         mutex_lock(&proto_list_mutex);
2671         list_add(&prot->node, &proto_list);
2672         assign_proto_idx(prot);
2673         mutex_unlock(&proto_list_mutex);
2674         return 0;
2675 
2676 out_free_timewait_sock_slab_name:
2677         kfree(prot->twsk_prot->twsk_slab_name);
2678 out_free_request_sock_slab:
2679         if (prot->rsk_prot && prot->rsk_prot->slab) {
2680                 kmem_cache_destroy(prot->rsk_prot->slab);
2681                 prot->rsk_prot->slab = NULL;
2682         }
2683 out_free_request_sock_slab_name:
2684         if (prot->rsk_prot)
2685                 kfree(prot->rsk_prot->slab_name);
2686 out_free_sock_slab:
2687         kmem_cache_destroy(prot->slab);
2688         prot->slab = NULL;
2689 out:
2690         return -ENOBUFS;
2691 }
2692 EXPORT_SYMBOL(proto_register);
2693 
2694 void proto_unregister(struct proto *prot)
2695 {
2696         mutex_lock(&proto_list_mutex);
2697         release_proto_idx(prot);
2698         list_del(&prot->node);
2699         mutex_unlock(&proto_list_mutex);
2700 
2701         if (prot->slab != NULL) {
2702                 kmem_cache_destroy(prot->slab);
2703                 prot->slab = NULL;
2704         }
2705 
2706         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2707                 kmem_cache_destroy(prot->rsk_prot->slab);
2708                 kfree(prot->rsk_prot->slab_name);
2709                 prot->rsk_prot->slab = NULL;
2710         }
2711 
2712         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2713                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2714                 kfree(prot->twsk_prot->twsk_slab_name);
2715                 prot->twsk_prot->twsk_slab = NULL;
2716         }
2717 }
2718 EXPORT_SYMBOL(proto_unregister);
2719 
2720 #ifdef CONFIG_PROC_FS
2721 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2722         __acquires(proto_list_mutex)
2723 {
2724         mutex_lock(&proto_list_mutex);
2725         return seq_list_start_head(&proto_list, *pos);
2726 }
2727 
2728 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2729 {
2730         return seq_list_next(v, &proto_list, pos);
2731 }
2732 
2733 static void proto_seq_stop(struct seq_file *seq, void *v)
2734         __releases(proto_list_mutex)
2735 {
2736         mutex_unlock(&proto_list_mutex);
2737 }
2738 
2739 static char proto_method_implemented(const void *method)
2740 {
2741         return method == NULL ? 'n' : 'y';
2742 }
2743 static long sock_prot_memory_allocated(struct proto *proto)
2744 {
2745         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2746 }
2747 
2748 static char *sock_prot_memory_pressure(struct proto *proto)
2749 {
2750         return proto->memory_pressure != NULL ?
2751         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2752 }
2753 
2754 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2755 {
2756 
2757         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2758                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2759                    proto->name,
2760                    proto->obj_size,
2761                    sock_prot_inuse_get(seq_file_net(seq), proto),
2762                    sock_prot_memory_allocated(proto),
2763                    sock_prot_memory_pressure(proto),
2764                    proto->max_header,
2765                    proto->slab == NULL ? "no" : "yes",
2766                    module_name(proto->owner),
2767                    proto_method_implemented(proto->close),
2768                    proto_method_implemented(proto->connect),
2769                    proto_method_implemented(proto->disconnect),
2770                    proto_method_implemented(proto->accept),
2771                    proto_method_implemented(proto->ioctl),
2772                    proto_method_implemented(proto->init),
2773                    proto_method_implemented(proto->destroy),
2774                    proto_method_implemented(proto->shutdown),
2775                    proto_method_implemented(proto->setsockopt),
2776                    proto_method_implemented(proto->getsockopt),
2777                    proto_method_implemented(proto->sendmsg),
2778                    proto_method_implemented(proto->recvmsg),
2779                    proto_method_implemented(proto->sendpage),
2780                    proto_method_implemented(proto->bind),
2781                    proto_method_implemented(proto->backlog_rcv),
2782                    proto_method_implemented(proto->hash),
2783                    proto_method_implemented(proto->unhash),
2784                    proto_method_implemented(proto->get_port),
2785                    proto_method_implemented(proto->enter_memory_pressure));
2786 }
2787 
2788 static int proto_seq_show(struct seq_file *seq, void *v)
2789 {
2790         if (v == &proto_list)
2791                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2792                            "protocol",
2793                            "size",
2794                            "sockets",
2795                            "memory",
2796                            "press",
2797                            "maxhdr",
2798                            "slab",
2799                            "module",
2800                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2801         else
2802                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2803         return 0;
2804 }
2805 
2806 static const struct seq_operations proto_seq_ops = {
2807         .start  = proto_seq_start,
2808         .next   = proto_seq_next,
2809         .stop   = proto_seq_stop,
2810         .show   = proto_seq_show,
2811 };
2812 
2813 static int proto_seq_open(struct inode *inode, struct file *file)
2814 {
2815         return seq_open_net(inode, file, &proto_seq_ops,
2816                             sizeof(struct seq_net_private));
2817 }
2818 
2819 static const struct file_operations proto_seq_fops = {
2820         .owner          = THIS_MODULE,
2821         .open           = proto_seq_open,
2822         .read           = seq_read,
2823         .llseek         = seq_lseek,
2824         .release        = seq_release_net,
2825 };
2826 
2827 static __net_init int proto_init_net(struct net *net)
2828 {
2829         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2830                 return -ENOMEM;
2831 
2832         return 0;
2833 }
2834 
2835 static __net_exit void proto_exit_net(struct net *net)
2836 {
2837         remove_proc_entry("protocols", net->proc_net);
2838 }
2839 
2840 
2841 static __net_initdata struct pernet_operations proto_net_ops = {
2842         .init = proto_init_net,
2843         .exit = proto_exit_net,
2844 };
2845 
2846 static int __init proto_init(void)
2847 {
2848         return register_pernet_subsys(&proto_net_ops);
2849 }
2850 
2851 subsys_initcall(proto_init);
2852 
2853 #endif /* PROC_FS */
2854 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp