~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/io_uring.c

Version: ~ [ linux-5.14-rc3 ] ~ [ linux-5.13.5 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.53 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.135 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.198 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.240 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.276 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.276 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * Shared application/kernel submission and completion ring pairs, for
  4  * supporting fast/efficient IO.
  5  *
  6  * A note on the read/write ordering memory barriers that are matched between
  7  * the application and kernel side.
  8  *
  9  * After the application reads the CQ ring tail, it must use an
 10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
 11  * before writing the tail (using smp_load_acquire to read the tail will
 12  * do). It also needs a smp_mb() before updating CQ head (ordering the
 13  * entry load(s) with the head store), pairing with an implicit barrier
 14  * through a control-dependency in io_get_cqring (smp_store_release to
 15  * store head will do). Failure to do so could lead to reading invalid
 16  * CQ entries.
 17  *
 18  * Likewise, the application must use an appropriate smp_wmb() before
 19  * writing the SQ tail (ordering SQ entry stores with the tail store),
 20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
 21  * to store the tail will do). And it needs a barrier ordering the SQ
 22  * head load before writing new SQ entries (smp_load_acquire to read
 23  * head will do).
 24  *
 25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
 26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
 27  * updating the SQ tail; a full memory barrier smp_mb() is needed
 28  * between.
 29  *
 30  * Also see the examples in the liburing library:
 31  *
 32  *      git://git.kernel.dk/liburing
 33  *
 34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
 35  * from data shared between the kernel and application. This is done both
 36  * for ordering purposes, but also to ensure that once a value is loaded from
 37  * data that the application could potentially modify, it remains stable.
 38  *
 39  * Copyright (C) 2018-2019 Jens Axboe
 40  * Copyright (c) 2018-2019 Christoph Hellwig
 41  */
 42 #include <linux/kernel.h>
 43 #include <linux/init.h>
 44 #include <linux/errno.h>
 45 #include <linux/syscalls.h>
 46 #include <linux/compat.h>
 47 #include <net/compat.h>
 48 #include <linux/refcount.h>
 49 #include <linux/uio.h>
 50 #include <linux/bits.h>
 51 
 52 #include <linux/sched/signal.h>
 53 #include <linux/fs.h>
 54 #include <linux/file.h>
 55 #include <linux/fdtable.h>
 56 #include <linux/mm.h>
 57 #include <linux/mman.h>
 58 #include <linux/percpu.h>
 59 #include <linux/slab.h>
 60 #include <linux/kthread.h>
 61 #include <linux/blkdev.h>
 62 #include <linux/bvec.h>
 63 #include <linux/net.h>
 64 #include <net/sock.h>
 65 #include <net/af_unix.h>
 66 #include <net/scm.h>
 67 #include <linux/anon_inodes.h>
 68 #include <linux/sched/mm.h>
 69 #include <linux/uaccess.h>
 70 #include <linux/nospec.h>
 71 #include <linux/sizes.h>
 72 #include <linux/hugetlb.h>
 73 #include <linux/highmem.h>
 74 #include <linux/namei.h>
 75 #include <linux/fsnotify.h>
 76 #include <linux/fadvise.h>
 77 #include <linux/eventpoll.h>
 78 #include <linux/fs_struct.h>
 79 #include <linux/splice.h>
 80 #include <linux/task_work.h>
 81 #include <linux/pagemap.h>
 82 #include <linux/io_uring.h>
 83 
 84 #define CREATE_TRACE_POINTS
 85 #include <trace/events/io_uring.h>
 86 
 87 #include <uapi/linux/io_uring.h>
 88 
 89 #include "internal.h"
 90 #include "io-wq.h"
 91 
 92 #define IORING_MAX_ENTRIES      32768
 93 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
 94 
 95 /*
 96  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
 97  */
 98 #define IORING_FILE_TABLE_SHIFT 9
 99 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
100 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
101 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
102 
103 struct io_uring {
104         u32 head ____cacheline_aligned_in_smp;
105         u32 tail ____cacheline_aligned_in_smp;
106 };
107 
108 /*
109  * This data is shared with the application through the mmap at offsets
110  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
111  *
112  * The offsets to the member fields are published through struct
113  * io_sqring_offsets when calling io_uring_setup.
114  */
115 struct io_rings {
116         /*
117          * Head and tail offsets into the ring; the offsets need to be
118          * masked to get valid indices.
119          *
120          * The kernel controls head of the sq ring and the tail of the cq ring,
121          * and the application controls tail of the sq ring and the head of the
122          * cq ring.
123          */
124         struct io_uring         sq, cq;
125         /*
126          * Bitmasks to apply to head and tail offsets (constant, equals
127          * ring_entries - 1)
128          */
129         u32                     sq_ring_mask, cq_ring_mask;
130         /* Ring sizes (constant, power of 2) */
131         u32                     sq_ring_entries, cq_ring_entries;
132         /*
133          * Number of invalid entries dropped by the kernel due to
134          * invalid index stored in array
135          *
136          * Written by the kernel, shouldn't be modified by the
137          * application (i.e. get number of "new events" by comparing to
138          * cached value).
139          *
140          * After a new SQ head value was read by the application this
141          * counter includes all submissions that were dropped reaching
142          * the new SQ head (and possibly more).
143          */
144         u32                     sq_dropped;
145         /*
146          * Runtime SQ flags
147          *
148          * Written by the kernel, shouldn't be modified by the
149          * application.
150          *
151          * The application needs a full memory barrier before checking
152          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
153          */
154         u32                     sq_flags;
155         /*
156          * Runtime CQ flags
157          *
158          * Written by the application, shouldn't be modified by the
159          * kernel.
160          */
161         u32                     cq_flags;
162         /*
163          * Number of completion events lost because the queue was full;
164          * this should be avoided by the application by making sure
165          * there are not more requests pending than there is space in
166          * the completion queue.
167          *
168          * Written by the kernel, shouldn't be modified by the
169          * application (i.e. get number of "new events" by comparing to
170          * cached value).
171          *
172          * As completion events come in out of order this counter is not
173          * ordered with any other data.
174          */
175         u32                     cq_overflow;
176         /*
177          * Ring buffer of completion events.
178          *
179          * The kernel writes completion events fresh every time they are
180          * produced, so the application is allowed to modify pending
181          * entries.
182          */
183         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
184 };
185 
186 struct io_mapped_ubuf {
187         u64             ubuf;
188         size_t          len;
189         struct          bio_vec *bvec;
190         unsigned int    nr_bvecs;
191 };
192 
193 struct fixed_file_table {
194         struct file             **files;
195 };
196 
197 struct fixed_file_ref_node {
198         struct percpu_ref               refs;
199         struct list_head                node;
200         struct list_head                file_list;
201         struct fixed_file_data          *file_data;
202         struct llist_node               llist;
203         bool                            done;
204 };
205 
206 struct fixed_file_data {
207         struct fixed_file_table         *table;
208         struct io_ring_ctx              *ctx;
209 
210         struct percpu_ref               *cur_refs;
211         struct percpu_ref               refs;
212         struct completion               done;
213         struct list_head                ref_list;
214         spinlock_t                      lock;
215 };
216 
217 struct io_buffer {
218         struct list_head list;
219         __u64 addr;
220         __s32 len;
221         __u16 bid;
222 };
223 
224 struct io_ring_ctx {
225         struct {
226                 struct percpu_ref       refs;
227         } ____cacheline_aligned_in_smp;
228 
229         struct {
230                 unsigned int            flags;
231                 unsigned int            compat: 1;
232                 unsigned int            limit_mem: 1;
233                 unsigned int            cq_overflow_flushed: 1;
234                 unsigned int            drain_next: 1;
235                 unsigned int            eventfd_async: 1;
236 
237                 /*
238                  * Ring buffer of indices into array of io_uring_sqe, which is
239                  * mmapped by the application using the IORING_OFF_SQES offset.
240                  *
241                  * This indirection could e.g. be used to assign fixed
242                  * io_uring_sqe entries to operations and only submit them to
243                  * the queue when needed.
244                  *
245                  * The kernel modifies neither the indices array nor the entries
246                  * array.
247                  */
248                 u32                     *sq_array;
249                 unsigned                cached_sq_head;
250                 unsigned                sq_entries;
251                 unsigned                sq_mask;
252                 unsigned                sq_thread_idle;
253                 unsigned                cached_sq_dropped;
254                 atomic_t                cached_cq_overflow;
255                 unsigned long           sq_check_overflow;
256 
257                 struct list_head        defer_list;
258                 struct list_head        timeout_list;
259                 struct list_head        cq_overflow_list;
260 
261                 wait_queue_head_t       inflight_wait;
262                 struct io_uring_sqe     *sq_sqes;
263         } ____cacheline_aligned_in_smp;
264 
265         struct io_rings *rings;
266 
267         /* IO offload */
268         struct io_wq            *io_wq;
269         struct task_struct      *sqo_thread;    /* if using sq thread polling */
270 
271         /*
272          * For SQPOLL usage - we hold a reference to the parent task, so we
273          * have access to the ->files
274          */
275         struct task_struct      *sqo_task;
276 
277         /* Only used for accounting purposes */
278         struct mm_struct        *mm_account;
279 
280         wait_queue_head_t       sqo_wait;
281 
282         /*
283          * If used, fixed file set. Writers must ensure that ->refs is dead,
284          * readers must ensure that ->refs is alive as long as the file* is
285          * used. Only updated through io_uring_register(2).
286          */
287         struct fixed_file_data  *file_data;
288         unsigned                nr_user_files;
289 
290         /* if used, fixed mapped user buffers */
291         unsigned                nr_user_bufs;
292         struct io_mapped_ubuf   *user_bufs;
293 
294         struct user_struct      *user;
295 
296         const struct cred       *creds;
297 
298         struct completion       ref_comp;
299         struct completion       sq_thread_comp;
300 
301         /* if all else fails... */
302         struct io_kiocb         *fallback_req;
303 
304 #if defined(CONFIG_UNIX)
305         struct socket           *ring_sock;
306 #endif
307 
308         struct idr              io_buffer_idr;
309 
310         struct idr              personality_idr;
311 
312         struct {
313                 unsigned                cached_cq_tail;
314                 unsigned                cq_entries;
315                 unsigned                cq_mask;
316                 atomic_t                cq_timeouts;
317                 unsigned long           cq_check_overflow;
318                 struct wait_queue_head  cq_wait;
319                 struct fasync_struct    *cq_fasync;
320                 struct eventfd_ctx      *cq_ev_fd;
321         } ____cacheline_aligned_in_smp;
322 
323         struct {
324                 struct mutex            uring_lock;
325                 wait_queue_head_t       wait;
326         } ____cacheline_aligned_in_smp;
327 
328         struct {
329                 spinlock_t              completion_lock;
330 
331                 /*
332                  * ->iopoll_list is protected by the ctx->uring_lock for
333                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
334                  * For SQPOLL, only the single threaded io_sq_thread() will
335                  * manipulate the list, hence no extra locking is needed there.
336                  */
337                 struct list_head        iopoll_list;
338                 struct hlist_head       *cancel_hash;
339                 unsigned                cancel_hash_bits;
340                 bool                    poll_multi_file;
341 
342                 spinlock_t              inflight_lock;
343                 struct list_head        inflight_list;
344         } ____cacheline_aligned_in_smp;
345 
346         struct delayed_work             file_put_work;
347         struct llist_head               file_put_llist;
348 
349         struct work_struct              exit_work;
350 };
351 
352 /*
353  * First field must be the file pointer in all the
354  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
355  */
356 struct io_poll_iocb {
357         struct file                     *file;
358         union {
359                 struct wait_queue_head  *head;
360                 u64                     addr;
361         };
362         __poll_t                        events;
363         bool                            done;
364         bool                            canceled;
365         struct wait_queue_entry         wait;
366 };
367 
368 struct io_close {
369         struct file                     *file;
370         struct file                     *put_file;
371         int                             fd;
372 };
373 
374 struct io_timeout_data {
375         struct io_kiocb                 *req;
376         struct hrtimer                  timer;
377         struct timespec64               ts;
378         enum hrtimer_mode               mode;
379 };
380 
381 struct io_accept {
382         struct file                     *file;
383         struct sockaddr __user          *addr;
384         int __user                      *addr_len;
385         int                             flags;
386         unsigned long                   nofile;
387 };
388 
389 struct io_sync {
390         struct file                     *file;
391         loff_t                          len;
392         loff_t                          off;
393         int                             flags;
394         int                             mode;
395 };
396 
397 struct io_cancel {
398         struct file                     *file;
399         u64                             addr;
400 };
401 
402 struct io_timeout {
403         struct file                     *file;
404         u64                             addr;
405         int                             flags;
406         u32                             off;
407         u32                             target_seq;
408         struct list_head                list;
409 };
410 
411 struct io_rw {
412         /* NOTE: kiocb has the file as the first member, so don't do it here */
413         struct kiocb                    kiocb;
414         u64                             addr;
415         u64                             len;
416 };
417 
418 struct io_connect {
419         struct file                     *file;
420         struct sockaddr __user          *addr;
421         int                             addr_len;
422 };
423 
424 struct io_sr_msg {
425         struct file                     *file;
426         union {
427                 struct user_msghdr __user *umsg;
428                 void __user             *buf;
429         };
430         int                             msg_flags;
431         int                             bgid;
432         size_t                          len;
433         struct io_buffer                *kbuf;
434 };
435 
436 struct io_open {
437         struct file                     *file;
438         int                             dfd;
439         bool                            ignore_nonblock;
440         struct filename                 *filename;
441         struct open_how                 how;
442         unsigned long                   nofile;
443 };
444 
445 struct io_files_update {
446         struct file                     *file;
447         u64                             arg;
448         u32                             nr_args;
449         u32                             offset;
450 };
451 
452 struct io_fadvise {
453         struct file                     *file;
454         u64                             offset;
455         u32                             len;
456         u32                             advice;
457 };
458 
459 struct io_madvise {
460         struct file                     *file;
461         u64                             addr;
462         u32                             len;
463         u32                             advice;
464 };
465 
466 struct io_epoll {
467         struct file                     *file;
468         int                             epfd;
469         int                             op;
470         int                             fd;
471         struct epoll_event              event;
472 };
473 
474 struct io_splice {
475         struct file                     *file_out;
476         struct file                     *file_in;
477         loff_t                          off_out;
478         loff_t                          off_in;
479         u64                             len;
480         unsigned int                    flags;
481 };
482 
483 struct io_provide_buf {
484         struct file                     *file;
485         __u64                           addr;
486         __s32                           len;
487         __u32                           bgid;
488         __u16                           nbufs;
489         __u16                           bid;
490 };
491 
492 struct io_statx {
493         struct file                     *file;
494         int                             dfd;
495         unsigned int                    mask;
496         unsigned int                    flags;
497         const char __user               *filename;
498         struct statx __user             *buffer;
499 };
500 
501 struct io_completion {
502         struct file                     *file;
503         struct list_head                list;
504         int                             cflags;
505 };
506 
507 struct io_async_connect {
508         struct sockaddr_storage         address;
509 };
510 
511 struct io_async_msghdr {
512         struct iovec                    fast_iov[UIO_FASTIOV];
513         struct iovec                    *iov;
514         struct sockaddr __user          *uaddr;
515         struct msghdr                   msg;
516         struct sockaddr_storage         addr;
517 };
518 
519 struct io_async_rw {
520         struct iovec                    fast_iov[UIO_FASTIOV];
521         const struct iovec              *free_iovec;
522         struct iov_iter                 iter;
523         size_t                          bytes_done;
524         struct wait_page_queue          wpq;
525 };
526 
527 struct io_async_ctx {
528         union {
529                 struct io_async_rw      rw;
530                 struct io_async_msghdr  msg;
531                 struct io_async_connect connect;
532                 struct io_timeout_data  timeout;
533         };
534 };
535 
536 enum {
537         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
538         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
539         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
540         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
541         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
542         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
543 
544         REQ_F_LINK_HEAD_BIT,
545         REQ_F_FAIL_LINK_BIT,
546         REQ_F_INFLIGHT_BIT,
547         REQ_F_CUR_POS_BIT,
548         REQ_F_NOWAIT_BIT,
549         REQ_F_LINK_TIMEOUT_BIT,
550         REQ_F_ISREG_BIT,
551         REQ_F_COMP_LOCKED_BIT,
552         REQ_F_NEED_CLEANUP_BIT,
553         REQ_F_POLLED_BIT,
554         REQ_F_BUFFER_SELECTED_BIT,
555         REQ_F_NO_FILE_TABLE_BIT,
556         REQ_F_WORK_INITIALIZED_BIT,
557 
558         /* not a real bit, just to check we're not overflowing the space */
559         __REQ_F_LAST_BIT,
560 };
561 
562 enum {
563         /* ctx owns file */
564         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
565         /* drain existing IO first */
566         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
567         /* linked sqes */
568         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
569         /* doesn't sever on completion < 0 */
570         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
571         /* IOSQE_ASYNC */
572         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
573         /* IOSQE_BUFFER_SELECT */
574         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
575 
576         /* head of a link */
577         REQ_F_LINK_HEAD         = BIT(REQ_F_LINK_HEAD_BIT),
578         /* fail rest of links */
579         REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
580         /* on inflight list */
581         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
582         /* read/write uses file position */
583         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
584         /* must not punt to workers */
585         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
586         /* has linked timeout */
587         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
588         /* regular file */
589         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
590         /* completion under lock */
591         REQ_F_COMP_LOCKED       = BIT(REQ_F_COMP_LOCKED_BIT),
592         /* needs cleanup */
593         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
594         /* already went through poll handler */
595         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
596         /* buffer already selected */
597         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
598         /* doesn't need file table for this request */
599         REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
600         /* io_wq_work is initialized */
601         REQ_F_WORK_INITIALIZED  = BIT(REQ_F_WORK_INITIALIZED_BIT),
602 };
603 
604 struct async_poll {
605         struct io_poll_iocb     poll;
606         struct io_poll_iocb     *double_poll;
607 };
608 
609 /*
610  * NOTE! Each of the iocb union members has the file pointer
611  * as the first entry in their struct definition. So you can
612  * access the file pointer through any of the sub-structs,
613  * or directly as just 'ki_filp' in this struct.
614  */
615 struct io_kiocb {
616         union {
617                 struct file             *file;
618                 struct io_rw            rw;
619                 struct io_poll_iocb     poll;
620                 struct io_accept        accept;
621                 struct io_sync          sync;
622                 struct io_cancel        cancel;
623                 struct io_timeout       timeout;
624                 struct io_connect       connect;
625                 struct io_sr_msg        sr_msg;
626                 struct io_open          open;
627                 struct io_close         close;
628                 struct io_files_update  files_update;
629                 struct io_fadvise       fadvise;
630                 struct io_madvise       madvise;
631                 struct io_epoll         epoll;
632                 struct io_splice        splice;
633                 struct io_provide_buf   pbuf;
634                 struct io_statx         statx;
635                 /* use only after cleaning per-op data, see io_clean_op() */
636                 struct io_completion    compl;
637         };
638 
639         struct io_async_ctx             *io;
640         u8                              opcode;
641         /* polled IO has completed */
642         u8                              iopoll_completed;
643 
644         u16                             buf_index;
645         u32                             result;
646 
647         struct io_ring_ctx              *ctx;
648         unsigned int                    flags;
649         refcount_t                      refs;
650         struct task_struct              *task;
651         u64                             user_data;
652 
653         struct list_head                link_list;
654 
655         /*
656          * 1. used with ctx->iopoll_list with reads/writes
657          * 2. to track reqs with ->files (see io_op_def::file_table)
658          */
659         struct list_head                inflight_entry;
660 
661         struct percpu_ref               *fixed_file_refs;
662         struct callback_head            task_work;
663         /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
664         struct hlist_node               hash_node;
665         struct async_poll               *apoll;
666         struct io_wq_work               work;
667 };
668 
669 struct io_defer_entry {
670         struct list_head        list;
671         struct io_kiocb         *req;
672         u32                     seq;
673 };
674 
675 #define IO_IOPOLL_BATCH                 8
676 
677 struct io_comp_state {
678         unsigned int            nr;
679         struct list_head        list;
680         struct io_ring_ctx      *ctx;
681 };
682 
683 struct io_submit_state {
684         struct blk_plug         plug;
685 
686         /*
687          * io_kiocb alloc cache
688          */
689         void                    *reqs[IO_IOPOLL_BATCH];
690         unsigned int            free_reqs;
691 
692         /*
693          * Batch completion logic
694          */
695         struct io_comp_state    comp;
696 
697         /*
698          * File reference cache
699          */
700         struct file             *file;
701         unsigned int            fd;
702         unsigned int            has_refs;
703         unsigned int            ios_left;
704 };
705 
706 struct io_op_def {
707         /* needs req->io allocated for deferral/async */
708         unsigned                async_ctx : 1;
709         /* needs current->mm setup, does mm access */
710         unsigned                needs_mm : 1;
711         /* needs req->file assigned */
712         unsigned                needs_file : 1;
713         /* don't fail if file grab fails */
714         unsigned                needs_file_no_error : 1;
715         /* hash wq insertion if file is a regular file */
716         unsigned                hash_reg_file : 1;
717         /* unbound wq insertion if file is a non-regular file */
718         unsigned                unbound_nonreg_file : 1;
719         /* opcode is not supported by this kernel */
720         unsigned                not_supported : 1;
721         /* needs file table */
722         unsigned                file_table : 1;
723         /* needs ->fs */
724         unsigned                needs_fs : 1;
725         /* set if opcode supports polled "wait" */
726         unsigned                pollin : 1;
727         unsigned                pollout : 1;
728         /* op supports buffer selection */
729         unsigned                buffer_select : 1;
730         unsigned                needs_fsize : 1;
731 };
732 
733 static const struct io_op_def io_op_defs[] = {
734         [IORING_OP_NOP] = {},
735         [IORING_OP_READV] = {
736                 .async_ctx              = 1,
737                 .needs_mm               = 1,
738                 .needs_file             = 1,
739                 .unbound_nonreg_file    = 1,
740                 .pollin                 = 1,
741                 .buffer_select          = 1,
742         },
743         [IORING_OP_WRITEV] = {
744                 .async_ctx              = 1,
745                 .needs_mm               = 1,
746                 .needs_file             = 1,
747                 .hash_reg_file          = 1,
748                 .unbound_nonreg_file    = 1,
749                 .pollout                = 1,
750                 .needs_fsize            = 1,
751         },
752         [IORING_OP_FSYNC] = {
753                 .needs_file             = 1,
754         },
755         [IORING_OP_READ_FIXED] = {
756                 .needs_file             = 1,
757                 .unbound_nonreg_file    = 1,
758                 .pollin                 = 1,
759         },
760         [IORING_OP_WRITE_FIXED] = {
761                 .needs_file             = 1,
762                 .hash_reg_file          = 1,
763                 .unbound_nonreg_file    = 1,
764                 .pollout                = 1,
765                 .needs_fsize            = 1,
766         },
767         [IORING_OP_POLL_ADD] = {
768                 .needs_file             = 1,
769                 .unbound_nonreg_file    = 1,
770         },
771         [IORING_OP_POLL_REMOVE] = {},
772         [IORING_OP_SYNC_FILE_RANGE] = {
773                 .needs_file             = 1,
774         },
775         [IORING_OP_SENDMSG] = {
776                 .async_ctx              = 1,
777                 .needs_mm               = 1,
778                 .needs_file             = 1,
779                 .unbound_nonreg_file    = 1,
780                 .needs_fs               = 1,
781                 .pollout                = 1,
782         },
783         [IORING_OP_RECVMSG] = {
784                 .async_ctx              = 1,
785                 .needs_mm               = 1,
786                 .needs_file             = 1,
787                 .unbound_nonreg_file    = 1,
788                 .needs_fs               = 1,
789                 .pollin                 = 1,
790                 .buffer_select          = 1,
791         },
792         [IORING_OP_TIMEOUT] = {
793                 .async_ctx              = 1,
794                 .needs_mm               = 1,
795         },
796         [IORING_OP_TIMEOUT_REMOVE] = {},
797         [IORING_OP_ACCEPT] = {
798                 .needs_mm               = 1,
799                 .needs_file             = 1,
800                 .unbound_nonreg_file    = 1,
801                 .file_table             = 1,
802                 .pollin                 = 1,
803         },
804         [IORING_OP_ASYNC_CANCEL] = {},
805         [IORING_OP_LINK_TIMEOUT] = {
806                 .async_ctx              = 1,
807                 .needs_mm               = 1,
808         },
809         [IORING_OP_CONNECT] = {
810                 .async_ctx              = 1,
811                 .needs_mm               = 1,
812                 .needs_file             = 1,
813                 .unbound_nonreg_file    = 1,
814                 .pollout                = 1,
815         },
816         [IORING_OP_FALLOCATE] = {
817                 .needs_file             = 1,
818                 .needs_fsize            = 1,
819         },
820         [IORING_OP_OPENAT] = {
821                 .file_table             = 1,
822                 .needs_fs               = 1,
823         },
824         [IORING_OP_CLOSE] = {
825                 .needs_file             = 1,
826                 .needs_file_no_error    = 1,
827                 .file_table             = 1,
828         },
829         [IORING_OP_FILES_UPDATE] = {
830                 .needs_mm               = 1,
831                 .file_table             = 1,
832         },
833         [IORING_OP_STATX] = {
834                 .needs_mm               = 1,
835                 .needs_fs               = 1,
836                 .file_table             = 1,
837         },
838         [IORING_OP_READ] = {
839                 .needs_mm               = 1,
840                 .needs_file             = 1,
841                 .unbound_nonreg_file    = 1,
842                 .pollin                 = 1,
843                 .buffer_select          = 1,
844         },
845         [IORING_OP_WRITE] = {
846                 .needs_mm               = 1,
847                 .needs_file             = 1,
848                 .unbound_nonreg_file    = 1,
849                 .pollout                = 1,
850                 .needs_fsize            = 1,
851         },
852         [IORING_OP_FADVISE] = {
853                 .needs_file             = 1,
854         },
855         [IORING_OP_MADVISE] = {
856                 .needs_mm               = 1,
857         },
858         [IORING_OP_SEND] = {
859                 .needs_mm               = 1,
860                 .needs_file             = 1,
861                 .unbound_nonreg_file    = 1,
862                 .pollout                = 1,
863         },
864         [IORING_OP_RECV] = {
865                 .needs_mm               = 1,
866                 .needs_file             = 1,
867                 .unbound_nonreg_file    = 1,
868                 .pollin                 = 1,
869                 .buffer_select          = 1,
870         },
871         [IORING_OP_OPENAT2] = {
872                 .file_table             = 1,
873                 .needs_fs               = 1,
874         },
875         [IORING_OP_EPOLL_CTL] = {
876                 .unbound_nonreg_file    = 1,
877                 .file_table             = 1,
878         },
879         [IORING_OP_SPLICE] = {
880                 .needs_file             = 1,
881                 .hash_reg_file          = 1,
882                 .unbound_nonreg_file    = 1,
883         },
884         [IORING_OP_PROVIDE_BUFFERS] = {},
885         [IORING_OP_REMOVE_BUFFERS] = {},
886         [IORING_OP_TEE] = {
887                 .needs_file             = 1,
888                 .hash_reg_file          = 1,
889                 .unbound_nonreg_file    = 1,
890         },
891 };
892 
893 enum io_mem_account {
894         ACCT_LOCKED,
895         ACCT_PINNED,
896 };
897 
898 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
899                              struct io_comp_state *cs);
900 static void io_cqring_fill_event(struct io_kiocb *req, long res);
901 static void io_put_req(struct io_kiocb *req);
902 static void io_double_put_req(struct io_kiocb *req);
903 static void __io_double_put_req(struct io_kiocb *req);
904 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
905 static void __io_queue_linked_timeout(struct io_kiocb *req);
906 static void io_queue_linked_timeout(struct io_kiocb *req);
907 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
908                                  struct io_uring_files_update *ip,
909                                  unsigned nr_args);
910 static int io_prep_work_files(struct io_kiocb *req);
911 static void __io_clean_op(struct io_kiocb *req);
912 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
913                        int fd, struct file **out_file, bool fixed);
914 static void __io_queue_sqe(struct io_kiocb *req,
915                            const struct io_uring_sqe *sqe,
916                            struct io_comp_state *cs);
917 static void io_file_put_work(struct work_struct *work);
918 
919 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
920                                struct iovec **iovec, struct iov_iter *iter,
921                                bool needs_lock);
922 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
923                              const struct iovec *fast_iov,
924                              struct iov_iter *iter, bool force);
925 
926 static struct kmem_cache *req_cachep;
927 
928 static const struct file_operations io_uring_fops;
929 
930 struct sock *io_uring_get_socket(struct file *file)
931 {
932 #if defined(CONFIG_UNIX)
933         if (file->f_op == &io_uring_fops) {
934                 struct io_ring_ctx *ctx = file->private_data;
935 
936                 return ctx->ring_sock->sk;
937         }
938 #endif
939         return NULL;
940 }
941 EXPORT_SYMBOL(io_uring_get_socket);
942 
943 static inline void io_clean_op(struct io_kiocb *req)
944 {
945         if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
946                           REQ_F_INFLIGHT))
947                 __io_clean_op(req);
948 }
949 
950 static void io_sq_thread_drop_mm(void)
951 {
952         struct mm_struct *mm = current->mm;
953 
954         if (mm) {
955                 kthread_unuse_mm(mm);
956                 mmput(mm);
957                 current->mm = NULL;
958         }
959 }
960 
961 static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
962 {
963         struct mm_struct *mm;
964 
965         if (current->mm)
966                 return 0;
967 
968         /* Should never happen */
969         if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL)))
970                 return -EFAULT;
971 
972         task_lock(ctx->sqo_task);
973         mm = ctx->sqo_task->mm;
974         if (unlikely(!mm || !mmget_not_zero(mm)))
975                 mm = NULL;
976         task_unlock(ctx->sqo_task);
977 
978         if (mm) {
979                 kthread_use_mm(mm);
980                 return 0;
981         }
982 
983         return -EFAULT;
984 }
985 
986 static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
987                                    struct io_kiocb *req)
988 {
989         if (!io_op_defs[req->opcode].needs_mm)
990                 return 0;
991         return __io_sq_thread_acquire_mm(ctx);
992 }
993 
994 static inline void req_set_fail_links(struct io_kiocb *req)
995 {
996         if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
997                 req->flags |= REQ_F_FAIL_LINK;
998 }
999 
1000 /*
1001  * Note: must call io_req_init_async() for the first time you
1002  * touch any members of io_wq_work.
1003  */
1004 static inline void io_req_init_async(struct io_kiocb *req)
1005 {
1006         if (req->flags & REQ_F_WORK_INITIALIZED)
1007                 return;
1008 
1009         memset(&req->work, 0, sizeof(req->work));
1010         req->flags |= REQ_F_WORK_INITIALIZED;
1011 }
1012 
1013 static inline bool io_async_submit(struct io_ring_ctx *ctx)
1014 {
1015         return ctx->flags & IORING_SETUP_SQPOLL;
1016 }
1017 
1018 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1019 {
1020         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1021 
1022         complete(&ctx->ref_comp);
1023 }
1024 
1025 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1026 {
1027         return !req->timeout.off;
1028 }
1029 
1030 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1031 {
1032         struct io_ring_ctx *ctx;
1033         int hash_bits;
1034 
1035         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1036         if (!ctx)
1037                 return NULL;
1038 
1039         ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
1040         if (!ctx->fallback_req)
1041                 goto err;
1042 
1043         /*
1044          * Use 5 bits less than the max cq entries, that should give us around
1045          * 32 entries per hash list if totally full and uniformly spread.
1046          */
1047         hash_bits = ilog2(p->cq_entries);
1048         hash_bits -= 5;
1049         if (hash_bits <= 0)
1050                 hash_bits = 1;
1051         ctx->cancel_hash_bits = hash_bits;
1052         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1053                                         GFP_KERNEL);
1054         if (!ctx->cancel_hash)
1055                 goto err;
1056         __hash_init(ctx->cancel_hash, 1U << hash_bits);
1057 
1058         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1059                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1060                 goto err;
1061 
1062         ctx->flags = p->flags;
1063         init_waitqueue_head(&ctx->sqo_wait);
1064         init_waitqueue_head(&ctx->cq_wait);
1065         INIT_LIST_HEAD(&ctx->cq_overflow_list);
1066         init_completion(&ctx->ref_comp);
1067         init_completion(&ctx->sq_thread_comp);
1068         idr_init(&ctx->io_buffer_idr);
1069         idr_init(&ctx->personality_idr);
1070         mutex_init(&ctx->uring_lock);
1071         init_waitqueue_head(&ctx->wait);
1072         spin_lock_init(&ctx->completion_lock);
1073         INIT_LIST_HEAD(&ctx->iopoll_list);
1074         INIT_LIST_HEAD(&ctx->defer_list);
1075         INIT_LIST_HEAD(&ctx->timeout_list);
1076         init_waitqueue_head(&ctx->inflight_wait);
1077         spin_lock_init(&ctx->inflight_lock);
1078         INIT_LIST_HEAD(&ctx->inflight_list);
1079         INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
1080         init_llist_head(&ctx->file_put_llist);
1081         return ctx;
1082 err:
1083         if (ctx->fallback_req)
1084                 kmem_cache_free(req_cachep, ctx->fallback_req);
1085         kfree(ctx->cancel_hash);
1086         kfree(ctx);
1087         return NULL;
1088 }
1089 
1090 static bool req_need_defer(struct io_kiocb *req, u32 seq)
1091 {
1092         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1093                 struct io_ring_ctx *ctx = req->ctx;
1094 
1095                 return seq != ctx->cached_cq_tail
1096                                 + atomic_read(&ctx->cached_cq_overflow);
1097         }
1098 
1099         return false;
1100 }
1101 
1102 static void __io_commit_cqring(struct io_ring_ctx *ctx)
1103 {
1104         struct io_rings *rings = ctx->rings;
1105 
1106         /* order cqe stores with ring update */
1107         smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
1108 
1109         if (wq_has_sleeper(&ctx->cq_wait)) {
1110                 wake_up_interruptible(&ctx->cq_wait);
1111                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1112         }
1113 }
1114 
1115 /*
1116  * Returns true if we need to defer file table putting. This can only happen
1117  * from the error path with REQ_F_COMP_LOCKED set.
1118  */
1119 static bool io_req_clean_work(struct io_kiocb *req)
1120 {
1121         if (!(req->flags & REQ_F_WORK_INITIALIZED))
1122                 return false;
1123 
1124         req->flags &= ~REQ_F_WORK_INITIALIZED;
1125 
1126         if (req->work.mm) {
1127                 mmdrop(req->work.mm);
1128                 req->work.mm = NULL;
1129         }
1130         if (req->work.creds) {
1131                 put_cred(req->work.creds);
1132                 req->work.creds = NULL;
1133         }
1134         if (req->work.fs) {
1135                 struct fs_struct *fs = req->work.fs;
1136 
1137                 if (req->flags & REQ_F_COMP_LOCKED)
1138                         return true;
1139 
1140                 spin_lock(&req->work.fs->lock);
1141                 if (--fs->users)
1142                         fs = NULL;
1143                 spin_unlock(&req->work.fs->lock);
1144                 if (fs)
1145                         free_fs_struct(fs);
1146                 req->work.fs = NULL;
1147         }
1148 
1149         return false;
1150 }
1151 
1152 static void io_prep_async_work(struct io_kiocb *req)
1153 {
1154         const struct io_op_def *def = &io_op_defs[req->opcode];
1155 
1156         io_req_init_async(req);
1157 
1158         if (req->flags & REQ_F_FORCE_ASYNC)
1159                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1160 
1161         if (req->flags & REQ_F_ISREG) {
1162                 if (def->hash_reg_file || (req->ctx->flags & IORING_SETUP_IOPOLL))
1163                         io_wq_hash_work(&req->work, file_inode(req->file));
1164         } else {
1165                 if (def->unbound_nonreg_file)
1166                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1167         }
1168         if (!req->work.mm && def->needs_mm) {
1169                 mmgrab(current->mm);
1170                 req->work.mm = current->mm;
1171         }
1172         if (!req->work.creds)
1173                 req->work.creds = get_current_cred();
1174         if (!req->work.fs && def->needs_fs) {
1175                 spin_lock(&current->fs->lock);
1176                 if (!current->fs->in_exec) {
1177                         req->work.fs = current->fs;
1178                         req->work.fs->users++;
1179                 } else {
1180                         req->work.flags |= IO_WQ_WORK_CANCEL;
1181                 }
1182                 spin_unlock(&current->fs->lock);
1183         }
1184         if (def->needs_fsize)
1185                 req->work.fsize = rlimit(RLIMIT_FSIZE);
1186         else
1187                 req->work.fsize = RLIM_INFINITY;
1188 }
1189 
1190 static void io_prep_async_link(struct io_kiocb *req)
1191 {
1192         struct io_kiocb *cur;
1193 
1194         io_prep_async_work(req);
1195         if (req->flags & REQ_F_LINK_HEAD)
1196                 list_for_each_entry(cur, &req->link_list, link_list)
1197                         io_prep_async_work(cur);
1198 }
1199 
1200 static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
1201 {
1202         struct io_ring_ctx *ctx = req->ctx;
1203         struct io_kiocb *link = io_prep_linked_timeout(req);
1204 
1205         trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1206                                         &req->work, req->flags);
1207         io_wq_enqueue(ctx->io_wq, &req->work);
1208         return link;
1209 }
1210 
1211 static void io_queue_async_work(struct io_kiocb *req)
1212 {
1213         struct io_kiocb *link;
1214 
1215         /* init ->work of the whole link before punting */
1216         io_prep_async_link(req);
1217         link = __io_queue_async_work(req);
1218 
1219         if (link)
1220                 io_queue_linked_timeout(link);
1221 }
1222 
1223 static void io_kill_timeout(struct io_kiocb *req)
1224 {
1225         int ret;
1226 
1227         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
1228         if (ret != -1) {
1229                 atomic_set(&req->ctx->cq_timeouts,
1230                         atomic_read(&req->ctx->cq_timeouts) + 1);
1231                 list_del_init(&req->timeout.list);
1232                 req->flags |= REQ_F_COMP_LOCKED;
1233                 io_cqring_fill_event(req, 0);
1234                 io_put_req(req);
1235         }
1236 }
1237 
1238 static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
1239 {
1240         struct io_ring_ctx *ctx = req->ctx;
1241 
1242         if (!tsk || req->task == tsk)
1243                 return true;
1244         if ((ctx->flags & IORING_SETUP_SQPOLL) && req->task == ctx->sqo_thread)
1245                 return true;
1246         return false;
1247 }
1248 
1249 /*
1250  * Returns true if we found and killed one or more timeouts
1251  */
1252 static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
1253 {
1254         struct io_kiocb *req, *tmp;
1255         int canceled = 0;
1256 
1257         spin_lock_irq(&ctx->completion_lock);
1258         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
1259                 if (io_task_match(req, tsk)) {
1260                         io_kill_timeout(req);
1261                         canceled++;
1262                 }
1263         }
1264         spin_unlock_irq(&ctx->completion_lock);
1265         return canceled != 0;
1266 }
1267 
1268 static void __io_queue_deferred(struct io_ring_ctx *ctx)
1269 {
1270         do {
1271                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1272                                                 struct io_defer_entry, list);
1273                 struct io_kiocb *link;
1274 
1275                 if (req_need_defer(de->req, de->seq))
1276                         break;
1277                 list_del_init(&de->list);
1278                 /* punt-init is done before queueing for defer */
1279                 link = __io_queue_async_work(de->req);
1280                 if (link) {
1281                         __io_queue_linked_timeout(link);
1282                         /* drop submission reference */
1283                         link->flags |= REQ_F_COMP_LOCKED;
1284                         io_put_req(link);
1285                 }
1286                 kfree(de);
1287         } while (!list_empty(&ctx->defer_list));
1288 }
1289 
1290 static void io_flush_timeouts(struct io_ring_ctx *ctx)
1291 {
1292         while (!list_empty(&ctx->timeout_list)) {
1293                 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1294                                                 struct io_kiocb, timeout.list);
1295 
1296                 if (io_is_timeout_noseq(req))
1297                         break;
1298                 if (req->timeout.target_seq != ctx->cached_cq_tail
1299                                         - atomic_read(&ctx->cq_timeouts))
1300                         break;
1301 
1302                 list_del_init(&req->timeout.list);
1303                 io_kill_timeout(req);
1304         }
1305 }
1306 
1307 static void io_commit_cqring(struct io_ring_ctx *ctx)
1308 {
1309         io_flush_timeouts(ctx);
1310         __io_commit_cqring(ctx);
1311 
1312         if (unlikely(!list_empty(&ctx->defer_list)))
1313                 __io_queue_deferred(ctx);
1314 }
1315 
1316 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1317 {
1318         struct io_rings *rings = ctx->rings;
1319         unsigned tail;
1320 
1321         tail = ctx->cached_cq_tail;
1322         /*
1323          * writes to the cq entry need to come after reading head; the
1324          * control dependency is enough as we're using WRITE_ONCE to
1325          * fill the cq entry
1326          */
1327         if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
1328                 return NULL;
1329 
1330         ctx->cached_cq_tail++;
1331         return &rings->cqes[tail & ctx->cq_mask];
1332 }
1333 
1334 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1335 {
1336         if (!ctx->cq_ev_fd)
1337                 return false;
1338         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1339                 return false;
1340         if (!ctx->eventfd_async)
1341                 return true;
1342         return io_wq_current_is_worker();
1343 }
1344 
1345 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1346 {
1347         if (waitqueue_active(&ctx->wait))
1348                 wake_up(&ctx->wait);
1349         if (waitqueue_active(&ctx->sqo_wait))
1350                 wake_up(&ctx->sqo_wait);
1351         if (io_should_trigger_evfd(ctx))
1352                 eventfd_signal(ctx->cq_ev_fd, 1);
1353 }
1354 
1355 static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
1356 {
1357         if (list_empty(&ctx->cq_overflow_list)) {
1358                 clear_bit(0, &ctx->sq_check_overflow);
1359                 clear_bit(0, &ctx->cq_check_overflow);
1360                 ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
1361         }
1362 }
1363 
1364 static inline bool io_match_files(struct io_kiocb *req,
1365                                        struct files_struct *files)
1366 {
1367         if (!files)
1368                 return true;
1369         if (req->flags & REQ_F_WORK_INITIALIZED)
1370                 return req->work.files == files;
1371         return false;
1372 }
1373 
1374 /* Returns true if there are no backlogged entries after the flush */
1375 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1376                                      struct task_struct *tsk,
1377                                      struct files_struct *files)
1378 {
1379         struct io_rings *rings = ctx->rings;
1380         struct io_kiocb *req, *tmp;
1381         struct io_uring_cqe *cqe;
1382         unsigned long flags;
1383         LIST_HEAD(list);
1384 
1385         if (!force) {
1386                 if (list_empty_careful(&ctx->cq_overflow_list))
1387                         return true;
1388                 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1389                     rings->cq_ring_entries))
1390                         return false;
1391         }
1392 
1393         spin_lock_irqsave(&ctx->completion_lock, flags);
1394 
1395         /* if force is set, the ring is going away. always drop after that */
1396         if (force)
1397                 ctx->cq_overflow_flushed = 1;
1398 
1399         cqe = NULL;
1400         list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
1401                 if (tsk && req->task != tsk)
1402                         continue;
1403                 if (!io_match_files(req, files))
1404                         continue;
1405 
1406                 cqe = io_get_cqring(ctx);
1407                 if (!cqe && !force)
1408                         break;
1409 
1410                 list_move(&req->compl.list, &list);
1411                 if (cqe) {
1412                         WRITE_ONCE(cqe->user_data, req->user_data);
1413                         WRITE_ONCE(cqe->res, req->result);
1414                         WRITE_ONCE(cqe->flags, req->compl.cflags);
1415                 } else {
1416                         WRITE_ONCE(ctx->rings->cq_overflow,
1417                                 atomic_inc_return(&ctx->cached_cq_overflow));
1418                 }
1419         }
1420 
1421         io_commit_cqring(ctx);
1422         io_cqring_mark_overflow(ctx);
1423 
1424         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1425         io_cqring_ev_posted(ctx);
1426 
1427         while (!list_empty(&list)) {
1428                 req = list_first_entry(&list, struct io_kiocb, compl.list);
1429                 list_del(&req->compl.list);
1430                 io_put_req(req);
1431         }
1432 
1433         return cqe != NULL;
1434 }
1435 
1436 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
1437 {
1438         struct io_ring_ctx *ctx = req->ctx;
1439         struct io_uring_cqe *cqe;
1440 
1441         trace_io_uring_complete(ctx, req->user_data, res);
1442 
1443         /*
1444          * If we can't get a cq entry, userspace overflowed the
1445          * submission (by quite a lot). Increment the overflow count in
1446          * the ring.
1447          */
1448         cqe = io_get_cqring(ctx);
1449         if (likely(cqe)) {
1450                 WRITE_ONCE(cqe->user_data, req->user_data);
1451                 WRITE_ONCE(cqe->res, res);
1452                 WRITE_ONCE(cqe->flags, cflags);
1453         } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) {
1454                 /*
1455                  * If we're in ring overflow flush mode, or in task cancel mode,
1456                  * then we cannot store the request for later flushing, we need
1457                  * to drop it on the floor.
1458                  */
1459                 WRITE_ONCE(ctx->rings->cq_overflow,
1460                                 atomic_inc_return(&ctx->cached_cq_overflow));
1461         } else {
1462                 if (list_empty(&ctx->cq_overflow_list)) {
1463                         set_bit(0, &ctx->sq_check_overflow);
1464                         set_bit(0, &ctx->cq_check_overflow);
1465                         ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
1466                 }
1467                 io_clean_op(req);
1468                 req->result = res;
1469                 req->compl.cflags = cflags;
1470                 refcount_inc(&req->refs);
1471                 list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
1472         }
1473 }
1474 
1475 static void io_cqring_fill_event(struct io_kiocb *req, long res)
1476 {
1477         __io_cqring_fill_event(req, res, 0);
1478 }
1479 
1480 static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
1481 {
1482         struct io_ring_ctx *ctx = req->ctx;
1483         unsigned long flags;
1484 
1485         spin_lock_irqsave(&ctx->completion_lock, flags);
1486         __io_cqring_fill_event(req, res, cflags);
1487         io_commit_cqring(ctx);
1488         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1489 
1490         io_cqring_ev_posted(ctx);
1491 }
1492 
1493 static void io_submit_flush_completions(struct io_comp_state *cs)
1494 {
1495         struct io_ring_ctx *ctx = cs->ctx;
1496 
1497         spin_lock_irq(&ctx->completion_lock);
1498         while (!list_empty(&cs->list)) {
1499                 struct io_kiocb *req;
1500 
1501                 req = list_first_entry(&cs->list, struct io_kiocb, compl.list);
1502                 list_del(&req->compl.list);
1503                 __io_cqring_fill_event(req, req->result, req->compl.cflags);
1504                 if (!(req->flags & REQ_F_LINK_HEAD)) {
1505                         req->flags |= REQ_F_COMP_LOCKED;
1506                         io_put_req(req);
1507                 } else {
1508                         spin_unlock_irq(&ctx->completion_lock);
1509                         io_put_req(req);
1510                         spin_lock_irq(&ctx->completion_lock);
1511                 }
1512         }
1513         io_commit_cqring(ctx);
1514         spin_unlock_irq(&ctx->completion_lock);
1515 
1516         io_cqring_ev_posted(ctx);
1517         cs->nr = 0;
1518 }
1519 
1520 static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags,
1521                               struct io_comp_state *cs)
1522 {
1523         if (!cs) {
1524                 io_cqring_add_event(req, res, cflags);
1525                 io_put_req(req);
1526         } else {
1527                 io_clean_op(req);
1528                 req->result = res;
1529                 req->compl.cflags = cflags;
1530                 list_add_tail(&req->compl.list, &cs->list);
1531                 if (++cs->nr >= 32)
1532                         io_submit_flush_completions(cs);
1533         }
1534 }
1535 
1536 static void io_req_complete(struct io_kiocb *req, long res)
1537 {
1538         __io_req_complete(req, res, 0, NULL);
1539 }
1540 
1541 static inline bool io_is_fallback_req(struct io_kiocb *req)
1542 {
1543         return req == (struct io_kiocb *)
1544                         ((unsigned long) req->ctx->fallback_req & ~1UL);
1545 }
1546 
1547 static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1548 {
1549         struct io_kiocb *req;
1550 
1551         req = ctx->fallback_req;
1552         if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
1553                 return req;
1554 
1555         return NULL;
1556 }
1557 
1558 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
1559                                      struct io_submit_state *state)
1560 {
1561         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1562         struct io_kiocb *req;
1563 
1564         if (!state->free_reqs) {
1565                 size_t sz;
1566                 int ret;
1567 
1568                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
1569                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1570 
1571                 /*
1572                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
1573                  * retry single alloc to be on the safe side.
1574                  */
1575                 if (unlikely(ret <= 0)) {
1576                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1577                         if (!state->reqs[0])
1578                                 goto fallback;
1579                         ret = 1;
1580                 }
1581                 state->free_reqs = ret - 1;
1582                 req = state->reqs[ret - 1];
1583         } else {
1584                 state->free_reqs--;
1585                 req = state->reqs[state->free_reqs];
1586         }
1587 
1588         return req;
1589 fallback:
1590         return io_get_fallback_req(ctx);
1591 }
1592 
1593 static inline void io_put_file(struct io_kiocb *req, struct file *file,
1594                           bool fixed)
1595 {
1596         if (fixed)
1597                 percpu_ref_put(req->fixed_file_refs);
1598         else
1599                 fput(file);
1600 }
1601 
1602 static bool io_dismantle_req(struct io_kiocb *req)
1603 {
1604         io_clean_op(req);
1605 
1606         if (req->io)
1607                 kfree(req->io);
1608         if (req->file)
1609                 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
1610 
1611         return io_req_clean_work(req);
1612 }
1613 
1614 static void __io_free_req_finish(struct io_kiocb *req)
1615 {
1616         struct io_uring_task *tctx = req->task->io_uring;
1617         struct io_ring_ctx *ctx = req->ctx;
1618 
1619         atomic_long_inc(&tctx->req_complete);
1620         if (tctx->in_idle)
1621                 wake_up(&tctx->wait);
1622         put_task_struct(req->task);
1623 
1624         if (likely(!io_is_fallback_req(req)))
1625                 kmem_cache_free(req_cachep, req);
1626         else
1627                 clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req);
1628         percpu_ref_put(&ctx->refs);
1629 }
1630 
1631 static void io_req_task_file_table_put(struct callback_head *cb)
1632 {
1633         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
1634         struct fs_struct *fs = req->work.fs;
1635 
1636         spin_lock(&req->work.fs->lock);
1637         if (--fs->users)
1638                 fs = NULL;
1639         spin_unlock(&req->work.fs->lock);
1640         if (fs)
1641                 free_fs_struct(fs);
1642         req->work.fs = NULL;
1643         __io_free_req_finish(req);
1644 }
1645 
1646 static void __io_free_req(struct io_kiocb *req)
1647 {
1648         if (!io_dismantle_req(req)) {
1649                 __io_free_req_finish(req);
1650         } else {
1651                 int ret;
1652 
1653                 init_task_work(&req->task_work, io_req_task_file_table_put);
1654                 ret = task_work_add(req->task, &req->task_work, TWA_RESUME);
1655                 if (unlikely(ret)) {
1656                         struct task_struct *tsk;
1657 
1658                         tsk = io_wq_get_task(req->ctx->io_wq);
1659                         task_work_add(tsk, &req->task_work, 0);
1660                 }
1661         }
1662 }
1663 
1664 static bool io_link_cancel_timeout(struct io_kiocb *req)
1665 {
1666         struct io_ring_ctx *ctx = req->ctx;
1667         int ret;
1668 
1669         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
1670         if (ret != -1) {
1671                 req->flags |= REQ_F_COMP_LOCKED;
1672                 io_cqring_fill_event(req, -ECANCELED);
1673                 io_commit_cqring(ctx);
1674                 req->flags &= ~REQ_F_LINK_HEAD;
1675                 io_put_req(req);
1676                 return true;
1677         }
1678 
1679         return false;
1680 }
1681 
1682 static bool __io_kill_linked_timeout(struct io_kiocb *req)
1683 {
1684         struct io_kiocb *link;
1685         bool wake_ev;
1686 
1687         if (list_empty(&req->link_list))
1688                 return false;
1689         link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
1690         if (link->opcode != IORING_OP_LINK_TIMEOUT)
1691                 return false;
1692 
1693         list_del_init(&link->link_list);
1694         wake_ev = io_link_cancel_timeout(link);
1695         req->flags &= ~REQ_F_LINK_TIMEOUT;
1696         return wake_ev;
1697 }
1698 
1699 static void io_kill_linked_timeout(struct io_kiocb *req)
1700 {
1701         struct io_ring_ctx *ctx = req->ctx;
1702         bool wake_ev;
1703 
1704         if (!(req->flags & REQ_F_COMP_LOCKED)) {
1705                 unsigned long flags;
1706 
1707                 spin_lock_irqsave(&ctx->completion_lock, flags);
1708                 wake_ev = __io_kill_linked_timeout(req);
1709                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1710         } else {
1711                 wake_ev = __io_kill_linked_timeout(req);
1712         }
1713 
1714         if (wake_ev)
1715                 io_cqring_ev_posted(ctx);
1716 }
1717 
1718 static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
1719 {
1720         struct io_kiocb *nxt;
1721 
1722         /*
1723          * The list should never be empty when we are called here. But could
1724          * potentially happen if the chain is messed up, check to be on the
1725          * safe side.
1726          */
1727         if (unlikely(list_empty(&req->link_list)))
1728                 return NULL;
1729 
1730         nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
1731         list_del_init(&req->link_list);
1732         if (!list_empty(&nxt->link_list))
1733                 nxt->flags |= REQ_F_LINK_HEAD;
1734         return nxt;
1735 }
1736 
1737 /*
1738  * Called if REQ_F_LINK_HEAD is set, and we fail the head request
1739  */
1740 static void __io_fail_links(struct io_kiocb *req)
1741 {
1742         struct io_ring_ctx *ctx = req->ctx;
1743 
1744         while (!list_empty(&req->link_list)) {
1745                 struct io_kiocb *link = list_first_entry(&req->link_list,
1746                                                 struct io_kiocb, link_list);
1747 
1748                 list_del_init(&link->link_list);
1749                 trace_io_uring_fail_link(req, link);
1750 
1751                 io_cqring_fill_event(link, -ECANCELED);
1752                 link->flags |= REQ_F_COMP_LOCKED;
1753                 __io_double_put_req(link);
1754                 req->flags &= ~REQ_F_LINK_TIMEOUT;
1755         }
1756 
1757         io_commit_cqring(ctx);
1758         io_cqring_ev_posted(ctx);
1759 }
1760 
1761 static void io_fail_links(struct io_kiocb *req)
1762 {
1763         struct io_ring_ctx *ctx = req->ctx;
1764 
1765         if (!(req->flags & REQ_F_COMP_LOCKED)) {
1766                 unsigned long flags;
1767 
1768                 spin_lock_irqsave(&ctx->completion_lock, flags);
1769                 __io_fail_links(req);
1770                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1771         } else {
1772                 __io_fail_links(req);
1773         }
1774 
1775         io_cqring_ev_posted(ctx);
1776 }
1777 
1778 static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
1779 {
1780         req->flags &= ~REQ_F_LINK_HEAD;
1781         if (req->flags & REQ_F_LINK_TIMEOUT)
1782                 io_kill_linked_timeout(req);
1783 
1784         /*
1785          * If LINK is set, we have dependent requests in this chain. If we
1786          * didn't fail this request, queue the first one up, moving any other
1787          * dependencies to the next request. In case of failure, fail the rest
1788          * of the chain.
1789          */
1790         if (likely(!(req->flags & REQ_F_FAIL_LINK)))
1791                 return io_req_link_next(req);
1792         io_fail_links(req);
1793         return NULL;
1794 }
1795 
1796 static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
1797 {
1798         if (likely(!(req->flags & REQ_F_LINK_HEAD)))
1799                 return NULL;
1800         return __io_req_find_next(req);
1801 }
1802 
1803 static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb,
1804                                 bool twa_signal_ok)
1805 {
1806         struct task_struct *tsk = req->task;
1807         struct io_ring_ctx *ctx = req->ctx;
1808         int ret, notify;
1809 
1810         if (tsk->flags & PF_EXITING)
1811                 return -ESRCH;
1812 
1813         /*
1814          * SQPOLL kernel thread doesn't need notification, just a wakeup. For
1815          * all other cases, use TWA_SIGNAL unconditionally to ensure we're
1816          * processing task_work. There's no reliable way to tell if TWA_RESUME
1817          * will do the job.
1818          */
1819         notify = 0;
1820         if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok)
1821                 notify = TWA_SIGNAL;
1822 
1823         ret = task_work_add(tsk, cb, notify);
1824         if (!ret)
1825                 wake_up_process(tsk);
1826 
1827         return ret;
1828 }
1829 
1830 static void __io_req_task_cancel(struct io_kiocb *req, int error)
1831 {
1832         struct io_ring_ctx *ctx = req->ctx;
1833 
1834         spin_lock_irq(&ctx->completion_lock);
1835         io_cqring_fill_event(req, error);
1836         io_commit_cqring(ctx);
1837         spin_unlock_irq(&ctx->completion_lock);
1838 
1839         io_cqring_ev_posted(ctx);
1840         req_set_fail_links(req);
1841         io_double_put_req(req);
1842 }
1843 
1844 static void io_req_task_cancel(struct callback_head *cb)
1845 {
1846         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
1847         struct io_ring_ctx *ctx = req->ctx;
1848 
1849         __io_req_task_cancel(req, -ECANCELED);
1850         percpu_ref_put(&ctx->refs);
1851 }
1852 
1853 static void __io_req_task_submit(struct io_kiocb *req)
1854 {
1855         struct io_ring_ctx *ctx = req->ctx;
1856 
1857         if (!__io_sq_thread_acquire_mm(ctx)) {
1858                 mutex_lock(&ctx->uring_lock);
1859                 __io_queue_sqe(req, NULL, NULL);
1860                 mutex_unlock(&ctx->uring_lock);
1861         } else {
1862                 __io_req_task_cancel(req, -EFAULT);
1863         }
1864 }
1865 
1866 static void io_req_task_submit(struct callback_head *cb)
1867 {
1868         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
1869         struct io_ring_ctx *ctx = req->ctx;
1870 
1871         __io_req_task_submit(req);
1872         percpu_ref_put(&ctx->refs);
1873 }
1874 
1875 static void io_req_task_queue(struct io_kiocb *req)
1876 {
1877         int ret;
1878 
1879         init_task_work(&req->task_work, io_req_task_submit);
1880         percpu_ref_get(&req->ctx->refs);
1881 
1882         ret = io_req_task_work_add(req, &req->task_work, true);
1883         if (unlikely(ret)) {
1884                 struct task_struct *tsk;
1885 
1886                 init_task_work(&req->task_work, io_req_task_cancel);
1887                 tsk = io_wq_get_task(req->ctx->io_wq);
1888                 task_work_add(tsk, &req->task_work, 0);
1889                 wake_up_process(tsk);
1890         }
1891 }
1892 
1893 static void io_queue_next(struct io_kiocb *req)
1894 {
1895         struct io_kiocb *nxt = io_req_find_next(req);
1896 
1897         if (nxt)
1898                 io_req_task_queue(nxt);
1899 }
1900 
1901 static void io_free_req(struct io_kiocb *req)
1902 {
1903         io_queue_next(req);
1904         __io_free_req(req);
1905 }
1906 
1907 struct req_batch {
1908         void *reqs[IO_IOPOLL_BATCH];
1909         int to_free;
1910 
1911         struct task_struct      *task;
1912         int                     task_refs;
1913 };
1914 
1915 static inline void io_init_req_batch(struct req_batch *rb)
1916 {
1917         rb->to_free = 0;
1918         rb->task_refs = 0;
1919         rb->task = NULL;
1920 }
1921 
1922 static void __io_req_free_batch_flush(struct io_ring_ctx *ctx,
1923                                       struct req_batch *rb)
1924 {
1925         kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
1926         percpu_ref_put_many(&ctx->refs, rb->to_free);
1927         rb->to_free = 0;
1928 }
1929 
1930 static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
1931                                      struct req_batch *rb)
1932 {
1933         if (rb->to_free)
1934                 __io_req_free_batch_flush(ctx, rb);
1935         if (rb->task) {
1936                 atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete);
1937                 put_task_struct_many(rb->task, rb->task_refs);
1938                 rb->task = NULL;
1939         }
1940 }
1941 
1942 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
1943 {
1944         if (unlikely(io_is_fallback_req(req))) {
1945                 io_free_req(req);
1946                 return;
1947         }
1948         if (req->flags & REQ_F_LINK_HEAD)
1949                 io_queue_next(req);
1950 
1951         if (req->task != rb->task) {
1952                 if (rb->task) {
1953                         atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete);
1954                         put_task_struct_many(rb->task, rb->task_refs);
1955                 }
1956                 rb->task = req->task;
1957                 rb->task_refs = 0;
1958         }
1959         rb->task_refs++;
1960 
1961         WARN_ON_ONCE(io_dismantle_req(req));
1962         rb->reqs[rb->to_free++] = req;
1963         if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
1964                 __io_req_free_batch_flush(req->ctx, rb);
1965 }
1966 
1967 /*
1968  * Drop reference to request, return next in chain (if there is one) if this
1969  * was the last reference to this request.
1970  */
1971 static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
1972 {
1973         struct io_kiocb *nxt = NULL;
1974 
1975         if (refcount_dec_and_test(&req->refs)) {
1976                 nxt = io_req_find_next(req);
1977                 __io_free_req(req);
1978         }
1979         return nxt;
1980 }
1981 
1982 static void io_put_req(struct io_kiocb *req)
1983 {
1984         if (refcount_dec_and_test(&req->refs))
1985                 io_free_req(req);
1986 }
1987 
1988 static struct io_wq_work *io_steal_work(struct io_kiocb *req)
1989 {
1990         struct io_kiocb *nxt;
1991 
1992         /*
1993          * A ref is owned by io-wq in which context we're. So, if that's the
1994          * last one, it's safe to steal next work. False negatives are Ok,
1995          * it just will be re-punted async in io_put_work()
1996          */
1997         if (refcount_read(&req->refs) != 1)
1998                 return NULL;
1999 
2000         nxt = io_req_find_next(req);
2001         return nxt ? &nxt->work : NULL;
2002 }
2003 
2004 /*
2005  * Must only be used if we don't need to care about links, usually from
2006  * within the completion handling itself.
2007  */
2008 static void __io_double_put_req(struct io_kiocb *req)
2009 {
2010         /* drop both submit and complete references */
2011         if (refcount_sub_and_test(2, &req->refs))
2012                 __io_free_req(req);
2013 }
2014 
2015 static void io_double_put_req(struct io_kiocb *req)
2016 {
2017         /* drop both submit and complete references */
2018         if (refcount_sub_and_test(2, &req->refs))
2019                 io_free_req(req);
2020 }
2021 
2022 static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
2023 {
2024         struct io_rings *rings = ctx->rings;
2025 
2026         if (test_bit(0, &ctx->cq_check_overflow)) {
2027                 /*
2028                  * noflush == true is from the waitqueue handler, just ensure
2029                  * we wake up the task, and the next invocation will flush the
2030                  * entries. We cannot safely to it from here.
2031                  */
2032                 if (noflush && !list_empty(&ctx->cq_overflow_list))
2033                         return -1U;
2034 
2035                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
2036         }
2037 
2038         /* See comment at the top of this file */
2039         smp_rmb();
2040         return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
2041 }
2042 
2043 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2044 {
2045         struct io_rings *rings = ctx->rings;
2046 
2047         /* make sure SQ entry isn't read before tail */
2048         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2049 }
2050 
2051 static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2052 {
2053         unsigned int cflags;
2054 
2055         cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2056         cflags |= IORING_CQE_F_BUFFER;
2057         req->flags &= ~REQ_F_BUFFER_SELECTED;
2058         kfree(kbuf);
2059         return cflags;
2060 }
2061 
2062 static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2063 {
2064         struct io_buffer *kbuf;
2065 
2066         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2067         return io_put_kbuf(req, kbuf);
2068 }
2069 
2070 static inline bool io_run_task_work(void)
2071 {
2072         /*
2073          * Not safe to run on exiting task, and the task_work handling will
2074          * not add work to such a task.
2075          */
2076         if (unlikely(current->flags & PF_EXITING))
2077                 return false;
2078         if (current->task_works) {
2079                 __set_current_state(TASK_RUNNING);
2080                 task_work_run();
2081                 return true;
2082         }
2083 
2084         return false;
2085 }
2086 
2087 static void io_iopoll_queue(struct list_head *again)
2088 {
2089         struct io_kiocb *req;
2090 
2091         do {
2092                 req = list_first_entry(again, struct io_kiocb, inflight_entry);
2093                 list_del(&req->inflight_entry);
2094                 __io_complete_rw(req, -EAGAIN, 0, NULL);
2095         } while (!list_empty(again));
2096 }
2097 
2098 /*
2099  * Find and free completed poll iocbs
2100  */
2101 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2102                                struct list_head *done)
2103 {
2104         struct req_batch rb;
2105         struct io_kiocb *req;
2106         LIST_HEAD(again);
2107 
2108         /* order with ->result store in io_complete_rw_iopoll() */
2109         smp_rmb();
2110 
2111         io_init_req_batch(&rb);
2112         while (!list_empty(done)) {
2113                 int cflags = 0;
2114 
2115                 req = list_first_entry(done, struct io_kiocb, inflight_entry);
2116                 if (READ_ONCE(req->result) == -EAGAIN) {
2117                         req->result = 0;
2118                         req->iopoll_completed = 0;
2119                         list_move_tail(&req->inflight_entry, &again);
2120                         continue;
2121                 }
2122                 list_del(&req->inflight_entry);
2123 
2124                 if (req->flags & REQ_F_BUFFER_SELECTED)
2125                         cflags = io_put_rw_kbuf(req);
2126 
2127                 __io_cqring_fill_event(req, req->result, cflags);
2128                 (*nr_events)++;
2129 
2130                 if (refcount_dec_and_test(&req->refs))
2131                         io_req_free_batch(&rb, req);
2132         }
2133 
2134         io_commit_cqring(ctx);
2135         if (ctx->flags & IORING_SETUP_SQPOLL)
2136                 io_cqring_ev_posted(ctx);
2137         io_req_free_batch_finish(ctx, &rb);
2138 
2139         if (!list_empty(&again))
2140                 io_iopoll_queue(&again);
2141 }
2142 
2143 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2144                         long min)
2145 {
2146         struct io_kiocb *req, *tmp;
2147         LIST_HEAD(done);
2148         bool spin;
2149         int ret;
2150 
2151         /*
2152          * Only spin for completions if we don't have multiple devices hanging
2153          * off our complete list, and we're under the requested amount.
2154          */
2155         spin = !ctx->poll_multi_file && *nr_events < min;
2156 
2157         ret = 0;
2158         list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
2159                 struct kiocb *kiocb = &req->rw.kiocb;
2160 
2161                 /*
2162                  * Move completed and retryable entries to our local lists.
2163                  * If we find a request that requires polling, break out
2164                  * and complete those lists first, if we have entries there.
2165                  */
2166                 if (READ_ONCE(req->iopoll_completed)) {
2167                         list_move_tail(&req->inflight_entry, &done);
2168                         continue;
2169                 }
2170                 if (!list_empty(&done))
2171                         break;
2172 
2173                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2174                 if (ret < 0)
2175                         break;
2176 
2177                 /* iopoll may have completed current req */
2178                 if (READ_ONCE(req->iopoll_completed))
2179                         list_move_tail(&req->inflight_entry, &done);
2180 
2181                 if (ret && spin)
2182                         spin = false;
2183                 ret = 0;
2184         }
2185 
2186         if (!list_empty(&done))
2187                 io_iopoll_complete(ctx, nr_events, &done);
2188 
2189         return ret;
2190 }
2191 
2192 /*
2193  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
2194  * non-spinning poll check - we'll still enter the driver poll loop, but only
2195  * as a non-spinning completion check.
2196  */
2197 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
2198                                 long min)
2199 {
2200         while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
2201                 int ret;
2202 
2203                 ret = io_do_iopoll(ctx, nr_events, min);
2204                 if (ret < 0)
2205                         return ret;
2206                 if (*nr_events >= min)
2207                         return 0;
2208         }
2209 
2210         return 1;
2211 }
2212 
2213 /*
2214  * We can't just wait for polled events to come to us, we have to actively
2215  * find and complete them.
2216  */
2217 static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2218 {
2219         if (!(ctx->flags & IORING_SETUP_IOPOLL))
2220                 return;
2221 
2222         mutex_lock(&ctx->uring_lock);
2223         while (!list_empty(&ctx->iopoll_list)) {
2224                 unsigned int nr_events = 0;
2225 
2226                 io_do_iopoll(ctx, &nr_events, 0);
2227 
2228                 /* let it sleep and repeat later if can't complete a request */
2229                 if (nr_events == 0)
2230                         break;
2231                 /*
2232                  * Ensure we allow local-to-the-cpu processing to take place,
2233                  * in this case we need to ensure that we reap all events.
2234                  * Also let task_work, etc. to progress by releasing the mutex
2235                  */
2236                 if (need_resched()) {
2237                         mutex_unlock(&ctx->uring_lock);
2238                         cond_resched();
2239                         mutex_lock(&ctx->uring_lock);
2240                 }
2241         }
2242         mutex_unlock(&ctx->uring_lock);
2243 }
2244 
2245 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2246 {
2247         unsigned int nr_events = 0;
2248         int iters = 0, ret = 0;
2249 
2250         /*
2251          * We disallow the app entering submit/complete with polling, but we
2252          * still need to lock the ring to prevent racing with polled issue
2253          * that got punted to a workqueue.
2254          */
2255         mutex_lock(&ctx->uring_lock);
2256         do {
2257                 /*
2258                  * Don't enter poll loop if we already have events pending.
2259                  * If we do, we can potentially be spinning for commands that
2260                  * already triggered a CQE (eg in error).
2261                  */
2262                 if (io_cqring_events(ctx, false))
2263                         break;
2264 
2265                 /*
2266                  * If a submit got punted to a workqueue, we can have the
2267                  * application entering polling for a command before it gets
2268                  * issued. That app will hold the uring_lock for the duration
2269                  * of the poll right here, so we need to take a breather every
2270                  * now and then to ensure that the issue has a chance to add
2271                  * the poll to the issued list. Otherwise we can spin here
2272                  * forever, while the workqueue is stuck trying to acquire the
2273                  * very same mutex.
2274                  */
2275                 if (!(++iters & 7)) {
2276                         mutex_unlock(&ctx->uring_lock);
2277                         io_run_task_work();
2278                         mutex_lock(&ctx->uring_lock);
2279                 }
2280 
2281                 ret = io_iopoll_getevents(ctx, &nr_events, min);
2282                 if (ret <= 0)
2283                         break;
2284                 ret = 0;
2285         } while (min && !nr_events && !need_resched());
2286 
2287         mutex_unlock(&ctx->uring_lock);
2288         return ret;
2289 }
2290 
2291 static void kiocb_end_write(struct io_kiocb *req)
2292 {
2293         /*
2294          * Tell lockdep we inherited freeze protection from submission
2295          * thread.
2296          */
2297         if (req->flags & REQ_F_ISREG) {
2298                 struct inode *inode = file_inode(req->file);
2299 
2300                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2301         }
2302         file_end_write(req->file);
2303 }
2304 
2305 static void io_complete_rw_common(struct kiocb *kiocb, long res,
2306                                   struct io_comp_state *cs)
2307 {
2308         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2309         int cflags = 0;
2310 
2311         if (kiocb->ki_flags & IOCB_WRITE)
2312                 kiocb_end_write(req);
2313 
2314         if (res != req->result)
2315                 req_set_fail_links(req);
2316         if (req->flags & REQ_F_BUFFER_SELECTED)
2317                 cflags = io_put_rw_kbuf(req);
2318         __io_req_complete(req, res, cflags, cs);
2319 }
2320 
2321 #ifdef CONFIG_BLOCK
2322 static bool io_resubmit_prep(struct io_kiocb *req, int error)
2323 {
2324         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2325         ssize_t ret = -ECANCELED;
2326         struct iov_iter iter;
2327         int rw;
2328 
2329         if (error) {
2330                 ret = error;
2331                 goto end_req;
2332         }
2333 
2334         switch (req->opcode) {
2335         case IORING_OP_READV:
2336         case IORING_OP_READ_FIXED:
2337         case IORING_OP_READ:
2338                 rw = READ;
2339                 break;
2340         case IORING_OP_WRITEV:
2341         case IORING_OP_WRITE_FIXED:
2342         case IORING_OP_WRITE:
2343                 rw = WRITE;
2344                 break;
2345         default:
2346                 printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
2347                                 req->opcode);
2348                 goto end_req;
2349         }
2350 
2351         if (!req->io) {
2352                 ret = io_import_iovec(rw, req, &iovec, &iter, false);
2353                 if (ret < 0)
2354                         goto end_req;
2355                 ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
2356                 if (!ret)
2357                         return true;
2358                 kfree(iovec);
2359         } else {
2360                 return true;
2361         }
2362 end_req:
2363         req_set_fail_links(req);
2364         return false;
2365 }
2366 #endif
2367 
2368 static bool io_rw_reissue(struct io_kiocb *req, long res)
2369 {
2370 #ifdef CONFIG_BLOCK
2371         umode_t mode = file_inode(req->file)->i_mode;
2372         int ret;
2373 
2374         if (!S_ISBLK(mode) && !S_ISREG(mode))
2375                 return false;
2376         if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
2377                 return false;
2378 
2379         ret = io_sq_thread_acquire_mm(req->ctx, req);
2380 
2381         if (io_resubmit_prep(req, ret)) {
2382                 refcount_inc(&req->refs);
2383                 io_queue_async_work(req);
2384                 return true;
2385         }
2386 
2387 #endif
2388         return false;
2389 }
2390 
2391 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2392                              struct io_comp_state *cs)
2393 {
2394         if (!io_rw_reissue(req, res))
2395                 io_complete_rw_common(&req->rw.kiocb, res, cs);
2396 }
2397 
2398 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2399 {
2400         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2401 
2402         __io_complete_rw(req, res, res2, NULL);
2403 }
2404 
2405 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2406 {
2407         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2408 
2409         if (kiocb->ki_flags & IOCB_WRITE)
2410                 kiocb_end_write(req);
2411 
2412         if (res != -EAGAIN && res != req->result)
2413                 req_set_fail_links(req);
2414 
2415         WRITE_ONCE(req->result, res);
2416         /* order with io_poll_complete() checking ->result */
2417         smp_wmb();
2418         WRITE_ONCE(req->iopoll_completed, 1);
2419 }
2420 
2421 /*
2422  * After the iocb has been issued, it's safe to be found on the poll list.
2423  * Adding the kiocb to the list AFTER submission ensures that we don't
2424  * find it from a io_iopoll_getevents() thread before the issuer is done
2425  * accessing the kiocb cookie.
2426  */
2427 static void io_iopoll_req_issued(struct io_kiocb *req)
2428 {
2429         struct io_ring_ctx *ctx = req->ctx;
2430 
2431         /*
2432          * Track whether we have multiple files in our lists. This will impact
2433          * how we do polling eventually, not spinning if we're on potentially
2434          * different devices.
2435          */
2436         if (list_empty(&ctx->iopoll_list)) {
2437                 ctx->poll_multi_file = false;
2438         } else if (!ctx->poll_multi_file) {
2439                 struct io_kiocb *list_req;
2440 
2441                 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
2442                                                 inflight_entry);
2443                 if (list_req->file != req->file)
2444                         ctx->poll_multi_file = true;
2445         }
2446 
2447         /*
2448          * For fast devices, IO may have already completed. If it has, add
2449          * it to the front so we find it first.
2450          */
2451         if (READ_ONCE(req->iopoll_completed))
2452                 list_add(&req->inflight_entry, &ctx->iopoll_list);
2453         else
2454                 list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
2455 
2456         if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2457             wq_has_sleeper(&ctx->sqo_wait))
2458                 wake_up(&ctx->sqo_wait);
2459 }
2460 
2461 static void __io_state_file_put(struct io_submit_state *state)
2462 {
2463         if (state->has_refs)
2464                 fput_many(state->file, state->has_refs);
2465         state->file = NULL;
2466 }
2467 
2468 static inline void io_state_file_put(struct io_submit_state *state)
2469 {
2470         if (state->file)
2471                 __io_state_file_put(state);
2472 }
2473 
2474 /*
2475  * Get as many references to a file as we have IOs left in this submission,
2476  * assuming most submissions are for one file, or at least that each file
2477  * has more than one submission.
2478  */
2479 static struct file *__io_file_get(struct io_submit_state *state, int fd)
2480 {
2481         if (!state)
2482                 return fget(fd);
2483 
2484         if (state->file) {
2485                 if (state->fd == fd) {
2486                         state->has_refs--;
2487                         state->ios_left--;
2488                         return state->file;
2489                 }
2490                 __io_state_file_put(state);
2491         }
2492         state->file = fget_many(fd, state->ios_left);
2493         if (!state->file)
2494                 return NULL;
2495 
2496         state->fd = fd;
2497         state->ios_left--;
2498         state->has_refs = state->ios_left;
2499         return state->file;
2500 }
2501 
2502 static bool io_bdev_nowait(struct block_device *bdev)
2503 {
2504 #ifdef CONFIG_BLOCK
2505         return !bdev || queue_is_mq(bdev_get_queue(bdev));
2506 #else
2507         return true;
2508 #endif
2509 }
2510 
2511 /*
2512  * If we tracked the file through the SCM inflight mechanism, we could support
2513  * any file. For now, just ensure that anything potentially problematic is done
2514  * inline.
2515  */
2516 static bool io_file_supports_async(struct file *file, int rw)
2517 {
2518         umode_t mode = file_inode(file)->i_mode;
2519 
2520         if (S_ISBLK(mode)) {
2521                 if (io_bdev_nowait(file->f_inode->i_bdev))
2522                         return true;
2523                 return false;
2524         }
2525         if (S_ISCHR(mode) || S_ISSOCK(mode))
2526                 return true;
2527         if (S_ISREG(mode)) {
2528                 if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2529                     file->f_op != &io_uring_fops)
2530                         return true;
2531                 return false;
2532         }
2533 
2534         /* any ->read/write should understand O_NONBLOCK */
2535         if (file->f_flags & O_NONBLOCK)
2536                 return true;
2537 
2538         if (!(file->f_mode & FMODE_NOWAIT))
2539                 return false;
2540 
2541         if (rw == READ)
2542                 return file->f_op->read_iter != NULL;
2543 
2544         return file->f_op->write_iter != NULL;
2545 }
2546 
2547 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2548                       bool force_nonblock)
2549 {
2550         struct io_ring_ctx *ctx = req->ctx;
2551         struct kiocb *kiocb = &req->rw.kiocb;
2552         unsigned ioprio;
2553         int ret;
2554 
2555         if (S_ISREG(file_inode(req->file)->i_mode))
2556                 req->flags |= REQ_F_ISREG;
2557 
2558         kiocb->ki_pos = READ_ONCE(sqe->off);
2559         if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
2560                 req->flags |= REQ_F_CUR_POS;
2561                 kiocb->ki_pos = req->file->f_pos;
2562         }
2563         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2564         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2565         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2566         if (unlikely(ret))
2567                 return ret;
2568 
2569         ioprio = READ_ONCE(sqe->ioprio);
2570         if (ioprio) {
2571                 ret = ioprio_check_cap(ioprio);
2572                 if (ret)
2573                         return ret;
2574 
2575                 kiocb->ki_ioprio = ioprio;
2576         } else
2577                 kiocb->ki_ioprio = get_current_ioprio();
2578 
2579         /* don't allow async punt if RWF_NOWAIT was requested */
2580         if (kiocb->ki_flags & IOCB_NOWAIT)
2581                 req->flags |= REQ_F_NOWAIT;
2582 
2583         if (force_nonblock)
2584                 kiocb->ki_flags |= IOCB_NOWAIT;
2585 
2586         if (ctx->flags & IORING_SETUP_IOPOLL) {
2587                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2588                     !kiocb->ki_filp->f_op->iopoll)
2589                         return -EOPNOTSUPP;
2590 
2591                 kiocb->ki_flags |= IOCB_HIPRI;
2592                 kiocb->ki_complete = io_complete_rw_iopoll;
2593                 req->iopoll_completed = 0;
2594         } else {
2595                 if (kiocb->ki_flags & IOCB_HIPRI)
2596                         return -EINVAL;
2597                 kiocb->ki_complete = io_complete_rw;
2598         }
2599 
2600         req->rw.addr = READ_ONCE(sqe->addr);
2601         req->rw.len = READ_ONCE(sqe->len);
2602         req->buf_index = READ_ONCE(sqe->buf_index);
2603         return 0;
2604 }
2605 
2606 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2607 {
2608         switch (ret) {
2609         case -EIOCBQUEUED:
2610                 break;
2611         case -ERESTARTSYS:
2612         case -ERESTARTNOINTR:
2613         case -ERESTARTNOHAND:
2614         case -ERESTART_RESTARTBLOCK:
2615                 /*
2616                  * We can't just restart the syscall, since previously
2617                  * submitted sqes may already be in progress. Just fail this
2618                  * IO with EINTR.
2619                  */
2620                 ret = -EINTR;
2621                 fallthrough;
2622         default:
2623                 kiocb->ki_complete(kiocb, ret, 0);
2624         }
2625 }
2626 
2627 static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2628                        struct io_comp_state *cs)
2629 {
2630         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2631 
2632         /* add previously done IO, if any */
2633         if (req->io && req->io->rw.bytes_done > 0) {
2634                 if (ret < 0)
2635                         ret = req->io->rw.bytes_done;
2636                 else
2637                         ret += req->io->rw.bytes_done;
2638         }
2639 
2640         if (req->flags & REQ_F_CUR_POS)
2641                 req->file->f_pos = kiocb->ki_pos;
2642         if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
2643                 __io_complete_rw(req, ret, 0, cs);
2644         else
2645                 io_rw_done(kiocb, ret);
2646 }
2647 
2648 static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
2649                                struct iov_iter *iter)
2650 {
2651         struct io_ring_ctx *ctx = req->ctx;
2652         size_t len = req->rw.len;
2653         struct io_mapped_ubuf *imu;
2654         u16 index, buf_index;
2655         size_t offset;
2656         u64 buf_addr;
2657 
2658         /* attempt to use fixed buffers without having provided iovecs */
2659         if (unlikely(!ctx->user_bufs))
2660                 return -EFAULT;
2661 
2662         buf_index = req->buf_index;
2663         if (unlikely(buf_index >= ctx->nr_user_bufs))
2664                 return -EFAULT;
2665 
2666         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2667         imu = &ctx->user_bufs[index];
2668         buf_addr = req->rw.addr;
2669 
2670         /* overflow */
2671         if (buf_addr + len < buf_addr)
2672                 return -EFAULT;
2673         /* not inside the mapped region */
2674         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2675                 return -EFAULT;
2676 
2677         /*
2678          * May not be a start of buffer, set size appropriately
2679          * and advance us to the beginning.
2680          */
2681         offset = buf_addr - imu->ubuf;
2682         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2683 
2684         if (offset) {
2685                 /*
2686                  * Don't use iov_iter_advance() here, as it's really slow for
2687                  * using the latter parts of a big fixed buffer - it iterates
2688                  * over each segment manually. We can cheat a bit here, because
2689                  * we know that:
2690                  *
2691                  * 1) it's a BVEC iter, we set it up
2692                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
2693                  *    first and last bvec
2694                  *
2695                  * So just find our index, and adjust the iterator afterwards.
2696                  * If the offset is within the first bvec (or the whole first
2697                  * bvec, just use iov_iter_advance(). This makes it easier
2698                  * since we can just skip the first segment, which may not
2699                  * be PAGE_SIZE aligned.
2700                  */
2701                 const struct bio_vec *bvec = imu->bvec;
2702 
2703                 if (offset <= bvec->bv_len) {
2704                         iov_iter_advance(iter, offset);
2705                 } else {
2706                         unsigned long seg_skip;
2707 
2708                         /* skip first vec */
2709                         offset -= bvec->bv_len;
2710                         seg_skip = 1 + (offset >> PAGE_SHIFT);
2711 
2712                         iter->bvec = bvec + seg_skip;
2713                         iter->nr_segs -= seg_skip;
2714                         iter->count -= bvec->bv_len + offset;
2715                         iter->iov_offset = offset & ~PAGE_MASK;
2716                 }
2717         }
2718 
2719         return len;
2720 }
2721 
2722 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2723 {
2724         if (needs_lock)
2725                 mutex_unlock(&ctx->uring_lock);
2726 }
2727 
2728 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2729 {
2730         /*
2731          * "Normal" inline submissions always hold the uring_lock, since we
2732          * grab it from the system call. Same is true for the SQPOLL offload.
2733          * The only exception is when we've detached the request and issue it
2734          * from an async worker thread, grab the lock for that case.
2735          */
2736         if (needs_lock)
2737                 mutex_lock(&ctx->uring_lock);
2738 }
2739 
2740 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2741                                           int bgid, struct io_buffer *kbuf,
2742                                           bool needs_lock)
2743 {
2744         struct io_buffer *head;
2745 
2746         if (req->flags & REQ_F_BUFFER_SELECTED)
2747                 return kbuf;
2748 
2749         io_ring_submit_lock(req->ctx, needs_lock);
2750 
2751         lockdep_assert_held(&req->ctx->uring_lock);
2752 
2753         head = idr_find(&req->ctx->io_buffer_idr, bgid);
2754         if (head) {
2755                 if (!list_empty(&head->list)) {
2756                         kbuf = list_last_entry(&head->list, struct io_buffer,
2757                                                         list);
2758                         list_del(&kbuf->list);
2759                 } else {
2760                         kbuf = head;
2761                         idr_remove(&req->ctx->io_buffer_idr, bgid);
2762                 }
2763                 if (*len > kbuf->len)
2764                         *len = kbuf->len;
2765         } else {
2766                 kbuf = ERR_PTR(-ENOBUFS);
2767         }
2768 
2769         io_ring_submit_unlock(req->ctx, needs_lock);
2770 
2771         return kbuf;
2772 }
2773 
2774 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2775                                         bool needs_lock)
2776 {
2777         struct io_buffer *kbuf;
2778         u16 bgid;
2779 
2780         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2781         bgid = req->buf_index;
2782         kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2783         if (IS_ERR(kbuf))
2784                 return kbuf;
2785         req->rw.addr = (u64) (unsigned long) kbuf;
2786         req->flags |= REQ_F_BUFFER_SELECTED;
2787         return u64_to_user_ptr(kbuf->addr);
2788 }
2789 
2790 #ifdef CONFIG_COMPAT
2791 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2792                                 bool needs_lock)
2793 {
2794         struct compat_iovec __user *uiov;
2795         compat_ssize_t clen;
2796         void __user *buf;
2797         ssize_t len;
2798 
2799         uiov = u64_to_user_ptr(req->rw.addr);
2800         if (!access_ok(uiov, sizeof(*uiov)))
2801                 return -EFAULT;
2802         if (__get_user(clen, &uiov->iov_len))
2803                 return -EFAULT;
2804         if (clen < 0)
2805                 return -EINVAL;
2806 
2807         len = clen;
2808         buf = io_rw_buffer_select(req, &len, needs_lock);
2809         if (IS_ERR(buf))
2810                 return PTR_ERR(buf);
2811         iov[0].iov_base = buf;
2812         iov[0].iov_len = (compat_size_t) len;
2813         return 0;
2814 }
2815 #endif
2816 
2817 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2818                                       bool needs_lock)
2819 {
2820         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2821         void __user *buf;
2822         ssize_t len;
2823 
2824         if (copy_from_user(iov, uiov, sizeof(*uiov)))
2825                 return -EFAULT;
2826 
2827         len = iov[0].iov_len;
2828         if (len < 0)
2829                 return -EINVAL;
2830         buf = io_rw_buffer_select(req, &len, needs_lock);
2831         if (IS_ERR(buf))
2832                 return PTR_ERR(buf);
2833         iov[0].iov_base = buf;
2834         iov[0].iov_len = len;
2835         return 0;
2836 }
2837 
2838 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2839                                     bool needs_lock)
2840 {
2841         if (req->flags & REQ_F_BUFFER_SELECTED) {
2842                 struct io_buffer *kbuf;
2843 
2844                 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2845                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
2846                 iov[0].iov_len = kbuf->len;
2847                 return 0;
2848         }
2849         if (!req->rw.len)
2850                 return 0;
2851         else if (req->rw.len > 1)
2852                 return -EINVAL;
2853 
2854 #ifdef CONFIG_COMPAT
2855         if (req->ctx->compat)
2856                 return io_compat_import(req, iov, needs_lock);
2857 #endif
2858 
2859         return __io_iov_buffer_select(req, iov, needs_lock);
2860 }
2861 
2862 static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
2863                                  struct iovec **iovec, struct iov_iter *iter,
2864                                  bool needs_lock)
2865 {
2866         void __user *buf = u64_to_user_ptr(req->rw.addr);
2867         size_t sqe_len = req->rw.len;
2868         ssize_t ret;
2869         u8 opcode;
2870 
2871         opcode = req->opcode;
2872         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
2873                 *iovec = NULL;
2874                 return io_import_fixed(req, rw, iter);
2875         }
2876 
2877         /* buffer index only valid with fixed read/write, or buffer select  */
2878         if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
2879                 return -EINVAL;
2880 
2881         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2882                 if (req->flags & REQ_F_BUFFER_SELECT) {
2883                         buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
2884                         if (IS_ERR(buf))
2885                                 return PTR_ERR(buf);
2886                         req->rw.len = sqe_len;
2887                 }
2888 
2889                 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2890                 *iovec = NULL;
2891                 return ret < 0 ? ret : sqe_len;
2892         }
2893 
2894         if (req->flags & REQ_F_BUFFER_SELECT) {
2895                 ret = io_iov_buffer_select(req, *iovec, needs_lock);
2896                 if (!ret) {
2897                         ret = (*iovec)->iov_len;
2898                         iov_iter_init(iter, rw, *iovec, 1, ret);
2899                 }
2900                 *iovec = NULL;
2901                 return ret;
2902         }
2903 
2904 #ifdef CONFIG_COMPAT
2905         if (req->ctx->compat)
2906                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
2907                                                 iovec, iter);
2908 #endif
2909 
2910         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
2911 }
2912 
2913 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
2914                                struct iovec **iovec, struct iov_iter *iter,
2915                                bool needs_lock)
2916 {
2917         if (!req->io)
2918                 return __io_import_iovec(rw, req, iovec, iter, needs_lock);
2919         *iovec = NULL;
2920         return iov_iter_count(&req->io->rw.iter);
2921 }
2922 
2923 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
2924 {
2925         return kiocb->ki_filp->f_mode & FMODE_STREAM ? NULL : &kiocb->ki_pos;
2926 }
2927 
2928 /*
2929  * For files that don't have ->read_iter() and ->write_iter(), handle them
2930  * by looping over ->read() or ->write() manually.
2931  */
2932 static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
2933                            struct iov_iter *iter)
2934 {
2935         ssize_t ret = 0;
2936 
2937         /*
2938          * Don't support polled IO through this interface, and we can't
2939          * support non-blocking either. For the latter, this just causes
2940          * the kiocb to be handled from an async context.
2941          */
2942         if (kiocb->ki_flags & IOCB_HIPRI)
2943                 return -EOPNOTSUPP;
2944         if (kiocb->ki_flags & IOCB_NOWAIT)
2945                 return -EAGAIN;
2946 
2947         while (iov_iter_count(iter)) {
2948                 struct iovec iovec;
2949                 ssize_t nr;
2950 
2951                 if (!iov_iter_is_bvec(iter)) {
2952                         iovec = iov_iter_iovec(iter);
2953                 } else {
2954                         /* fixed buffers import bvec */
2955                         iovec.iov_base = kmap(iter->bvec->bv_page)
2956                                                 + iter->iov_offset;
2957                         iovec.iov_len = min(iter->count,
2958                                         iter->bvec->bv_len - iter->iov_offset);
2959                 }
2960 
2961                 if (rw == READ) {
2962                         nr = file->f_op->read(file, iovec.iov_base,
2963                                               iovec.iov_len, io_kiocb_ppos(kiocb));
2964                 } else {
2965                         nr = file->f_op->write(file, iovec.iov_base,
2966                                                iovec.iov_len, io_kiocb_ppos(kiocb));
2967                 }
2968 
2969                 if (iov_iter_is_bvec(iter))
2970                         kunmap(iter->bvec->bv_page);
2971 
2972                 if (nr < 0) {
2973                         if (!ret)
2974                                 ret = nr;
2975                         break;
2976                 }
2977                 ret += nr;
2978                 if (nr != iovec.iov_len)
2979                         break;
2980                 iov_iter_advance(iter, nr);
2981         }
2982 
2983         return ret;
2984 }
2985 
2986 static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
2987                           const struct iovec *fast_iov, struct iov_iter *iter)
2988 {
2989         struct io_async_rw *rw = &req->io->rw;
2990 
2991         memcpy(&rw->iter, iter, sizeof(*iter));
2992         rw->free_iovec = NULL;
2993         rw->bytes_done = 0;
2994         /* can only be fixed buffers, no need to do anything */
2995         if (iov_iter_is_bvec(iter))
2996                 return;
2997         if (!iovec) {
2998                 unsigned iov_off = 0;
2999 
3000                 rw->iter.iov = rw->fast_iov;
3001                 if (iter->iov != fast_iov) {
3002                         iov_off = iter->iov - fast_iov;
3003                         rw->iter.iov += iov_off;
3004                 }
3005                 if (rw->fast_iov != fast_iov)
3006                         memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
3007                                sizeof(struct iovec) * iter->nr_segs);
3008         } else {
3009                 rw->free_iovec = iovec;
3010                 req->flags |= REQ_F_NEED_CLEANUP;
3011         }
3012 }
3013 
3014 static inline int __io_alloc_async_ctx(struct io_kiocb *req)
3015 {
3016         req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
3017         return req->io == NULL;
3018 }
3019 
3020 static int io_alloc_async_ctx(struct io_kiocb *req)
3021 {
3022         if (!io_op_defs[req->opcode].async_ctx)
3023                 return 0;
3024 
3025         return  __io_alloc_async_ctx(req);
3026 }
3027 
3028 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3029                              const struct iovec *fast_iov,
3030                              struct iov_iter *iter, bool force)
3031 {
3032         if (!force && !io_op_defs[req->opcode].async_ctx)
3033                 return 0;
3034         if (!req->io) {
3035                 if (__io_alloc_async_ctx(req))
3036                         return -ENOMEM;
3037 
3038                 io_req_map_rw(req, iovec, fast_iov, iter);
3039         }
3040         return 0;
3041 }
3042 
3043 static inline int io_rw_prep_async(struct io_kiocb *req, int rw,
3044                                    bool force_nonblock)
3045 {
3046         struct io_async_rw *iorw = &req->io->rw;
3047         struct iovec *iov;
3048         ssize_t ret;
3049 
3050         iorw->iter.iov = iov = iorw->fast_iov;
3051         ret = __io_import_iovec(rw, req, &iov, &iorw->iter, !force_nonblock);
3052         if (unlikely(ret < 0))
3053                 return ret;
3054 
3055         iorw->iter.iov = iov;
3056         io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter);
3057         return 0;
3058 }
3059 
3060 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
3061                         bool force_nonblock)
3062 {
3063         ssize_t ret;
3064 
3065         ret = io_prep_rw(req, sqe, force_nonblock);
3066         if (ret)
3067                 return ret;
3068 
3069         if (unlikely(!(req->file->f_mode & FMODE_READ)))
3070                 return -EBADF;
3071 
3072         /* either don't need iovec imported or already have it */
3073         if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
3074                 return 0;
3075         return io_rw_prep_async(req, READ, force_nonblock);
3076 }
3077 
3078 /*
3079  * This is our waitqueue callback handler, registered through lock_page_async()
3080  * when we initially tried to do the IO with the iocb armed our waitqueue.
3081  * This gets called when the page is unlocked, and we generally expect that to
3082  * happen when the page IO is completed and the page is now uptodate. This will
3083  * queue a task_work based retry of the operation, attempting to copy the data
3084  * again. If the latter fails because the page was NOT uptodate, then we will
3085  * do a thread based blocking retry of the operation. That's the unexpected
3086  * slow path.
3087  */
3088 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3089                              int sync, void *arg)
3090 {
3091         struct wait_page_queue *wpq;
3092         struct io_kiocb *req = wait->private;
3093         struct wait_page_key *key = arg;
3094         int ret;
3095 
3096         wpq = container_of(wait, struct wait_page_queue, wait);
3097 
3098         if (!wake_page_match(wpq, key))
3099                 return 0;
3100 
3101         req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3102         list_del_init(&wait->entry);
3103 
3104         init_task_work(&req->task_work, io_req_task_submit);
3105         percpu_ref_get(&req->ctx->refs);
3106 
3107         /* submit ref gets dropped, acquire a new one */
3108         refcount_inc(&req->refs);
3109         ret = io_req_task_work_add(req, &req->task_work, true);
3110         if (unlikely(ret)) {
3111                 struct task_struct *tsk;
3112 
3113                 /* queue just for cancelation */
3114                 init_task_work(&req->task_work, io_req_task_cancel);
3115                 tsk = io_wq_get_task(req->ctx->io_wq);
3116                 task_work_add(tsk, &req->task_work, 0);
3117                 wake_up_process(tsk);
3118         }
3119         return 1;
3120 }
3121 
3122 /*
3123  * This controls whether a given IO request should be armed for async page
3124  * based retry. If we return false here, the request is handed to the async
3125  * worker threads for retry. If we're doing buffered reads on a regular file,
3126  * we prepare a private wait_page_queue entry and retry the operation. This
3127  * will either succeed because the page is now uptodate and unlocked, or it
3128  * will register a callback when the page is unlocked at IO completion. Through
3129  * that callback, io_uring uses task_work to setup a retry of the operation.
3130  * That retry will attempt the buffered read again. The retry will generally
3131  * succeed, or in rare cases where it fails, we then fall back to using the
3132  * async worker threads for a blocking retry.
3133  */
3134 static bool io_rw_should_retry(struct io_kiocb *req)
3135 {
3136         struct wait_page_queue *wait = &req->io->rw.wpq;
3137         struct kiocb *kiocb = &req->rw.kiocb;
3138 
3139         /* never retry for NOWAIT, we just complete with -EAGAIN */
3140         if (req->flags & REQ_F_NOWAIT)
3141                 return false;
3142 
3143         /* Only for buffered IO */
3144         if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3145                 return false;
3146 
3147         /*
3148          * just use poll if we can, and don't attempt if the fs doesn't
3149          * support callback based unlocks
3150          */
3151         if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3152                 return false;
3153 
3154         wait->wait.func = io_async_buf_func;
3155         wait->wait.private = req;
3156         wait->wait.flags = 0;
3157         INIT_LIST_HEAD(&wait->wait.entry);
3158         kiocb->ki_flags |= IOCB_WAITQ;
3159         kiocb->ki_flags &= ~IOCB_NOWAIT;
3160         kiocb->ki_waitq = wait;
3161         return true;
3162 }
3163 
3164 static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3165 {
3166         if (req->file->f_op->read_iter)
3167                 return call_read_iter(req->file, &req->rw.kiocb, iter);
3168         else if (req->file->f_op->read)
3169                 return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
3170         else
3171                 return -EINVAL;
3172 }
3173 
3174 static int io_read(struct io_kiocb *req, bool force_nonblock,
3175                    struct io_comp_state *cs)
3176 {
3177         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3178         struct kiocb *kiocb = &req->rw.kiocb;
3179         struct iov_iter __iter, *iter = &__iter;
3180         ssize_t io_size, ret, ret2;
3181         size_t iov_count;
3182         bool no_async;
3183 
3184         if (req->io)
3185                 iter = &req->io->rw.iter;
3186 
3187         ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3188         if (ret < 0)
3189                 return ret;
3190         iov_count = iov_iter_count(iter);
3191         io_size = ret;
3192         req->result = io_size;
3193         ret = 0;
3194 
3195         /* Ensure we clear previously set non-block flag */
3196         if (!force_nonblock)
3197                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3198 
3199         /* If the file doesn't support async, just async punt */
3200         no_async = force_nonblock && !io_file_supports_async(req->file, READ);
3201         if (no_async)
3202                 goto copy_iov;
3203 
3204         ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
3205         if (unlikely(ret))
3206                 goto out_free;
3207 
3208         ret = io_iter_do_read(req, iter);
3209 
3210         if (!ret) {
3211                 goto done;
3212         } else if (ret == -EIOCBQUEUED) {
3213                 ret = 0;
3214                 goto out_free;
3215         } else if (ret == -EAGAIN) {
3216                 /* IOPOLL retry should happen for io-wq threads */
3217                 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3218                         goto done;
3219                 /* no retry on NONBLOCK marked file */
3220                 if (req->file->f_flags & O_NONBLOCK)
3221                         goto done;
3222                 /* some cases will consume bytes even on error returns */
3223                 iov_iter_revert(iter, iov_count - iov_iter_count(iter));
3224                 ret = 0;
3225                 goto copy_iov;
3226         } else if (ret < 0) {
3227                 /* make sure -ERESTARTSYS -> -EINTR is done */
3228                 goto done;
3229         }
3230 
3231         /* read it all, or we did blocking attempt. no retry. */
3232         if (!iov_iter_count(iter) || !force_nonblock ||
3233             (req->file->f_flags & O_NONBLOCK))
3234                 goto done;
3235 
3236         io_size -= ret;
3237 copy_iov:
3238         ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3239         if (ret2) {
3240                 ret = ret2;
3241                 goto out_free;
3242         }
3243         if (no_async)
3244                 return -EAGAIN;
3245         /* it's copied and will be cleaned with ->io */
3246         iovec = NULL;
3247         /* now use our persistent iterator, if we aren't already */
3248         iter = &req->io->rw.iter;
3249 retry:
3250         req->io->rw.bytes_done += ret;
3251         /* if we can retry, do so with the callbacks armed */
3252         if (!io_rw_should_retry(req)) {
3253                 kiocb->ki_flags &= ~IOCB_WAITQ;
3254                 return -EAGAIN;
3255         }
3256 
3257         /*
3258          * Now retry read with the IOCB_WAITQ parts set in the iocb. If we
3259          * get -EIOCBQUEUED, then we'll get a notification when the desired
3260          * page gets unlocked. We can also get a partial read here, and if we
3261          * do, then just retry at the new offset.
3262          */
3263         ret = io_iter_do_read(req, iter);
3264         if (ret == -EIOCBQUEUED) {
3265                 ret = 0;
3266                 goto out_free;
3267         } else if (ret > 0 && ret < io_size) {
3268                 /* we got some bytes, but not all. retry. */
3269                 goto retry;
3270         }
3271 done:
3272         kiocb_done(kiocb, ret, cs);
3273         ret = 0;
3274 out_free:
3275         /* it's reportedly faster than delegating the null check to kfree() */
3276         if (iovec)
3277                 kfree(iovec);
3278         return ret;
3279 }
3280 
3281 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
3282                          bool force_nonblock)
3283 {
3284         ssize_t ret;
3285 
3286         ret = io_prep_rw(req, sqe, force_nonblock);
3287         if (ret)
3288                 return ret;
3289 
3290         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3291                 return -EBADF;
3292 
3293         /* either don't need iovec imported or already have it */
3294         if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
3295                 return 0;
3296         return io_rw_prep_async(req, WRITE, force_nonblock);
3297 }
3298 
3299 static int io_write(struct io_kiocb *req, bool force_nonblock,
3300                     struct io_comp_state *cs)
3301 {
3302         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3303         struct kiocb *kiocb = &req->rw.kiocb;
3304         struct iov_iter __iter, *iter = &__iter;
3305         size_t iov_count;
3306         ssize_t ret, ret2, io_size;
3307 
3308         if (req->io)
3309                 iter = &req->io->rw.iter;
3310 
3311         ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3312         if (ret < 0)
3313                 return ret;
3314         iov_count = iov_iter_count(iter);
3315         io_size = ret;
3316         req->result = io_size;
3317 
3318         /* Ensure we clear previously set non-block flag */
3319         if (!force_nonblock)
3320                 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
3321 
3322         /* If the file doesn't support async, just async punt */
3323         if (force_nonblock && !io_file_supports_async(req->file, WRITE))
3324                 goto copy_iov;
3325 
3326         /* file path doesn't support NOWAIT for non-direct_IO */
3327         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3328             (req->flags & REQ_F_ISREG))
3329                 goto copy_iov;
3330 
3331         ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count);
3332         if (unlikely(ret))
3333                 goto out_free;
3334 
3335         /*
3336          * Open-code file_start_write here to grab freeze protection,
3337          * which will be released by another thread in
3338          * io_complete_rw().  Fool lockdep by telling it the lock got
3339          * released so that it doesn't complain about the held lock when
3340          * we return to userspace.
3341          */
3342         if (req->flags & REQ_F_ISREG) {
3343                 __sb_start_write(file_inode(req->file)->i_sb,
3344                                         SB_FREEZE_WRITE, true);
3345                 __sb_writers_release(file_inode(req->file)->i_sb,
3346                                         SB_FREEZE_WRITE);
3347         }
3348         kiocb->ki_flags |= IOCB_WRITE;
3349 
3350         if (req->file->f_op->write_iter)
3351                 ret2 = call_write_iter(req->file, kiocb, iter);
3352         else if (req->file->f_op->write)
3353                 ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter);
3354         else
3355                 ret2 = -EINVAL;
3356 
3357         /*
3358          * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3359          * retry them without IOCB_NOWAIT.
3360          */
3361         if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3362                 ret2 = -EAGAIN;
3363         /* no retry on NONBLOCK marked file */
3364         if (ret2 == -EAGAIN && (req->file->f_flags & O_NONBLOCK))
3365                 goto done;
3366         if (!force_nonblock || ret2 != -EAGAIN) {
3367                 /* IOPOLL retry should happen for io-wq threads */
3368                 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3369                         goto copy_iov;
3370 done:
3371                 kiocb_done(kiocb, ret2, cs);
3372         } else {
3373 copy_iov:
3374                 /* some cases will consume bytes even on error returns */
3375                 iov_iter_revert(iter, iov_count - iov_iter_count(iter));
3376                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
3377                 if (!ret)
3378                         return -EAGAIN;
3379         }
3380 out_free:
3381         /* it's reportedly faster than delegating the null check to kfree() */
3382         if (iovec)
3383                 kfree(iovec);
3384         return ret;
3385 }
3386 
3387 static int __io_splice_prep(struct io_kiocb *req,
3388                             const struct io_uring_sqe *sqe)
3389 {
3390         struct io_splice* sp = &req->splice;
3391         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3392         int ret;
3393 
3394         if (req->flags & REQ_F_NEED_CLEANUP)
3395                 return 0;
3396         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3397                 return -EINVAL;
3398 
3399         sp->file_in = NULL;
3400         sp->len = READ_ONCE(sqe->len);
3401         sp->flags = READ_ONCE(sqe->splice_flags);
3402 
3403         if (unlikely(sp->flags & ~valid_flags))
3404                 return -EINVAL;
3405 
3406         ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
3407                           (sp->flags & SPLICE_F_FD_IN_FIXED));
3408         if (ret)
3409                 return ret;
3410         req->flags |= REQ_F_NEED_CLEANUP;
3411 
3412         if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
3413                 /*
3414                  * Splice operation will be punted aync, and here need to
3415                  * modify io_wq_work.flags, so initialize io_wq_work firstly.
3416                  */
3417                 io_req_init_async(req);
3418                 req->work.flags |= IO_WQ_WORK_UNBOUND;
3419         }
3420 
3421         return 0;
3422 }
3423 
3424 static int io_tee_prep(struct io_kiocb *req,
3425                        const struct io_uring_sqe *sqe)
3426 {
3427         if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3428                 return -EINVAL;
3429         return __io_splice_prep(req, sqe);
3430 }
3431 
3432 static int io_tee(struct io_kiocb *req, bool force_nonblock)
3433 {
3434         struct io_splice *sp = &req->splice;
3435         struct file *in = sp->file_in;
3436         struct file *out = sp->file_out;
3437         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3438         long ret = 0;
3439 
3440         if (force_nonblock)
3441                 return -EAGAIN;
3442         if (sp->len)
3443                 ret = do_tee(in, out, sp->len, flags);
3444 
3445         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3446         req->flags &= ~REQ_F_NEED_CLEANUP;
3447 
3448         if (ret != sp->len)
3449                 req_set_fail_links(req);
3450         io_req_complete(req, ret);
3451         return 0;
3452 }
3453 
3454 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3455 {
3456         struct io_splice* sp = &req->splice;
3457 
3458         sp->off_in = READ_ONCE(sqe->splice_off_in);
3459         sp->off_out = READ_ONCE(sqe->off);
3460         return __io_splice_prep(req, sqe);
3461 }
3462 
3463 static int io_splice(struct io_kiocb *req, bool force_nonblock)
3464 {
3465         struct io_splice *sp = &req->splice;
3466         struct file *in = sp->file_in;
3467         struct file *out = sp->file_out;
3468         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3469         loff_t *poff_in, *poff_out;
3470         long ret = 0;
3471 
3472         if (force_nonblock)
3473                 return -EAGAIN;
3474 
3475         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3476         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
3477 
3478         if (sp->len)
3479                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
3480 
3481         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3482         req->flags &= ~REQ_F_NEED_CLEANUP;
3483 
3484         if (ret != sp->len)
3485                 req_set_fail_links(req);
3486         io_req_complete(req, ret);
3487         return 0;
3488 }
3489 
3490 /*
3491  * IORING_OP_NOP just posts a completion event, nothing else.
3492  */
3493 static int io_nop(struct io_kiocb *req, struct io_comp_state *cs)
3494 {
3495         struct io_ring_ctx *ctx = req->ctx;
3496 
3497         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3498                 return -EINVAL;
3499 
3500         __io_req_complete(req, 0, 0, cs);
3501         return 0;
3502 }
3503 
3504 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3505 {
3506         struct io_ring_ctx *ctx = req->ctx;
3507 
3508         if (!req->file)
3509                 return -EBADF;
3510 
3511         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3512                 return -EINVAL;
3513         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3514                 return -EINVAL;
3515 
3516         req->sync.flags = READ_ONCE(sqe->fsync_flags);
3517         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3518                 return -EINVAL;
3519 
3520         req->sync.off = READ_ONCE(sqe->off);
3521         req->sync.len = READ_ONCE(sqe->len);
3522         return 0;
3523 }
3524 
3525 static int io_fsync(struct io_kiocb *req, bool force_nonblock)
3526 {
3527         loff_t end = req->sync.off + req->sync.len;
3528         int ret;
3529 
3530         /* fsync always requires a blocking context */
3531         if (force_nonblock)
3532                 return -EAGAIN;
3533 
3534         ret = vfs_fsync_range(req->file, req->sync.off,
3535                                 end > 0 ? end : LLONG_MAX,
3536                                 req->sync.flags & IORING_FSYNC_DATASYNC);
3537         if (ret < 0)
3538                 req_set_fail_links(req);
3539         io_req_complete(req, ret);
3540         return 0;
3541 }
3542 
3543 static int io_fallocate_prep(struct io_kiocb *req,
3544                              const struct io_uring_sqe *sqe)
3545 {
3546         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
3547                 return -EINVAL;
3548         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3549                 return -EINVAL;
3550 
3551         req->sync.off = READ_ONCE(sqe->off);
3552         req->sync.len = READ_ONCE(sqe->addr);
3553         req->sync.mode = READ_ONCE(sqe->len);
3554         return 0;
3555 }
3556 
3557 static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
3558 {
3559         int ret;
3560 
3561         /* fallocate always requiring blocking context */
3562         if (force_nonblock)
3563                 return -EAGAIN;
3564         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3565                                 req->sync.len);
3566         if (ret < 0)
3567                 req_set_fail_links(req);
3568         io_req_complete(req, ret);
3569         return 0;
3570 }
3571 
3572 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3573 {
3574         const char __user *fname;
3575         int ret;
3576 
3577         if (unlikely(sqe->ioprio || sqe->buf_index))
3578                 return -EINVAL;
3579         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3580                 return -EBADF;
3581 
3582         /* open.how should be already initialised */
3583         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
3584                 req->open.how.flags |= O_LARGEFILE;
3585 
3586         req->open.dfd = READ_ONCE(sqe->fd);
3587         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3588         req->open.filename = getname(fname);
3589         if (IS_ERR(req->open.filename)) {
3590                 ret = PTR_ERR(req->open.filename);
3591                 req->open.filename = NULL;
3592                 return ret;
3593         }
3594         req->open.nofile = rlimit(RLIMIT_NOFILE);
3595         req->open.ignore_nonblock = false;
3596         req->flags |= REQ_F_NEED_CLEANUP;
3597         return 0;
3598 }
3599 
3600 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3601 {
3602         u64 flags, mode;
3603 
3604         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3605                 return -EINVAL;
3606         if (req->flags & REQ_F_NEED_CLEANUP)
3607                 return 0;
3608         mode = READ_ONCE(sqe->len);
3609         flags = READ_ONCE(sqe->open_flags);
3610         req->open.how = build_open_how(flags, mode);
3611         return __io_openat_prep(req, sqe);
3612 }
3613 
3614 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3615 {
3616         struct open_how __user *how;
3617         size_t len;
3618         int ret;
3619 
3620         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3621                 return -EINVAL;
3622         if (req->flags & REQ_F_NEED_CLEANUP)
3623                 return 0;
3624         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3625         len = READ_ONCE(sqe->len);
3626         if (len < OPEN_HOW_SIZE_VER0)
3627                 return -EINVAL;
3628 
3629         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3630                                         len);
3631         if (ret)
3632                 return ret;
3633 
3634         return __io_openat_prep(req, sqe);
3635 }
3636 
3637 static int io_openat2(struct io_kiocb *req, bool force_nonblock)
3638 {
3639         struct open_flags op;
3640         struct file *file;
3641         int ret;
3642 
3643         if (force_nonblock && !req->open.ignore_nonblock)
3644                 return -EAGAIN;
3645 
3646         ret = build_open_flags(&req->open.how, &op);
3647         if (ret)
3648                 goto err;
3649 
3650         ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3651         if (ret < 0)
3652                 goto err;
3653 
3654         file = do_filp_open(req->open.dfd, req->open.filename, &op);
3655         if (IS_ERR(file)) {
3656                 put_unused_fd(ret);
3657                 ret = PTR_ERR(file);
3658                 /*
3659                  * A work-around to ensure that /proc/self works that way
3660                  * that it should - if we get -EOPNOTSUPP back, then assume
3661                  * that proc_self_get_link() failed us because we're in async
3662                  * context. We should be safe to retry this from the task
3663                  * itself with force_nonblock == false set, as it should not
3664                  * block on lookup. Would be nice to know this upfront and
3665                  * avoid the async dance, but doesn't seem feasible.
3666                  */
3667                 if (ret == -EOPNOTSUPP && io_wq_current_is_worker()) {
3668                         req->open.ignore_nonblock = true;
3669                         refcount_inc(&req->refs);
3670                         io_req_task_queue(req);
3671                         return 0;
3672                 }
3673         } else {
3674                 fsnotify_open(file);
3675                 fd_install(ret, file);
3676         }
3677 err:
3678         putname(req->open.filename);
3679         req->flags &= ~REQ_F_NEED_CLEANUP;
3680         if (ret < 0)
3681                 req_set_fail_links(req);
3682         io_req_complete(req, ret);
3683         return 0;
3684 }
3685 
3686 static int io_openat(struct io_kiocb *req, bool force_nonblock)
3687 {
3688         return io_openat2(req, force_nonblock);
3689 }
3690 
3691 static int io_remove_buffers_prep(struct io_kiocb *req,
3692                                   const struct io_uring_sqe *sqe)
3693 {
3694         struct io_provide_buf *p = &req->pbuf;
3695         u64 tmp;
3696 
3697         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3698                 return -EINVAL;
3699 
3700         tmp = READ_ONCE(sqe->fd);
3701         if (!tmp || tmp > USHRT_MAX)
3702                 return -EINVAL;
3703 
3704         memset(p, 0, sizeof(*p));
3705         p->nbufs = tmp;
3706         p->bgid = READ_ONCE(sqe->buf_group);
3707         return 0;
3708 }
3709 
3710 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3711                                int bgid, unsigned nbufs)
3712 {
3713         unsigned i = 0;
3714 
3715         /* shouldn't happen */
3716         if (!nbufs)
3717                 return 0;
3718 
3719         /* the head kbuf is the list itself */
3720         while (!list_empty(&buf->list)) {
3721                 struct io_buffer *nxt;
3722 
3723                 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3724                 list_del(&nxt->list);
3725                 kfree(nxt);
3726                 if (++i == nbufs)
3727                         return i;
3728         }
3729         i++;
3730         kfree(buf);
3731         idr_remove(&ctx->io_buffer_idr, bgid);
3732 
3733         return i;
3734 }
3735 
3736 static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
3737                              struct io_comp_state *cs)
3738 {
3739         struct io_provide_buf *p = &req->pbuf;
3740         struct io_ring_ctx *ctx = req->ctx;
3741         struct io_buffer *head;
3742         int ret = 0;
3743 
3744         io_ring_submit_lock(ctx, !force_nonblock);
3745 
3746         lockdep_assert_held(&ctx->uring_lock);
3747 
3748         ret = -ENOENT;
3749         head = idr_find(&ctx->io_buffer_idr, p->bgid);
3750         if (head)
3751                 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3752 
3753         io_ring_submit_lock(ctx, !force_nonblock);
3754         if (ret < 0)
3755                 req_set_fail_links(req);
3756         __io_req_complete(req, ret, 0, cs);
3757         return 0;
3758 }
3759 
3760 static int io_provide_buffers_prep(struct io_kiocb *req,
3761                                    const struct io_uring_sqe *sqe)
3762 {
3763         struct io_provide_buf *p = &req->pbuf;
3764         u64 tmp;
3765 
3766         if (sqe->ioprio || sqe->rw_flags)
3767                 return -EINVAL;
3768 
3769         tmp = READ_ONCE(sqe->fd);
3770         if (!tmp || tmp > USHRT_MAX)
3771                 return -E2BIG;
3772         p->nbufs = tmp;
3773         p->addr = READ_ONCE(sqe->addr);
3774         p->len = READ_ONCE(sqe->len);
3775 
3776         if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
3777                 return -EFAULT;
3778 
3779         p->bgid = READ_ONCE(sqe->buf_group);
3780         tmp = READ_ONCE(sqe->off);
3781         if (tmp > USHRT_MAX)
3782                 return -E2BIG;
3783         p->bid = tmp;
3784         return 0;
3785 }
3786 
3787 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3788 {
3789         struct io_buffer *buf;
3790         u64 addr = pbuf->addr;
3791         int i, bid = pbuf->bid;
3792 
3793         for (i = 0; i < pbuf->nbufs; i++) {
3794                 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3795                 if (!buf)
3796                         break;
3797 
3798                 buf->addr = addr;
3799                 buf->len = pbuf->len;
3800                 buf->bid = bid;
3801                 addr += pbuf->len;
3802                 bid++;
3803                 if (!*head) {
3804                         INIT_LIST_HEAD(&buf->list);
3805                         *head = buf;
3806                 } else {
3807                         list_add_tail(&buf->list, &(*head)->list);
3808                 }
3809         }
3810 
3811         return i ? i : -ENOMEM;
3812 }
3813 
3814 static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock,
3815                               struct io_comp_state *cs)
3816 {
3817         struct io_provide_buf *p = &req->pbuf;
3818         struct io_ring_ctx *ctx = req->ctx;
3819         struct io_buffer *head, *list;
3820         int ret = 0;
3821 
3822         io_ring_submit_lock(ctx, !force_nonblock);
3823 
3824         lockdep_assert_held(&ctx->uring_lock);
3825 
3826         list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
3827 
3828         ret = io_add_buffers(p, &head);
3829         if (ret < 0)
3830                 goto out;
3831 
3832         if (!list) {
3833                 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
3834                                         GFP_KERNEL);
3835                 if (ret < 0) {
3836                         __io_remove_buffers(ctx, head, p->bgid, -1U);
3837                         goto out;
3838                 }
3839         }
3840 out:
3841         io_ring_submit_unlock(ctx, !force_nonblock);
3842         if (ret < 0)
3843                 req_set_fail_links(req);
3844         __io_req_complete(req, ret, 0, cs);
3845         return 0;
3846 }
3847 
3848 static int io_epoll_ctl_prep(struct io_kiocb *req,
3849                              const struct io_uring_sqe *sqe)
3850 {
3851 #if defined(CONFIG_EPOLL)
3852         if (sqe->ioprio || sqe->buf_index)
3853                 return -EINVAL;
3854         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
3855                 return -EINVAL;
3856 
3857         req->epoll.epfd = READ_ONCE(sqe->fd);
3858         req->epoll.op = READ_ONCE(sqe->len);
3859         req->epoll.fd = READ_ONCE(sqe->off);
3860 
3861         if (ep_op_has_event(req->epoll.op)) {
3862                 struct epoll_event __user *ev;
3863 
3864                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
3865                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
3866                         return -EFAULT;
3867         }
3868 
3869         return 0;
3870 #else
3871         return -EOPNOTSUPP;
3872 #endif
3873 }
3874 
3875 static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock,
3876                         struct io_comp_state *cs)
3877 {
3878 #if defined(CONFIG_EPOLL)
3879         struct io_epoll *ie = &req->epoll;
3880         int ret;
3881 
3882         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
3883         if (force_nonblock && ret == -EAGAIN)
3884                 return -EAGAIN;
3885 
3886         if (ret < 0)
3887                 req_set_fail_links(req);
3888         __io_req_complete(req, ret, 0, cs);
3889         return 0;
3890 #else
3891         return -EOPNOTSUPP;
3892 #endif
3893 }
3894 
3895 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3896 {
3897 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3898         if (sqe->ioprio || sqe->buf_index || sqe->off)
3899                 return -EINVAL;
3900         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3901                 return -EINVAL;
3902 
3903         req->madvise.addr = READ_ONCE(sqe->addr);
3904         req->madvise.len = READ_ONCE(sqe->len);
3905         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
3906         return 0;
3907 #else
3908         return -EOPNOTSUPP;
3909 #endif
3910 }
3911 
3912 static int io_madvise(struct io_kiocb *req, bool force_nonblock)
3913 {
3914 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3915         struct io_madvise *ma = &req->madvise;
3916         int ret;
3917 
3918         if (force_nonblock)
3919                 return -EAGAIN;
3920 
3921         ret = do_madvise(ma->addr, ma->len, ma->advice);
3922         if (ret < 0)
3923                 req_set_fail_links(req);
3924         io_req_complete(req, ret);
3925         return 0;
3926 #else
3927         return -EOPNOTSUPP;
3928 #endif
3929 }
3930 
3931 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3932 {
3933         if (sqe->ioprio || sqe->buf_index || sqe->addr)
3934                 return -EINVAL;
3935         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3936                 return -EINVAL;
3937 
3938         req->fadvise.offset = READ_ONCE(sqe->off);
3939         req->fadvise.len = READ_ONCE(sqe->len);
3940         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
3941         return 0;
3942 }
3943 
3944 static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
3945 {
3946         struct io_fadvise *fa = &req->fadvise;
3947         int ret;
3948 
3949         if (force_nonblock) {
3950                 switch (fa->advice) {
3951                 case POSIX_FADV_NORMAL:
3952                 case POSIX_FADV_RANDOM:
3953                 case POSIX_FADV_SEQUENTIAL:
3954                         break;
3955                 default:
3956                         return -EAGAIN;
3957                 }
3958         }
3959 
3960         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
3961         if (ret < 0)
3962                 req_set_fail_links(req);
3963         io_req_complete(req, ret);
3964         return 0;
3965 }
3966 
3967 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3968 {
3969         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
3970                 return -EINVAL;
3971         if (sqe->ioprio || sqe->buf_index)
3972                 return -EINVAL;
3973         if (req->flags & REQ_F_FIXED_FILE)
3974                 return -EBADF;
3975 
3976         req->statx.dfd = READ_ONCE(sqe->fd);
3977         req->statx.mask = READ_ONCE(sqe->len);
3978         req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
3979         req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3980         req->statx.flags = READ_ONCE(sqe->statx_flags);
3981 
3982         return 0;
3983 }
3984 
3985 static int io_statx(struct io_kiocb *req, bool force_nonblock)
3986 {
3987         struct io_statx *ctx = &req->statx;
3988         int ret;
3989 
3990         if (force_nonblock) {
3991                 /* only need file table for an actual valid fd */
3992                 if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
3993                         req->flags |= REQ_F_NO_FILE_TABLE;
3994                 return -EAGAIN;
3995         }
3996 
3997         ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
3998                        ctx->buffer);
3999 
4000         if (ret < 0)
4001                 req_set_fail_links(req);
4002         io_req_complete(req, ret);
4003         return 0;
4004 }
4005 
4006 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4007 {
4008         /*
4009          * If we queue this for async, it must not be cancellable. That would
4010          * leave the 'file' in an undeterminate state, and here need to modify
4011          * io_wq_work.flags, so initialize io_wq_work firstly.
4012          */
4013         io_req_init_async(req);
4014         req->work.flags |= IO_WQ_WORK_NO_CANCEL;
4015 
4016         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4017                 return -EINVAL;
4018         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4019             sqe->rw_flags || sqe->buf_index)
4020                 return -EINVAL;
4021         if (req->flags & REQ_F_FIXED_FILE)
4022                 return -EBADF;
4023 
4024         req->close.fd = READ_ONCE(sqe->fd);
4025         if ((req->file && req->file->f_op == &io_uring_fops))
4026                 return -EBADF;
4027 
4028         req->close.put_file = NULL;
4029         return 0;
4030 }
4031 
4032 static int io_close(struct io_kiocb *req, bool force_nonblock,
4033                     struct io_comp_state *cs)
4034 {
4035         struct io_close *close = &req->close;
4036         int ret;
4037 
4038         /* might be already done during nonblock submission */
4039         if (!close->put_file) {
4040                 ret = __close_fd_get_file(close->fd, &close->put_file);
4041                 if (ret < 0)
4042                         return (ret == -ENOENT) ? -EBADF : ret;
4043         }
4044 
4045         /* if the file has a flush method, be safe and punt to async */
4046         if (close->put_file->f_op->flush && force_nonblock) {
4047                 /* was never set, but play safe */
4048                 req->flags &= ~REQ_F_NOWAIT;
4049                 /* avoid grabbing files - we don't need the files */
4050                 req->flags |= REQ_F_NO_FILE_TABLE;
4051                 return -EAGAIN;
4052         }
4053 
4054         /* No ->flush() or already async, safely close from here */
4055         ret = filp_close(close->put_file, req->work.files);
4056         if (ret < 0)
4057                 req_set_fail_links(req);
4058         fput(close->put_file);
4059         close->put_file = NULL;
4060         __io_req_complete(req, ret, 0, cs);
4061         return 0;
4062 }
4063 
4064 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4065 {
4066         struct io_ring_ctx *ctx = req->ctx;
4067 
4068         if (!req->file)
4069                 return -EBADF;
4070 
4071         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4072                 return -EINVAL;
4073         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
4074                 return -EINVAL;
4075 
4076         req->sync.off = READ_ONCE(sqe->off);
4077         req->sync.len = READ_ONCE(sqe->len);
4078         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4079         return 0;
4080 }
4081 
4082 static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
4083 {
4084         int ret;
4085 
4086         /* sync_file_range always requires a blocking context */
4087         if (force_nonblock)
4088                 return -EAGAIN;
4089 
4090         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4091                                 req->sync.flags);
4092         if (ret < 0)
4093                 req_set_fail_links(req);
4094         io_req_complete(req, ret);
4095         return 0;
4096 }
4097 
4098 #if defined(CONFIG_NET)
4099 static int io_setup_async_msg(struct io_kiocb *req,
4100                               struct io_async_msghdr *kmsg)
4101 {
4102         if (req->io)
4103                 return -EAGAIN;
4104         if (io_alloc_async_ctx(req)) {
4105                 if (kmsg->iov != kmsg->fast_iov)
4106                         kfree(kmsg->iov);
4107                 return -ENOMEM;
4108         }
4109         req->flags |= REQ_F_NEED_CLEANUP;
4110         memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
4111         return -EAGAIN;
4112 }
4113 
4114 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4115                                struct io_async_msghdr *iomsg)
4116 {
4117         iomsg->iov = iomsg->fast_iov;
4118         iomsg->msg.msg_name = &iomsg->addr;
4119         return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4120                                    req->sr_msg.msg_flags, &iomsg->iov);
4121 }
4122 
4123 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4124 {
4125         struct io_sr_msg *sr = &req->sr_msg;
4126         struct io_async_ctx *io = req->io;
4127         int ret;
4128 
4129         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4130                 return -EINVAL;
4131 
4132         sr->msg_flags = READ_ONCE(sqe->msg_flags);
4133         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4134         sr->len = READ_ONCE(sqe->len);
4135 
4136 #ifdef CONFIG_COMPAT
4137         if (req->ctx->compat)
4138                 sr->msg_flags |= MSG_CMSG_COMPAT;
4139 #endif
4140 
4141         if (!io || req->opcode == IORING_OP_SEND)
4142                 return 0;
4143         /* iovec is already imported */
4144         if (req->flags & REQ_F_NEED_CLEANUP)
4145                 return 0;
4146 
4147         ret = io_sendmsg_copy_hdr(req, &io->msg);
4148         if (!ret)
4149                 req->flags |= REQ_F_NEED_CLEANUP;
4150         return ret;
4151 }
4152 
4153 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
4154                       struct io_comp_state *cs)
4155 {
4156         struct io_async_msghdr iomsg, *kmsg;
4157         struct socket *sock;
4158         unsigned flags;
4159         int ret;
4160 
4161         sock = sock_from_file(req->file, &ret);
4162         if (unlikely(!sock))
4163                 return ret;
4164 
4165         if (req->io) {
4166                 kmsg = &req->io->msg;
4167                 kmsg->msg.msg_name = &req->io->msg.addr;
4168                 /* if iov is set, it's allocated already */
4169                 if (!kmsg->iov)
4170                         kmsg->iov = kmsg->fast_iov;
4171                 kmsg->msg.msg_iter.iov = kmsg->iov;
4172         } else {
4173                 ret = io_sendmsg_copy_hdr(req, &iomsg);
4174                 if (ret)
4175                         return ret;
4176                 kmsg = &iomsg;
4177         }
4178 
4179         flags = req->sr_msg.msg_flags;
4180         if (flags & MSG_DONTWAIT)
4181                 req->flags |= REQ_F_NOWAIT;
4182         else if (force_nonblock)
4183                 flags |= MSG_DONTWAIT;
4184 
4185         ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4186         if (force_nonblock && ret == -EAGAIN)
4187                 return io_setup_async_msg(req, kmsg);
4188         if (ret == -ERESTARTSYS)
4189                 ret = -EINTR;
4190 
4191         if (kmsg->iov != kmsg->fast_iov)
4192                 kfree(kmsg->iov);
4193         req->flags &= ~REQ_F_NEED_CLEANUP;
4194         if (ret < 0)
4195                 req_set_fail_links(req);
4196         __io_req_complete(req, ret, 0, cs);
4197         return 0;
4198 }
4199 
4200 static int io_send(struct io_kiocb *req, bool force_nonblock,
4201                    struct io_comp_state *cs)
4202 {
4203         struct io_sr_msg *sr = &req->sr_msg;
4204         struct msghdr msg;
4205         struct iovec iov;
4206         struct socket *sock;
4207         unsigned flags;
4208         int ret;
4209 
4210         sock = sock_from_file(req->file, &ret);
4211         if (unlikely(!sock))
4212                 return ret;
4213 
4214         ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4215         if (unlikely(ret))
4216                 return ret;;
4217 
4218         msg.msg_name = NULL;
4219         msg.msg_control = NULL;
4220         msg.msg_controllen = 0;
4221         msg.msg_namelen = 0;
4222 
4223         flags = req->sr_msg.msg_flags;
4224         if (flags & MSG_DONTWAIT)
4225                 req->flags |= REQ_F_NOWAIT;
4226         else if (force_nonblock)
4227                 flags |= MSG_DONTWAIT;
4228 
4229         msg.msg_flags = flags;
4230         ret = sock_sendmsg(sock, &msg);
4231         if (force_nonblock && ret == -EAGAIN)
4232                 return -EAGAIN;
4233         if (ret == -ERESTARTSYS)
4234                 ret = -EINTR;
4235 
4236         if (ret < 0)
4237                 req_set_fail_links(req);
4238         __io_req_complete(req, ret, 0, cs);
4239         return 0;
4240 }
4241 
4242 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4243                                  struct io_async_msghdr *iomsg)
4244 {
4245         struct io_sr_msg *sr = &req->sr_msg;
4246         struct iovec __user *uiov;
4247         size_t iov_len;
4248         int ret;
4249 
4250         ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4251                                         &iomsg->uaddr, &uiov, &iov_len);
4252         if (ret)
4253                 return ret;
4254 
4255         if (req->flags & REQ_F_BUFFER_SELECT) {
4256                 if (iov_len > 1)
4257                         return -EINVAL;
4258                 if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov)))
4259                         return -EFAULT;
4260                 sr->len = iomsg->iov[0].iov_len;
4261                 iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1,
4262                                 sr->len);
4263                 iomsg->iov = NULL;
4264         } else {
4265                 ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4266                                         &iomsg->iov, &iomsg->msg.msg_iter);
4267                 if (ret > 0)
4268                         ret = 0;
4269         }
4270 
4271         return ret;
4272 }
4273 
4274 #ifdef CONFIG_COMPAT
4275 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4276                                         struct io_async_msghdr *iomsg)
4277 {
4278         struct compat_msghdr __user *msg_compat;
4279         struct io_sr_msg *sr = &req->sr_msg;
4280         struct compat_iovec __user *uiov;
4281         compat_uptr_t ptr;
4282         compat_size_t len;
4283         int ret;
4284 
4285         msg_compat = (struct compat_msghdr __user *) sr->umsg;
4286         ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
4287                                         &ptr, &len);
4288         if (ret)
4289                 return ret;
4290 
4291         uiov = compat_ptr(ptr);
4292         if (req->flags & REQ_F_BUFFER_SELECT) {
4293                 compat_ssize_t clen;
4294 
4295                 if (len > 1)
4296                         return -EINVAL;
4297                 if (!access_ok(uiov, sizeof(*uiov)))
4298                         return -EFAULT;
4299                 if (__get_user(clen, &uiov->iov_len))
4300                         return -EFAULT;
4301                 if (clen < 0)
4302                         return -EINVAL;
4303                 sr->len = clen;
4304                 iomsg->iov[0].iov_len = clen;
4305                 iomsg->iov = NULL;
4306         } else {
4307                 ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
4308                                                 &iomsg->iov,
4309                                                 &iomsg->msg.msg_iter);
4310                 if (ret < 0)
4311                         return ret;
4312         }
4313 
4314         return 0;
4315 }
4316 #endif
4317 
4318 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4319                                struct io_async_msghdr *iomsg)
4320 {
4321         iomsg->msg.msg_name = &iomsg->addr;
4322         iomsg->iov = iomsg->fast_iov;
4323 
4324 #ifdef CONFIG_COMPAT
4325         if (req->ctx->compat)
4326                 return __io_compat_recvmsg_copy_hdr(req, iomsg);
4327 #endif
4328 
4329         return __io_recvmsg_copy_hdr(req, iomsg);
4330 }
4331 
4332 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4333                                                bool needs_lock)
4334 {
4335         struct io_sr_msg *sr = &req->sr_msg;
4336         struct io_buffer *kbuf;
4337 
4338         kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4339         if (IS_ERR(kbuf))
4340                 return kbuf;
4341 
4342         sr->kbuf = kbuf;
4343         req->flags |= REQ_F_BUFFER_SELECTED;
4344         return kbuf;
4345 }
4346 
4347 static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4348 {
4349         return io_put_kbuf(req, req->sr_msg.kbuf);
4350 }
4351 
4352 static int io_recvmsg_prep(struct io_kiocb *req,
4353                            const struct io_uring_sqe *sqe)
4354 {
4355         struct io_sr_msg *sr = &req->sr_msg;
4356         struct io_async_ctx *io = req->io;
4357         int ret;
4358 
4359         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4360                 return -EINVAL;
4361 
4362         sr->msg_flags = READ_ONCE(sqe->msg_flags);
4363         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4364         sr->len = READ_ONCE(sqe->len);
4365         sr->bgid = READ_ONCE(sqe->buf_group);
4366 
4367 #ifdef CONFIG_COMPAT
4368         if (req->ctx->compat)
4369                 sr->msg_flags |= MSG_CMSG_COMPAT;
4370 #endif
4371 
4372         if (!io || req->opcode == IORING_OP_RECV)
4373                 return 0;
4374         /* iovec is already imported */
4375         if (req->flags & REQ_F_NEED_CLEANUP)
4376                 return 0;
4377 
4378         ret = io_recvmsg_copy_hdr(req, &io->msg);
4379         if (!ret)
4380                 req->flags |= REQ_F_NEED_CLEANUP;
4381         return ret;
4382 }
4383 
4384 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
4385                       struct io_comp_state *cs)
4386 {
4387         struct io_async_msghdr iomsg, *kmsg;
4388         struct socket *sock;
4389         struct io_buffer *kbuf;
4390         unsigned flags;
4391         int ret, cflags = 0;
4392 
4393         sock = sock_from_file(req->file, &ret);
4394         if (unlikely(!sock))
4395                 return ret;
4396 
4397         if (req->io) {
4398                 kmsg = &req->io->msg;
4399                 kmsg->msg.msg_name = &req->io->msg.addr;
4400                 /* if iov is set, it's allocated already */
4401                 if (!kmsg->iov)
4402                         kmsg->iov = kmsg->fast_iov;
4403                 kmsg->msg.msg_iter.iov = kmsg->iov;
4404         } else {
4405                 ret = io_recvmsg_copy_hdr(req, &iomsg);
4406                 if (ret)
4407                         return ret;
4408                 kmsg = &iomsg;
4409         }
4410 
4411         if (req->flags & REQ_F_BUFFER_SELECT) {
4412                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4413                 if (IS_ERR(kbuf))
4414                         return PTR_ERR(kbuf);
4415                 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4416                 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
4417                                 1, req->sr_msg.len);
4418         }
4419 
4420         flags = req->sr_msg.msg_flags;
4421         if (flags & MSG_DONTWAIT)
4422                 req->flags |= REQ_F_NOWAIT;
4423         else if (force_nonblock)
4424                 flags |= MSG_DONTWAIT;
4425 
4426         ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4427                                         kmsg->uaddr, flags);
4428         if (force_nonblock && ret == -EAGAIN)
4429                 return io_setup_async_msg(req, kmsg);
4430         if (ret == -ERESTARTSYS)
4431                 ret = -EINTR;
4432 
4433         if (req->flags & REQ_F_BUFFER_SELECTED)
4434                 cflags = io_put_recv_kbuf(req);
4435         if (kmsg->iov != kmsg->fast_iov)
4436                 kfree(kmsg->iov);
4437         req->flags &= ~REQ_F_NEED_CLEANUP;
4438         if (ret < 0)
4439                 req_set_fail_links(req);
4440         __io_req_complete(req, ret, cflags, cs);
4441         return 0;
4442 }
4443 
4444 static int io_recv(struct io_kiocb *req, bool force_nonblock,
4445                    struct io_comp_state *cs)
4446 {
4447         struct io_buffer *kbuf;
4448         struct io_sr_msg *sr = &req->sr_msg;
4449         struct msghdr msg;
4450         void __user *buf = sr->buf;
4451         struct socket *sock;
4452         struct iovec iov;
4453         unsigned flags;
4454         int ret, cflags = 0;
4455 
4456         sock = sock_from_file(req->file, &ret);
4457         if (unlikely(!sock))
4458                 return ret;
4459 
4460         if (req->flags & REQ_F_BUFFER_SELECT) {
4461                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4462                 if (IS_ERR(kbuf))
4463                         return PTR_ERR(kbuf);
4464                 buf = u64_to_user_ptr(kbuf->addr);
4465         }
4466 
4467         ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
4468         if (unlikely(ret))
4469                 goto out_free;
4470 
4471         msg.msg_name = NULL;
4472         msg.msg_control = NULL;
4473         msg.msg_controllen = 0;
4474         msg.msg_namelen = 0;
4475         msg.msg_iocb = NULL;
4476         msg.msg_flags = 0;
4477 
4478         flags = req->sr_msg.msg_flags;
4479         if (flags & MSG_DONTWAIT)
4480                 req->flags |= REQ_F_NOWAIT;
4481         else if (force_nonblock)
4482                 flags |= MSG_DONTWAIT;
4483 
4484         ret = sock_recvmsg(sock, &msg, flags);
4485         if (force_nonblock && ret == -EAGAIN)
4486                 return -EAGAIN;
4487         if (ret == -ERESTARTSYS)
4488                 ret = -EINTR;
4489 out_free:
4490         if (req->flags & REQ_F_BUFFER_SELECTED)
4491                 cflags = io_put_recv_kbuf(req);
4492         if (ret < 0)
4493                 req_set_fail_links(req);
4494         __io_req_complete(req, ret, cflags, cs);
4495         return 0;
4496 }
4497 
4498 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4499 {
4500         struct io_accept *accept = &req->accept;
4501 
4502         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4503                 return -EINVAL;
4504         if (sqe->ioprio || sqe->len || sqe->buf_index)
4505                 return -EINVAL;
4506 
4507         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4508         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4509         accept->flags = READ_ONCE(sqe->accept_flags);
4510         accept->nofile = rlimit(RLIMIT_NOFILE);
4511         return 0;
4512 }
4513 
4514 static int io_accept(struct io_kiocb *req, bool force_nonblock,
4515                      struct io_comp_state *cs)
4516 {
4517         struct io_accept *accept = &req->accept;
4518         unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
4519         int ret;
4520 
4521         if (req->file->f_flags & O_NONBLOCK)
4522                 req->flags |= REQ_F_NOWAIT;
4523 
4524         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
4525                                         accept->addr_len, accept->flags,
4526                                         accept->nofile);
4527         if (ret == -EAGAIN && force_nonblock)
4528                 return -EAGAIN;
4529         if (ret < 0) {
4530                 if (ret == -ERESTARTSYS)
4531                         ret = -EINTR;
4532                 req_set_fail_links(req);
4533         }
4534         __io_req_complete(req, ret, 0, cs);
4535         return 0;
4536 }
4537 
4538 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4539 {
4540         struct io_connect *conn = &req->connect;
4541         struct io_async_ctx *io = req->io;
4542 
4543         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4544                 return -EINVAL;
4545         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
4546                 return -EINVAL;
4547 
4548         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4549         conn->addr_len =  READ_ONCE(sqe->addr2);
4550 
4551         if (!io)
4552                 return 0;
4553 
4554         return move_addr_to_kernel(conn->addr, conn->addr_len,
4555                                         &io->connect.address);
4556 }
4557 
4558 static int io_connect(struct io_kiocb *req, bool force_nonblock,
4559                       struct io_comp_state *cs)
4560 {
4561         struct io_async_ctx __io, *io;
4562         unsigned file_flags;
4563         int ret;
4564 
4565         if (req->io) {
4566                 io = req->io;
4567         } else {
4568                 ret = move_addr_to_kernel(req->connect.addr,
4569                                                 req->connect.addr_len,
4570                                                 &__io.connect.address);
4571                 if (ret)
4572                         goto out;
4573                 io = &__io;
4574         }
4575 
4576         file_flags = force_nonblock ? O_NONBLOCK : 0;
4577 
4578         ret = __sys_connect_file(req->file, &io->connect.address,
4579                                         req->connect.addr_len, file_flags);
4580         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4581                 if (req->io)
4582                         return -EAGAIN;
4583                 if (io_alloc_async_ctx(req)) {
4584                         ret = -ENOMEM;
4585                         goto out;
4586                 }
4587                 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
4588                 return -EAGAIN;
4589         }
4590         if (ret == -ERESTARTSYS)
4591                 ret = -EINTR;
4592 out:
4593         if (ret < 0)
4594                 req_set_fail_links(req);
4595         __io_req_complete(req, ret, 0, cs);
4596         return 0;
4597 }
4598 #else /* !CONFIG_NET */
4599 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4600 {
4601         return -EOPNOTSUPP;
4602 }
4603 
4604 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
4605                       struct io_comp_state *cs)
4606 {
4607         return -EOPNOTSUPP;
4608 }
4609 
4610 static int io_send(struct io_kiocb *req, bool force_nonblock,
4611                    struct io_comp_state *cs)
4612 {
4613         return -EOPNOTSUPP;
4614 }
4615 
4616 static int io_recvmsg_prep(struct io_kiocb *req,
4617                            const struct io_uring_sqe *sqe)
4618 {
4619         return -EOPNOTSUPP;
4620 }
4621 
4622 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
4623                       struct io_comp_state *cs)
4624 {
4625         return -EOPNOTSUPP;
4626 }
4627 
4628 static int io_recv(struct io_kiocb *req, bool force_nonblock,
4629                    struct io_comp_state *cs)
4630 {
4631         return -EOPNOTSUPP;
4632 }
4633 
4634 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4635 {
4636         return -EOPNOTSUPP;
4637 }
4638 
4639 static int io_accept(struct io_kiocb *req, bool force_nonblock,
4640                      struct io_comp_state *cs)
4641 {
4642         return -EOPNOTSUPP;
4643 }
4644 
4645 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4646 {
4647         return -EOPNOTSUPP;
4648 }
4649 
4650 static int io_connect(struct io_kiocb *req, bool force_nonblock,
4651                       struct io_comp_state *cs)
4652 {
4653         return -EOPNOTSUPP;
4654 }
4655 #endif /* CONFIG_NET */
4656 
4657 struct io_poll_table {
4658         struct poll_table_struct pt;
4659         struct io_kiocb *req;
4660         int error;
4661 };
4662 
4663 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4664                            __poll_t mask, task_work_func_t func)
4665 {
4666         bool twa_signal_ok;
4667         int ret;
4668 
4669         /* for instances that support it check for an event match first: */
4670         if (mask && !(mask & poll->events))
4671                 return 0;
4672 
4673         trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4674 
4675         list_del_init(&poll->wait.entry);
4676 
4677         req->result = mask;
4678         init_task_work(&req->task_work, func);
4679         percpu_ref_get(&req->ctx->refs);
4680 
4681         /*
4682          * If we using the signalfd wait_queue_head for this wakeup, then
4683          * it's not safe to use TWA_SIGNAL as we could be recursing on the
4684          * tsk->sighand->siglock on doing the wakeup. Should not be needed
4685          * either, as the normal wakeup will suffice.
4686          */
4687         twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh);
4688 
4689         /*
4690          * If this fails, then the task is exiting. When a task exits, the
4691          * work gets canceled, so just cancel this request as well instead
4692          * of executing it. We can't safely execute it anyway, as we may not
4693          * have the needed state needed for it anyway.
4694          */
4695         ret = io_req_task_work_add(req, &req->task_work, twa_signal_ok);
4696         if (unlikely(ret)) {
4697                 struct task_struct *tsk;
4698 
4699                 WRITE_ONCE(poll->canceled, true);
4700                 tsk = io_wq_get_task(req->ctx->io_wq);
4701                 task_work_add(tsk, &req->task_work, 0);
4702                 wake_up_process(tsk);
4703         }
4704         return 1;
4705 }
4706 
4707 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4708         __acquires(&req->ctx->completion_lock)
4709 {
4710         struct io_ring_ctx *ctx = req->ctx;
4711 
4712         if (!req->result && !READ_ONCE(poll->canceled)) {
4713                 struct poll_table_struct pt = { ._key = poll->events };
4714 
4715                 req->result = vfs_poll(req->file, &pt) & poll->events;
4716         }
4717 
4718         spin_lock_irq(&ctx->completion_lock);
4719         if (!req->result && !READ_ONCE(poll->canceled)) {
4720                 add_wait_queue(poll->head, &poll->wait);
4721                 return true;
4722         }
4723 
4724         return false;
4725 }
4726 
4727 static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
4728 {
4729         /* pure poll stashes this in ->io, poll driven retry elsewhere */
4730         if (req->opcode == IORING_OP_POLL_ADD)
4731                 return (struct io_poll_iocb *) req->io;
4732         return req->apoll->double_poll;
4733 }
4734 
4735 static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
4736 {
4737         if (req->opcode == IORING_OP_POLL_ADD)
4738                 return &req->poll;
4739         return &req->apoll->poll;
4740 }
4741 
4742 static void io_poll_remove_double(struct io_kiocb *req)
4743 {
4744         struct io_poll_iocb *poll = io_poll_get_double(req);
4745 
4746         lockdep_assert_held(&req->ctx->completion_lock);
4747 
4748         if (poll && poll->head) {
4749                 struct wait_queue_head *head = poll->head;
4750 
4751                 spin_lock(&head->lock);
4752                 list_del_init(&poll->wait.entry);
4753                 if (poll->wait.private)
4754                         refcount_dec(&req->refs);
4755                 poll->head = NULL;
4756                 spin_unlock(&head->lock);
4757         }
4758 }
4759 
4760 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
4761 {
4762         struct io_ring_ctx *ctx = req->ctx;
4763 
4764         io_poll_remove_double(req);
4765         req->poll.done = true;
4766         io_cqring_fill_event(req, error ? error : mangle_poll(mask));
4767         io_commit_cqring(ctx);
4768 }
4769 
4770 static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
4771 {
4772         struct io_ring_ctx *ctx = req->ctx;
4773 
4774         if (io_poll_rewait(req, &req->poll)) {
4775                 spin_unlock_irq(&ctx->completion_lock);
4776                 return;
4777         }
4778 
4779         hash_del(&req->hash_node);
4780         io_poll_complete(req, req->result, 0);
4781         req->flags |= REQ_F_COMP_LOCKED;
4782         *nxt = io_put_req_find_next(req);
4783         spin_unlock_irq(&ctx->completion_lock);
4784 
4785         io_cqring_ev_posted(ctx);
4786 }
4787 
4788 static void io_poll_task_func(struct callback_head *cb)
4789 {
4790         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4791         struct io_ring_ctx *ctx = req->ctx;
4792         struct io_kiocb *nxt = NULL;
4793 
4794         io_poll_task_handler(req, &nxt);
4795         if (nxt)
4796                 __io_req_task_submit(nxt);
4797         percpu_ref_put(&ctx->refs);
4798 }
4799 
4800 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4801                                int sync, void *key)
4802 {
4803         struct io_kiocb *req = wait->private;
4804         struct io_poll_iocb *poll = io_poll_get_single(req);
4805         __poll_t mask = key_to_poll(key);
4806 
4807         /* for instances that support it check for an event match first: */
4808         if (mask && !(mask & poll->events))
4809                 return 0;
4810 
4811         list_del_init(&wait->entry);
4812 
4813         if (poll && poll->head) {
4814                 bool done;
4815 
4816                 spin_lock(&poll->head->lock);
4817                 done = list_empty(&poll->wait.entry);
4818                 if (!done)
4819                         list_del_init(&poll->wait.entry);
4820                 /* make sure double remove sees this as being gone */
4821                 wait->private = NULL;
4822                 spin_unlock(&poll->head->lock);
4823                 if (!done) {
4824                         /* use wait func handler, so it matches the rq type */
4825                         poll->wait.func(&poll->wait, mode, sync, key);
4826                 }
4827         }
4828         refcount_dec(&req->refs);
4829         return 1;
4830 }
4831 
4832 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
4833                               wait_queue_func_t wake_func)
4834 {
4835         poll->head = NULL;
4836         poll->done = false;
4837         poll->canceled = false;
4838         poll->events = events;
4839         INIT_LIST_HEAD(&poll->wait.entry);
4840         init_waitqueue_func_entry(&poll->wait, wake_func);
4841 }
4842 
4843 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
4844                             struct wait_queue_head *head,
4845                             struct io_poll_iocb **poll_ptr)
4846 {
4847         struct io_kiocb *req = pt->req;
4848 
4849         /*
4850          * If poll->head is already set, it's because the file being polled
4851          * uses multiple waitqueues for poll handling (eg one for read, one
4852          * for write). Setup a separate io_poll_iocb if this happens.
4853          */
4854         if (unlikely(poll->head)) {
4855                 /* already have a 2nd entry, fail a third attempt */
4856                 if (*poll_ptr) {
4857                         pt->error = -EINVAL;
4858                         return;
4859                 }
4860                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
4861                 if (!poll) {
4862                         pt->error = -ENOMEM;
4863                         return;
4864                 }
4865                 io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake);
4866                 refcount_inc(&req->refs);
4867                 poll->wait.private = req;
4868                 *poll_ptr = poll;
4869         }
4870 
4871         pt->error = 0;
4872         poll->head = head;
4873 
4874         if (poll->events & EPOLLEXCLUSIVE)
4875                 add_wait_queue_exclusive(head, &poll->wait);
4876         else
4877                 add_wait_queue(head, &poll->wait);
4878 }
4879 
4880 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
4881                                struct poll_table_struct *p)
4882 {
4883         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4884         struct async_poll *apoll = pt->req->apoll;
4885 
4886         __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
4887 }
4888 
4889 static void io_async_task_func(struct callback_head *cb)
4890 {
4891         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4892         struct async_poll *apoll = req->apoll;
4893         struct io_ring_ctx *ctx = req->ctx;
4894 
4895         trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
4896 
4897         if (io_poll_rewait(req, &apoll->poll)) {
4898                 spin_unlock_irq(&ctx->completion_lock);
4899                 percpu_ref_put(&ctx->refs);
4900                 return;
4901         }
4902 
4903         /* If req is still hashed, it cannot have been canceled. Don't check. */
4904         if (hash_hashed(&req->hash_node))
4905                 hash_del(&req->hash_node);
4906 
4907         io_poll_remove_double(req);
4908         spin_unlock_irq(&ctx->completion_lock);
4909 
4910         if (!READ_ONCE(apoll->poll.canceled))
4911                 __io_req_task_submit(req);
4912         else
4913                 __io_req_task_cancel(req, -ECANCELED);
4914 
4915         percpu_ref_put(&ctx->refs);
4916         kfree(apoll->double_poll);
4917         kfree(apoll);
4918 }
4919 
4920 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4921                         void *key)
4922 {
4923         struct io_kiocb *req = wait->private;
4924         struct io_poll_iocb *poll = &req->apoll->poll;
4925 
4926         trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
4927                                         key_to_poll(key));
4928 
4929         return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
4930 }
4931 
4932 static void io_poll_req_insert(struct io_kiocb *req)
4933 {
4934         struct io_ring_ctx *ctx = req->ctx;
4935         struct hlist_head *list;
4936 
4937         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
4938         hlist_add_head(&req->hash_node, list);
4939 }
4940 
4941 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
4942                                       struct io_poll_iocb *poll,
4943                                       struct io_poll_table *ipt, __poll_t mask,
4944                                       wait_queue_func_t wake_func)
4945         __acquires(&ctx->completion_lock)
4946 {
4947         struct io_ring_ctx *ctx = req->ctx;
4948         bool cancel = false;
4949 
4950         io_init_poll_iocb(poll, mask, wake_func);
4951         poll->file = req->file;
4952         poll->wait.private = req;
4953 
4954         ipt->pt._key = mask;
4955         ipt->req = req;
4956         ipt->error = -EINVAL;
4957 
4958         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
4959 
4960         spin_lock_irq(&ctx->completion_lock);
4961         if (likely(poll->head)) {
4962                 spin_lock(&poll->head->lock);
4963                 if (unlikely(list_empty(&poll->wait.entry))) {
4964                         if (ipt->error)
4965                                 cancel = true;
4966                         ipt->error = 0;
4967                         mask = 0;
4968                 }
4969                 if (mask || ipt->error)
4970                         list_del_init(&poll->wait.entry);
4971                 else if (cancel)
4972                         WRITE_ONCE(poll->canceled, true);
4973                 else if (!poll->done) /* actually waiting for an event */
4974                         io_poll_req_insert(req);
4975                 spin_unlock(&poll->head->lock);
4976         }
4977 
4978         return mask;
4979 }
4980 
4981 static bool io_arm_poll_handler(struct io_kiocb *req)
4982 {
4983         const struct io_op_def *def = &io_op_defs[req->opcode];
4984         struct io_ring_ctx *ctx = req->ctx;
4985         struct async_poll *apoll;
4986         struct io_poll_table ipt;
4987         __poll_t mask, ret;
4988         int rw;
4989 
4990         if (!req->file || !file_can_poll(req->file))
4991                 return false;
4992         if (req->flags & REQ_F_POLLED)
4993                 return false;
4994         if (def->pollin)
4995                 rw = READ;
4996         else if (def->pollout)
4997                 rw = WRITE;
4998         else
4999                 return false;
5000         /* if we can't nonblock try, then no point in arming a poll handler */
5001         if (!io_file_supports_async(req->file, rw))
5002                 return false;
5003 
5004         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5005         if (unlikely(!apoll))
5006                 return false;
5007         apoll->double_poll = NULL;
5008 
5009         req->flags |= REQ_F_POLLED;
5010         req->apoll = apoll;
5011         INIT_HLIST_NODE(&req->hash_node);
5012 
5013         mask = 0;
5014         if (def->pollin)
5015                 mask |= POLLIN | POLLRDNORM;
5016         if (def->pollout)
5017                 mask |= POLLOUT | POLLWRNORM;
5018         mask |= POLLERR | POLLPRI;
5019 
5020         ipt.pt._qproc = io_async_queue_proc;
5021 
5022         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5023                                         io_async_wake);
5024         if (ret || ipt.error) {
5025                 io_poll_remove_double(req);
5026                 spin_unlock_irq(&ctx->completion_lock);
5027                 kfree(apoll->double_poll);
5028                 kfree(apoll);
5029                 return false;
5030         }
5031         spin_unlock_irq(&ctx->completion_lock);
5032         trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
5033                                         apoll->poll.events);
5034         return true;
5035 }
5036 
5037 static bool __io_poll_remove_one(struct io_kiocb *req,
5038                                  struct io_poll_iocb *poll)
5039 {
5040         bool do_complete = false;
5041 
5042         spin_lock(&poll->head->lock);
5043         WRITE_ONCE(poll->canceled, true);
5044         if (!list_empty(&poll->wait.entry)) {
5045                 list_del_init(&poll->wait.entry);
5046                 do_complete = true;
5047         }
5048         spin_unlock(&poll->head->lock);
5049         hash_del(&req->hash_node);
5050         return do_complete;
5051 }
5052 
5053 static bool io_poll_remove_one(struct io_kiocb *req)
5054 {
5055         bool do_complete;
5056 
5057         io_poll_remove_double(req);
5058 
5059         if (req->opcode == IORING_OP_POLL_ADD) {
5060                 do_complete = __io_poll_remove_one(req, &req->poll);
5061         } else {
5062                 struct async_poll *apoll = req->apoll;
5063 
5064                 /* non-poll requests have submit ref still */
5065                 do_complete = __io_poll_remove_one(req, &apoll->poll);
5066                 if (do_complete) {
5067                         io_put_req(req);
5068                         kfree(apoll->double_poll);
5069                         kfree(apoll);
5070                 }
5071         }
5072 
5073         if (do_complete) {
5074                 io_cqring_fill_event(req, -ECANCELED);
5075                 io_commit_cqring(req->ctx);
5076                 req->flags |= REQ_F_COMP_LOCKED;
5077                 req_set_fail_links(req);
5078                 io_put_req(req);
5079         }
5080 
5081         return do_complete;
5082 }
5083 
5084 /*
5085  * Returns true if we found and killed one or more poll requests
5086  */
5087 static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
5088 {
5089         struct hlist_node *tmp;
5090         struct io_kiocb *req;
5091         int posted = 0, i;
5092 
5093         spin_lock_irq(&ctx->completion_lock);
5094         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5095                 struct hlist_head *list;
5096 
5097                 list = &ctx->cancel_hash[i];
5098                 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5099                         if (io_task_match(req, tsk))
5100                                 posted += io_poll_remove_one(req);
5101                 }
5102         }
5103         spin_unlock_irq(&ctx->completion_lock);
5104 
5105         if (posted)
5106                 io_cqring_ev_posted(ctx);
5107 
5108         return posted != 0;
5109 }
5110 
5111 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
5112 {
5113         struct hlist_head *list;
5114         struct io_kiocb *req;
5115 
5116         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5117         hlist_for_each_entry(req, list, hash_node) {
5118                 if (sqe_addr != req->user_data)
5119                         continue;
5120                 if (io_poll_remove_one(req))
5121                         return 0;
5122                 return -EALREADY;
5123         }
5124 
5125         return -ENOENT;
5126 }
5127 
5128 static int io_poll_remove_prep(struct io_kiocb *req,
5129                                const struct io_uring_sqe *sqe)
5130 {
5131         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5132                 return -EINVAL;
5133         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
5134             sqe->poll_events)
5135                 return -EINVAL;
5136 
5137         req->poll.addr = READ_ONCE(sqe->addr);
5138         return 0;
5139 }
5140 
5141 /*
5142  * Find a running poll command that matches one specified in sqe->addr,
5143  * and remove it if found.
5144  */
5145 static int io_poll_remove(struct io_kiocb *req)
5146 {
5147         struct io_ring_ctx *ctx = req->ctx;
5148         u64 addr;
5149         int ret;
5150 
5151         addr = req->poll.addr;
5152         spin_lock_irq(&ctx->completion_lock);
5153         ret = io_poll_cancel(ctx, addr);
5154         spin_unlock_irq(&ctx->completion_lock);
5155 
5156         if (ret < 0)
5157                 req_set_fail_links(req);
5158         io_req_complete(req, ret);
5159         return 0;
5160 }
5161 
5162 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5163                         void *key)
5164 {
5165         struct io_kiocb *req = wait->private;
5166         struct io_poll_iocb *poll = &req->poll;
5167 
5168         return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
5169 }
5170 
5171 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5172                                struct poll_table_struct *p)
5173 {
5174         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5175 
5176         __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->io);
5177 }
5178 
5179 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5180 {
5181         struct io_poll_iocb *poll = &req->poll;
5182         u32 events;
5183 
5184         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5185                 return -EINVAL;
5186         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
5187                 return -EINVAL;
5188         if (!poll->file)
5189                 return -EBADF;
5190 
5191         events = READ_ONCE(sqe->poll32_events);
5192 #ifdef __BIG_ENDIAN
5193         events = swahw32(events);
5194 #endif
5195         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
5196                        (events & EPOLLEXCLUSIVE);
5197         return 0;
5198 }
5199 
5200 static int io_poll_add(struct io_kiocb *req)
5201 {
5202         struct io_poll_iocb *poll = &req->poll;
5203         struct io_ring_ctx *ctx = req->ctx;
5204         struct io_poll_table ipt;
5205         __poll_t mask;
5206 
5207         INIT_HLIST_NODE(&req->hash_node);
5208         ipt.pt._qproc = io_poll_queue_proc;
5209 
5210         mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5211                                         io_poll_wake);
5212 
5213         if (mask) { /* no async, we'd stolen it */
5214                 ipt.error = 0;
5215                 io_poll_complete(req, mask, 0);
5216         }
5217         spin_unlock_irq(&ctx->completion_lock);
5218 
5219         if (mask) {
5220                 io_cqring_ev_posted(ctx);
5221                 io_put_req(req);
5222         }
5223         return ipt.error;
5224 }
5225 
5226 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5227 {
5228         struct io_timeout_data *data = container_of(timer,
5229                                                 struct io_timeout_data, timer);
5230         struct io_kiocb *req = data->req;
5231         struct io_ring_ctx *ctx = req->ctx;
5232         unsigned long flags;
5233 
5234         spin_lock_irqsave(&ctx->completion_lock, flags);
5235         atomic_set(&req->ctx->cq_timeouts,
5236                 atomic_read(&req->ctx->cq_timeouts) + 1);
5237 
5238         /*
5239          * We could be racing with timeout deletion. If the list is empty,
5240          * then timeout lookup already found it and will be handling it.
5241          */
5242         if (!list_empty(&req->timeout.list))
5243                 list_del_init(&req->timeout.list);
5244 
5245         io_cqring_fill_event(req, -ETIME);
5246         io_commit_cqring(ctx);
5247         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5248 
5249         io_cqring_ev_posted(ctx);
5250         req_set_fail_links(req);
5251         io_put_req(req);
5252         return HRTIMER_NORESTART;
5253 }
5254 
5255 static int __io_timeout_cancel(struct io_kiocb *req)
5256 {
5257         int ret;
5258 
5259         list_del_init(&req->timeout.list);
5260 
5261         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
5262         if (ret == -1)
5263                 return -EALREADY;
5264 
5265         req_set_fail_links(req);
5266         req->flags |= REQ_F_COMP_LOCKED;
5267         io_cqring_fill_event(req, -ECANCELED);
5268         io_put_req(req);
5269         return 0;
5270 }
5271 
5272 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5273 {
5274         struct io_kiocb *req;
5275         int ret = -ENOENT;
5276 
5277         list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
5278                 if (user_data == req->user_data) {
5279                         ret = 0;
5280                         break;
5281                 }
5282         }
5283 
5284         if (ret == -ENOENT)
5285                 return ret;
5286 
5287         return __io_timeout_cancel(req);
5288 }
5289 
5290 static int io_timeout_remove_prep(struct io_kiocb *req,
5291                                   const struct io_uring_sqe *sqe)
5292 {
5293         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5294                 return -EINVAL;
5295         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5296                 return -EINVAL;
5297         if (sqe->ioprio || sqe->buf_index || sqe->len)
5298                 return -EINVAL;
5299 
5300         req->timeout.addr = READ_ONCE(sqe->addr);
5301         req->timeout.flags = READ_ONCE(sqe->timeout_flags);
5302         if (req->timeout.flags)
5303                 return -EINVAL;
5304 
5305         return 0;
5306 }
5307 
5308 /*
5309  * Remove or update an existing timeout command
5310  */
5311 static int io_timeout_remove(struct io_kiocb *req)
5312 {
5313         struct io_ring_ctx *ctx = req->ctx;
5314         int ret;
5315 
5316         spin_lock_irq(&ctx->completion_lock);
5317         ret = io_timeout_cancel(ctx, req->timeout.addr);
5318 
5319         io_cqring_fill_event(req, ret);
5320         io_commit_cqring(ctx);
5321         spin_unlock_irq(&ctx->completion_lock);
5322         io_cqring_ev_posted(ctx);
5323         if (ret < 0)
5324                 req_set_fail_links(req);
5325         io_put_req(req);
5326         return 0;
5327 }
5328 
5329 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5330                            bool is_timeout_link)
5331 {
5332         struct io_timeout_data *data;
5333         unsigned flags;
5334         u32 off = READ_ONCE(sqe->off);
5335 
5336         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5337                 return -EINVAL;
5338         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
5339                 return -EINVAL;
5340         if (off && is_timeout_link)
5341                 return -EINVAL;
5342         flags = READ_ONCE(sqe->timeout_flags);
5343         if (flags & ~IORING_TIMEOUT_ABS)
5344                 return -EINVAL;
5345 
5346         req->timeout.off = off;
5347 
5348         if (!req->io && io_alloc_async_ctx(req))
5349                 return -ENOMEM;
5350 
5351         data = &req->io->timeout;
5352         data->req = req;
5353 
5354         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5355                 return -EFAULT;
5356 
5357         if (flags & IORING_TIMEOUT_ABS)
5358                 data->mode = HRTIMER_MODE_ABS;
5359         else
5360                 data->mode = HRTIMER_MODE_REL;
5361 
5362         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
5363         return 0;
5364 }
5365 
5366 static int io_timeout(struct io_kiocb *req)
5367 {
5368         struct io_ring_ctx *ctx = req->ctx;
5369         struct io_timeout_data *data = &req->io->timeout;
5370         struct list_head *entry;
5371         u32 tail, off = req->timeout.off;
5372 
5373         spin_lock_irq(&ctx->completion_lock);
5374 
5375         /*
5376          * sqe->off holds how many events that need to occur for this
5377          * timeout event to be satisfied. If it isn't set, then this is
5378          * a pure timeout request, sequence isn't used.
5379          */
5380         if (io_is_timeout_noseq(req)) {
5381                 entry = ctx->timeout_list.prev;
5382                 goto add;
5383         }
5384 
5385         tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5386         req->timeout.target_seq = tail + off;
5387 
5388         /*
5389          * Insertion sort, ensuring the first entry in the list is always
5390          * the one we need first.
5391          */
5392         list_for_each_prev(entry, &ctx->timeout_list) {
5393                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
5394                                                   timeout.list);
5395 
5396                 if (io_is_timeout_noseq(nxt))
5397                         continue;
5398                 /* nxt.seq is behind @tail, otherwise would've been completed */
5399                 if (off >= nxt->timeout.target_seq - tail)
5400                         break;
5401         }
5402 add:
5403         list_add(&req->timeout.list, entry);
5404         data->timer.function = io_timeout_fn;
5405         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5406         spin_unlock_irq(&ctx->completion_lock);
5407         return 0;
5408 }
5409 
5410 static bool io_cancel_cb(struct io_wq_work *work, void *data)
5411 {
5412         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5413 
5414         return req->user_data == (unsigned long) data;
5415 }
5416 
5417 static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
5418 {
5419         enum io_wq_cancel cancel_ret;
5420         int ret = 0;
5421 
5422         cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
5423         switch (cancel_ret) {
5424         case IO_WQ_CANCEL_OK:
5425                 ret = 0;
5426                 break;
5427         case IO_WQ_CANCEL_RUNNING:
5428                 ret = -EALREADY;
5429                 break;
5430         case IO_WQ_CANCEL_NOTFOUND:
5431                 ret = -ENOENT;
5432                 break;
5433         }
5434 
5435         return ret;
5436 }
5437 
5438 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
5439                                      struct io_kiocb *req, __u64 sqe_addr,
5440                                      int success_ret)
5441 {
5442         unsigned long flags;
5443         int ret;
5444 
5445         ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
5446         if (ret != -ENOENT) {
5447                 spin_lock_irqsave(&ctx->completion_lock, flags);
5448                 goto done;
5449         }
5450 
5451         spin_lock_irqsave(&ctx->completion_lock, flags);
5452         ret = io_timeout_cancel(ctx, sqe_addr);
5453         if (ret != -ENOENT)
5454                 goto done;
5455         ret = io_poll_cancel(ctx, sqe_addr);
5456 done:
5457         if (!ret)
5458                 ret = success_ret;
5459         io_cqring_fill_event(req, ret);
5460         io_commit_cqring(ctx);
5461         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5462         io_cqring_ev_posted(ctx);
5463 
5464         if (ret < 0)
5465                 req_set_fail_links(req);
5466         io_put_req(req);
5467 }
5468 
5469 static int io_async_cancel_prep(struct io_kiocb *req,
5470                                 const struct io_uring_sqe *sqe)
5471 {
5472         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5473                 return -EINVAL;
5474         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5475                 return -EINVAL;
5476         if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
5477                 return -EINVAL;
5478 
5479         req->cancel.addr = READ_ONCE(sqe->addr);
5480         return 0;
5481 }
5482 
5483 static int io_async_cancel(struct io_kiocb *req)
5484 {
5485         struct io_ring_ctx *ctx = req->ctx;
5486 
5487         io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
5488         return 0;
5489 }
5490 
5491 static int io_files_update_prep(struct io_kiocb *req,
5492                                 const struct io_uring_sqe *sqe)
5493 {
5494         if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
5495                 return -EINVAL;
5496         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5497                 return -EINVAL;
5498         if (sqe->ioprio || sqe->rw_flags)
5499                 return -EINVAL;
5500 
5501         req->files_update.offset = READ_ONCE(sqe->off);
5502         req->files_update.nr_args = READ_ONCE(sqe->len);
5503         if (!req->files_update.nr_args)
5504                 return -EINVAL;
5505         req->files_update.arg = READ_ONCE(sqe->addr);
5506         return 0;
5507 }
5508 
5509 static int io_files_update(struct io_kiocb *req, bool force_nonblock,
5510                            struct io_comp_state *cs)
5511 {
5512         struct io_ring_ctx *ctx = req->ctx;
5513         struct io_uring_files_update up;
5514         int ret;
5515 
5516         if (force_nonblock)
5517                 return -EAGAIN;
5518 
5519         up.offset = req->files_update.offset;
5520         up.fds = req->files_update.arg;
5521 
5522         mutex_lock(&ctx->uring_lock);
5523         ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
5524         mutex_unlock(&ctx->uring_lock);
5525 
5526         if (ret < 0)
5527                 req_set_fail_links(req);
5528         __io_req_complete(req, ret, 0, cs);
5529         return 0;
5530 }
5531 
5532 static int io_req_defer_prep(struct io_kiocb *req,
5533                              const struct io_uring_sqe *sqe)
5534 {
5535         ssize_t ret = 0;
5536 
5537         if (!sqe)
5538                 return 0;
5539 
5540         if (io_alloc_async_ctx(req))
5541                 return -EAGAIN;
5542         ret = io_prep_work_files(req);
5543         if (unlikely(ret))
5544                 return ret;
5545 
5546         io_prep_async_work(req);
5547 
5548         switch (req->opcode) {
5549         case IORING_OP_NOP:
5550                 break;
5551         case IORING_OP_READV:
5552         case IORING_OP_READ_FIXED:
5553         case IORING_OP_READ:
5554                 ret = io_read_prep(req, sqe, true);
5555                 break;
5556         case IORING_OP_WRITEV:
5557         case IORING_OP_WRITE_FIXED:
5558         case IORING_OP_WRITE:
5559                 ret = io_write_prep(req, sqe, true);
5560                 break;
5561         case IORING_OP_POLL_ADD:
5562                 ret = io_poll_add_prep(req, sqe);
5563                 break;
5564         case IORING_OP_POLL_REMOVE:
5565                 ret = io_poll_remove_prep(req, sqe);
5566                 break;
5567         case IORING_OP_FSYNC:
5568                 ret = io_prep_fsync(req, sqe);
5569                 break;
5570         case IORING_OP_SYNC_FILE_RANGE:
5571                 ret = io_prep_sfr(req, sqe);
5572                 break;
5573         case IORING_OP_SENDMSG:
5574         case IORING_OP_SEND:
5575                 ret = io_sendmsg_prep(req, sqe);
5576                 break;
5577         case IORING_OP_RECVMSG:
5578         case IORING_OP_RECV:
5579                 ret = io_recvmsg_prep(req, sqe);
5580                 break;
5581         case IORING_OP_CONNECT:
5582                 ret = io_connect_prep(req, sqe);
5583                 break;
5584         case IORING_OP_TIMEOUT:
5585                 ret = io_timeout_prep(req, sqe, false);
5586                 break;
5587         case IORING_OP_TIMEOUT_REMOVE:
5588                 ret = io_timeout_remove_prep(req, sqe);
5589                 break;
5590         case IORING_OP_ASYNC_CANCEL:
5591                 ret = io_async_cancel_prep(req, sqe);
5592                 break;
5593         case IORING_OP_LINK_TIMEOUT:
5594                 ret = io_timeout_prep(req, sqe, true);
5595                 break;
5596         case IORING_OP_ACCEPT:
5597                 ret = io_accept_prep(req, sqe);
5598                 break;
5599         case IORING_OP_FALLOCATE:
5600                 ret = io_fallocate_prep(req, sqe);
5601                 break;
5602         case IORING_OP_OPENAT:
5603                 ret = io_openat_prep(req, sqe);
5604                 break;
5605         case IORING_OP_CLOSE:
5606                 ret = io_close_prep(req, sqe);
5607                 break;
5608         case IORING_OP_FILES_UPDATE:
5609                 ret = io_files_update_prep(req, sqe);
5610                 break;
5611         case IORING_OP_STATX:
5612                 ret = io_statx_prep(req, sqe);
5613                 break;
5614         case IORING_OP_FADVISE:
5615                 ret = io_fadvise_prep(req, sqe);
5616                 break;
5617         case IORING_OP_MADVISE:
5618                 ret = io_madvise_prep(req, sqe);
5619                 break;
5620         case IORING_OP_OPENAT2:
5621