1 /* 2 * Fast Userspace Mutexes (which I call "Futexes!"). 3 * (C) Rusty Russell, IBM 2002 4 * 5 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar 6 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved 7 * 8 * Removed page pinning, fix privately mapped COW pages and other cleanups 9 * (C) Copyright 2003, 2004 Jamie Lokier 10 * 11 * Robust futex support started by Ingo Molnar 12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 14 * 15 * PI-futex support started by Ingo Molnar and Thomas Gleixner 16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 18 * 19 * PRIVATE futexes by Eric Dumazet 20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 21 * 22 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> 23 * Copyright (C) IBM Corporation, 2009 24 * Thanks to Thomas Gleixner for conceptual design and careful reviews. 25 * 26 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 27 * enough at me, Linus for the original (flawed) idea, Matthew 28 * Kirkwood for proof-of-concept implementation. 29 * 30 * "The futexes are also cursed." 31 * "But they come in a choice of three flavours!" 32 * 33 * This program is free software; you can redistribute it and/or modify 34 * it under the terms of the GNU General Public License as published by 35 * the Free Software Foundation; either version 2 of the License, or 36 * (at your option) any later version. 37 * 38 * This program is distributed in the hope that it will be useful, 39 * but WITHOUT ANY WARRANTY; without even the implied warranty of 40 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 41 * GNU General Public License for more details. 42 * 43 * You should have received a copy of the GNU General Public License 44 * along with this program; if not, write to the Free Software 45 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 46 */ 47 #include <linux/slab.h> 48 #include <linux/poll.h> 49 #include <linux/fs.h> 50 #include <linux/file.h> 51 #include <linux/jhash.h> 52 #include <linux/init.h> 53 #include <linux/futex.h> 54 #include <linux/mount.h> 55 #include <linux/pagemap.h> 56 #include <linux/syscalls.h> 57 #include <linux/signal.h> 58 #include <linux/export.h> 59 #include <linux/magic.h> 60 #include <linux/pid.h> 61 #include <linux/nsproxy.h> 62 #include <linux/ptrace.h> 63 #include <linux/sched/rt.h> 64 #include <linux/sched/wake_q.h> 65 #include <linux/sched/mm.h> 66 #include <linux/hugetlb.h> 67 #include <linux/freezer.h> 68 #include <linux/bootmem.h> 69 #include <linux/fault-inject.h> 70 71 #include <asm/futex.h> 72 73 #include "locking/rtmutex_common.h" 74 75 /* 76 * READ this before attempting to hack on futexes! 77 * 78 * Basic futex operation and ordering guarantees 79 * ============================================= 80 * 81 * The waiter reads the futex value in user space and calls 82 * futex_wait(). This function computes the hash bucket and acquires 83 * the hash bucket lock. After that it reads the futex user space value 84 * again and verifies that the data has not changed. If it has not changed 85 * it enqueues itself into the hash bucket, releases the hash bucket lock 86 * and schedules. 87 * 88 * The waker side modifies the user space value of the futex and calls 89 * futex_wake(). This function computes the hash bucket and acquires the 90 * hash bucket lock. Then it looks for waiters on that futex in the hash 91 * bucket and wakes them. 92 * 93 * In futex wake up scenarios where no tasks are blocked on a futex, taking 94 * the hb spinlock can be avoided and simply return. In order for this 95 * optimization to work, ordering guarantees must exist so that the waiter 96 * being added to the list is acknowledged when the list is concurrently being 97 * checked by the waker, avoiding scenarios like the following: 98 * 99 * CPU 0 CPU 1 100 * val = *futex; 101 * sys_futex(WAIT, futex, val); 102 * futex_wait(futex, val); 103 * uval = *futex; 104 * *futex = newval; 105 * sys_futex(WAKE, futex); 106 * futex_wake(futex); 107 * if (queue_empty()) 108 * return; 109 * if (uval == val) 110 * lock(hash_bucket(futex)); 111 * queue(); 112 * unlock(hash_bucket(futex)); 113 * schedule(); 114 * 115 * This would cause the waiter on CPU 0 to wait forever because it 116 * missed the transition of the user space value from val to newval 117 * and the waker did not find the waiter in the hash bucket queue. 118 * 119 * The correct serialization ensures that a waiter either observes 120 * the changed user space value before blocking or is woken by a 121 * concurrent waker: 122 * 123 * CPU 0 CPU 1 124 * val = *futex; 125 * sys_futex(WAIT, futex, val); 126 * futex_wait(futex, val); 127 * 128 * waiters++; (a) 129 * smp_mb(); (A) <-- paired with -. 130 * | 131 * lock(hash_bucket(futex)); | 132 * | 133 * uval = *futex; | 134 * | *futex = newval; 135 * | sys_futex(WAKE, futex); 136 * | futex_wake(futex); 137 * | 138 * `--------> smp_mb(); (B) 139 * if (uval == val) 140 * queue(); 141 * unlock(hash_bucket(futex)); 142 * schedule(); if (waiters) 143 * lock(hash_bucket(futex)); 144 * else wake_waiters(futex); 145 * waiters--; (b) unlock(hash_bucket(futex)); 146 * 147 * Where (A) orders the waiters increment and the futex value read through 148 * atomic operations (see hb_waiters_inc) and where (B) orders the write 149 * to futex and the waiters read -- this is done by the barriers for both 150 * shared and private futexes in get_futex_key_refs(). 151 * 152 * This yields the following case (where X:=waiters, Y:=futex): 153 * 154 * X = Y = 0 155 * 156 * w[X]=1 w[Y]=1 157 * MB MB 158 * r[Y]=y r[X]=x 159 * 160 * Which guarantees that x==0 && y==0 is impossible; which translates back into 161 * the guarantee that we cannot both miss the futex variable change and the 162 * enqueue. 163 * 164 * Note that a new waiter is accounted for in (a) even when it is possible that 165 * the wait call can return error, in which case we backtrack from it in (b). 166 * Refer to the comment in queue_lock(). 167 * 168 * Similarly, in order to account for waiters being requeued on another 169 * address we always increment the waiters for the destination bucket before 170 * acquiring the lock. It then decrements them again after releasing it - 171 * the code that actually moves the futex(es) between hash buckets (requeue_futex) 172 * will do the additional required waiter count housekeeping. This is done for 173 * double_lock_hb() and double_unlock_hb(), respectively. 174 */ 175 176 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG 177 int __read_mostly futex_cmpxchg_enabled; 178 #endif 179 180 /* 181 * Futex flags used to encode options to functions and preserve them across 182 * restarts. 183 */ 184 #ifdef CONFIG_MMU 185 # define FLAGS_SHARED 0x01 186 #else 187 /* 188 * NOMMU does not have per process address space. Let the compiler optimize 189 * code away. 190 */ 191 # define FLAGS_SHARED 0x00 192 #endif 193 #define FLAGS_CLOCKRT 0x02 194 #define FLAGS_HAS_TIMEOUT 0x04 195 196 /* 197 * Priority Inheritance state: 198 */ 199 struct futex_pi_state { 200 /* 201 * list of 'owned' pi_state instances - these have to be 202 * cleaned up in do_exit() if the task exits prematurely: 203 */ 204 struct list_head list; 205 206 /* 207 * The PI object: 208 */ 209 struct rt_mutex pi_mutex; 210 211 struct task_struct *owner; 212 atomic_t refcount; 213 214 union futex_key key; 215 } __randomize_layout; 216 217 /** 218 * struct futex_q - The hashed futex queue entry, one per waiting task 219 * @list: priority-sorted list of tasks waiting on this futex 220 * @task: the task waiting on the futex 221 * @lock_ptr: the hash bucket lock 222 * @key: the key the futex is hashed on 223 * @pi_state: optional priority inheritance state 224 * @rt_waiter: rt_waiter storage for use with requeue_pi 225 * @requeue_pi_key: the requeue_pi target futex key 226 * @bitset: bitset for the optional bitmasked wakeup 227 * 228 * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so 229 * we can wake only the relevant ones (hashed queues may be shared). 230 * 231 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 232 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 233 * The order of wakeup is always to make the first condition true, then 234 * the second. 235 * 236 * PI futexes are typically woken before they are removed from the hash list via 237 * the rt_mutex code. See unqueue_me_pi(). 238 */ 239 struct futex_q { 240 struct plist_node list; 241 242 struct task_struct *task; 243 spinlock_t *lock_ptr; 244 union futex_key key; 245 struct futex_pi_state *pi_state; 246 struct rt_mutex_waiter *rt_waiter; 247 union futex_key *requeue_pi_key; 248 u32 bitset; 249 } __randomize_layout; 250 251 static const struct futex_q futex_q_init = { 252 /* list gets initialized in queue_me()*/ 253 .key = FUTEX_KEY_INIT, 254 .bitset = FUTEX_BITSET_MATCH_ANY 255 }; 256 257 /* 258 * Hash buckets are shared by all the futex_keys that hash to the same 259 * location. Each key may have multiple futex_q structures, one for each task 260 * waiting on a futex. 261 */ 262 struct futex_hash_bucket { 263 atomic_t waiters; 264 spinlock_t lock; 265 struct plist_head chain; 266 } ____cacheline_aligned_in_smp; 267 268 /* 269 * The base of the bucket array and its size are always used together 270 * (after initialization only in hash_futex()), so ensure that they 271 * reside in the same cacheline. 272 */ 273 static struct { 274 struct futex_hash_bucket *queues; 275 unsigned long hashsize; 276 } __futex_data __read_mostly __aligned(2*sizeof(long)); 277 #define futex_queues (__futex_data.queues) 278 #define futex_hashsize (__futex_data.hashsize) 279 280 281 /* 282 * Fault injections for futexes. 283 */ 284 #ifdef CONFIG_FAIL_FUTEX 285 286 static struct { 287 struct fault_attr attr; 288 289 bool ignore_private; 290 } fail_futex = { 291 .attr = FAULT_ATTR_INITIALIZER, 292 .ignore_private = false, 293 }; 294 295 static int __init setup_fail_futex(char *str) 296 { 297 return setup_fault_attr(&fail_futex.attr, str); 298 } 299 __setup("fail_futex=", setup_fail_futex); 300 301 static bool should_fail_futex(bool fshared) 302 { 303 if (fail_futex.ignore_private && !fshared) 304 return false; 305 306 return should_fail(&fail_futex.attr, 1); 307 } 308 309 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 310 311 static int __init fail_futex_debugfs(void) 312 { 313 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 314 struct dentry *dir; 315 316 dir = fault_create_debugfs_attr("fail_futex", NULL, 317 &fail_futex.attr); 318 if (IS_ERR(dir)) 319 return PTR_ERR(dir); 320 321 if (!debugfs_create_bool("ignore-private", mode, dir, 322 &fail_futex.ignore_private)) { 323 debugfs_remove_recursive(dir); 324 return -ENOMEM; 325 } 326 327 return 0; 328 } 329 330 late_initcall(fail_futex_debugfs); 331 332 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 333 334 #else 335 static inline bool should_fail_futex(bool fshared) 336 { 337 return false; 338 } 339 #endif /* CONFIG_FAIL_FUTEX */ 340 341 static inline void futex_get_mm(union futex_key *key) 342 { 343 mmgrab(key->private.mm); 344 /* 345 * Ensure futex_get_mm() implies a full barrier such that 346 * get_futex_key() implies a full barrier. This is relied upon 347 * as smp_mb(); (B), see the ordering comment above. 348 */ 349 smp_mb__after_atomic(); 350 } 351 352 /* 353 * Reflects a new waiter being added to the waitqueue. 354 */ 355 static inline void hb_waiters_inc(struct futex_hash_bucket *hb) 356 { 357 #ifdef CONFIG_SMP 358 atomic_inc(&hb->waiters); 359 /* 360 * Full barrier (A), see the ordering comment above. 361 */ 362 smp_mb__after_atomic(); 363 #endif 364 } 365 366 /* 367 * Reflects a waiter being removed from the waitqueue by wakeup 368 * paths. 369 */ 370 static inline void hb_waiters_dec(struct futex_hash_bucket *hb) 371 { 372 #ifdef CONFIG_SMP 373 atomic_dec(&hb->waiters); 374 #endif 375 } 376 377 static inline int hb_waiters_pending(struct futex_hash_bucket *hb) 378 { 379 #ifdef CONFIG_SMP 380 return atomic_read(&hb->waiters); 381 #else 382 return 1; 383 #endif 384 } 385 386 /** 387 * hash_futex - Return the hash bucket in the global hash 388 * @key: Pointer to the futex key for which the hash is calculated 389 * 390 * We hash on the keys returned from get_futex_key (see below) and return the 391 * corresponding hash bucket in the global hash. 392 */ 393 static struct futex_hash_bucket *hash_futex(union futex_key *key) 394 { 395 u32 hash = jhash2((u32*)&key->both.word, 396 (sizeof(key->both.word)+sizeof(key->both.ptr))/4, 397 key->both.offset); 398 return &futex_queues[hash & (futex_hashsize - 1)]; 399 } 400 401 402 /** 403 * match_futex - Check whether two futex keys are equal 404 * @key1: Pointer to key1 405 * @key2: Pointer to key2 406 * 407 * Return 1 if two futex_keys are equal, 0 otherwise. 408 */ 409 static inline int match_futex(union futex_key *key1, union futex_key *key2) 410 { 411 return (key1 && key2 412 && key1->both.word == key2->both.word 413 && key1->both.ptr == key2->both.ptr 414 && key1->both.offset == key2->both.offset); 415 } 416 417 /* 418 * Take a reference to the resource addressed by a key. 419 * Can be called while holding spinlocks. 420 * 421 */ 422 static void get_futex_key_refs(union futex_key *key) 423 { 424 if (!key->both.ptr) 425 return; 426 427 /* 428 * On MMU less systems futexes are always "private" as there is no per 429 * process address space. We need the smp wmb nevertheless - yes, 430 * arch/blackfin has MMU less SMP ... 431 */ 432 if (!IS_ENABLED(CONFIG_MMU)) { 433 smp_mb(); /* explicit smp_mb(); (B) */ 434 return; 435 } 436 437 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 438 case FUT_OFF_INODE: 439 ihold(key->shared.inode); /* implies smp_mb(); (B) */ 440 break; 441 case FUT_OFF_MMSHARED: 442 futex_get_mm(key); /* implies smp_mb(); (B) */ 443 break; 444 default: 445 /* 446 * Private futexes do not hold reference on an inode or 447 * mm, therefore the only purpose of calling get_futex_key_refs 448 * is because we need the barrier for the lockless waiter check. 449 */ 450 smp_mb(); /* explicit smp_mb(); (B) */ 451 } 452 } 453 454 /* 455 * Drop a reference to the resource addressed by a key. 456 * The hash bucket spinlock must not be held. This is 457 * a no-op for private futexes, see comment in the get 458 * counterpart. 459 */ 460 static void drop_futex_key_refs(union futex_key *key) 461 { 462 if (!key->both.ptr) { 463 /* If we're here then we tried to put a key we failed to get */ 464 WARN_ON_ONCE(1); 465 return; 466 } 467 468 if (!IS_ENABLED(CONFIG_MMU)) 469 return; 470 471 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 472 case FUT_OFF_INODE: 473 iput(key->shared.inode); 474 break; 475 case FUT_OFF_MMSHARED: 476 mmdrop(key->private.mm); 477 break; 478 } 479 } 480 481 /** 482 * get_futex_key() - Get parameters which are the keys for a futex 483 * @uaddr: virtual address of the futex 484 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 485 * @key: address where result is stored. 486 * @rw: mapping needs to be read/write (values: VERIFY_READ, 487 * VERIFY_WRITE) 488 * 489 * Return: a negative error code or 0 490 * 491 * The key words are stored in @key on success. 492 * 493 * For shared mappings, it's (page->index, file_inode(vma->vm_file), 494 * offset_within_page). For private mappings, it's (uaddr, current->mm). 495 * We can usually work out the index without swapping in the page. 496 * 497 * lock_page() might sleep, the caller should not hold a spinlock. 498 */ 499 static int 500 get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) 501 { 502 unsigned long address = (unsigned long)uaddr; 503 struct mm_struct *mm = current->mm; 504 struct page *page, *tail; 505 struct address_space *mapping; 506 int err, ro = 0; 507 508 /* 509 * The futex address must be "naturally" aligned. 510 */ 511 key->both.offset = address % PAGE_SIZE; 512 if (unlikely((address % sizeof(u32)) != 0)) 513 return -EINVAL; 514 address -= key->both.offset; 515 516 if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) 517 return -EFAULT; 518 519 if (unlikely(should_fail_futex(fshared))) 520 return -EFAULT; 521 522 /* 523 * PROCESS_PRIVATE futexes are fast. 524 * As the mm cannot disappear under us and the 'key' only needs 525 * virtual address, we dont even have to find the underlying vma. 526 * Note : We do have to check 'uaddr' is a valid user address, 527 * but access_ok() should be faster than find_vma() 528 */ 529 if (!fshared) { 530 key->private.mm = mm; 531 key->private.address = address; 532 get_futex_key_refs(key); /* implies smp_mb(); (B) */ 533 return 0; 534 } 535 536 again: 537 /* Ignore any VERIFY_READ mapping (futex common case) */ 538 if (unlikely(should_fail_futex(fshared))) 539 return -EFAULT; 540 541 err = get_user_pages_fast(address, 1, 1, &page); 542 /* 543 * If write access is not required (eg. FUTEX_WAIT), try 544 * and get read-only access. 545 */ 546 if (err == -EFAULT && rw == VERIFY_READ) { 547 err = get_user_pages_fast(address, 1, 0, &page); 548 ro = 1; 549 } 550 if (err < 0) 551 return err; 552 else 553 err = 0; 554 555 /* 556 * The treatment of mapping from this point on is critical. The page 557 * lock protects many things but in this context the page lock 558 * stabilizes mapping, prevents inode freeing in the shared 559 * file-backed region case and guards against movement to swap cache. 560 * 561 * Strictly speaking the page lock is not needed in all cases being 562 * considered here and page lock forces unnecessarily serialization 563 * From this point on, mapping will be re-verified if necessary and 564 * page lock will be acquired only if it is unavoidable 565 * 566 * Mapping checks require the head page for any compound page so the 567 * head page and mapping is looked up now. For anonymous pages, it 568 * does not matter if the page splits in the future as the key is 569 * based on the address. For filesystem-backed pages, the tail is 570 * required as the index of the page determines the key. For 571 * base pages, there is no tail page and tail == page. 572 */ 573 tail = page; 574 page = compound_head(page); 575 mapping = READ_ONCE(page->mapping); 576 577 /* 578 * If page->mapping is NULL, then it cannot be a PageAnon 579 * page; but it might be the ZERO_PAGE or in the gate area or 580 * in a special mapping (all cases which we are happy to fail); 581 * or it may have been a good file page when get_user_pages_fast 582 * found it, but truncated or holepunched or subjected to 583 * invalidate_complete_page2 before we got the page lock (also 584 * cases which we are happy to fail). And we hold a reference, 585 * so refcount care in invalidate_complete_page's remove_mapping 586 * prevents drop_caches from setting mapping to NULL beneath us. 587 * 588 * The case we do have to guard against is when memory pressure made 589 * shmem_writepage move it from filecache to swapcache beneath us: 590 * an unlikely race, but we do need to retry for page->mapping. 591 */ 592 if (unlikely(!mapping)) { 593 int shmem_swizzled; 594 595 /* 596 * Page lock is required to identify which special case above 597 * applies. If this is really a shmem page then the page lock 598 * will prevent unexpected transitions. 599 */ 600 lock_page(page); 601 shmem_swizzled = PageSwapCache(page) || page->mapping; 602 unlock_page(page); 603 put_page(page); 604 605 if (shmem_swizzled) 606 goto again; 607 608 return -EFAULT; 609 } 610 611 /* 612 * Private mappings are handled in a simple way. 613 * 614 * If the futex key is stored on an anonymous page, then the associated 615 * object is the mm which is implicitly pinned by the calling process. 616 * 617 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 618 * it's a read-only handle, it's expected that futexes attach to 619 * the object not the particular process. 620 */ 621 if (PageAnon(page)) { 622 /* 623 * A RO anonymous page will never change and thus doesn't make 624 * sense for futex operations. 625 */ 626 if (unlikely(should_fail_futex(fshared)) || ro) { 627 err = -EFAULT; 628 goto out; 629 } 630 631 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 632 key->private.mm = mm; 633 key->private.address = address; 634 635 get_futex_key_refs(key); /* implies smp_mb(); (B) */ 636 637 } else { 638 struct inode *inode; 639 640 /* 641 * The associated futex object in this case is the inode and 642 * the page->mapping must be traversed. Ordinarily this should 643 * be stabilised under page lock but it's not strictly 644 * necessary in this case as we just want to pin the inode, not 645 * update the radix tree or anything like that. 646 * 647 * The RCU read lock is taken as the inode is finally freed 648 * under RCU. If the mapping still matches expectations then the 649 * mapping->host can be safely accessed as being a valid inode. 650 */ 651 rcu_read_lock(); 652 653 if (READ_ONCE(page->mapping) != mapping) { 654 rcu_read_unlock(); 655 put_page(page); 656 657 goto again; 658 } 659 660 inode = READ_ONCE(mapping->host); 661 if (!inode) { 662 rcu_read_unlock(); 663 put_page(page); 664 665 goto again; 666 } 667 668 /* 669 * Take a reference unless it is about to be freed. Previously 670 * this reference was taken by ihold under the page lock 671 * pinning the inode in place so i_lock was unnecessary. The 672 * only way for this check to fail is if the inode was 673 * truncated in parallel which is almost certainly an 674 * application bug. In such a case, just retry. 675 * 676 * We are not calling into get_futex_key_refs() in file-backed 677 * cases, therefore a successful atomic_inc return below will 678 * guarantee that get_futex_key() will still imply smp_mb(); (B). 679 */ 680 if (!atomic_inc_not_zero(&inode->i_count)) { 681 rcu_read_unlock(); 682 put_page(page); 683 684 goto again; 685 } 686 687 /* Should be impossible but lets be paranoid for now */ 688 if (WARN_ON_ONCE(inode->i_mapping != mapping)) { 689 err = -EFAULT; 690 rcu_read_unlock(); 691 iput(inode); 692 693 goto out; 694 } 695 696 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 697 key->shared.inode = inode; 698 key->shared.pgoff = basepage_index(tail); 699 rcu_read_unlock(); 700 } 701 702 out: 703 put_page(page); 704 return err; 705 } 706 707 static inline void put_futex_key(union futex_key *key) 708 { 709 drop_futex_key_refs(key); 710 } 711 712 /** 713 * fault_in_user_writeable() - Fault in user address and verify RW access 714 * @uaddr: pointer to faulting user space address 715 * 716 * Slow path to fixup the fault we just took in the atomic write 717 * access to @uaddr. 718 * 719 * We have no generic implementation of a non-destructive write to the 720 * user address. We know that we faulted in the atomic pagefault 721 * disabled section so we can as well avoid the #PF overhead by 722 * calling get_user_pages() right away. 723 */ 724 static int fault_in_user_writeable(u32 __user *uaddr) 725 { 726 struct mm_struct *mm = current->mm; 727 int ret; 728 729 down_read(&mm->mmap_sem); 730 ret = fixup_user_fault(current, mm, (unsigned long)uaddr, 731 FAULT_FLAG_WRITE, NULL); 732 up_read(&mm->mmap_sem); 733 734 return ret < 0 ? ret : 0; 735 } 736 737 /** 738 * futex_top_waiter() - Return the highest priority waiter on a futex 739 * @hb: the hash bucket the futex_q's reside in 740 * @key: the futex key (to distinguish it from other futex futex_q's) 741 * 742 * Must be called with the hb lock held. 743 */ 744 static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, 745 union futex_key *key) 746 { 747 struct futex_q *this; 748 749 plist_for_each_entry(this, &hb->chain, list) { 750 if (match_futex(&this->key, key)) 751 return this; 752 } 753 return NULL; 754 } 755 756 static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, 757 u32 uval, u32 newval) 758 { 759 int ret; 760 761 pagefault_disable(); 762 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); 763 pagefault_enable(); 764 765 return ret; 766 } 767 768 static int get_futex_value_locked(u32 *dest, u32 __user *from) 769 { 770 int ret; 771 772 pagefault_disable(); 773 ret = __get_user(*dest, from); 774 pagefault_enable(); 775 776 return ret ? -EFAULT : 0; 777 } 778 779 780 /* 781 * PI code: 782 */ 783 static int refill_pi_state_cache(void) 784 { 785 struct futex_pi_state *pi_state; 786 787 if (likely(current->pi_state_cache)) 788 return 0; 789 790 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); 791 792 if (!pi_state) 793 return -ENOMEM; 794 795 INIT_LIST_HEAD(&pi_state->list); 796 /* pi_mutex gets initialized later */ 797 pi_state->owner = NULL; 798 atomic_set(&pi_state->refcount, 1); 799 pi_state->key = FUTEX_KEY_INIT; 800 801 current->pi_state_cache = pi_state; 802 803 return 0; 804 } 805 806 static struct futex_pi_state *alloc_pi_state(void) 807 { 808 struct futex_pi_state *pi_state = current->pi_state_cache; 809 810 WARN_ON(!pi_state); 811 current->pi_state_cache = NULL; 812 813 return pi_state; 814 } 815 816 static void get_pi_state(struct futex_pi_state *pi_state) 817 { 818 WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); 819 } 820 821 /* 822 * Drops a reference to the pi_state object and frees or caches it 823 * when the last reference is gone. 824 */ 825 static void put_pi_state(struct futex_pi_state *pi_state) 826 { 827 if (!pi_state) 828 return; 829 830 if (!atomic_dec_and_test(&pi_state->refcount)) 831 return; 832 833 /* 834 * If pi_state->owner is NULL, the owner is most probably dying 835 * and has cleaned up the pi_state already 836 */ 837 if (pi_state->owner) { 838 struct task_struct *owner; 839 840 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 841 owner = pi_state->owner; 842 if (owner) { 843 raw_spin_lock(&owner->pi_lock); 844 list_del_init(&pi_state->list); 845 raw_spin_unlock(&owner->pi_lock); 846 } 847 rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); 848 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 849 } 850 851 if (current->pi_state_cache) { 852 kfree(pi_state); 853 } else { 854 /* 855 * pi_state->list is already empty. 856 * clear pi_state->owner. 857 * refcount is at 0 - put it back to 1. 858 */ 859 pi_state->owner = NULL; 860 atomic_set(&pi_state->refcount, 1); 861 current->pi_state_cache = pi_state; 862 } 863 } 864 865 #ifdef CONFIG_FUTEX_PI 866 867 /* 868 * This task is holding PI mutexes at exit time => bad. 869 * Kernel cleans up PI-state, but userspace is likely hosed. 870 * (Robust-futex cleanup is separate and might save the day for userspace.) 871 */ 872 void exit_pi_state_list(struct task_struct *curr) 873 { 874 struct list_head *next, *head = &curr->pi_state_list; 875 struct futex_pi_state *pi_state; 876 struct futex_hash_bucket *hb; 877 union futex_key key = FUTEX_KEY_INIT; 878 879 if (!futex_cmpxchg_enabled) 880 return; 881 /* 882 * We are a ZOMBIE and nobody can enqueue itself on 883 * pi_state_list anymore, but we have to be careful 884 * versus waiters unqueueing themselves: 885 */ 886 raw_spin_lock_irq(&curr->pi_lock); 887 while (!list_empty(head)) { 888 next = head->next; 889 pi_state = list_entry(next, struct futex_pi_state, list); 890 key = pi_state->key; 891 hb = hash_futex(&key); 892 893 /* 894 * We can race against put_pi_state() removing itself from the 895 * list (a waiter going away). put_pi_state() will first 896 * decrement the reference count and then modify the list, so 897 * its possible to see the list entry but fail this reference 898 * acquire. 899 * 900 * In that case; drop the locks to let put_pi_state() make 901 * progress and retry the loop. 902 */ 903 if (!atomic_inc_not_zero(&pi_state->refcount)) { 904 raw_spin_unlock_irq(&curr->pi_lock); 905 cpu_relax(); 906 raw_spin_lock_irq(&curr->pi_lock); 907 continue; 908 } 909 raw_spin_unlock_irq(&curr->pi_lock); 910 911 spin_lock(&hb->lock); 912 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 913 raw_spin_lock(&curr->pi_lock); 914 /* 915 * We dropped the pi-lock, so re-check whether this 916 * task still owns the PI-state: 917 */ 918 if (head->next != next) { 919 /* retain curr->pi_lock for the loop invariant */ 920 raw_spin_unlock(&pi_state->pi_mutex.wait_lock); 921 spin_unlock(&hb->lock); 922 put_pi_state(pi_state); 923 continue; 924 } 925 926 WARN_ON(pi_state->owner != curr); 927 WARN_ON(list_empty(&pi_state->list)); 928 list_del_init(&pi_state->list); 929 pi_state->owner = NULL; 930 931 raw_spin_unlock(&curr->pi_lock); 932 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 933 spin_unlock(&hb->lock); 934 935 rt_mutex_futex_unlock(&pi_state->pi_mutex); 936 put_pi_state(pi_state); 937 938 raw_spin_lock_irq(&curr->pi_lock); 939 } 940 raw_spin_unlock_irq(&curr->pi_lock); 941 } 942 943 #endif 944 945 /* 946 * We need to check the following states: 947 * 948 * Waiter | pi_state | pi->owner | uTID | uODIED | ? 949 * 950 * [1] NULL | --- | --- | 0 | 0/1 | Valid 951 * [2] NULL | --- | --- | >0 | 0/1 | Valid 952 * 953 * [3] Found | NULL | -- | Any | 0/1 | Invalid 954 * 955 * [4] Found | Found | NULL | 0 | 1 | Valid 956 * [5] Found | Found | NULL | >0 | 1 | Invalid 957 * 958 * [6] Found | Found | task | 0 | 1 | Valid 959 * 960 * [7] Found | Found | NULL | Any | 0 | Invalid 961 * 962 * [8] Found | Found | task | ==taskTID | 0/1 | Valid 963 * [9] Found | Found | task | 0 | 0 | Invalid 964 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid 965 * 966 * [1] Indicates that the kernel can acquire the futex atomically. We 967 * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. 968 * 969 * [2] Valid, if TID does not belong to a kernel thread. If no matching 970 * thread is found then it indicates that the owner TID has died. 971 * 972 * [3] Invalid. The waiter is queued on a non PI futex 973 * 974 * [4] Valid state after exit_robust_list(), which sets the user space 975 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. 976 * 977 * [5] The user space value got manipulated between exit_robust_list() 978 * and exit_pi_state_list() 979 * 980 * [6] Valid state after exit_pi_state_list() which sets the new owner in 981 * the pi_state but cannot access the user space value. 982 * 983 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. 984 * 985 * [8] Owner and user space value match 986 * 987 * [9] There is no transient state which sets the user space TID to 0 988 * except exit_robust_list(), but this is indicated by the 989 * FUTEX_OWNER_DIED bit. See [4] 990 * 991 * [10] There is no transient state which leaves owner and user space 992 * TID out of sync. 993 * 994 * 995 * Serialization and lifetime rules: 996 * 997 * hb->lock: 998 * 999 * hb -> futex_q, relation 1000 * futex_q -> pi_state, relation 1001 * 1002 * (cannot be raw because hb can contain arbitrary amount 1003 * of futex_q's) 1004 * 1005 * pi_mutex->wait_lock: 1006 * 1007 * {uval, pi_state} 1008 * 1009 * (and pi_mutex 'obviously') 1010 * 1011 * p->pi_lock: 1012 * 1013 * p->pi_state_list -> pi_state->list, relation 1014 * 1015 * pi_state->refcount: 1016 * 1017 * pi_state lifetime 1018 * 1019 * 1020 * Lock order: 1021 * 1022 * hb->lock 1023 * pi_mutex->wait_lock 1024 * p->pi_lock 1025 * 1026 */ 1027 1028 /* 1029 * Validate that the existing waiter has a pi_state and sanity check 1030 * the pi_state against the user space value. If correct, attach to 1031 * it. 1032 */ 1033 static int attach_to_pi_state(u32 __user *uaddr, u32 uval, 1034 struct futex_pi_state *pi_state, 1035 struct futex_pi_state **ps) 1036 { 1037 pid_t pid = uval & FUTEX_TID_MASK; 1038 u32 uval2; 1039 int ret; 1040 1041 /* 1042 * Userspace might have messed up non-PI and PI futexes [3] 1043 */ 1044 if (unlikely(!pi_state)) 1045 return -EINVAL; 1046 1047 /* 1048 * We get here with hb->lock held, and having found a 1049 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q 1050 * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), 1051 * which in turn means that futex_lock_pi() still has a reference on 1052 * our pi_state. 1053 * 1054 * The waiter holding a reference on @pi_state also protects against 1055 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() 1056 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently 1057 * free pi_state before we can take a reference ourselves. 1058 */ 1059 WARN_ON(!atomic_read(&pi_state->refcount)); 1060 1061 /* 1062 * Now that we have a pi_state, we can acquire wait_lock 1063 * and do the state validation. 1064 */ 1065 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 1066 1067 /* 1068 * Since {uval, pi_state} is serialized by wait_lock, and our current 1069 * uval was read without holding it, it can have changed. Verify it 1070 * still is what we expect it to be, otherwise retry the entire 1071 * operation. 1072 */ 1073 if (get_futex_value_locked(&uval2, uaddr)) 1074 goto out_efault; 1075 1076 if (uval != uval2) 1077 goto out_eagain; 1078 1079 /* 1080 * Handle the owner died case: 1081 */ 1082 if (uval & FUTEX_OWNER_DIED) { 1083 /* 1084 * exit_pi_state_list sets owner to NULL and wakes the 1085 * topmost waiter. The task which acquires the 1086 * pi_state->rt_mutex will fixup owner. 1087 */ 1088 if (!pi_state->owner) { 1089 /* 1090 * No pi state owner, but the user space TID 1091 * is not 0. Inconsistent state. [5] 1092 */ 1093 if (pid) 1094 goto out_einval; 1095 /* 1096 * Take a ref on the state and return success. [4] 1097 */ 1098 goto out_attach; 1099 } 1100 1101 /* 1102 * If TID is 0, then either the dying owner has not 1103 * yet executed exit_pi_state_list() or some waiter 1104 * acquired the rtmutex in the pi state, but did not 1105 * yet fixup the TID in user space. 1106 * 1107 * Take a ref on the state and return success. [6] 1108 */ 1109 if (!pid) 1110 goto out_attach; 1111 } else { 1112 /* 1113 * If the owner died bit is not set, then the pi_state 1114 * must have an owner. [7] 1115 */ 1116 if (!pi_state->owner) 1117 goto out_einval; 1118 } 1119 1120 /* 1121 * Bail out if user space manipulated the futex value. If pi 1122 * state exists then the owner TID must be the same as the 1123 * user space TID. [9/10] 1124 */ 1125 if (pid != task_pid_vnr(pi_state->owner)) 1126 goto out_einval; 1127 1128 out_attach: 1129 get_pi_state(pi_state); 1130 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1131 *ps = pi_state; 1132 return 0; 1133 1134 out_einval: 1135 ret = -EINVAL; 1136 goto out_error; 1137 1138 out_eagain: 1139 ret = -EAGAIN; 1140 goto out_error; 1141 1142 out_efault: 1143 ret = -EFAULT; 1144 goto out_error; 1145 1146 out_error: 1147 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1148 return ret; 1149 } 1150 1151 /* 1152 * Lookup the task for the TID provided from user space and attach to 1153 * it after doing proper sanity checks. 1154 */ 1155 static int attach_to_pi_owner(u32 uval, union futex_key *key, 1156 struct futex_pi_state **ps) 1157 { 1158 pid_t pid = uval & FUTEX_TID_MASK; 1159 struct futex_pi_state *pi_state; 1160 struct task_struct *p; 1161 1162 /* 1163 * We are the first waiter - try to look up the real owner and attach 1164 * the new pi_state to it, but bail out when TID = 0 [1] 1165 */ 1166 if (!pid) 1167 return -ESRCH; 1168 p = find_get_task_by_vpid(pid); 1169 if (!p) 1170 return -ESRCH; 1171 1172 if (unlikely(p->flags & PF_KTHREAD)) { 1173 put_task_struct(p); 1174 return -EPERM; 1175 } 1176 1177 /* 1178 * We need to look at the task state flags to figure out, 1179 * whether the task is exiting. To protect against the do_exit 1180 * change of the task flags, we do this protected by 1181 * p->pi_lock: 1182 */ 1183 raw_spin_lock_irq(&p->pi_lock); 1184 if (unlikely(p->flags & PF_EXITING)) { 1185 /* 1186 * The task is on the way out. When PF_EXITPIDONE is 1187 * set, we know that the task has finished the 1188 * cleanup: 1189 */ 1190 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; 1191 1192 raw_spin_unlock_irq(&p->pi_lock); 1193 put_task_struct(p); 1194 return ret; 1195 } 1196 1197 /* 1198 * No existing pi state. First waiter. [2] 1199 * 1200 * This creates pi_state, we have hb->lock held, this means nothing can 1201 * observe this state, wait_lock is irrelevant. 1202 */ 1203 pi_state = alloc_pi_state(); 1204 1205 /* 1206 * Initialize the pi_mutex in locked state and make @p 1207 * the owner of it: 1208 */ 1209 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 1210 1211 /* Store the key for possible exit cleanups: */ 1212 pi_state->key = *key; 1213 1214 WARN_ON(!list_empty(&pi_state->list)); 1215 list_add(&pi_state->list, &p->pi_state_list); 1216 /* 1217 * Assignment without holding pi_state->pi_mutex.wait_lock is safe 1218 * because there is no concurrency as the object is not published yet. 1219 */ 1220 pi_state->owner = p; 1221 raw_spin_unlock_irq(&p->pi_lock); 1222 1223 put_task_struct(p); 1224 1225 *ps = pi_state; 1226 1227 return 0; 1228 } 1229 1230 static int lookup_pi_state(u32 __user *uaddr, u32 uval, 1231 struct futex_hash_bucket *hb, 1232 union futex_key *key, struct futex_pi_state **ps) 1233 { 1234 struct futex_q *top_waiter = futex_top_waiter(hb, key); 1235 1236 /* 1237 * If there is a waiter on that futex, validate it and 1238 * attach to the pi_state when the validation succeeds. 1239 */ 1240 if (top_waiter) 1241 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); 1242 1243 /* 1244 * We are the first waiter - try to look up the owner based on 1245 * @uval and attach to it. 1246 */ 1247 return attach_to_pi_owner(uval, key, ps); 1248 } 1249 1250 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) 1251 { 1252 u32 uninitialized_var(curval); 1253 1254 if (unlikely(should_fail_futex(true))) 1255 return -EFAULT; 1256 1257 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) 1258 return -EFAULT; 1259 1260 /* If user space value changed, let the caller retry */ 1261 return curval != uval ? -EAGAIN : 0; 1262 } 1263 1264 /** 1265 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex 1266 * @uaddr: the pi futex user address 1267 * @hb: the pi futex hash bucket 1268 * @key: the futex key associated with uaddr and hb 1269 * @ps: the pi_state pointer where we store the result of the 1270 * lookup 1271 * @task: the task to perform the atomic lock work for. This will 1272 * be "current" except in the case of requeue pi. 1273 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 1274 * 1275 * Return: 1276 * - 0 - ready to wait; 1277 * - 1 - acquired the lock; 1278 * - <0 - error 1279 * 1280 * The hb->lock and futex_key refs shall be held by the caller. 1281 */ 1282 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, 1283 union futex_key *key, 1284 struct futex_pi_state **ps, 1285 struct task_struct *task, int set_waiters) 1286 { 1287 u32 uval, newval, vpid = task_pid_vnr(task); 1288 struct futex_q *top_waiter; 1289 int ret; 1290 1291 /* 1292 * Read the user space value first so we can validate a few 1293 * things before proceeding further. 1294 */ 1295 if (get_futex_value_locked(&uval, uaddr)) 1296 return -EFAULT; 1297 1298 if (unlikely(should_fail_futex(true))) 1299 return -EFAULT; 1300 1301 /* 1302 * Detect deadlocks. 1303 */ 1304 if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) 1305 return -EDEADLK; 1306 1307 if ((unlikely(should_fail_futex(true)))) 1308 return -EDEADLK; 1309 1310 /* 1311 * Lookup existing state first. If it exists, try to attach to 1312 * its pi_state. 1313 */ 1314 top_waiter = futex_top_waiter(hb, key); 1315 if (top_waiter) 1316 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); 1317 1318 /* 1319 * No waiter and user TID is 0. We are here because the 1320 * waiters or the owner died bit is set or called from 1321 * requeue_cmp_pi or for whatever reason something took the 1322 * syscall. 1323 */ 1324 if (!(uval & FUTEX_TID_MASK)) { 1325 /* 1326 * We take over the futex. No other waiters and the user space 1327 * TID is 0. We preserve the owner died bit. 1328 */ 1329 newval = uval & FUTEX_OWNER_DIED; 1330 newval |= vpid; 1331 1332 /* The futex requeue_pi code can enforce the waiters bit */ 1333 if (set_waiters) 1334 newval |= FUTEX_WAITERS; 1335 1336 ret = lock_pi_update_atomic(uaddr, uval, newval); 1337 /* If the take over worked, return 1 */ 1338 return ret < 0 ? ret : 1; 1339 } 1340 1341 /* 1342 * First waiter. Set the waiters bit before attaching ourself to 1343 * the owner. If owner tries to unlock, it will be forced into 1344 * the kernel and blocked on hb->lock. 1345 */ 1346 newval = uval | FUTEX_WAITERS; 1347 ret = lock_pi_update_atomic(uaddr, uval, newval); 1348 if (ret) 1349 return ret; 1350 /* 1351 * If the update of the user space value succeeded, we try to 1352 * attach to the owner. If that fails, no harm done, we only 1353 * set the FUTEX_WAITERS bit in the user space variable. 1354 */ 1355 return attach_to_pi_owner(uval, key, ps); 1356 } 1357 1358 /** 1359 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket 1360 * @q: The futex_q to unqueue 1361 * 1362 * The q->lock_ptr must not be NULL and must be held by the caller. 1363 */ 1364 static void __unqueue_futex(struct futex_q *q) 1365 { 1366 struct futex_hash_bucket *hb; 1367 1368 if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) 1369 || WARN_ON(plist_node_empty(&q->list))) 1370 return; 1371 1372 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 1373 plist_del(&q->list, &hb->chain); 1374 hb_waiters_dec(hb); 1375 } 1376 1377 /* 1378 * The hash bucket lock must be held when this is called. 1379 * Afterwards, the futex_q must not be accessed. Callers 1380 * must ensure to later call wake_up_q() for the actual 1381 * wakeups to occur. 1382 */ 1383 static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) 1384 { 1385 struct task_struct *p = q->task; 1386 1387 if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) 1388 return; 1389 1390 /* 1391 * Queue the task for later wakeup for after we've released 1392 * the hb->lock. wake_q_add() grabs reference to p. 1393 */ 1394 wake_q_add(wake_q, p); 1395 __unqueue_futex(q); 1396 /* 1397 * The waiting task can free the futex_q as soon as q->lock_ptr = NULL 1398 * is written, without taking any locks. This is possible in the event 1399 * of a spurious wakeup, for example. A memory barrier is required here 1400 * to prevent the following store to lock_ptr from getting ahead of the 1401 * plist_del in __unqueue_futex(). 1402 */ 1403 smp_store_release(&q->lock_ptr, NULL); 1404 } 1405 1406 /* 1407 * Caller must hold a reference on @pi_state. 1408 */ 1409 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) 1410 { 1411 u32 uninitialized_var(curval), newval; 1412 struct task_struct *new_owner; 1413 bool postunlock = false; 1414 DEFINE_WAKE_Q(wake_q); 1415 int ret = 0; 1416 1417 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 1418 if (WARN_ON_ONCE(!new_owner)) { 1419 /* 1420 * As per the comment in futex_unlock_pi() this should not happen. 1421 * 1422 * When this happens, give up our locks and try again, giving 1423 * the futex_lock_pi() instance time to complete, either by 1424 * waiting on the rtmutex or removing itself from the futex 1425 * queue. 1426 */ 1427 ret = -EAGAIN; 1428 goto out_unlock; 1429 } 1430 1431 /* 1432 * We pass it to the next owner. The WAITERS bit is always kept 1433 * enabled while there is PI state around. We cleanup the owner 1434 * died bit, because we are the owner. 1435 */ 1436 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 1437 1438 if (unlikely(should_fail_futex(true))) 1439 ret = -EFAULT; 1440 1441 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { 1442 ret = -EFAULT; 1443 1444 } else if (curval != uval) { 1445 /* 1446 * If a unconditional UNLOCK_PI operation (user space did not 1447 * try the TID->0 transition) raced with a waiter setting the 1448 * FUTEX_WAITERS flag between get_user() and locking the hash 1449 * bucket lock, retry the operation. 1450 */ 1451 if ((FUTEX_TID_MASK & curval) == uval) 1452 ret = -EAGAIN; 1453 else 1454 ret = -EINVAL; 1455 } 1456 1457 if (ret) 1458 goto out_unlock; 1459 1460 /* 1461 * This is a point of no return; once we modify the uval there is no 1462 * going back and subsequent operations must not fail. 1463 */ 1464 1465 raw_spin_lock(&pi_state->owner->pi_lock); 1466 WARN_ON(list_empty(&pi_state->list)); 1467 list_del_init(&pi_state->list); 1468 raw_spin_unlock(&pi_state->owner->pi_lock); 1469 1470 raw_spin_lock(&new_owner->pi_lock); 1471 WARN_ON(!list_empty(&pi_state->list)); 1472 list_add(&pi_state->list, &new_owner->pi_state_list); 1473 pi_state->owner = new_owner; 1474 raw_spin_unlock(&new_owner->pi_lock); 1475 1476 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); 1477 1478 out_unlock: 1479 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1480 1481 if (postunlock) 1482 rt_mutex_postunlock(&wake_q); 1483 1484 return ret; 1485 } 1486 1487 /* 1488 * Express the locking dependencies for lockdep: 1489 */ 1490 static inline void 1491 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) 1492 { 1493 if (hb1 <= hb2) { 1494 spin_lock(&hb1->lock); 1495 if (hb1 < hb2) 1496 spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); 1497 } else { /* hb1 > hb2 */ 1498 spin_lock(&hb2->lock); 1499 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); 1500 } 1501 } 1502 1503 static inline void 1504 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) 1505 { 1506 spin_unlock(&hb1->lock); 1507 if (hb1 != hb2) 1508 spin_unlock(&hb2->lock); 1509 } 1510 1511 /* 1512 * Wake up waiters matching bitset queued on this futex (uaddr). 1513 */ 1514 static int 1515 futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) 1516 { 1517 struct futex_hash_bucket *hb; 1518 struct futex_q *this, *next; 1519 union futex_key key = FUTEX_KEY_INIT; 1520 int ret; 1521 DEFINE_WAKE_Q(wake_q); 1522 1523 if (!bitset) 1524 return -EINVAL; 1525 1526 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); 1527 if (unlikely(ret != 0)) 1528 goto out; 1529 1530 hb = hash_futex(&key); 1531 1532 /* Make sure we really have tasks to wakeup */ 1533 if (!hb_waiters_pending(hb)) 1534 goto out_put_key; 1535 1536 spin_lock(&hb->lock); 1537 1538 plist_for_each_entry_safe(this, next, &hb->chain, list) { 1539 if (match_futex (&this->key, &key)) { 1540 if (this->pi_state || this->rt_waiter) { 1541 ret = -EINVAL; 1542 break; 1543 } 1544 1545 /* Check if one of the bits is set in both bitsets */ 1546 if (!(this->bitset & bitset)) 1547 continue; 1548 1549 mark_wake_futex(&wake_q, this); 1550 if (++ret >= nr_wake) 1551 break; 1552 } 1553 } 1554 1555 spin_unlock(&hb->lock); 1556 wake_up_q(&wake_q); 1557 out_put_key: 1558 put_futex_key(&key); 1559 out: 1560 return ret; 1561 } 1562 1563 static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) 1564 { 1565 unsigned int op = (encoded_op & 0x70000000) >> 28; 1566 unsigned int cmp = (encoded_op & 0x0f000000) >> 24; 1567 int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); 1568 int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); 1569 int oldval, ret; 1570 1571 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { 1572 if (oparg < 0 || oparg > 31) { 1573 char comm[sizeof(current->comm)]; 1574 /* 1575 * kill this print and return -EINVAL when userspace 1576 * is sane again 1577 */ 1578 pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", 1579 get_task_comm(comm, current), oparg); 1580 oparg &= 31; 1581 } 1582 oparg = 1 << oparg; 1583 } 1584 1585 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) 1586 return -EFAULT; 1587 1588 ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); 1589 if (ret) 1590 return ret; 1591 1592 switch (cmp) { 1593 case FUTEX_OP_CMP_EQ: 1594 return oldval == cmparg; 1595 case FUTEX_OP_CMP_NE: 1596 return oldval != cmparg; 1597 case FUTEX_OP_CMP_LT: 1598 return oldval < cmparg; 1599 case FUTEX_OP_CMP_GE: 1600 return oldval >= cmparg; 1601 case FUTEX_OP_CMP_LE: 1602 return oldval <= cmparg; 1603 case FUTEX_OP_CMP_GT: 1604 return oldval > cmparg; 1605 default: 1606 return -ENOSYS; 1607 } 1608 } 1609 1610 /* 1611 * Wake up all waiters hashed on the physical page that is mapped 1612 * to this virtual address: 1613 */ 1614 static int 1615 futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, 1616 int nr_wake, int nr_wake2, int op) 1617 { 1618 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1619 struct futex_hash_bucket *hb1, *hb2; 1620 struct futex_q *this, *next; 1621 int ret, op_ret; 1622 DEFINE_WAKE_Q(wake_q); 1623 1624 retry: 1625 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); 1626 if (unlikely(ret != 0)) 1627 goto out; 1628 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); 1629 if (unlikely(ret != 0)) 1630 goto out_put_key1; 1631 1632 hb1 = hash_futex(&key1); 1633 hb2 = hash_futex(&key2); 1634 1635 retry_private: 1636 double_lock_hb(hb1, hb2); 1637 op_ret = futex_atomic_op_inuser(op, uaddr2); 1638 if (unlikely(op_ret < 0)) { 1639 1640 double_unlock_hb(hb1, hb2); 1641 1642 #ifndef CONFIG_MMU 1643 /* 1644 * we don't get EFAULT from MMU faults if we don't have an MMU, 1645 * but we might get them from range checking 1646 */ 1647 ret = op_ret; 1648 goto out_put_keys; 1649 #endif 1650 1651 if (unlikely(op_ret != -EFAULT)) { 1652 ret = op_ret; 1653 goto out_put_keys; 1654 } 1655 1656 ret = fault_in_user_writeable(uaddr2); 1657 if (ret) 1658 goto out_put_keys; 1659 1660 if (!(flags & FLAGS_SHARED)) 1661 goto retry_private; 1662 1663 put_futex_key(&key2); 1664 put_futex_key(&key1); 1665 goto retry; 1666 } 1667 1668 plist_for_each_entry_safe(this, next, &hb1->chain, list) { 1669 if (match_futex (&this->key, &key1)) { 1670 if (this->pi_state || this->rt_waiter) { 1671 ret = -EINVAL; 1672 goto out_unlock; 1673 } 1674 mark_wake_futex(&wake_q, this); 1675 if (++ret >= nr_wake) 1676 break; 1677 } 1678 } 1679 1680 if (op_ret > 0) { 1681 op_ret = 0; 1682 plist_for_each_entry_safe(this, next, &hb2->chain, list) { 1683 if (match_futex (&this->key, &key2)) { 1684 if (this->pi_state || this->rt_waiter) { 1685 ret = -EINVAL; 1686 goto out_unlock; 1687 } 1688 mark_wake_futex(&wake_q, this); 1689 if (++op_ret >= nr_wake2) 1690 break; 1691 } 1692 } 1693 ret += op_ret; 1694 } 1695 1696 out_unlock: 1697 double_unlock_hb(hb1, hb2); 1698 wake_up_q(&wake_q); 1699 out_put_keys: 1700 put_futex_key(&key2); 1701 out_put_key1: 1702 put_futex_key(&key1); 1703 out: 1704 return ret; 1705 } 1706 1707 /** 1708 * requeue_futex() - Requeue a futex_q from one hb to another 1709 * @q: the futex_q to requeue 1710 * @hb1: the source hash_bucket 1711 * @hb2: the target hash_bucket 1712 * @key2: the new key for the requeued futex_q 1713 */ 1714 static inline 1715 void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, 1716 struct futex_hash_bucket *hb2, union futex_key *key2) 1717 { 1718 1719 /* 1720 * If key1 and key2 hash to the same bucket, no need to 1721 * requeue. 1722 */ 1723 if (likely(&hb1->chain != &hb2->chain)) { 1724 plist_del(&q->list, &hb1->chain); 1725 hb_waiters_dec(hb1); 1726 hb_waiters_inc(hb2); 1727 plist_add(&q->list, &hb2->chain); 1728 q->lock_ptr = &hb2->lock; 1729 } 1730 get_futex_key_refs(key2); 1731 q->key = *key2; 1732 } 1733 1734 /** 1735 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1736 * @q: the futex_q 1737 * @key: the key of the requeue target futex 1738 * @hb: the hash_bucket of the requeue target futex 1739 * 1740 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1741 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1742 * to the requeue target futex so the waiter can detect the wakeup on the right 1743 * futex, but remove it from the hb and NULL the rt_waiter so it can detect 1744 * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock 1745 * to protect access to the pi_state to fixup the owner later. Must be called 1746 * with both q->lock_ptr and hb->lock held. 1747 */ 1748 static inline 1749 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, 1750 struct futex_hash_bucket *hb) 1751 { 1752 get_futex_key_refs(key); 1753 q->key = *key; 1754 1755 __unqueue_futex(q); 1756 1757 WARN_ON(!q->rt_waiter); 1758 q->rt_waiter = NULL; 1759 1760 q->lock_ptr = &hb->lock; 1761 1762 wake_up_state(q->task, TASK_NORMAL); 1763 } 1764 1765 /** 1766 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter 1767 * @pifutex: the user address of the to futex 1768 * @hb1: the from futex hash bucket, must be locked by the caller 1769 * @hb2: the to futex hash bucket, must be locked by the caller 1770 * @key1: the from futex key 1771 * @key2: the to futex key 1772 * @ps: address to store the pi_state pointer 1773 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 1774 * 1775 * Try and get the lock on behalf of the top waiter if we can do it atomically. 1776 * Wake the top waiter if we succeed. If the caller specified set_waiters, 1777 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. 1778 * hb1 and hb2 must be held by the caller. 1779 * 1780 * Return: 1781 * - 0 - failed to acquire the lock atomically; 1782 * - >0 - acquired the lock, return value is vpid of the top_waiter 1783 * - <0 - error 1784 */ 1785 static int futex_proxy_trylock_atomic(u32 __user *pifutex, 1786 struct futex_hash_bucket *hb1, 1787 struct futex_hash_bucket *hb2, 1788 union futex_key *key1, union futex_key *key2, 1789 struct futex_pi_state **ps, int set_waiters) 1790 { 1791 struct futex_q *top_waiter = NULL; 1792 u32 curval; 1793 int ret, vpid; 1794 1795 if (get_futex_value_locked(&curval, pifutex)) 1796 return -EFAULT; 1797 1798 if (unlikely(should_fail_futex(true))) 1799 return -EFAULT; 1800 1801 /* 1802 * Find the top_waiter and determine if there are additional waiters. 1803 * If the caller intends to requeue more than 1 waiter to pifutex, 1804 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, 1805 * as we have means to handle the possible fault. If not, don't set 1806 * the bit unecessarily as it will force the subsequent unlock to enter 1807 * the kernel. 1808 */ 1809 top_waiter = futex_top_waiter(hb1, key1); 1810 1811 /* There are no waiters, nothing for us to do. */ 1812 if (!top_waiter) 1813 return 0; 1814 1815 /* Ensure we requeue to the expected futex. */ 1816 if (!match_futex(top_waiter->requeue_pi_key, key2)) 1817 return -EINVAL; 1818 1819 /* 1820 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in 1821 * the contended case or if set_waiters is 1. The pi_state is returned 1822 * in ps in contended cases. 1823 */ 1824 vpid = task_pid_vnr(top_waiter->task); 1825 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, 1826 set_waiters); 1827 if (ret == 1) { 1828 requeue_pi_wake_futex(top_waiter, key2, hb2); 1829 return vpid; 1830 } 1831 return ret; 1832 } 1833 1834 /** 1835 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1836 * @uaddr1: source futex user address 1837 * @flags: futex flags (FLAGS_SHARED, etc.) 1838 * @uaddr2: target futex user address 1839 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1840 * @nr_requeue: number of waiters to requeue (0-INT_MAX) 1841 * @cmpval: @uaddr1 expected value (or %NULL) 1842 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a 1843 * pi futex (pi to pi requeue is not supported) 1844 * 1845 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1846 * uaddr2 atomically on behalf of the top waiter. 1847 * 1848 * Return: 1849 * - >=0 - on success, the number of tasks requeued or woken; 1850 * - <0 - on error 1851 */ 1852 static int futex_requeue(u32 __user *uaddr1, unsigned int flags, 1853 u32 __user *uaddr2, int nr_wake, int nr_requeue, 1854 u32 *cmpval, int requeue_pi) 1855 { 1856 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1857 int drop_count = 0, task_count = 0, ret; 1858 struct futex_pi_state *pi_state = NULL; 1859 struct futex_hash_bucket *hb1, *hb2; 1860 struct futex_q *this, *next; 1861 DEFINE_WAKE_Q(wake_q); 1862 1863 if (nr_wake < 0 || nr_requeue < 0) 1864 return -EINVAL; 1865 1866 /* 1867 * When PI not supported: return -ENOSYS if requeue_pi is true, 1868 * consequently the compiler knows requeue_pi is always false past 1869 * this point which will optimize away all the conditional code 1870 * further down. 1871 */ 1872 if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) 1873 return -ENOSYS; 1874 1875 if (requeue_pi) { 1876 /* 1877 * Requeue PI only works on two distinct uaddrs. This 1878 * check is only valid for private futexes. See below. 1879 */ 1880 if (uaddr1 == uaddr2) 1881 return -EINVAL; 1882 1883 /* 1884 * requeue_pi requires a pi_state, try to allocate it now 1885 * without any locks in case it fails. 1886 */ 1887 if (refill_pi_state_cache()) 1888 return -ENOMEM; 1889 /* 1890 * requeue_pi must wake as many tasks as it can, up to nr_wake 1891 * + nr_requeue, since it acquires the rt_mutex prior to 1892 * returning to userspace, so as to not leave the rt_mutex with 1893 * waiters and no owner. However, second and third wake-ups 1894 * cannot be predicted as they involve race conditions with the 1895 * first wake and a fault while looking up the pi_state. Both 1896 * pthread_cond_signal() and pthread_cond_broadcast() should 1897 * use nr_wake=1. 1898 */ 1899 if (nr_wake != 1) 1900 return -EINVAL; 1901 } 1902 1903 retry: 1904 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); 1905 if (unlikely(ret != 0)) 1906 goto out; 1907 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, 1908 requeue_pi ? VERIFY_WRITE : VERIFY_READ); 1909 if (unlikely(ret != 0)) 1910 goto out_put_key1; 1911 1912 /* 1913 * The check above which compares uaddrs is not sufficient for 1914 * shared futexes. We need to compare the keys: 1915 */ 1916 if (requeue_pi && match_futex(&key1, &key2)) { 1917 ret = -EINVAL; 1918 goto out_put_keys; 1919 } 1920 1921 hb1 = hash_futex(&key1); 1922 hb2 = hash_futex(&key2); 1923 1924 retry_private: 1925 hb_waiters_inc(hb2); 1926 double_lock_hb(hb1, hb2); 1927 1928 if (likely(cmpval != NULL)) { 1929 u32 curval; 1930 1931 ret = get_futex_value_locked(&curval, uaddr1); 1932 1933 if (unlikely(ret)) { 1934 double_unlock_hb(hb1, hb2); 1935 hb_waiters_dec(hb2); 1936 1937 ret = get_user(curval, uaddr1); 1938 if (ret) 1939 goto out_put_keys; 1940 1941 if (!(flags & FLAGS_SHARED)) 1942 goto retry_private; 1943 1944 put_futex_key(&key2); 1945 put_futex_key(&key1); 1946 goto retry; 1947 } 1948 if (curval != *cmpval) { 1949 ret = -EAGAIN; 1950 goto out_unlock; 1951 } 1952 } 1953 1954 if (requeue_pi && (task_count - nr_wake < nr_requeue)) { 1955 /* 1956 * Attempt to acquire uaddr2 and wake the top waiter. If we 1957 * intend to requeue waiters, force setting the FUTEX_WAITERS 1958 * bit. We force this here where we are able to easily handle 1959 * faults rather in the requeue loop below. 1960 */ 1961 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, 1962 &key2, &pi_state, nr_requeue); 1963 1964 /* 1965 * At this point the top_waiter has either taken uaddr2 or is 1966 * waiting on it. If the former, then the pi_state will not 1967 * exist yet, look it up one more time to ensure we have a 1968 * reference to it. If the lock was taken, ret contains the 1969 * vpid of the top waiter task. 1970 * If the lock was not taken, we have pi_state and an initial 1971 * refcount on it. In case of an error we have nothing. 1972 */ 1973 if (ret > 0) { 1974 WARN_ON(pi_state); 1975 drop_count++; 1976 task_count++; 1977 /* 1978 * If we acquired the lock, then the user space value 1979 * of uaddr2 should be vpid. It cannot be changed by 1980 * the top waiter as it is blocked on hb2 lock if it 1981 * tries to do so. If something fiddled with it behind 1982 * our back the pi state lookup might unearth it. So 1983 * we rather use the known value than rereading and 1984 * handing potential crap to lookup_pi_state. 1985 * 1986 * If that call succeeds then we have pi_state and an 1987 * initial refcount on it. 1988 */ 1989 ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); 1990 } 1991 1992 switch (ret) { 1993 case 0: 1994 /* We hold a reference on the pi state. */ 1995 break; 1996 1997 /* If the above failed, then pi_state is NULL */ 1998 case -EFAULT: 1999 double_unlock_hb(hb1, hb2); 2000 hb_waiters_dec(hb2); 2001 put_futex_key(&key2); 2002 put_futex_key(&key1); 2003 ret = fault_in_user_writeable(uaddr2); 2004 if (!ret) 2005 goto retry; 2006 goto out; 2007 case -EAGAIN: 2008 /* 2009 * Two reasons for this: 2010 * - Owner is exiting and we just wait for the 2011 * exit to complete. 2012 * - The user space value changed. 2013 */ 2014 double_unlock_hb(hb1, hb2); 2015 hb_waiters_dec(hb2); 2016 put_futex_key(&key2); 2017 put_futex_key(&key1); 2018 cond_resched(); 2019 goto retry; 2020 default: 2021 goto out_unlock; 2022 } 2023 } 2024 2025 plist_for_each_entry_safe(this, next, &hb1->chain, list) { 2026 if (task_count - nr_wake >= nr_requeue) 2027 break; 2028 2029 if (!match_futex(&this->key, &key1)) 2030 continue; 2031 2032 /* 2033 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always 2034 * be paired with each other and no other futex ops. 2035 * 2036 * We should never be requeueing a futex_q with a pi_state, 2037 * which is awaiting a futex_unlock_pi(). 2038 */ 2039 if ((requeue_pi && !this->rt_waiter) || 2040 (!requeue_pi && this->rt_waiter) || 2041 this->pi_state) { 2042 ret = -EINVAL; 2043 break; 2044 } 2045 2046 /* 2047 * Wake nr_wake waiters. For requeue_pi, if we acquired the 2048 * lock, we already woke the top_waiter. If not, it will be 2049 * woken by futex_unlock_pi(). 2050 */ 2051 if (++task_count <= nr_wake && !requeue_pi) { 2052 mark_wake_futex(&wake_q, this); 2053 continue; 2054 } 2055 2056 /* Ensure we requeue to the expected futex for requeue_pi. */ 2057 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { 2058 ret = -EINVAL; 2059 break; 2060 } 2061 2062 /* 2063 * Requeue nr_requeue waiters and possibly one more in the case 2064 * of requeue_pi if we couldn't acquire the lock atomically. 2065 */ 2066 if (requeue_pi) { 2067 /* 2068 * Prepare the waiter to take the rt_mutex. Take a 2069 * refcount on the pi_state and store the pointer in 2070 * the futex_q object of the waiter. 2071 */ 2072 get_pi_state(pi_state); 2073 this->pi_state = pi_state; 2074 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, 2075 this->rt_waiter, 2076 this->task); 2077 if (ret == 1) { 2078 /* 2079 * We got the lock. We do neither drop the 2080 * refcount on pi_state nor clear 2081 * this->pi_state because the waiter needs the 2082 * pi_state for cleaning up the user space 2083 * value. It will drop the refcount after 2084 * doing so. 2085 */ 2086 requeue_pi_wake_futex(this, &key2, hb2); 2087 drop_count++; 2088 continue; 2089 } else if (ret) { 2090 /* 2091 * rt_mutex_start_proxy_lock() detected a 2092 * potential deadlock when we tried to queue 2093 * that waiter. Drop the pi_state reference 2094 * which we took above and remove the pointer 2095 * to the state from the waiters futex_q 2096 * object. 2097 */ 2098 this->pi_state = NULL; 2099 put_pi_state(pi_state); 2100 /* 2101 * We stop queueing more waiters and let user 2102 * space deal with the mess. 2103 */ 2104 break; 2105 } 2106 } 2107 requeue_futex(this, hb1, hb2, &key2); 2108 drop_count++; 2109 } 2110 2111 /* 2112 * We took an extra initial reference to the pi_state either 2113 * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We 2114 * need to drop it here again. 2115 */ 2116 put_pi_state(pi_state); 2117 2118 out_unlock: 2119 double_unlock_hb(hb1, hb2); 2120 wake_up_q(&wake_q); 2121 hb_waiters_dec(hb2); 2122 2123 /* 2124 * drop_futex_key_refs() must be called outside the spinlocks. During 2125 * the requeue we moved futex_q's from the hash bucket at key1 to the 2126 * one at key2 and updated their key pointer. We no longer need to 2127 * hold the references to key1. 2128 */ 2129 while (--drop_count >= 0) 2130 drop_futex_key_refs(&key1); 2131 2132 out_put_keys: 2133 put_futex_key(&key2); 2134 out_put_key1: 2135 put_futex_key(&key1); 2136 out: 2137 return ret ? ret : task_count; 2138 } 2139 2140 /* The key must be already stored in q->key. */ 2141 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) 2142 __acquires(&hb->lock) 2143 { 2144 struct futex_hash_bucket *hb; 2145 2146 hb = hash_futex(&q->key); 2147 2148 /* 2149 * Increment the counter before taking the lock so that 2150 * a potential waker won't miss a to-be-slept task that is 2151 * waiting for the spinlock. This is safe as all queue_lock() 2152 * users end up calling queue_me(). Similarly, for housekeeping, 2153 * decrement the counter at queue_unlock() when some error has 2154 * occurred and we don't end up adding the task to the list. 2155 */ 2156 hb_waiters_inc(hb); 2157 2158 q->lock_ptr = &hb->lock; 2159 2160 spin_lock(&hb->lock); /* implies smp_mb(); (A) */ 2161 return hb; 2162 } 2163 2164 static inline void 2165 queue_unlock(struct futex_hash_bucket *hb) 2166 __releases(&hb->lock) 2167 { 2168 spin_unlock(&hb->lock); 2169 hb_waiters_dec(hb); 2170 } 2171 2172 static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 2173 { 2174 int prio; 2175 2176 /* 2177 * The priority used to register this element is 2178 * - either the real thread-priority for the real-time threads 2179 * (i.e. threads with a priority lower than MAX_RT_PRIO) 2180 * - or MAX_RT_PRIO for non-RT threads. 2181 * Thus, all RT-threads are woken first in priority order, and 2182 * the others are woken last, in FIFO order. 2183 */ 2184 prio = min(current->normal_prio, MAX_RT_PRIO); 2185 2186 plist_node_init(&q->list, prio); 2187 plist_add(&q->list, &hb->chain); 2188 q->task = current; 2189 } 2190 2191 /** 2192 * queue_me() - Enqueue the futex_q on the futex_hash_bucket 2193 * @q: The futex_q to enqueue 2194 * @hb: The destination hash bucket 2195 * 2196 * The hb->lock must be held by the caller, and is released here. A call to 2197 * queue_me() is typically paired with exactly one call to unqueue_me(). The 2198 * exceptions involve the PI related operations, which may use unqueue_me_pi() 2199 * or nothing if the unqueue is done as part of the wake process and the unqueue 2200 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for 2201 * an example). 2202 */ 2203 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 2204 __releases(&hb->lock) 2205 { 2206 __queue_me(q, hb); 2207 spin_unlock(&hb->lock); 2208 } 2209 2210 /** 2211 * unqueue_me() - Remove the futex_q from its futex_hash_bucket 2212 * @q: The futex_q to unqueue 2213 * 2214 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must 2215 * be paired with exactly one earlier call to queue_me(). 2216 * 2217 * Return: 2218 * - 1 - if the futex_q was still queued (and we removed unqueued it); 2219 * - 0 - if the futex_q was already removed by the waking thread 2220 */ 2221 static int unqueue_me(struct futex_q *q) 2222 { 2223 spinlock_t *lock_ptr; 2224 int ret = 0; 2225 2226 /* In the common case we don't take the spinlock, which is nice. */ 2227 retry: 2228 /* 2229 * q->lock_ptr can change between this read and the following spin_lock. 2230 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and 2231 * optimizing lock_ptr out of the logic below. 2232 */ 2233 lock_ptr = READ_ONCE(q->lock_ptr); 2234 if (lock_ptr != NULL) { 2235 spin_lock(lock_ptr); 2236 /* 2237 * q->lock_ptr can change between reading it and 2238 * spin_lock(), causing us to take the wrong lock. This 2239 * corrects the race condition. 2240 * 2241 * Reasoning goes like this: if we have the wrong lock, 2242 * q->lock_ptr must have changed (maybe several times) 2243 * between reading it and the spin_lock(). It can 2244 * change again after the spin_lock() but only if it was 2245 * already changed before the spin_lock(). It cannot, 2246 * however, change back to the original value. Therefore 2247 * we can detect whether we acquired the correct lock. 2248 */ 2249 if (unlikely(lock_ptr != q->lock_ptr)) { 2250 spin_unlock(lock_ptr); 2251 goto retry; 2252 } 2253 __unqueue_futex(q); 2254 2255 BUG_ON(q->pi_state); 2256 2257 spin_unlock(lock_ptr); 2258 ret = 1; 2259 } 2260 2261 drop_futex_key_refs(&q->key); 2262 return ret; 2263 } 2264 2265 /* 2266 * PI futexes can not be requeued and must remove themself from the 2267 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry 2268 * and dropped here. 2269 */ 2270 static void unqueue_me_pi(struct futex_q *q) 2271 __releases(q->lock_ptr) 2272 { 2273 __unqueue_futex(q); 2274 2275 BUG_ON(!q->pi_state); 2276 put_pi_state(q->pi_state); 2277 q->pi_state = NULL; 2278 2279 spin_unlock(q->lock_ptr); 2280 } 2281 2282 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 2283 struct task_struct *argowner) 2284 { 2285 struct futex_pi_state *pi_state = q->pi_state; 2286 u32 uval, uninitialized_var(curval), newval; 2287 struct task_struct *oldowner, *newowner; 2288 u32 newtid; 2289 int ret; 2290 2291 lockdep_assert_held(q->lock_ptr); 2292 2293 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 2294 2295 oldowner = pi_state->owner; 2296 2297 /* 2298 * We are here because either: 2299 * 2300 * - we stole the lock and pi_state->owner needs updating to reflect 2301 * that (@argowner == current), 2302 * 2303 * or: 2304 * 2305 * - someone stole our lock and we need to fix things to point to the 2306 * new owner (@argowner == NULL). 2307 * 2308 * Either way, we have to replace the TID in the user space variable. 2309 * This must be atomic as we have to preserve the owner died bit here. 2310 * 2311 * Note: We write the user space value _before_ changing the pi_state 2312 * because we can fault here. Imagine swapped out pages or a fork 2313 * that marked all the anonymous memory readonly for cow. 2314 * 2315 * Modifying pi_state _before_ the user space value would leave the 2316 * pi_state in an inconsistent state when we fault here, because we 2317 * need to drop the locks to handle the fault. This might be observed 2318 * in the PID check in lookup_pi_state. 2319 */ 2320 retry: 2321 if (!argowner) { 2322 if (oldowner != current) { 2323 /* 2324 * We raced against a concurrent self; things are 2325 * already fixed up. Nothing to do. 2326 */ 2327 ret = 0; 2328 goto out_unlock; 2329 } 2330 2331 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { 2332 /* We got the lock after all, nothing to fix. */ 2333 ret = 0; 2334 goto out_unlock; 2335 } 2336 2337 /* 2338 * Since we just failed the trylock; there must be an owner. 2339 */ 2340 newowner = rt_mutex_owner(&pi_state->pi_mutex); 2341 BUG_ON(!newowner); 2342 } else { 2343 WARN_ON_ONCE(argowner != current); 2344 if (oldowner == current) { 2345 /* 2346 * We raced against a concurrent self; things are 2347 * already fixed up. Nothing to do. 2348 */ 2349 ret = 0; 2350 goto out_unlock; 2351 } 2352 newowner = argowner; 2353 } 2354 2355 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 2356 /* Owner died? */ 2357 if (!pi_state->owner) 2358 newtid |= FUTEX_OWNER_DIED; 2359 2360 if (get_futex_value_locked(&uval, uaddr)) 2361 goto handle_fault; 2362 2363 for (;;) { 2364 newval = (uval & FUTEX_OWNER_DIED) | newtid; 2365 2366 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) 2367 goto handle_fault; 2368 if (curval == uval) 2369 break; 2370 uval = curval; 2371 } 2372 2373 /* 2374 * We fixed up user space. Now we need to fix the pi_state 2375 * itself. 2376 */ 2377 if (pi_state->owner != NULL) { 2378 raw_spin_lock(&pi_state->owner->pi_lock); 2379 WARN_ON(list_empty(&pi_state->list)); 2380 list_del_init(&pi_state->list); 2381 raw_spin_unlock(&pi_state->owner->pi_lock); 2382 } 2383 2384 pi_state->owner = newowner; 2385 2386 raw_spin_lock(&newowner->pi_lock); 2387 WARN_ON(!list_empty(&pi_state->list)); 2388 list_add(&pi_state->list, &newowner->pi_state_list); 2389 raw_spin_unlock(&newowner->pi_lock); 2390 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 2391 2392 return 0; 2393 2394 /* 2395 * To handle the page fault we need to drop the locks here. That gives 2396 * the other task (either the highest priority waiter itself or the 2397 * task which stole the rtmutex) the chance to try the fixup of the 2398 * pi_state. So once we are back from handling the fault we need to 2399 * check the pi_state after reacquiring the locks and before trying to 2400 * do another fixup. When the fixup has been done already we simply 2401 * return. 2402 * 2403 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely 2404 * drop hb->lock since the caller owns the hb -> futex_q relation. 2405 * Dropping the pi_mutex->wait_lock requires the state revalidate. 2406 */ 2407 handle_fault: 2408 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 2409 spin_unlock(q->lock_ptr); 2410 2411 ret = fault_in_user_writeable(uaddr); 2412 2413 spin_lock(q->lock_ptr); 2414 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 2415 2416 /* 2417 * Check if someone else fixed it for us: 2418 */ 2419 if (pi_state->owner != oldowner) { 2420 ret = 0; 2421 goto out_unlock; 2422 } 2423 2424 if (ret) 2425 goto out_unlock; 2426 2427 goto retry; 2428 2429 out_unlock: 2430 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 2431 return ret; 2432 } 2433 2434 static long futex_wait_restart(struct restart_block *restart); 2435 2436 /** 2437 * fixup_owner() - Post lock pi_state and corner case management 2438 * @uaddr: user address of the futex 2439 * @q: futex_q (contains pi_state and access to the rt_mutex) 2440 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 2441 * 2442 * After attempting to lock an rt_mutex, this function is called to cleanup 2443 * the pi_state owner as well as handle race conditions that may allow us to 2444 * acquire the lock. Must be called with the hb lock held. 2445 * 2446 * Return: 2447 * - 1 - success, lock taken; 2448 * - 0 - success, lock not taken; 2449 * - <0 - on error (-EFAULT) 2450 */ 2451 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) 2452 { 2453 int ret = 0; 2454 2455 if (locked) { 2456 /* 2457 * Got the lock. We might not be the anticipated owner if we 2458 * did a lock-steal - fix up the PI-state in that case: 2459 * 2460 * Speculative pi_state->owner read (we don't hold wait_lock); 2461 * since we own the lock pi_state->owner == current is the 2462 * stable state, anything else needs more attention. 2463 */ 2464 if (q->pi_state->owner != current) 2465 ret = fixup_pi_state_owner(uaddr, q, current); 2466 goto out; 2467 } 2468 2469 /* 2470 * If we didn't get the lock; check if anybody stole it from us. In 2471 * that case, we need to fix up the uval to point to them instead of 2472 * us, otherwise bad things happen. [10] 2473 * 2474 * Another speculative read; pi_state->owner == current is unstable 2475 * but needs our attention. 2476 */ 2477 if (q->pi_state->owner == current) { 2478 ret = fixup_pi_state_owner(uaddr, q, NULL); 2479 goto out; 2480 } 2481 2482 /* 2483 * Paranoia check. If we did not take the lock, then we should not be 2484 * the owner of the rt_mutex. 2485 */ 2486 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { 2487 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " 2488 "pi-state %p\n", ret, 2489 q->pi_state->pi_mutex.owner, 2490 q->pi_state->owner); 2491 } 2492 2493 out: 2494 return ret ? ret : locked; 2495 } 2496 2497 /** 2498 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal 2499 * @hb: the futex hash bucket, must be locked by the caller 2500 * @q: the futex_q to queue up on 2501 * @timeout: the prepared hrtimer_sleeper, or null for no timeout 2502 */ 2503 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, 2504 struct hrtimer_sleeper *timeout) 2505 { 2506 /* 2507 * The task state is guaranteed to be set before another task can 2508 * wake it. set_current_state() is implemented using smp_store_mb() and 2509 * queue_me() calls spin_unlock() upon completion, both serializing 2510 * access to the hash list and forcing another memory barrier. 2511 */ 2512 set_current_state(TASK_INTERRUPTIBLE); 2513 queue_me(q, hb); 2514 2515 /* Arm the timer */ 2516 if (timeout) 2517 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); 2518 2519 /* 2520 * If we have been removed from the hash list, then another task 2521 * has tried to wake us, and we can skip the call to schedule(). 2522 */ 2523 if (likely(!plist_node_empty(&q->list))) { 2524 /* 2525 * If the timer has already expired, current will already be 2526 * flagged for rescheduling. Only call schedule if there 2527 * is no timeout, or if it has yet to expire. 2528 */ 2529 if (!timeout || timeout->task) 2530 freezable_schedule(); 2531 } 2532 __set_current_state(TASK_RUNNING); 2533 } 2534 2535 /** 2536 * futex_wait_setup() - Prepare to wait on a futex 2537 * @uaddr: the futex userspace address 2538 * @val: the expected value 2539 * @flags: futex flags (FLAGS_SHARED, etc.) 2540 * @q: the associated futex_q 2541 * @hb: storage for hash_bucket pointer to be returned to caller 2542 * 2543 * Setup the futex_q and locate the hash_bucket. Get the futex value and 2544 * compare it with the expected value. Handle atomic faults internally. 2545 * Return with the hb lock held and a q.key reference on success, and unlocked 2546 * with no q.key reference on failure. 2547 * 2548 * Return: 2549 * - 0 - uaddr contains val and hb has been locked; 2550 * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked 2551 */ 2552 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 2553 struct futex_q *q, struct futex_hash_bucket **hb) 2554 { 2555 u32 uval; 2556 int ret; 2557 2558 /* 2559 * Access the page AFTER the hash-bucket is locked. 2560 * Order is important: 2561 * 2562 * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); 2563 * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } 2564 * 2565 * The basic logical guarantee of a futex is that it blocks ONLY 2566 * if cond(var) is known to be true at the time of blocking, for 2567 * any cond. If we locked the hash-bucket after testing *uaddr, that 2568 * would open a race condition where we could block indefinitely with 2569 * cond(var) false, which would violate the guarantee. 2570 * 2571 * On the other hand, we insert q and release the hash-bucket only 2572 * after testing *uaddr. This guarantees that futex_wait() will NOT 2573 * absorb a wakeup if *uaddr does not match the desired values 2574 * while the syscall executes. 2575 */ 2576 retry: 2577 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); 2578 if (unlikely(ret != 0)) 2579 return ret; 2580 2581 retry_private: 2582 *hb = queue_lock(q); 2583 2584 ret = get_futex_value_locked(&uval, uaddr); 2585 2586 if (ret) { 2587 queue_unlock(*hb); 2588 2589 ret = get_user(uval, uaddr); 2590 if (ret) 2591 goto out; 2592 2593 if (!(flags & FLAGS_SHARED)) 2594 goto retry_private; 2595 2596 put_futex_key(&q->key); 2597 goto retry; 2598 } 2599 2600 if (uval != val) { 2601 queue_unlock(*hb); 2602 ret = -EWOULDBLOCK; 2603 } 2604 2605 out: 2606 if (ret) 2607 put_futex_key(&q->key); 2608 return ret; 2609 } 2610 2611 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, 2612 ktime_t *abs_time, u32 bitset) 2613 { 2614 struct hrtimer_sleeper timeout, *to = NULL; 2615 struct restart_block *restart; 2616 struct futex_hash_bucket *hb; 2617 struct futex_q q = futex_q_init; 2618 int ret; 2619 2620 if (!bitset) 2621 return -EINVAL; 2622 q.bitset = bitset; 2623 2624 if (abs_time) { 2625 to = &timeout; 2626 2627 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? 2628 CLOCK_REALTIME : CLOCK_MONOTONIC, 2629 HRTIMER_MODE_ABS); 2630 hrtimer_init_sleeper(to, current); 2631 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 2632 current->timer_slack_ns); 2633 } 2634 2635 retry: 2636 /* 2637 * Prepare to wait on uaddr. On success, holds hb lock and increments 2638 * q.key refs. 2639 */ 2640 ret = futex_wait_setup(uaddr, val, flags, &q, &hb); 2641 if (ret) 2642 goto out; 2643 2644 /* queue_me and wait for wakeup, timeout, or a signal. */ 2645 futex_wait_queue_me(hb, &q, to); 2646 2647 /* If we were woken (and unqueued), we succeeded, whatever. */ 2648 ret = 0; 2649 /* unqueue_me() drops q.key ref */ 2650 if (!unqueue_me(&q)) 2651 goto out; 2652 ret = -ETIMEDOUT; 2653 if (to && !to->task) 2654 goto out; 2655 2656 /* 2657 * We expect signal_pending(current), but we might be the 2658 * victim of a spurious wakeup as well. 2659 */ 2660 if (!signal_pending(current)) 2661 goto retry; 2662 2663 ret = -ERESTARTSYS; 2664 if (!abs_time) 2665 goto out; 2666 2667 restart = ¤t->restart_block; 2668 restart->fn = futex_wait_restart; 2669 restart->futex.uaddr = uaddr; 2670 restart->futex.val = val; 2671 restart->futex.time = *abs_time; 2672 restart->futex.bitset = bitset; 2673 restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; 2674 2675 ret = -ERESTART_RESTARTBLOCK; 2676 2677 out: 2678 if (to) { 2679 hrtimer_cancel(&to->timer); 2680 destroy_hrtimer_on_stack(&to->timer); 2681 } 2682 return ret; 2683 } 2684 2685 2686 static long futex_wait_restart(struct restart_block *restart) 2687 { 2688 u32 __user *uaddr = restart->futex.uaddr; 2689 ktime_t t, *tp = NULL; 2690 2691 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { 2692 t = restart->futex.time; 2693 tp = &t; 2694 } 2695 restart->fn = do_no_restart_syscall; 2696 2697 return (long)futex_wait(uaddr, restart->futex.flags, 2698 restart->futex.val, tp, restart->futex.bitset); 2699 } 2700 2701 2702 /* 2703 * Userspace tried a 0 -> TID atomic transition of the futex value 2704 * and failed. The kernel side here does the whole locking operation: 2705 * if there are waiters then it will block as a consequence of relying 2706 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see 2707 * a 0 value of the futex too.). 2708 * 2709 * Also serves as futex trylock_pi()'ing, and due semantics. 2710 */ 2711 static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, 2712 ktime_t *time, int trylock) 2713 { 2714 struct hrtimer_sleeper timeout, *to = NULL; 2715 struct futex_pi_state *pi_state = NULL; 2716 struct rt_mutex_waiter rt_waiter; 2717 struct futex_hash_bucket *hb; 2718 struct futex_q q = futex_q_init; 2719 int res, ret; 2720 2721 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 2722 return -ENOSYS; 2723 2724 if (refill_pi_state_cache()) 2725 return -ENOMEM; 2726 2727 if (time) { 2728 to = &timeout; 2729 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, 2730 HRTIMER_MODE_ABS); 2731 hrtimer_init_sleeper(to, current); 2732 hrtimer_set_expires(&to->timer, *time); 2733 } 2734 2735 retry: 2736 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); 2737 if (unlikely(ret != 0)) 2738 goto out; 2739 2740 retry_private: 2741 hb = queue_lock(&q); 2742 2743 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); 2744 if (unlikely(ret)) { 2745 /* 2746 * Atomic work succeeded and we got the lock, 2747 * or failed. Either way, we do _not_ block. 2748 */ 2749 switch (ret) { 2750 case 1: 2751 /* We got the lock. */ 2752 ret = 0; 2753 goto out_unlock_put_key; 2754 case -EFAULT: 2755 goto uaddr_faulted; 2756 case -EAGAIN: 2757 /* 2758 * Two reasons for this: 2759 * - Task is exiting and we just wait for the 2760 * exit to complete. 2761 * - The user space value changed. 2762 */ 2763 queue_unlock(hb); 2764 put_futex_key(&q.key); 2765 cond_resched(); 2766 goto retry; 2767 default: 2768 goto out_unlock_put_key; 2769 } 2770 } 2771 2772 WARN_ON(!q.pi_state); 2773 2774 /* 2775 * Only actually queue now that the atomic ops are done: 2776 */ 2777 __queue_me(&q, hb); 2778 2779 if (trylock) { 2780 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); 2781 /* Fixup the trylock return value: */ 2782 ret = ret ? 0 : -EWOULDBLOCK; 2783 goto no_block; 2784 } 2785 2786 rt_mutex_init_waiter(&rt_waiter); 2787 2788 /* 2789 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not 2790 * hold it while doing rt_mutex_start_proxy(), because then it will 2791 * include hb->lock in the blocking chain, even through we'll not in 2792 * fact hold it while blocking. This will lead it to report -EDEADLK 2793 * and BUG when futex_unlock_pi() interleaves with this. 2794 * 2795 * Therefore acquire wait_lock while holding hb->lock, but drop the 2796 * latter before calling rt_mutex_start_proxy_lock(). This still fully 2797 * serializes against futex_unlock_pi() as that does the exact same 2798 * lock handoff sequence. 2799 */ 2800 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); 2801 spin_unlock(q.lock_ptr); 2802 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); 2803 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); 2804 2805 if (ret) { 2806 if (ret == 1) 2807 ret = 0; 2808 2809 spin_lock(q.lock_ptr); 2810 goto no_block; 2811 } 2812 2813 2814 if (unlikely(to)) 2815 hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); 2816 2817 ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); 2818 2819 spin_lock(q.lock_ptr); 2820 /* 2821 * If we failed to acquire the lock (signal/timeout), we must 2822 * first acquire the hb->lock before removing the lock from the 2823 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex 2824 * wait lists consistent. 2825 * 2826 * In particular; it is important that futex_unlock_pi() can not 2827 * observe this inconsistency. 2828 */ 2829 if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) 2830 ret = 0; 2831 2832 no_block: 2833 /* 2834 * Fixup the pi_state owner and possibly acquire the lock if we 2835 * haven't already. 2836 */ 2837 res = fixup_owner(uaddr, &q, !ret); 2838 /* 2839 * If fixup_owner() returned an error, proprogate that. If it acquired 2840 * the lock, clear our -ETIMEDOUT or -EINTR. 2841 */ 2842 if (res) 2843 ret = (res < 0) ? res : 0; 2844 2845 /* 2846 * If fixup_owner() faulted and was unable to handle the fault, unlock 2847 * it and return the fault to userspace. 2848 */ 2849 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { 2850 pi_state = q.pi_state; 2851 get_pi_state(pi_state); 2852 } 2853 2854 /* Unqueue and drop the lock */ 2855 unqueue_me_pi(&q); 2856 2857 if (pi_state) { 2858 rt_mutex_futex_unlock(&pi_state->pi_mutex); 2859 put_pi_state(pi_state); 2860 } 2861 2862 goto out_put_key; 2863 2864 out_unlock_put_key: 2865 queue_unlock(hb); 2866 2867 out_put_key: 2868 put_futex_key(&q.key); 2869 out: 2870 if (to) { 2871 hrtimer_cancel(&to->timer); 2872 destroy_hrtimer_on_stack(&to->timer); 2873 } 2874 return ret != -EINTR ? ret : -ERESTARTNOINTR; 2875 2876 uaddr_faulted: 2877 queue_unlock(hb); 2878 2879 ret = fault_in_user_writeable(uaddr); 2880 if (ret) 2881 goto out_put_key; 2882 2883 if (!(flags & FLAGS_SHARED)) 2884 goto retry_private; 2885 2886 put_futex_key(&q.key); 2887 goto retry; 2888 } 2889 2890 /* 2891 * Userspace attempted a TID -> 0 atomic transition, and failed. 2892 * This is the in-kernel slowpath: we look up the PI state (if any), 2893 * and do the rt-mutex unlock. 2894 */ 2895 static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) 2896 { 2897 u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); 2898 union futex_key key = FUTEX_KEY_INIT; 2899 struct futex_hash_bucket *hb; 2900 struct futex_q *top_waiter; 2901 int ret; 2902 2903 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 2904 return -ENOSYS; 2905 2906 retry: 2907 if (get_user(uval, uaddr)) 2908 return -EFAULT; 2909 /* 2910 * We release only a lock we actually own: 2911 */ 2912 if ((uval & FUTEX_TID_MASK) != vpid) 2913 return -EPERM; 2914 2915 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); 2916 if (ret) 2917 return ret; 2918 2919 hb = hash_futex(&key); 2920 spin_lock(&hb->lock); 2921 2922 /* 2923 * Check waiters first. We do not trust user space values at 2924 * all and we at least want to know if user space fiddled 2925 * with the futex value instead of blindly unlocking. 2926 */ 2927 top_waiter = futex_top_waiter(hb, &key); 2928 if (top_waiter) { 2929 struct futex_pi_state *pi_state = top_waiter->pi_state; 2930 2931 ret = -EINVAL; 2932 if (!pi_state) 2933 goto out_unlock; 2934 2935 /* 2936 * If current does not own the pi_state then the futex is 2937 * inconsistent and user space fiddled with the futex value. 2938 */ 2939 if (pi_state->owner != current) 2940 goto out_unlock; 2941 2942 get_pi_state(pi_state); 2943 /* 2944 * By taking wait_lock while still holding hb->lock, we ensure 2945 * there is no point where we hold neither; and therefore 2946 * wake_futex_pi() must observe a state consistent with what we 2947 * observed. 2948 */ 2949 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 2950 spin_unlock(&hb->lock); 2951 2952 /* drops pi_state->pi_mutex.wait_lock */ 2953 ret = wake_futex_pi(uaddr, uval, pi_state); 2954 2955 put_pi_state(pi_state); 2956 2957 /* 2958 * Success, we're done! No tricky corner cases. 2959 */ 2960 if (!ret) 2961 goto out_putkey; 2962 /* 2963 * The atomic access to the futex value generated a 2964 * pagefault, so retry the user-access and the wakeup: 2965 */ 2966 if (ret == -EFAULT) 2967 goto pi_faulted; 2968 /* 2969 * A unconditional UNLOCK_PI op raced against a waiter 2970 * setting the FUTEX_WAITERS bit. Try again. 2971 */ 2972 if (ret == -EAGAIN) { 2973 put_futex_key(&key); 2974 goto retry; 2975 } 2976 /* 2977 * wake_futex_pi has detected invalid state. Tell user 2978 * space. 2979 */ 2980 goto out_putkey; 2981 } 2982 2983 /* 2984 * We have no kernel internal state, i.e. no waiters in the 2985 * kernel. Waiters which are about to queue themselves are stuck 2986 * on hb->lock. So we can safely ignore them. We do neither 2987 * preserve the WAITERS bit not the OWNER_DIED one. We are the 2988 * owner. 2989 */ 2990 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { 2991 spin_unlock(&hb->lock); 2992 goto pi_faulted; 2993 } 2994 2995 /* 2996 * If uval has changed, let user space handle it. 2997 */ 2998 ret = (curval == uval) ? 0 : -EAGAIN; 2999 3000 out_unlock: 3001 spin_unlock(&hb->lock); 3002 out_putkey: 3003 put_futex_key(&key); 3004 return ret; 3005 3006 pi_faulted: 3007 put_futex_key(&key); 3008 3009 ret = fault_in_user_writeable(uaddr); 3010 if (!ret) 3011 goto retry; 3012 3013 return ret; 3014 } 3015 3016 /** 3017 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex 3018 * @hb: the hash_bucket futex_q was original enqueued on 3019 * @q: the futex_q woken while waiting to be requeued 3020 * @key2: the futex_key of the requeue target futex 3021 * @timeout: the timeout associated with the wait (NULL if none) 3022 * 3023 * Detect if the task was woken on the initial futex as opposed to the requeue 3024 * target futex. If so, determine if it was a timeout or a signal that caused 3025 * the wakeup and return the appropriate error code to the caller. Must be 3026 * called with the hb lock held. 3027 * 3028 * Return: 3029 * - 0 = no early wakeup detected; 3030 * - <0 = -ETIMEDOUT or -ERESTARTNOINTR 3031 */ 3032 static inline 3033 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, 3034 struct futex_q *q, union futex_key *key2, 3035 struct hrtimer_sleeper *timeout) 3036 { 3037 int ret = 0; 3038 3039 /* 3040 * With the hb lock held, we avoid races while we process the wakeup. 3041 * We only need to hold hb (and not hb2) to ensure atomicity as the 3042 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. 3043 * It can't be requeued from uaddr2 to something else since we don't 3044 * support a PI aware source futex for requeue. 3045 */ 3046 if (!match_futex(&q->key, key2)) { 3047 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); 3048 /* 3049 * We were woken prior to requeue by a timeout or a signal. 3050 * Unqueue the futex_q and determine which it was. 3051 */ 3052 plist_del(&q->list, &hb->chain); 3053 hb_waiters_dec(hb); 3054 3055 /* Handle spurious wakeups gracefully */ 3056 ret = -EWOULDBLOCK; 3057 if (timeout && !timeout->task) 3058 ret = -ETIMEDOUT; 3059 else if (signal_pending(current)) 3060 ret = -ERESTARTNOINTR; 3061 } 3062 return ret; 3063 } 3064 3065 /** 3066 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 3067 * @uaddr: the futex we initially wait on (non-pi) 3068 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be 3069 * the same type, no requeueing from private to shared, etc. 3070 * @val: the expected value of uaddr 3071 * @abs_time: absolute timeout 3072 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all 3073 * @uaddr2: the pi futex we will take prior to returning to user-space 3074 * 3075 * The caller will wait on uaddr and will be requeued by futex_requeue() to 3076 * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake 3077 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to 3078 * userspace. This ensures the rt_mutex maintains an owner when it has waiters; 3079 * without one, the pi logic would not know which task to boost/deboost, if 3080 * there was a need to. 3081 * 3082 * We call schedule in futex_wait_queue_me() when we enqueue and return there 3083 * via the following-- 3084 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 3085 * 2) wakeup on uaddr2 after a requeue 3086 * 3) signal 3087 * 4) timeout 3088 * 3089 * If 3, cleanup and return -ERESTARTNOINTR. 3090 * 3091 * If 2, we may then block on trying to take the rt_mutex and return via: 3092 * 5) successful lock 3093 * 6) signal 3094 * 7) timeout 3095 * 8) other lock acquisition failure 3096 * 3097 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). 3098 * 3099 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 3100 * 3101 * Return: 3102 * - 0 - On success; 3103 * - <0 - On error 3104 */ 3105 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, 3106 u32 val, ktime_t *abs_time, u32 bitset, 3107 u32 __user *uaddr2) 3108 { 3109 struct hrtimer_sleeper timeout, *to = NULL; 3110 struct futex_pi_state *pi_state = NULL; 3111 struct rt_mutex_waiter rt_waiter; 3112 struct futex_hash_bucket *hb; 3113 union futex_key key2 = FUTEX_KEY_INIT; 3114 struct futex_q q = futex_q_init; 3115 int res, ret; 3116 3117 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 3118 return -ENOSYS; 3119 3120 if (uaddr == uaddr2) 3121 return -EINVAL; 3122 3123 if (!bitset) 3124 return -EINVAL; 3125 3126 if (abs_time) { 3127 to = &timeout; 3128 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? 3129 CLOCK_REALTIME : CLOCK_MONOTONIC, 3130 HRTIMER_MODE_ABS); 3131 hrtimer_init_sleeper(to, current); 3132 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 3133 current->timer_slack_ns); 3134 } 3135 3136 /* 3137 * The waiter is allocated on our stack, manipulated by the requeue 3138 * code while we sleep on uaddr. 3139 */ 3140 rt_mutex_init_waiter(&rt_waiter); 3141 3142 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); 3143 if (unlikely(ret != 0)) 3144 goto out; 3145 3146 q.bitset = bitset; 3147 q.rt_waiter = &rt_waiter; 3148 q.requeue_pi_key = &key2; 3149 3150 /* 3151 * Prepare to wait on uaddr. On success, increments q.key (key1) ref 3152 * count. 3153 */ 3154 ret = futex_wait_setup(uaddr, val, flags, &q, &hb); 3155 if (ret) 3156 goto out_key2; 3157 3158 /* 3159 * The check above which compares uaddrs is not sufficient for 3160 * shared futexes. We need to compare the keys: 3161 */ 3162 if (match_futex(&q.key, &key2)) { 3163 queue_unlock(hb); 3164 ret = -EINVAL; 3165 goto out_put_keys; 3166 } 3167 3168 /* Queue the futex_q, drop the hb lock, wait for wakeup. */ 3169 futex_wait_queue_me(hb, &q, to); 3170 3171 spin_lock(&hb->lock); 3172 ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); 3173 spin_unlock(&hb->lock); 3174 if (ret) 3175 goto out_put_keys; 3176 3177 /* 3178 * In order for us to be here, we know our q.key == key2, and since 3179 * we took the hb->lock above, we also know that futex_requeue() has 3180 * completed and we no longer have to concern ourselves with a wakeup 3181 * race with the atomic proxy lock acquisition by the requeue code. The 3182 * futex_requeue dropped our key1 reference and incremented our key2 3183 * reference count. 3184 */ 3185 3186 /* Check if the requeue code acquired the second futex for us. */ 3187 if (!q.rt_waiter) { 3188 /* 3189 * Got the lock. We might not be the anticipated owner if we 3190 * did a lock-steal - fix up the PI-state in that case. 3191 */ 3192 if (q.pi_state && (q.pi_state->owner != current)) { 3193 spin_lock(q.lock_ptr); 3194 ret = fixup_pi_state_owner(uaddr2, &q, current); 3195 if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { 3196 pi_state = q.pi_state; 3197 get_pi_state(pi_state); 3198 } 3199 /* 3200 * Drop the reference to the pi state which 3201 * the requeue_pi() code acquired for us. 3202 */ 3203 put_pi_state(q.pi_state); 3204 spin_unlock(q.lock_ptr); 3205 } 3206 } else { 3207 struct rt_mutex *pi_mutex; 3208 3209 /* 3210 * We have been woken up by futex_unlock_pi(), a timeout, or a 3211 * signal. futex_unlock_pi() will not destroy the lock_ptr nor 3212 * the pi_state. 3213 */ 3214 WARN_ON(!q.pi_state); 3215 pi_mutex = &q.pi_state->pi_mutex; 3216 ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); 3217 3218 spin_lock(q.lock_ptr); 3219 if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) 3220 ret = 0; 3221 3222 debug_rt_mutex_free_waiter(&rt_waiter); 3223 /* 3224 * Fixup the pi_state owner and possibly acquire the lock if we 3225 * haven't already. 3226 */ 3227 res = fixup_owner(uaddr2, &q, !ret); 3228 /* 3229 * If fixup_owner() returned an error, proprogate that. If it 3230 * acquired the lock, clear -ETIMEDOUT or -EINTR. 3231 */ 3232 if (res) 3233 ret = (res < 0) ? res : 0; 3234 3235 /* 3236 * If fixup_pi_state_owner() faulted and was unable to handle 3237 * the fault, unlock the rt_mutex and return the fault to 3238 * userspace. 3239 */ 3240 if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { 3241 pi_state = q.pi_state; 3242 get_pi_state(pi_state); 3243 } 3244 3245 /* Unqueue and drop the lock. */ 3246 unqueue_me_pi(&q); 3247 } 3248 3249 if (pi_state) { 3250 rt_mutex_futex_unlock(&pi_state->pi_mutex); 3251 put_pi_state(pi_state); 3252 } 3253 3254 if (ret == -EINTR) { 3255 /* 3256 * We've already been requeued, but cannot restart by calling 3257 * futex_lock_pi() directly. We could restart this syscall, but 3258 * it would detect that the user space "val" changed and return 3259 * -EWOULDBLOCK. Save the overhead of the restart and return 3260 * -EWOULDBLOCK directly. 3261 */ 3262 ret = -EWOULDBLOCK; 3263 } 3264 3265 out_put_keys: 3266 put_futex_key(&q.key); 3267 out_key2: 3268 put_futex_key(&key2); 3269 3270 out: 3271 if (to) { 3272 hrtimer_cancel(&to->timer); 3273 destroy_hrtimer_on_stack(&to->timer); 3274 } 3275 return ret; 3276 } 3277 3278 /* 3279 * Support for robust futexes: the kernel cleans up held futexes at 3280 * thread exit time. 3281 * 3282 * Implementation: user-space maintains a per-thread list of locks it 3283 * is holding. Upon do_exit(), the kernel carefully walks this list, 3284 * and marks all locks that are owned by this thread with the 3285 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is 3286 * always manipulated with the lock held, so the list is private and 3287 * per-thread. Userspace also maintains a per-thread 'list_op_pending' 3288 * field, to allow the kernel to clean up if the thread dies after 3289 * acquiring the lock, but just before it could have added itself to 3290 * the list. There can only be one such pending lock. 3291 */ 3292 3293 /** 3294 * sys_set_robust_list() - Set the robust-futex list head of a task 3295 * @head: pointer to the list-head 3296 * @len: length of the list-head, as userspace expects 3297 */ 3298 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, 3299 size_t, len) 3300 { 3301 if (!futex_cmpxchg_enabled) 3302 return -ENOSYS; 3303 /* 3304 * The kernel knows only one size for now: 3305 */ 3306 if (unlikely(len != sizeof(*head))) 3307 return -EINVAL; 3308 3309 current->robust_list = head; 3310 3311 return 0; 3312 } 3313 3314 /** 3315 * sys_get_robust_list() - Get the robust-futex list head of a task 3316 * @pid: pid of the process [zero for current task] 3317 * @head_ptr: pointer to a list-head pointer, the kernel fills it in 3318 * @len_ptr: pointer to a length field, the kernel fills in the header size 3319 */ 3320 SYSCALL_DEFINE3(get_robust_list, int, pid, 3321 struct robust_list_head __user * __user *, head_ptr, 3322 size_t __user *, len_ptr) 3323 { 3324 struct robust_list_head __user *head; 3325 unsigned long ret; 3326 struct task_struct *p; 3327 3328 if (!futex_cmpxchg_enabled) 3329 return -ENOSYS; 3330 3331 rcu_read_lock(); 3332 3333 ret = -ESRCH; 3334 if (!pid) 3335 p = current; 3336 else { 3337 p = find_task_by_vpid(pid); 3338 if (!p) 3339 goto err_unlock; 3340 } 3341 3342 ret = -EPERM; 3343 if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) 3344 goto err_unlock; 3345 3346 head = p->robust_list; 3347 rcu_read_unlock(); 3348 3349 if (put_user(sizeof(*head), len_ptr)) 3350 return -EFAULT; 3351 return put_user(head, head_ptr); 3352 3353 err_unlock: 3354 rcu_read_unlock(); 3355 3356 return ret; 3357 } 3358 3359 /* 3360 * Process a futex-list entry, check whether it's owned by the 3361 * dying task, and do notification if so: 3362 */ 3363 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) 3364 { 3365 u32 uval, uninitialized_var(nval), mval; 3366 3367 retry: 3368 if (get_user(uval, uaddr)) 3369 return -1; 3370 3371 if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { 3372 /* 3373 * Ok, this dying thread is truly holding a futex 3374 * of interest. Set the OWNER_DIED bit atomically 3375 * via cmpxchg, and if the value had FUTEX_WAITERS 3376 * set, wake up a waiter (if any). (We have to do a 3377 * futex_wake() even if OWNER_DIED is already set - 3378 * to handle the rare but possible case of recursive 3379 * thread-death.) The rest of the cleanup is done in 3380 * userspace. 3381 */ 3382 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 3383 /* 3384 * We are not holding a lock here, but we want to have 3385 * the pagefault_disable/enable() protection because 3386 * we want to handle the fault gracefully. If the 3387 * access fails we try to fault in the futex with R/W 3388 * verification via get_user_pages. get_user() above 3389 * does not guarantee R/W access. If that fails we 3390 * give up and leave the futex locked. 3391 */ 3392 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { 3393 if (fault_in_user_writeable(uaddr)) 3394 return -1; 3395 goto retry; 3396 } 3397 if (nval != uval) 3398 goto retry; 3399 3400 /* 3401 * Wake robust non-PI futexes here. The wakeup of 3402 * PI futexes happens in exit_pi_state(): 3403 */ 3404 if (!pi && (uval & FUTEX_WAITERS)) 3405 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); 3406 } 3407 return 0; 3408 } 3409 3410 /* 3411 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 3412 */ 3413 static inline int fetch_robust_entry(struct robust_list __user **entry, 3414 struct robust_list __user * __user *head, 3415 unsigned int *pi) 3416 { 3417 unsigned long uentry; 3418 3419 if (get_user(uentry, (unsigned long __user *)head)) 3420 return -EFAULT; 3421 3422 *entry = (void __user *)(uentry & ~1UL); 3423 *pi = uentry & 1; 3424 3425 return 0; 3426 } 3427 3428 /* 3429 * Walk curr->robust_list (very carefully, it's a userspace list!) 3430 * and mark any locks found there dead, and notify any waiters. 3431 * 3432 * We silently return on any sign of list-walking problem. 3433 */ 3434 void exit_robust_list(struct task_struct *curr) 3435 { 3436 struct robust_list_head __user *head = curr->robust_list; 3437 struct robust_list __user *entry, *next_entry, *pending; 3438 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 3439 unsigned int uninitialized_var(next_pi); 3440 unsigned long futex_offset; 3441 int rc; 3442 3443 if (!futex_cmpxchg_enabled) 3444 return; 3445 3446 /* 3447 * Fetch the list head (which was registered earlier, via 3448 * sys_set_robust_list()): 3449 */ 3450 if (fetch_robust_entry(&entry, &head->list.next, &pi)) 3451 return; 3452 /* 3453 * Fetch the relative futex offset: 3454 */ 3455 if (get_user(futex_offset, &head->futex_offset)) 3456 return; 3457 /* 3458 * Fetch any possibly pending lock-add first, and handle it 3459 * if it exists: 3460 */ 3461 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) 3462 return; 3463 3464 next_entry = NULL; /* avoid warning with gcc */ 3465 while (entry != &head->list) { 3466 /* 3467 * Fetch the next entry in the list before calling 3468 * handle_futex_death: 3469 */ 3470 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); 3471 /* 3472 * A pending lock might already be on the list, so 3473 * don't process it twice: 3474 */ 3475 if (entry != pending) 3476 if (handle_futex_death((void __user *)entry + futex_offset, 3477 curr, pi)) 3478 return; 3479 if (rc) 3480 return; 3481 entry = next_entry; 3482 pi = next_pi; 3483 /* 3484 * Avoid excessively long or circular lists: 3485 */ 3486 if (!--limit) 3487 break; 3488 3489 cond_resched(); 3490 } 3491 3492 if (pending) 3493 handle_futex_death((void __user *)pending + futex_offset, 3494 curr, pip); 3495 } 3496 3497 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 3498 u32 __user *uaddr2, u32 val2, u32 val3) 3499 { 3500 int cmd = op & FUTEX_CMD_MASK; 3501 unsigned int flags = 0; 3502 3503 if (!(op & FUTEX_PRIVATE_FLAG)) 3504 flags |= FLAGS_SHARED; 3505 3506 if (op & FUTEX_CLOCK_REALTIME) { 3507 flags |= FLAGS_CLOCKRT; 3508 if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \ 3509 cmd != FUTEX_WAIT_REQUEUE_PI) 3510 return -ENOSYS; 3511 } 3512 3513 switch (cmd) { 3514 case FUTEX_LOCK_PI: 3515 case FUTEX_UNLOCK_PI: 3516 case FUTEX_TRYLOCK_PI: 3517 case FUTEX_WAIT_REQUEUE_PI: 3518 case FUTEX_CMP_REQUEUE_PI: 3519 if (!futex_cmpxchg_enabled) 3520 return -ENOSYS; 3521 } 3522 3523 switch (cmd) { 3524 case FUTEX_WAIT: 3525 val3 = FUTEX_BITSET_MATCH_ANY; 3526 case FUTEX_WAIT_BITSET: 3527 return futex_wait(uaddr, flags, val, timeout, val3); 3528 case FUTEX_WAKE: 3529 val3 = FUTEX_BITSET_MATCH_ANY; 3530 case FUTEX_WAKE_BITSET: 3531 return futex_wake(uaddr, flags, val, val3); 3532 case FUTEX_REQUEUE: 3533 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); 3534 case FUTEX_CMP_REQUEUE: 3535 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); 3536 case FUTEX_WAKE_OP: 3537 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); 3538 case FUTEX_LOCK_PI: 3539 return futex_lock_pi(uaddr, flags, timeout, 0); 3540 case FUTEX_UNLOCK_PI: 3541 return futex_unlock_pi(uaddr, flags); 3542 case FUTEX_TRYLOCK_PI: 3543 return futex_lock_pi(uaddr, flags, NULL, 1); 3544 case FUTEX_WAIT_REQUEUE_PI: 3545 val3 = FUTEX_BITSET_MATCH_ANY; 3546 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, 3547 uaddr2); 3548 case FUTEX_CMP_REQUEUE_PI: 3549 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); 3550 } 3551 return -ENOSYS; 3552 } 3553 3554 3555 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, 3556 struct timespec __user *, utime, u32 __user *, uaddr2, 3557 u32, val3) 3558 { 3559 struct timespec ts; 3560 ktime_t t, *tp = NULL; 3561 u32 val2 = 0; 3562 int cmd = op & FUTEX_CMD_MASK; 3563 3564 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 3565 cmd == FUTEX_WAIT_BITSET || 3566 cmd == FUTEX_WAIT_REQUEUE_PI)) { 3567 if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) 3568 return -EFAULT; 3569 if (copy_from_user(&ts, utime, sizeof(ts)) != 0) 3570 return -EFAULT; 3571 if (!timespec_valid(&ts)) 3572 return -EINVAL; 3573 3574 t = timespec_to_ktime(ts); 3575 if (cmd == FUTEX_WAIT) 3576 t = ktime_add_safe(ktime_get(), t); 3577 tp = &t; 3578 } 3579 /* 3580 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. 3581 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. 3582 */ 3583 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || 3584 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) 3585 val2 = (u32) (unsigned long) utime; 3586 3587 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 3588 } 3589 3590 static void __init futex_detect_cmpxchg(void) 3591 { 3592 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG 3593 u32 curval; 3594 3595 /* 3596 * This will fail and we want it. Some arch implementations do 3597 * runtime detection of the futex_atomic_cmpxchg_inatomic() 3598 * functionality. We want to know that before we call in any 3599 * of the complex code paths. Also we want to prevent 3600 * registration of robust lists in that case. NULL is 3601 * guaranteed to fault and we get -EFAULT on functional 3602 * implementation, the non-functional ones will return 3603 * -ENOSYS. 3604 */ 3605 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) 3606 futex_cmpxchg_enabled = 1; 3607 #endif 3608 } 3609 3610 static int __init futex_init(void) 3611 { 3612 unsigned int futex_shift; 3613 unsigned long i; 3614 3615 #if CONFIG_BASE_SMALL 3616 futex_hashsize = 16; 3617 #else 3618 futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); 3619 #endif 3620 3621 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), 3622 futex_hashsize, 0, 3623 futex_hashsize < 256 ? HASH_SMALL : 0, 3624 &futex_shift, NULL, 3625 futex_hashsize, futex_hashsize); 3626 futex_hashsize = 1UL << futex_shift; 3627 3628 futex_detect_cmpxchg(); 3629 3630 for (i = 0; i < futex_hashsize; i++) { 3631 atomic_set(&futex_queues[i].waiters, 0); 3632 plist_head_init(&futex_queues[i].chain); 3633 spin_lock_init(&futex_queues[i].lock); 3634 } 3635 3636 return 0; 3637 } 3638 core_initcall(futex_init); 3639
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.