1 /* 2 * Fast Userspace Mutexes (which I call "Futexes!"). 3 * (C) Rusty Russell, IBM 2002 4 * 5 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar 6 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved 7 * 8 * Removed page pinning, fix privately mapped COW pages and other cleanups 9 * (C) Copyright 2003, 2004 Jamie Lokier 10 * 11 * Robust futex support started by Ingo Molnar 12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 14 * 15 * PI-futex support started by Ingo Molnar and Thomas Gleixner 16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 18 * 19 * PRIVATE futexes by Eric Dumazet 20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 21 * 22 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> 23 * Copyright (C) IBM Corporation, 2009 24 * Thanks to Thomas Gleixner for conceptual design and careful reviews. 25 * 26 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 27 * enough at me, Linus for the original (flawed) idea, Matthew 28 * Kirkwood for proof-of-concept implementation. 29 * 30 * "The futexes are also cursed." 31 * "But they come in a choice of three flavours!" 32 * 33 * This program is free software; you can redistribute it and/or modify 34 * it under the terms of the GNU General Public License as published by 35 * the Free Software Foundation; either version 2 of the License, or 36 * (at your option) any later version. 37 * 38 * This program is distributed in the hope that it will be useful, 39 * but WITHOUT ANY WARRANTY; without even the implied warranty of 40 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 41 * GNU General Public License for more details. 42 * 43 * You should have received a copy of the GNU General Public License 44 * along with this program; if not, write to the Free Software 45 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 46 */ 47 #include <linux/slab.h> 48 #include <linux/poll.h> 49 #include <linux/fs.h> 50 #include <linux/file.h> 51 #include <linux/jhash.h> 52 #include <linux/init.h> 53 #include <linux/futex.h> 54 #include <linux/mount.h> 55 #include <linux/pagemap.h> 56 #include <linux/syscalls.h> 57 #include <linux/signal.h> 58 #include <linux/export.h> 59 #include <linux/magic.h> 60 #include <linux/pid.h> 61 #include <linux/nsproxy.h> 62 #include <linux/ptrace.h> 63 #include <linux/sched/rt.h> 64 #include <linux/sched/wake_q.h> 65 #include <linux/sched/mm.h> 66 #include <linux/hugetlb.h> 67 #include <linux/freezer.h> 68 #include <linux/bootmem.h> 69 #include <linux/fault-inject.h> 70 71 #include <asm/futex.h> 72 73 #include "locking/rtmutex_common.h" 74 75 /* 76 * READ this before attempting to hack on futexes! 77 * 78 * Basic futex operation and ordering guarantees 79 * ============================================= 80 * 81 * The waiter reads the futex value in user space and calls 82 * futex_wait(). This function computes the hash bucket and acquires 83 * the hash bucket lock. After that it reads the futex user space value 84 * again and verifies that the data has not changed. If it has not changed 85 * it enqueues itself into the hash bucket, releases the hash bucket lock 86 * and schedules. 87 * 88 * The waker side modifies the user space value of the futex and calls 89 * futex_wake(). This function computes the hash bucket and acquires the 90 * hash bucket lock. Then it looks for waiters on that futex in the hash 91 * bucket and wakes them. 92 * 93 * In futex wake up scenarios where no tasks are blocked on a futex, taking 94 * the hb spinlock can be avoided and simply return. In order for this 95 * optimization to work, ordering guarantees must exist so that the waiter 96 * being added to the list is acknowledged when the list is concurrently being 97 * checked by the waker, avoiding scenarios like the following: 98 * 99 * CPU 0 CPU 1 100 * val = *futex; 101 * sys_futex(WAIT, futex, val); 102 * futex_wait(futex, val); 103 * uval = *futex; 104 * *futex = newval; 105 * sys_futex(WAKE, futex); 106 * futex_wake(futex); 107 * if (queue_empty()) 108 * return; 109 * if (uval == val) 110 * lock(hash_bucket(futex)); 111 * queue(); 112 * unlock(hash_bucket(futex)); 113 * schedule(); 114 * 115 * This would cause the waiter on CPU 0 to wait forever because it 116 * missed the transition of the user space value from val to newval 117 * and the waker did not find the waiter in the hash bucket queue. 118 * 119 * The correct serialization ensures that a waiter either observes 120 * the changed user space value before blocking or is woken by a 121 * concurrent waker: 122 * 123 * CPU 0 CPU 1 124 * val = *futex; 125 * sys_futex(WAIT, futex, val); 126 * futex_wait(futex, val); 127 * 128 * waiters++; (a) 129 * smp_mb(); (A) <-- paired with -. 130 * | 131 * lock(hash_bucket(futex)); | 132 * | 133 * uval = *futex; | 134 * | *futex = newval; 135 * | sys_futex(WAKE, futex); 136 * | futex_wake(futex); 137 * | 138 * `--------> smp_mb(); (B) 139 * if (uval == val) 140 * queue(); 141 * unlock(hash_bucket(futex)); 142 * schedule(); if (waiters) 143 * lock(hash_bucket(futex)); 144 * else wake_waiters(futex); 145 * waiters--; (b) unlock(hash_bucket(futex)); 146 * 147 * Where (A) orders the waiters increment and the futex value read through 148 * atomic operations (see hb_waiters_inc) and where (B) orders the write 149 * to futex and the waiters read -- this is done by the barriers for both 150 * shared and private futexes in get_futex_key_refs(). 151 * 152 * This yields the following case (where X:=waiters, Y:=futex): 153 * 154 * X = Y = 0 155 * 156 * w[X]=1 w[Y]=1 157 * MB MB 158 * r[Y]=y r[X]=x 159 * 160 * Which guarantees that x==0 && y==0 is impossible; which translates back into 161 * the guarantee that we cannot both miss the futex variable change and the 162 * enqueue. 163 * 164 * Note that a new waiter is accounted for in (a) even when it is possible that 165 * the wait call can return error, in which case we backtrack from it in (b). 166 * Refer to the comment in queue_lock(). 167 * 168 * Similarly, in order to account for waiters being requeued on another 169 * address we always increment the waiters for the destination bucket before 170 * acquiring the lock. It then decrements them again after releasing it - 171 * the code that actually moves the futex(es) between hash buckets (requeue_futex) 172 * will do the additional required waiter count housekeeping. This is done for 173 * double_lock_hb() and double_unlock_hb(), respectively. 174 */ 175 176 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG 177 int __read_mostly futex_cmpxchg_enabled; 178 #endif 179 180 /* 181 * Futex flags used to encode options to functions and preserve them across 182 * restarts. 183 */ 184 #ifdef CONFIG_MMU 185 # define FLAGS_SHARED 0x01 186 #else 187 /* 188 * NOMMU does not have per process address space. Let the compiler optimize 189 * code away. 190 */ 191 # define FLAGS_SHARED 0x00 192 #endif 193 #define FLAGS_CLOCKRT 0x02 194 #define FLAGS_HAS_TIMEOUT 0x04 195 196 /* 197 * Priority Inheritance state: 198 */ 199 struct futex_pi_state { 200 /* 201 * list of 'owned' pi_state instances - these have to be 202 * cleaned up in do_exit() if the task exits prematurely: 203 */ 204 struct list_head list; 205 206 /* 207 * The PI object: 208 */ 209 struct rt_mutex pi_mutex; 210 211 struct task_struct *owner; 212 atomic_t refcount; 213 214 union futex_key key; 215 }; 216 217 /** 218 * struct futex_q - The hashed futex queue entry, one per waiting task 219 * @list: priority-sorted list of tasks waiting on this futex 220 * @task: the task waiting on the futex 221 * @lock_ptr: the hash bucket lock 222 * @key: the key the futex is hashed on 223 * @pi_state: optional priority inheritance state 224 * @rt_waiter: rt_waiter storage for use with requeue_pi 225 * @requeue_pi_key: the requeue_pi target futex key 226 * @bitset: bitset for the optional bitmasked wakeup 227 * 228 * We use this hashed waitqueue, instead of a normal wait_queue_t, so 229 * we can wake only the relevant ones (hashed queues may be shared). 230 * 231 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 232 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 233 * The order of wakeup is always to make the first condition true, then 234 * the second. 235 * 236 * PI futexes are typically woken before they are removed from the hash list via 237 * the rt_mutex code. See unqueue_me_pi(). 238 */ 239 struct futex_q { 240 struct plist_node list; 241 242 struct task_struct *task; 243 spinlock_t *lock_ptr; 244 union futex_key key; 245 struct futex_pi_state *pi_state; 246 struct rt_mutex_waiter *rt_waiter; 247 union futex_key *requeue_pi_key; 248 u32 bitset; 249 }; 250 251 static const struct futex_q futex_q_init = { 252 /* list gets initialized in queue_me()*/ 253 .key = FUTEX_KEY_INIT, 254 .bitset = FUTEX_BITSET_MATCH_ANY 255 }; 256 257 /* 258 * Hash buckets are shared by all the futex_keys that hash to the same 259 * location. Each key may have multiple futex_q structures, one for each task 260 * waiting on a futex. 261 */ 262 struct futex_hash_bucket { 263 atomic_t waiters; 264 spinlock_t lock; 265 struct plist_head chain; 266 } ____cacheline_aligned_in_smp; 267 268 /* 269 * The base of the bucket array and its size are always used together 270 * (after initialization only in hash_futex()), so ensure that they 271 * reside in the same cacheline. 272 */ 273 static struct { 274 struct futex_hash_bucket *queues; 275 unsigned long hashsize; 276 } __futex_data __read_mostly __aligned(2*sizeof(long)); 277 #define futex_queues (__futex_data.queues) 278 #define futex_hashsize (__futex_data.hashsize) 279 280 281 /* 282 * Fault injections for futexes. 283 */ 284 #ifdef CONFIG_FAIL_FUTEX 285 286 static struct { 287 struct fault_attr attr; 288 289 bool ignore_private; 290 } fail_futex = { 291 .attr = FAULT_ATTR_INITIALIZER, 292 .ignore_private = false, 293 }; 294 295 static int __init setup_fail_futex(char *str) 296 { 297 return setup_fault_attr(&fail_futex.attr, str); 298 } 299 __setup("fail_futex=", setup_fail_futex); 300 301 static bool should_fail_futex(bool fshared) 302 { 303 if (fail_futex.ignore_private && !fshared) 304 return false; 305 306 return should_fail(&fail_futex.attr, 1); 307 } 308 309 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 310 311 static int __init fail_futex_debugfs(void) 312 { 313 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 314 struct dentry *dir; 315 316 dir = fault_create_debugfs_attr("fail_futex", NULL, 317 &fail_futex.attr); 318 if (IS_ERR(dir)) 319 return PTR_ERR(dir); 320 321 if (!debugfs_create_bool("ignore-private", mode, dir, 322 &fail_futex.ignore_private)) { 323 debugfs_remove_recursive(dir); 324 return -ENOMEM; 325 } 326 327 return 0; 328 } 329 330 late_initcall(fail_futex_debugfs); 331 332 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 333 334 #else 335 static inline bool should_fail_futex(bool fshared) 336 { 337 return false; 338 } 339 #endif /* CONFIG_FAIL_FUTEX */ 340 341 static inline void futex_get_mm(union futex_key *key) 342 { 343 mmgrab(key->private.mm); 344 /* 345 * Ensure futex_get_mm() implies a full barrier such that 346 * get_futex_key() implies a full barrier. This is relied upon 347 * as smp_mb(); (B), see the ordering comment above. 348 */ 349 smp_mb__after_atomic(); 350 } 351 352 /* 353 * Reflects a new waiter being added to the waitqueue. 354 */ 355 static inline void hb_waiters_inc(struct futex_hash_bucket *hb) 356 { 357 #ifdef CONFIG_SMP 358 atomic_inc(&hb->waiters); 359 /* 360 * Full barrier (A), see the ordering comment above. 361 */ 362 smp_mb__after_atomic(); 363 #endif 364 } 365 366 /* 367 * Reflects a waiter being removed from the waitqueue by wakeup 368 * paths. 369 */ 370 static inline void hb_waiters_dec(struct futex_hash_bucket *hb) 371 { 372 #ifdef CONFIG_SMP 373 atomic_dec(&hb->waiters); 374 #endif 375 } 376 377 static inline int hb_waiters_pending(struct futex_hash_bucket *hb) 378 { 379 #ifdef CONFIG_SMP 380 return atomic_read(&hb->waiters); 381 #else 382 return 1; 383 #endif 384 } 385 386 /** 387 * hash_futex - Return the hash bucket in the global hash 388 * @key: Pointer to the futex key for which the hash is calculated 389 * 390 * We hash on the keys returned from get_futex_key (see below) and return the 391 * corresponding hash bucket in the global hash. 392 */ 393 static struct futex_hash_bucket *hash_futex(union futex_key *key) 394 { 395 u32 hash = jhash2((u32*)&key->both.word, 396 (sizeof(key->both.word)+sizeof(key->both.ptr))/4, 397 key->both.offset); 398 return &futex_queues[hash & (futex_hashsize - 1)]; 399 } 400 401 402 /** 403 * match_futex - Check whether two futex keys are equal 404 * @key1: Pointer to key1 405 * @key2: Pointer to key2 406 * 407 * Return 1 if two futex_keys are equal, 0 otherwise. 408 */ 409 static inline int match_futex(union futex_key *key1, union futex_key *key2) 410 { 411 return (key1 && key2 412 && key1->both.word == key2->both.word 413 && key1->both.ptr == key2->both.ptr 414 && key1->both.offset == key2->both.offset); 415 } 416 417 /* 418 * Take a reference to the resource addressed by a key. 419 * Can be called while holding spinlocks. 420 * 421 */ 422 static void get_futex_key_refs(union futex_key *key) 423 { 424 if (!key->both.ptr) 425 return; 426 427 /* 428 * On MMU less systems futexes are always "private" as there is no per 429 * process address space. We need the smp wmb nevertheless - yes, 430 * arch/blackfin has MMU less SMP ... 431 */ 432 if (!IS_ENABLED(CONFIG_MMU)) { 433 smp_mb(); /* explicit smp_mb(); (B) */ 434 return; 435 } 436 437 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 438 case FUT_OFF_INODE: 439 ihold(key->shared.inode); /* implies smp_mb(); (B) */ 440 break; 441 case FUT_OFF_MMSHARED: 442 futex_get_mm(key); /* implies smp_mb(); (B) */ 443 break; 444 default: 445 /* 446 * Private futexes do not hold reference on an inode or 447 * mm, therefore the only purpose of calling get_futex_key_refs 448 * is because we need the barrier for the lockless waiter check. 449 */ 450 smp_mb(); /* explicit smp_mb(); (B) */ 451 } 452 } 453 454 /* 455 * Drop a reference to the resource addressed by a key. 456 * The hash bucket spinlock must not be held. This is 457 * a no-op for private futexes, see comment in the get 458 * counterpart. 459 */ 460 static void drop_futex_key_refs(union futex_key *key) 461 { 462 if (!key->both.ptr) { 463 /* If we're here then we tried to put a key we failed to get */ 464 WARN_ON_ONCE(1); 465 return; 466 } 467 468 if (!IS_ENABLED(CONFIG_MMU)) 469 return; 470 471 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 472 case FUT_OFF_INODE: 473 iput(key->shared.inode); 474 break; 475 case FUT_OFF_MMSHARED: 476 mmdrop(key->private.mm); 477 break; 478 } 479 } 480 481 /** 482 * get_futex_key() - Get parameters which are the keys for a futex 483 * @uaddr: virtual address of the futex 484 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 485 * @key: address where result is stored. 486 * @rw: mapping needs to be read/write (values: VERIFY_READ, 487 * VERIFY_WRITE) 488 * 489 * Return: a negative error code or 0 490 * 491 * The key words are stored in *key on success. 492 * 493 * For shared mappings, it's (page->index, file_inode(vma->vm_file), 494 * offset_within_page). For private mappings, it's (uaddr, current->mm). 495 * We can usually work out the index without swapping in the page. 496 * 497 * lock_page() might sleep, the caller should not hold a spinlock. 498 */ 499 static int 500 get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) 501 { 502 unsigned long address = (unsigned long)uaddr; 503 struct mm_struct *mm = current->mm; 504 struct page *page, *tail; 505 struct address_space *mapping; 506 int err, ro = 0; 507 508 /* 509 * The futex address must be "naturally" aligned. 510 */ 511 key->both.offset = address % PAGE_SIZE; 512 if (unlikely((address % sizeof(u32)) != 0)) 513 return -EINVAL; 514 address -= key->both.offset; 515 516 if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) 517 return -EFAULT; 518 519 if (unlikely(should_fail_futex(fshared))) 520 return -EFAULT; 521 522 /* 523 * PROCESS_PRIVATE futexes are fast. 524 * As the mm cannot disappear under us and the 'key' only needs 525 * virtual address, we dont even have to find the underlying vma. 526 * Note : We do have to check 'uaddr' is a valid user address, 527 * but access_ok() should be faster than find_vma() 528 */ 529 if (!fshared) { 530 key->private.mm = mm; 531 key->private.address = address; 532 get_futex_key_refs(key); /* implies smp_mb(); (B) */ 533 return 0; 534 } 535 536 again: 537 /* Ignore any VERIFY_READ mapping (futex common case) */ 538 if (unlikely(should_fail_futex(fshared))) 539 return -EFAULT; 540 541 err = get_user_pages_fast(address, 1, 1, &page); 542 /* 543 * If write access is not required (eg. FUTEX_WAIT), try 544 * and get read-only access. 545 */ 546 if (err == -EFAULT && rw == VERIFY_READ) { 547 err = get_user_pages_fast(address, 1, 0, &page); 548 ro = 1; 549 } 550 if (err < 0) 551 return err; 552 else 553 err = 0; 554 555 /* 556 * The treatment of mapping from this point on is critical. The page 557 * lock protects many things but in this context the page lock 558 * stabilizes mapping, prevents inode freeing in the shared 559 * file-backed region case and guards against movement to swap cache. 560 * 561 * Strictly speaking the page lock is not needed in all cases being 562 * considered here and page lock forces unnecessarily serialization 563 * From this point on, mapping will be re-verified if necessary and 564 * page lock will be acquired only if it is unavoidable 565 * 566 * Mapping checks require the head page for any compound page so the 567 * head page and mapping is looked up now. For anonymous pages, it 568 * does not matter if the page splits in the future as the key is 569 * based on the address. For filesystem-backed pages, the tail is 570 * required as the index of the page determines the key. For 571 * base pages, there is no tail page and tail == page. 572 */ 573 tail = page; 574 page = compound_head(page); 575 mapping = READ_ONCE(page->mapping); 576 577 /* 578 * If page->mapping is NULL, then it cannot be a PageAnon 579 * page; but it might be the ZERO_PAGE or in the gate area or 580 * in a special mapping (all cases which we are happy to fail); 581 * or it may have been a good file page when get_user_pages_fast 582 * found it, but truncated or holepunched or subjected to 583 * invalidate_complete_page2 before we got the page lock (also 584 * cases which we are happy to fail). And we hold a reference, 585 * so refcount care in invalidate_complete_page's remove_mapping 586 * prevents drop_caches from setting mapping to NULL beneath us. 587 * 588 * The case we do have to guard against is when memory pressure made 589 * shmem_writepage move it from filecache to swapcache beneath us: 590 * an unlikely race, but we do need to retry for page->mapping. 591 */ 592 if (unlikely(!mapping)) { 593 int shmem_swizzled; 594 595 /* 596 * Page lock is required to identify which special case above 597 * applies. If this is really a shmem page then the page lock 598 * will prevent unexpected transitions. 599 */ 600 lock_page(page); 601 shmem_swizzled = PageSwapCache(page) || page->mapping; 602 unlock_page(page); 603 put_page(page); 604 605 if (shmem_swizzled) 606 goto again; 607 608 return -EFAULT; 609 } 610 611 /* 612 * Private mappings are handled in a simple way. 613 * 614 * If the futex key is stored on an anonymous page, then the associated 615 * object is the mm which is implicitly pinned by the calling process. 616 * 617 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 618 * it's a read-only handle, it's expected that futexes attach to 619 * the object not the particular process. 620 */ 621 if (PageAnon(page)) { 622 /* 623 * A RO anonymous page will never change and thus doesn't make 624 * sense for futex operations. 625 */ 626 if (unlikely(should_fail_futex(fshared)) || ro) { 627 err = -EFAULT; 628 goto out; 629 } 630 631 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 632 key->private.mm = mm; 633 key->private.address = address; 634 635 get_futex_key_refs(key); /* implies smp_mb(); (B) */ 636 637 } else { 638 struct inode *inode; 639 640 /* 641 * The associated futex object in this case is the inode and 642 * the page->mapping must be traversed. Ordinarily this should 643 * be stabilised under page lock but it's not strictly 644 * necessary in this case as we just want to pin the inode, not 645 * update the radix tree or anything like that. 646 * 647 * The RCU read lock is taken as the inode is finally freed 648 * under RCU. If the mapping still matches expectations then the 649 * mapping->host can be safely accessed as being a valid inode. 650 */ 651 rcu_read_lock(); 652 653 if (READ_ONCE(page->mapping) != mapping) { 654 rcu_read_unlock(); 655 put_page(page); 656 657 goto again; 658 } 659 660 inode = READ_ONCE(mapping->host); 661 if (!inode) { 662 rcu_read_unlock(); 663 put_page(page); 664 665 goto again; 666 } 667 668 /* 669 * Take a reference unless it is about to be freed. Previously 670 * this reference was taken by ihold under the page lock 671 * pinning the inode in place so i_lock was unnecessary. The 672 * only way for this check to fail is if the inode was 673 * truncated in parallel which is almost certainly an 674 * application bug. In such a case, just retry. 675 * 676 * We are not calling into get_futex_key_refs() in file-backed 677 * cases, therefore a successful atomic_inc return below will 678 * guarantee that get_futex_key() will still imply smp_mb(); (B). 679 */ 680 if (!atomic_inc_not_zero(&inode->i_count)) { 681 rcu_read_unlock(); 682 put_page(page); 683 684 goto again; 685 } 686 687 /* Should be impossible but lets be paranoid for now */ 688 if (WARN_ON_ONCE(inode->i_mapping != mapping)) { 689 err = -EFAULT; 690 rcu_read_unlock(); 691 iput(inode); 692 693 goto out; 694 } 695 696 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 697 key->shared.inode = inode; 698 key->shared.pgoff = basepage_index(tail); 699 rcu_read_unlock(); 700 } 701 702 out: 703 put_page(page); 704 return err; 705 } 706 707 static inline void put_futex_key(union futex_key *key) 708 { 709 drop_futex_key_refs(key); 710 } 711 712 /** 713 * fault_in_user_writeable() - Fault in user address and verify RW access 714 * @uaddr: pointer to faulting user space address 715 * 716 * Slow path to fixup the fault we just took in the atomic write 717 * access to @uaddr. 718 * 719 * We have no generic implementation of a non-destructive write to the 720 * user address. We know that we faulted in the atomic pagefault 721 * disabled section so we can as well avoid the #PF overhead by 722 * calling get_user_pages() right away. 723 */ 724 static int fault_in_user_writeable(u32 __user *uaddr) 725 { 726 struct mm_struct *mm = current->mm; 727 int ret; 728 729 down_read(&mm->mmap_sem); 730 ret = fixup_user_fault(current, mm, (unsigned long)uaddr, 731 FAULT_FLAG_WRITE, NULL); 732 up_read(&mm->mmap_sem); 733 734 return ret < 0 ? ret : 0; 735 } 736 737 /** 738 * futex_top_waiter() - Return the highest priority waiter on a futex 739 * @hb: the hash bucket the futex_q's reside in 740 * @key: the futex key (to distinguish it from other futex futex_q's) 741 * 742 * Must be called with the hb lock held. 743 */ 744 static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, 745 union futex_key *key) 746 { 747 struct futex_q *this; 748 749 plist_for_each_entry(this, &hb->chain, list) { 750 if (match_futex(&this->key, key)) 751 return this; 752 } 753 return NULL; 754 } 755 756 static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, 757 u32 uval, u32 newval) 758 { 759 int ret; 760 761 pagefault_disable(); 762 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); 763 pagefault_enable(); 764 765 return ret; 766 } 767 768 static int get_futex_value_locked(u32 *dest, u32 __user *from) 769 { 770 int ret; 771 772 pagefault_disable(); 773 ret = __get_user(*dest, from); 774 pagefault_enable(); 775 776 return ret ? -EFAULT : 0; 777 } 778 779 780 /* 781 * PI code: 782 */ 783 static int refill_pi_state_cache(void) 784 { 785 struct futex_pi_state *pi_state; 786 787 if (likely(current->pi_state_cache)) 788 return 0; 789 790 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); 791 792 if (!pi_state) 793 return -ENOMEM; 794 795 INIT_LIST_HEAD(&pi_state->list); 796 /* pi_mutex gets initialized later */ 797 pi_state->owner = NULL; 798 atomic_set(&pi_state->refcount, 1); 799 pi_state->key = FUTEX_KEY_INIT; 800 801 current->pi_state_cache = pi_state; 802 803 return 0; 804 } 805 806 static struct futex_pi_state *alloc_pi_state(void) 807 { 808 struct futex_pi_state *pi_state = current->pi_state_cache; 809 810 WARN_ON(!pi_state); 811 current->pi_state_cache = NULL; 812 813 return pi_state; 814 } 815 816 static void get_pi_state(struct futex_pi_state *pi_state) 817 { 818 WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); 819 } 820 821 /* 822 * Drops a reference to the pi_state object and frees or caches it 823 * when the last reference is gone. 824 * 825 * Must be called with the hb lock held. 826 */ 827 static void put_pi_state(struct futex_pi_state *pi_state) 828 { 829 if (!pi_state) 830 return; 831 832 if (!atomic_dec_and_test(&pi_state->refcount)) 833 return; 834 835 /* 836 * If pi_state->owner is NULL, the owner is most probably dying 837 * and has cleaned up the pi_state already 838 */ 839 if (pi_state->owner) { 840 raw_spin_lock_irq(&pi_state->owner->pi_lock); 841 list_del_init(&pi_state->list); 842 raw_spin_unlock_irq(&pi_state->owner->pi_lock); 843 844 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); 845 } 846 847 if (current->pi_state_cache) 848 kfree(pi_state); 849 else { 850 /* 851 * pi_state->list is already empty. 852 * clear pi_state->owner. 853 * refcount is at 0 - put it back to 1. 854 */ 855 pi_state->owner = NULL; 856 atomic_set(&pi_state->refcount, 1); 857 current->pi_state_cache = pi_state; 858 } 859 } 860 861 /* 862 * Look up the task based on what TID userspace gave us. 863 * We dont trust it. 864 */ 865 static struct task_struct *futex_find_get_task(pid_t pid) 866 { 867 struct task_struct *p; 868 869 rcu_read_lock(); 870 p = find_task_by_vpid(pid); 871 if (p) 872 get_task_struct(p); 873 874 rcu_read_unlock(); 875 876 return p; 877 } 878 879 /* 880 * This task is holding PI mutexes at exit time => bad. 881 * Kernel cleans up PI-state, but userspace is likely hosed. 882 * (Robust-futex cleanup is separate and might save the day for userspace.) 883 */ 884 void exit_pi_state_list(struct task_struct *curr) 885 { 886 struct list_head *next, *head = &curr->pi_state_list; 887 struct futex_pi_state *pi_state; 888 struct futex_hash_bucket *hb; 889 union futex_key key = FUTEX_KEY_INIT; 890 891 if (!futex_cmpxchg_enabled) 892 return; 893 /* 894 * We are a ZOMBIE and nobody can enqueue itself on 895 * pi_state_list anymore, but we have to be careful 896 * versus waiters unqueueing themselves: 897 */ 898 raw_spin_lock_irq(&curr->pi_lock); 899 while (!list_empty(head)) { 900 901 next = head->next; 902 pi_state = list_entry(next, struct futex_pi_state, list); 903 key = pi_state->key; 904 hb = hash_futex(&key); 905 raw_spin_unlock_irq(&curr->pi_lock); 906 907 spin_lock(&hb->lock); 908 909 raw_spin_lock_irq(&curr->pi_lock); 910 /* 911 * We dropped the pi-lock, so re-check whether this 912 * task still owns the PI-state: 913 */ 914 if (head->next != next) { 915 spin_unlock(&hb->lock); 916 continue; 917 } 918 919 WARN_ON(pi_state->owner != curr); 920 WARN_ON(list_empty(&pi_state->list)); 921 list_del_init(&pi_state->list); 922 pi_state->owner = NULL; 923 raw_spin_unlock_irq(&curr->pi_lock); 924 925 get_pi_state(pi_state); 926 spin_unlock(&hb->lock); 927 928 rt_mutex_futex_unlock(&pi_state->pi_mutex); 929 put_pi_state(pi_state); 930 931 raw_spin_lock_irq(&curr->pi_lock); 932 } 933 raw_spin_unlock_irq(&curr->pi_lock); 934 } 935 936 /* 937 * We need to check the following states: 938 * 939 * Waiter | pi_state | pi->owner | uTID | uODIED | ? 940 * 941 * [1] NULL | --- | --- | 0 | 0/1 | Valid 942 * [2] NULL | --- | --- | >0 | 0/1 | Valid 943 * 944 * [3] Found | NULL | -- | Any | 0/1 | Invalid 945 * 946 * [4] Found | Found | NULL | 0 | 1 | Valid 947 * [5] Found | Found | NULL | >0 | 1 | Invalid 948 * 949 * [6] Found | Found | task | 0 | 1 | Valid 950 * 951 * [7] Found | Found | NULL | Any | 0 | Invalid 952 * 953 * [8] Found | Found | task | ==taskTID | 0/1 | Valid 954 * [9] Found | Found | task | 0 | 0 | Invalid 955 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid 956 * 957 * [1] Indicates that the kernel can acquire the futex atomically. We 958 * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. 959 * 960 * [2] Valid, if TID does not belong to a kernel thread. If no matching 961 * thread is found then it indicates that the owner TID has died. 962 * 963 * [3] Invalid. The waiter is queued on a non PI futex 964 * 965 * [4] Valid state after exit_robust_list(), which sets the user space 966 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. 967 * 968 * [5] The user space value got manipulated between exit_robust_list() 969 * and exit_pi_state_list() 970 * 971 * [6] Valid state after exit_pi_state_list() which sets the new owner in 972 * the pi_state but cannot access the user space value. 973 * 974 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. 975 * 976 * [8] Owner and user space value match 977 * 978 * [9] There is no transient state which sets the user space TID to 0 979 * except exit_robust_list(), but this is indicated by the 980 * FUTEX_OWNER_DIED bit. See [4] 981 * 982 * [10] There is no transient state which leaves owner and user space 983 * TID out of sync. 984 * 985 * 986 * Serialization and lifetime rules: 987 * 988 * hb->lock: 989 * 990 * hb -> futex_q, relation 991 * futex_q -> pi_state, relation 992 * 993 * (cannot be raw because hb can contain arbitrary amount 994 * of futex_q's) 995 * 996 * pi_mutex->wait_lock: 997 * 998 * {uval, pi_state} 999 * 1000 * (and pi_mutex 'obviously') 1001 * 1002 * p->pi_lock: 1003 * 1004 * p->pi_state_list -> pi_state->list, relation 1005 * 1006 * pi_state->refcount: 1007 * 1008 * pi_state lifetime 1009 * 1010 * 1011 * Lock order: 1012 * 1013 * hb->lock 1014 * pi_mutex->wait_lock 1015 * p->pi_lock 1016 * 1017 */ 1018 1019 /* 1020 * Validate that the existing waiter has a pi_state and sanity check 1021 * the pi_state against the user space value. If correct, attach to 1022 * it. 1023 */ 1024 static int attach_to_pi_state(u32 __user *uaddr, u32 uval, 1025 struct futex_pi_state *pi_state, 1026 struct futex_pi_state **ps) 1027 { 1028 pid_t pid = uval & FUTEX_TID_MASK; 1029 u32 uval2; 1030 int ret; 1031 1032 /* 1033 * Userspace might have messed up non-PI and PI futexes [3] 1034 */ 1035 if (unlikely(!pi_state)) 1036 return -EINVAL; 1037 1038 /* 1039 * We get here with hb->lock held, and having found a 1040 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q 1041 * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), 1042 * which in turn means that futex_lock_pi() still has a reference on 1043 * our pi_state. 1044 * 1045 * The waiter holding a reference on @pi_state also protects against 1046 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() 1047 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently 1048 * free pi_state before we can take a reference ourselves. 1049 */ 1050 WARN_ON(!atomic_read(&pi_state->refcount)); 1051 1052 /* 1053 * Now that we have a pi_state, we can acquire wait_lock 1054 * and do the state validation. 1055 */ 1056 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 1057 1058 /* 1059 * Since {uval, pi_state} is serialized by wait_lock, and our current 1060 * uval was read without holding it, it can have changed. Verify it 1061 * still is what we expect it to be, otherwise retry the entire 1062 * operation. 1063 */ 1064 if (get_futex_value_locked(&uval2, uaddr)) 1065 goto out_efault; 1066 1067 if (uval != uval2) 1068 goto out_eagain; 1069 1070 /* 1071 * Handle the owner died case: 1072 */ 1073 if (uval & FUTEX_OWNER_DIED) { 1074 /* 1075 * exit_pi_state_list sets owner to NULL and wakes the 1076 * topmost waiter. The task which acquires the 1077 * pi_state->rt_mutex will fixup owner. 1078 */ 1079 if (!pi_state->owner) { 1080 /* 1081 * No pi state owner, but the user space TID 1082 * is not 0. Inconsistent state. [5] 1083 */ 1084 if (pid) 1085 goto out_einval; 1086 /* 1087 * Take a ref on the state and return success. [4] 1088 */ 1089 goto out_attach; 1090 } 1091 1092 /* 1093 * If TID is 0, then either the dying owner has not 1094 * yet executed exit_pi_state_list() or some waiter 1095 * acquired the rtmutex in the pi state, but did not 1096 * yet fixup the TID in user space. 1097 * 1098 * Take a ref on the state and return success. [6] 1099 */ 1100 if (!pid) 1101 goto out_attach; 1102 } else { 1103 /* 1104 * If the owner died bit is not set, then the pi_state 1105 * must have an owner. [7] 1106 */ 1107 if (!pi_state->owner) 1108 goto out_einval; 1109 } 1110 1111 /* 1112 * Bail out if user space manipulated the futex value. If pi 1113 * state exists then the owner TID must be the same as the 1114 * user space TID. [9/10] 1115 */ 1116 if (pid != task_pid_vnr(pi_state->owner)) 1117 goto out_einval; 1118 1119 out_attach: 1120 get_pi_state(pi_state); 1121 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1122 *ps = pi_state; 1123 return 0; 1124 1125 out_einval: 1126 ret = -EINVAL; 1127 goto out_error; 1128 1129 out_eagain: 1130 ret = -EAGAIN; 1131 goto out_error; 1132 1133 out_efault: 1134 ret = -EFAULT; 1135 goto out_error; 1136 1137 out_error: 1138 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1139 return ret; 1140 } 1141 1142 /* 1143 * Lookup the task for the TID provided from user space and attach to 1144 * it after doing proper sanity checks. 1145 */ 1146 static int attach_to_pi_owner(u32 uval, union futex_key *key, 1147 struct futex_pi_state **ps) 1148 { 1149 pid_t pid = uval & FUTEX_TID_MASK; 1150 struct futex_pi_state *pi_state; 1151 struct task_struct *p; 1152 1153 /* 1154 * We are the first waiter - try to look up the real owner and attach 1155 * the new pi_state to it, but bail out when TID = 0 [1] 1156 */ 1157 if (!pid) 1158 return -ESRCH; 1159 p = futex_find_get_task(pid); 1160 if (!p) 1161 return -ESRCH; 1162 1163 if (unlikely(p->flags & PF_KTHREAD)) { 1164 put_task_struct(p); 1165 return -EPERM; 1166 } 1167 1168 /* 1169 * We need to look at the task state flags to figure out, 1170 * whether the task is exiting. To protect against the do_exit 1171 * change of the task flags, we do this protected by 1172 * p->pi_lock: 1173 */ 1174 raw_spin_lock_irq(&p->pi_lock); 1175 if (unlikely(p->flags & PF_EXITING)) { 1176 /* 1177 * The task is on the way out. When PF_EXITPIDONE is 1178 * set, we know that the task has finished the 1179 * cleanup: 1180 */ 1181 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; 1182 1183 raw_spin_unlock_irq(&p->pi_lock); 1184 put_task_struct(p); 1185 return ret; 1186 } 1187 1188 /* 1189 * No existing pi state. First waiter. [2] 1190 * 1191 * This creates pi_state, we have hb->lock held, this means nothing can 1192 * observe this state, wait_lock is irrelevant. 1193 */ 1194 pi_state = alloc_pi_state(); 1195 1196 /* 1197 * Initialize the pi_mutex in locked state and make @p 1198 * the owner of it: 1199 */ 1200 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 1201 1202 /* Store the key for possible exit cleanups: */ 1203 pi_state->key = *key; 1204 1205 WARN_ON(!list_empty(&pi_state->list)); 1206 list_add(&pi_state->list, &p->pi_state_list); 1207 pi_state->owner = p; 1208 raw_spin_unlock_irq(&p->pi_lock); 1209 1210 put_task_struct(p); 1211 1212 *ps = pi_state; 1213 1214 return 0; 1215 } 1216 1217 static int lookup_pi_state(u32 __user *uaddr, u32 uval, 1218 struct futex_hash_bucket *hb, 1219 union futex_key *key, struct futex_pi_state **ps) 1220 { 1221 struct futex_q *top_waiter = futex_top_waiter(hb, key); 1222 1223 /* 1224 * If there is a waiter on that futex, validate it and 1225 * attach to the pi_state when the validation succeeds. 1226 */ 1227 if (top_waiter) 1228 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); 1229 1230 /* 1231 * We are the first waiter - try to look up the owner based on 1232 * @uval and attach to it. 1233 */ 1234 return attach_to_pi_owner(uval, key, ps); 1235 } 1236 1237 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) 1238 { 1239 u32 uninitialized_var(curval); 1240 1241 if (unlikely(should_fail_futex(true))) 1242 return -EFAULT; 1243 1244 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) 1245 return -EFAULT; 1246 1247 /* If user space value changed, let the caller retry */ 1248 return curval != uval ? -EAGAIN : 0; 1249 } 1250 1251 /** 1252 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex 1253 * @uaddr: the pi futex user address 1254 * @hb: the pi futex hash bucket 1255 * @key: the futex key associated with uaddr and hb 1256 * @ps: the pi_state pointer where we store the result of the 1257 * lookup 1258 * @task: the task to perform the atomic lock work for. This will 1259 * be "current" except in the case of requeue pi. 1260 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 1261 * 1262 * Return: 1263 * 0 - ready to wait; 1264 * 1 - acquired the lock; 1265 * <0 - error 1266 * 1267 * The hb->lock and futex_key refs shall be held by the caller. 1268 */ 1269 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, 1270 union futex_key *key, 1271 struct futex_pi_state **ps, 1272 struct task_struct *task, int set_waiters) 1273 { 1274 u32 uval, newval, vpid = task_pid_vnr(task); 1275 struct futex_q *top_waiter; 1276 int ret; 1277 1278 /* 1279 * Read the user space value first so we can validate a few 1280 * things before proceeding further. 1281 */ 1282 if (get_futex_value_locked(&uval, uaddr)) 1283 return -EFAULT; 1284 1285 if (unlikely(should_fail_futex(true))) 1286 return -EFAULT; 1287 1288 /* 1289 * Detect deadlocks. 1290 */ 1291 if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) 1292 return -EDEADLK; 1293 1294 if ((unlikely(should_fail_futex(true)))) 1295 return -EDEADLK; 1296 1297 /* 1298 * Lookup existing state first. If it exists, try to attach to 1299 * its pi_state. 1300 */ 1301 top_waiter = futex_top_waiter(hb, key); 1302 if (top_waiter) 1303 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); 1304 1305 /* 1306 * No waiter and user TID is 0. We are here because the 1307 * waiters or the owner died bit is set or called from 1308 * requeue_cmp_pi or for whatever reason something took the 1309 * syscall. 1310 */ 1311 if (!(uval & FUTEX_TID_MASK)) { 1312 /* 1313 * We take over the futex. No other waiters and the user space 1314 * TID is 0. We preserve the owner died bit. 1315 */ 1316 newval = uval & FUTEX_OWNER_DIED; 1317 newval |= vpid; 1318 1319 /* The futex requeue_pi code can enforce the waiters bit */ 1320 if (set_waiters) 1321 newval |= FUTEX_WAITERS; 1322 1323 ret = lock_pi_update_atomic(uaddr, uval, newval); 1324 /* If the take over worked, return 1 */ 1325 return ret < 0 ? ret : 1; 1326 } 1327 1328 /* 1329 * First waiter. Set the waiters bit before attaching ourself to 1330 * the owner. If owner tries to unlock, it will be forced into 1331 * the kernel and blocked on hb->lock. 1332 */ 1333 newval = uval | FUTEX_WAITERS; 1334 ret = lock_pi_update_atomic(uaddr, uval, newval); 1335 if (ret) 1336 return ret; 1337 /* 1338 * If the update of the user space value succeeded, we try to 1339 * attach to the owner. If that fails, no harm done, we only 1340 * set the FUTEX_WAITERS bit in the user space variable. 1341 */ 1342 return attach_to_pi_owner(uval, key, ps); 1343 } 1344 1345 /** 1346 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket 1347 * @q: The futex_q to unqueue 1348 * 1349 * The q->lock_ptr must not be NULL and must be held by the caller. 1350 */ 1351 static void __unqueue_futex(struct futex_q *q) 1352 { 1353 struct futex_hash_bucket *hb; 1354 1355 if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) 1356 || WARN_ON(plist_node_empty(&q->list))) 1357 return; 1358 1359 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 1360 plist_del(&q->list, &hb->chain); 1361 hb_waiters_dec(hb); 1362 } 1363 1364 /* 1365 * The hash bucket lock must be held when this is called. 1366 * Afterwards, the futex_q must not be accessed. Callers 1367 * must ensure to later call wake_up_q() for the actual 1368 * wakeups to occur. 1369 */ 1370 static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) 1371 { 1372 struct task_struct *p = q->task; 1373 1374 if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) 1375 return; 1376 1377 /* 1378 * Queue the task for later wakeup for after we've released 1379 * the hb->lock. wake_q_add() grabs reference to p. 1380 */ 1381 wake_q_add(wake_q, p); 1382 __unqueue_futex(q); 1383 /* 1384 * The waiting task can free the futex_q as soon as q->lock_ptr = NULL 1385 * is written, without taking any locks. This is possible in the event 1386 * of a spurious wakeup, for example. A memory barrier is required here 1387 * to prevent the following store to lock_ptr from getting ahead of the 1388 * plist_del in __unqueue_futex(). 1389 */ 1390 smp_store_release(&q->lock_ptr, NULL); 1391 } 1392 1393 /* 1394 * Caller must hold a reference on @pi_state. 1395 */ 1396 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) 1397 { 1398 u32 uninitialized_var(curval), newval; 1399 struct task_struct *new_owner; 1400 bool postunlock = false; 1401 DEFINE_WAKE_Q(wake_q); 1402 int ret = 0; 1403 1404 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 1405 if (WARN_ON_ONCE(!new_owner)) { 1406 /* 1407 * As per the comment in futex_unlock_pi() this should not happen. 1408 * 1409 * When this happens, give up our locks and try again, giving 1410 * the futex_lock_pi() instance time to complete, either by 1411 * waiting on the rtmutex or removing itself from the futex 1412 * queue. 1413 */ 1414 ret = -EAGAIN; 1415 goto out_unlock; 1416 } 1417 1418 /* 1419 * We pass it to the next owner. The WAITERS bit is always kept 1420 * enabled while there is PI state around. We cleanup the owner 1421 * died bit, because we are the owner. 1422 */ 1423 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 1424 1425 if (unlikely(should_fail_futex(true))) 1426 ret = -EFAULT; 1427 1428 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { 1429 ret = -EFAULT; 1430 1431 } else if (curval != uval) { 1432 /* 1433 * If a unconditional UNLOCK_PI operation (user space did not 1434 * try the TID->0 transition) raced with a waiter setting the 1435 * FUTEX_WAITERS flag between get_user() and locking the hash 1436 * bucket lock, retry the operation. 1437 */ 1438 if ((FUTEX_TID_MASK & curval) == uval) 1439 ret = -EAGAIN; 1440 else 1441 ret = -EINVAL; 1442 } 1443 1444 if (ret) 1445 goto out_unlock; 1446 1447 /* 1448 * This is a point of no return; once we modify the uval there is no 1449 * going back and subsequent operations must not fail. 1450 */ 1451 1452 raw_spin_lock(&pi_state->owner->pi_lock); 1453 WARN_ON(list_empty(&pi_state->list)); 1454 list_del_init(&pi_state->list); 1455 raw_spin_unlock(&pi_state->owner->pi_lock); 1456 1457 raw_spin_lock(&new_owner->pi_lock); 1458 WARN_ON(!list_empty(&pi_state->list)); 1459 list_add(&pi_state->list, &new_owner->pi_state_list); 1460 pi_state->owner = new_owner; 1461 raw_spin_unlock(&new_owner->pi_lock); 1462 1463 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); 1464 1465 out_unlock: 1466 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1467 1468 if (postunlock) 1469 rt_mutex_postunlock(&wake_q); 1470 1471 return ret; 1472 } 1473 1474 /* 1475 * Express the locking dependencies for lockdep: 1476 */ 1477 static inline void 1478 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) 1479 { 1480 if (hb1 <= hb2) { 1481 spin_lock(&hb1->lock); 1482 if (hb1 < hb2) 1483 spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); 1484 } else { /* hb1 > hb2 */ 1485 spin_lock(&hb2->lock); 1486 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); 1487 } 1488 } 1489 1490 static inline void 1491 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) 1492 { 1493 spin_unlock(&hb1->lock); 1494 if (hb1 != hb2) 1495 spin_unlock(&hb2->lock); 1496 } 1497 1498 /* 1499 * Wake up waiters matching bitset queued on this futex (uaddr). 1500 */ 1501 static int 1502 futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) 1503 { 1504 struct futex_hash_bucket *hb; 1505 struct futex_q *this, *next; 1506 union futex_key key = FUTEX_KEY_INIT; 1507 int ret; 1508 DEFINE_WAKE_Q(wake_q); 1509 1510 if (!bitset) 1511 return -EINVAL; 1512 1513 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); 1514 if (unlikely(ret != 0)) 1515 goto out; 1516 1517 hb = hash_futex(&key); 1518 1519 /* Make sure we really have tasks to wakeup */ 1520 if (!hb_waiters_pending(hb)) 1521 goto out_put_key; 1522 1523 spin_lock(&hb->lock); 1524 1525 plist_for_each_entry_safe(this, next, &hb->chain, list) { 1526 if (match_futex (&this->key, &key)) { 1527 if (this->pi_state || this->rt_waiter) { 1528 ret = -EINVAL; 1529 break; 1530 } 1531 1532 /* Check if one of the bits is set in both bitsets */ 1533 if (!(this->bitset & bitset)) 1534 continue; 1535 1536 mark_wake_futex(&wake_q, this); 1537 if (++ret >= nr_wake) 1538 break; 1539 } 1540 } 1541 1542 spin_unlock(&hb->lock); 1543 wake_up_q(&wake_q); 1544 out_put_key: 1545 put_futex_key(&key); 1546 out: 1547 return ret; 1548 } 1549 1550 /* 1551 * Wake up all waiters hashed on the physical page that is mapped 1552 * to this virtual address: 1553 */ 1554 static int 1555 futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, 1556 int nr_wake, int nr_wake2, int op) 1557 { 1558 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1559 struct futex_hash_bucket *hb1, *hb2; 1560 struct futex_q *this, *next; 1561 int ret, op_ret; 1562 DEFINE_WAKE_Q(wake_q); 1563 1564 retry: 1565 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); 1566 if (unlikely(ret != 0)) 1567 goto out; 1568 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); 1569 if (unlikely(ret != 0)) 1570 goto out_put_key1; 1571 1572 hb1 = hash_futex(&key1); 1573 hb2 = hash_futex(&key2); 1574 1575 retry_private: 1576 double_lock_hb(hb1, hb2); 1577 op_ret = futex_atomic_op_inuser(op, uaddr2); 1578 if (unlikely(op_ret < 0)) { 1579 1580 double_unlock_hb(hb1, hb2); 1581 1582 #ifndef CONFIG_MMU 1583 /* 1584 * we don't get EFAULT from MMU faults if we don't have an MMU, 1585 * but we might get them from range checking 1586 */ 1587 ret = op_ret; 1588 goto out_put_keys; 1589 #endif 1590 1591 if (unlikely(op_ret != -EFAULT)) { 1592 ret = op_ret; 1593 goto out_put_keys; 1594 } 1595 1596 ret = fault_in_user_writeable(uaddr2); 1597 if (ret) 1598 goto out_put_keys; 1599 1600 if (!(flags & FLAGS_SHARED)) 1601 goto retry_private; 1602 1603 put_futex_key(&key2); 1604 put_futex_key(&key1); 1605 goto retry; 1606 } 1607 1608 plist_for_each_entry_safe(this, next, &hb1->chain, list) { 1609 if (match_futex (&this->key, &key1)) { 1610 if (this->pi_state || this->rt_waiter) { 1611 ret = -EINVAL; 1612 goto out_unlock; 1613 } 1614 mark_wake_futex(&wake_q, this); 1615 if (++ret >= nr_wake) 1616 break; 1617 } 1618 } 1619 1620 if (op_ret > 0) { 1621 op_ret = 0; 1622 plist_for_each_entry_safe(this, next, &hb2->chain, list) { 1623 if (match_futex (&this->key, &key2)) { 1624 if (this->pi_state || this->rt_waiter) { 1625 ret = -EINVAL; 1626 goto out_unlock; 1627 } 1628 mark_wake_futex(&wake_q, this); 1629 if (++op_ret >= nr_wake2) 1630 break; 1631 } 1632 } 1633 ret += op_ret; 1634 } 1635 1636 out_unlock: 1637 double_unlock_hb(hb1, hb2); 1638 wake_up_q(&wake_q); 1639 out_put_keys: 1640 put_futex_key(&key2); 1641 out_put_key1: 1642 put_futex_key(&key1); 1643 out: 1644 return ret; 1645 } 1646 1647 /** 1648 * requeue_futex() - Requeue a futex_q from one hb to another 1649 * @q: the futex_q to requeue 1650 * @hb1: the source hash_bucket 1651 * @hb2: the target hash_bucket 1652 * @key2: the new key for the requeued futex_q 1653 */ 1654 static inline 1655 void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, 1656 struct futex_hash_bucket *hb2, union futex_key *key2) 1657 { 1658 1659 /* 1660 * If key1 and key2 hash to the same bucket, no need to 1661 * requeue. 1662 */ 1663 if (likely(&hb1->chain != &hb2->chain)) { 1664 plist_del(&q->list, &hb1->chain); 1665 hb_waiters_dec(hb1); 1666 hb_waiters_inc(hb2); 1667 plist_add(&q->list, &hb2->chain); 1668 q->lock_ptr = &hb2->lock; 1669 } 1670 get_futex_key_refs(key2); 1671 q->key = *key2; 1672 } 1673 1674 /** 1675 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1676 * @q: the futex_q 1677 * @key: the key of the requeue target futex 1678 * @hb: the hash_bucket of the requeue target futex 1679 * 1680 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1681 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1682 * to the requeue target futex so the waiter can detect the wakeup on the right 1683 * futex, but remove it from the hb and NULL the rt_waiter so it can detect 1684 * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock 1685 * to protect access to the pi_state to fixup the owner later. Must be called 1686 * with both q->lock_ptr and hb->lock held. 1687 */ 1688 static inline 1689 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, 1690 struct futex_hash_bucket *hb) 1691 { 1692 get_futex_key_refs(key); 1693 q->key = *key; 1694 1695 __unqueue_futex(q); 1696 1697 WARN_ON(!q->rt_waiter); 1698 q->rt_waiter = NULL; 1699 1700 q->lock_ptr = &hb->lock; 1701 1702 wake_up_state(q->task, TASK_NORMAL); 1703 } 1704 1705 /** 1706 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter 1707 * @pifutex: the user address of the to futex 1708 * @hb1: the from futex hash bucket, must be locked by the caller 1709 * @hb2: the to futex hash bucket, must be locked by the caller 1710 * @key1: the from futex key 1711 * @key2: the to futex key 1712 * @ps: address to store the pi_state pointer 1713 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 1714 * 1715 * Try and get the lock on behalf of the top waiter if we can do it atomically. 1716 * Wake the top waiter if we succeed. If the caller specified set_waiters, 1717 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. 1718 * hb1 and hb2 must be held by the caller. 1719 * 1720 * Return: 1721 * 0 - failed to acquire the lock atomically; 1722 * >0 - acquired the lock, return value is vpid of the top_waiter 1723 * <0 - error 1724 */ 1725 static int futex_proxy_trylock_atomic(u32 __user *pifutex, 1726 struct futex_hash_bucket *hb1, 1727 struct futex_hash_bucket *hb2, 1728 union futex_key *key1, union futex_key *key2, 1729 struct futex_pi_state **ps, int set_waiters) 1730 { 1731 struct futex_q *top_waiter = NULL; 1732 u32 curval; 1733 int ret, vpid; 1734 1735 if (get_futex_value_locked(&curval, pifutex)) 1736 return -EFAULT; 1737 1738 if (unlikely(should_fail_futex(true))) 1739 return -EFAULT; 1740 1741 /* 1742 * Find the top_waiter and determine if there are additional waiters. 1743 * If the caller intends to requeue more than 1 waiter to pifutex, 1744 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, 1745 * as we have means to handle the possible fault. If not, don't set 1746 * the bit unecessarily as it will force the subsequent unlock to enter 1747 * the kernel. 1748 */ 1749 top_waiter = futex_top_waiter(hb1, key1); 1750 1751 /* There are no waiters, nothing for us to do. */ 1752 if (!top_waiter) 1753 return 0; 1754 1755 /* Ensure we requeue to the expected futex. */ 1756 if (!match_futex(top_waiter->requeue_pi_key, key2)) 1757 return -EINVAL; 1758 1759 /* 1760 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in 1761 * the contended case or if set_waiters is 1. The pi_state is returned 1762 * in ps in contended cases. 1763 */ 1764 vpid = task_pid_vnr(top_waiter->task); 1765 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, 1766 set_waiters); 1767 if (ret == 1) { 1768 requeue_pi_wake_futex(top_waiter, key2, hb2); 1769 return vpid; 1770 } 1771 return ret; 1772 } 1773 1774 /** 1775 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1776 * @uaddr1: source futex user address 1777 * @flags: futex flags (FLAGS_SHARED, etc.) 1778 * @uaddr2: target futex user address 1779 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1780 * @nr_requeue: number of waiters to requeue (0-INT_MAX) 1781 * @cmpval: @uaddr1 expected value (or %NULL) 1782 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a 1783 * pi futex (pi to pi requeue is not supported) 1784 * 1785 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1786 * uaddr2 atomically on behalf of the top waiter. 1787 * 1788 * Return: 1789 * >=0 - on success, the number of tasks requeued or woken; 1790 * <0 - on error 1791 */ 1792 static int futex_requeue(u32 __user *uaddr1, unsigned int flags, 1793 u32 __user *uaddr2, int nr_wake, int nr_requeue, 1794 u32 *cmpval, int requeue_pi) 1795 { 1796 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1797 int drop_count = 0, task_count = 0, ret; 1798 struct futex_pi_state *pi_state = NULL; 1799 struct futex_hash_bucket *hb1, *hb2; 1800 struct futex_q *this, *next; 1801 DEFINE_WAKE_Q(wake_q); 1802 1803 if (requeue_pi) { 1804 /* 1805 * Requeue PI only works on two distinct uaddrs. This 1806 * check is only valid for private futexes. See below. 1807 */ 1808 if (uaddr1 == uaddr2) 1809 return -EINVAL; 1810 1811 /* 1812 * requeue_pi requires a pi_state, try to allocate it now 1813 * without any locks in case it fails. 1814 */ 1815 if (refill_pi_state_cache()) 1816 return -ENOMEM; 1817 /* 1818 * requeue_pi must wake as many tasks as it can, up to nr_wake 1819 * + nr_requeue, since it acquires the rt_mutex prior to 1820 * returning to userspace, so as to not leave the rt_mutex with 1821 * waiters and no owner. However, second and third wake-ups 1822 * cannot be predicted as they involve race conditions with the 1823 * first wake and a fault while looking up the pi_state. Both 1824 * pthread_cond_signal() and pthread_cond_broadcast() should 1825 * use nr_wake=1. 1826 */ 1827 if (nr_wake != 1) 1828 return -EINVAL; 1829 } 1830 1831 retry: 1832 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); 1833 if (unlikely(ret != 0)) 1834 goto out; 1835 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, 1836 requeue_pi ? VERIFY_WRITE : VERIFY_READ); 1837 if (unlikely(ret != 0)) 1838 goto out_put_key1; 1839 1840 /* 1841 * The check above which compares uaddrs is not sufficient for 1842 * shared futexes. We need to compare the keys: 1843 */ 1844 if (requeue_pi && match_futex(&key1, &key2)) { 1845 ret = -EINVAL; 1846 goto out_put_keys; 1847 } 1848 1849 hb1 = hash_futex(&key1); 1850 hb2 = hash_futex(&key2); 1851 1852 retry_private: 1853 hb_waiters_inc(hb2); 1854 double_lock_hb(hb1, hb2); 1855 1856 if (likely(cmpval != NULL)) { 1857 u32 curval; 1858 1859 ret = get_futex_value_locked(&curval, uaddr1); 1860 1861 if (unlikely(ret)) { 1862 double_unlock_hb(hb1, hb2); 1863 hb_waiters_dec(hb2); 1864 1865 ret = get_user(curval, uaddr1); 1866 if (ret) 1867 goto out_put_keys; 1868 1869 if (!(flags & FLAGS_SHARED)) 1870 goto retry_private; 1871 1872 put_futex_key(&key2); 1873 put_futex_key(&key1); 1874 goto retry; 1875 } 1876 if (curval != *cmpval) { 1877 ret = -EAGAIN; 1878 goto out_unlock; 1879 } 1880 } 1881 1882 if (requeue_pi && (task_count - nr_wake < nr_requeue)) { 1883 /* 1884 * Attempt to acquire uaddr2 and wake the top waiter. If we 1885 * intend to requeue waiters, force setting the FUTEX_WAITERS 1886 * bit. We force this here where we are able to easily handle 1887 * faults rather in the requeue loop below. 1888 */ 1889 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, 1890 &key2, &pi_state, nr_requeue); 1891 1892 /* 1893 * At this point the top_waiter has either taken uaddr2 or is 1894 * waiting on it. If the former, then the pi_state will not 1895 * exist yet, look it up one more time to ensure we have a 1896 * reference to it. If the lock was taken, ret contains the 1897 * vpid of the top waiter task. 1898 * If the lock was not taken, we have pi_state and an initial 1899 * refcount on it. In case of an error we have nothing. 1900 */ 1901 if (ret > 0) { 1902 WARN_ON(pi_state); 1903 drop_count++; 1904 task_count++; 1905 /* 1906 * If we acquired the lock, then the user space value 1907 * of uaddr2 should be vpid. It cannot be changed by 1908 * the top waiter as it is blocked on hb2 lock if it 1909 * tries to do so. If something fiddled with it behind 1910 * our back the pi state lookup might unearth it. So 1911 * we rather use the known value than rereading and 1912 * handing potential crap to lookup_pi_state. 1913 * 1914 * If that call succeeds then we have pi_state and an 1915 * initial refcount on it. 1916 */ 1917 ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); 1918 } 1919 1920 switch (ret) { 1921 case 0: 1922 /* We hold a reference on the pi state. */ 1923 break; 1924 1925 /* If the above failed, then pi_state is NULL */ 1926 case -EFAULT: 1927 double_unlock_hb(hb1, hb2); 1928 hb_waiters_dec(hb2); 1929 put_futex_key(&key2); 1930 put_futex_key(&key1); 1931 ret = fault_in_user_writeable(uaddr2); 1932 if (!ret) 1933 goto retry; 1934 goto out; 1935 case -EAGAIN: 1936 /* 1937 * Two reasons for this: 1938 * - Owner is exiting and we just wait for the 1939 * exit to complete. 1940 * - The user space value changed. 1941 */ 1942 double_unlock_hb(hb1, hb2); 1943 hb_waiters_dec(hb2); 1944 put_futex_key(&key2); 1945 put_futex_key(&key1); 1946 cond_resched(); 1947 goto retry; 1948 default: 1949 goto out_unlock; 1950 } 1951 } 1952 1953 plist_for_each_entry_safe(this, next, &hb1->chain, list) { 1954 if (task_count - nr_wake >= nr_requeue) 1955 break; 1956 1957 if (!match_futex(&this->key, &key1)) 1958 continue; 1959 1960 /* 1961 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always 1962 * be paired with each other and no other futex ops. 1963 * 1964 * We should never be requeueing a futex_q with a pi_state, 1965 * which is awaiting a futex_unlock_pi(). 1966 */ 1967 if ((requeue_pi && !this->rt_waiter) || 1968 (!requeue_pi && this->rt_waiter) || 1969 this->pi_state) { 1970 ret = -EINVAL; 1971 break; 1972 } 1973 1974 /* 1975 * Wake nr_wake waiters. For requeue_pi, if we acquired the 1976 * lock, we already woke the top_waiter. If not, it will be 1977 * woken by futex_unlock_pi(). 1978 */ 1979 if (++task_count <= nr_wake && !requeue_pi) { 1980 mark_wake_futex(&wake_q, this); 1981 continue; 1982 } 1983 1984 /* Ensure we requeue to the expected futex for requeue_pi. */ 1985 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { 1986 ret = -EINVAL; 1987 break; 1988 } 1989 1990 /* 1991 * Requeue nr_requeue waiters and possibly one more in the case 1992 * of requeue_pi if we couldn't acquire the lock atomically. 1993 */ 1994 if (requeue_pi) { 1995 /* 1996 * Prepare the waiter to take the rt_mutex. Take a 1997 * refcount on the pi_state and store the pointer in 1998 * the futex_q object of the waiter. 1999 */ 2000 get_pi_state(pi_state); 2001 this->pi_state = pi_state; 2002 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, 2003 this->rt_waiter, 2004 this->task); 2005 if (ret == 1) { 2006 /* 2007 * We got the lock. We do neither drop the 2008 * refcount on pi_state nor clear 2009 * this->pi_state because the waiter needs the 2010 * pi_state for cleaning up the user space 2011 * value. It will drop the refcount after 2012 * doing so. 2013 */ 2014 requeue_pi_wake_futex(this, &key2, hb2); 2015 drop_count++; 2016 continue; 2017 } else if (ret) { 2018 /* 2019 * rt_mutex_start_proxy_lock() detected a 2020 * potential deadlock when we tried to queue 2021 * that waiter. Drop the pi_state reference 2022 * which we took above and remove the pointer 2023 * to the state from the waiters futex_q 2024 * object. 2025 */ 2026 this->pi_state = NULL; 2027 put_pi_state(pi_state); 2028 /* 2029 * We stop queueing more waiters and let user 2030 * space deal with the mess. 2031 */ 2032 break; 2033 } 2034 } 2035 requeue_futex(this, hb1, hb2, &key2); 2036 drop_count++; 2037 } 2038 2039 /* 2040 * We took an extra initial reference to the pi_state either 2041 * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We 2042 * need to drop it here again. 2043 */ 2044 put_pi_state(pi_state); 2045 2046 out_unlock: 2047 double_unlock_hb(hb1, hb2); 2048 wake_up_q(&wake_q); 2049 hb_waiters_dec(hb2); 2050 2051 /* 2052 * drop_futex_key_refs() must be called outside the spinlocks. During 2053 * the requeue we moved futex_q's from the hash bucket at key1 to the 2054 * one at key2 and updated their key pointer. We no longer need to 2055 * hold the references to key1. 2056 */ 2057 while (--drop_count >= 0) 2058 drop_futex_key_refs(&key1); 2059 2060 out_put_keys: 2061 put_futex_key(&key2); 2062 out_put_key1: 2063 put_futex_key(&key1); 2064 out: 2065 return ret ? ret : task_count; 2066 } 2067 2068 /* The key must be already stored in q->key. */ 2069 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) 2070 __acquires(&hb->lock) 2071 { 2072 struct futex_hash_bucket *hb; 2073 2074 hb = hash_futex(&q->key); 2075 2076 /* 2077 * Increment the counter before taking the lock so that 2078 * a potential waker won't miss a to-be-slept task that is 2079 * waiting for the spinlock. This is safe as all queue_lock() 2080 * users end up calling queue_me(). Similarly, for housekeeping, 2081 * decrement the counter at queue_unlock() when some error has 2082 * occurred and we don't end up adding the task to the list. 2083 */ 2084 hb_waiters_inc(hb); 2085 2086 q->lock_ptr = &hb->lock; 2087 2088 spin_lock(&hb->lock); /* implies smp_mb(); (A) */ 2089 return hb; 2090 } 2091 2092 static inline void 2093 queue_unlock(struct futex_hash_bucket *hb) 2094 __releases(&hb->lock) 2095 { 2096 spin_unlock(&hb->lock); 2097 hb_waiters_dec(hb); 2098 } 2099 2100 static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 2101 { 2102 int prio; 2103 2104 /* 2105 * The priority used to register this element is 2106 * - either the real thread-priority for the real-time threads 2107 * (i.e. threads with a priority lower than MAX_RT_PRIO) 2108 * - or MAX_RT_PRIO for non-RT threads. 2109 * Thus, all RT-threads are woken first in priority order, and 2110 * the others are woken last, in FIFO order. 2111 */ 2112 prio = min(current->normal_prio, MAX_RT_PRIO); 2113 2114 plist_node_init(&q->list, prio); 2115 plist_add(&q->list, &hb->chain); 2116 q->task = current; 2117 } 2118 2119 /** 2120 * queue_me() - Enqueue the futex_q on the futex_hash_bucket 2121 * @q: The futex_q to enqueue 2122 * @hb: The destination hash bucket 2123 * 2124 * The hb->lock must be held by the caller, and is released here. A call to 2125 * queue_me() is typically paired with exactly one call to unqueue_me(). The 2126 * exceptions involve the PI related operations, which may use unqueue_me_pi() 2127 * or nothing if the unqueue is done as part of the wake process and the unqueue 2128 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for 2129 * an example). 2130 */ 2131 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 2132 __releases(&hb->lock) 2133 { 2134 __queue_me(q, hb); 2135 spin_unlock(&hb->lock); 2136 } 2137 2138 /** 2139 * unqueue_me() - Remove the futex_q from its futex_hash_bucket 2140 * @q: The futex_q to unqueue 2141 * 2142 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must 2143 * be paired with exactly one earlier call to queue_me(). 2144 * 2145 * Return: 2146 * 1 - if the futex_q was still queued (and we removed unqueued it); 2147 * 0 - if the futex_q was already removed by the waking thread 2148 */ 2149 static int unqueue_me(struct futex_q *q) 2150 { 2151 spinlock_t *lock_ptr; 2152 int ret = 0; 2153 2154 /* In the common case we don't take the spinlock, which is nice. */ 2155 retry: 2156 /* 2157 * q->lock_ptr can change between this read and the following spin_lock. 2158 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and 2159 * optimizing lock_ptr out of the logic below. 2160 */ 2161 lock_ptr = READ_ONCE(q->lock_ptr); 2162 if (lock_ptr != NULL) { 2163 spin_lock(lock_ptr); 2164 /* 2165 * q->lock_ptr can change between reading it and 2166 * spin_lock(), causing us to take the wrong lock. This 2167 * corrects the race condition. 2168 * 2169 * Reasoning goes like this: if we have the wrong lock, 2170 * q->lock_ptr must have changed (maybe several times) 2171 * between reading it and the spin_lock(). It can 2172 * change again after the spin_lock() but only if it was 2173 * already changed before the spin_lock(). It cannot, 2174 * however, change back to the original value. Therefore 2175 * we can detect whether we acquired the correct lock. 2176 */ 2177 if (unlikely(lock_ptr != q->lock_ptr)) { 2178 spin_unlock(lock_ptr); 2179 goto retry; 2180 } 2181 __unqueue_futex(q); 2182 2183 BUG_ON(q->pi_state); 2184 2185 spin_unlock(lock_ptr); 2186 ret = 1; 2187 } 2188 2189 drop_futex_key_refs(&q->key); 2190 return ret; 2191 } 2192 2193 /* 2194 * PI futexes can not be requeued and must remove themself from the 2195 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry 2196 * and dropped here. 2197 */ 2198 static void unqueue_me_pi(struct futex_q *q) 2199 __releases(q->lock_ptr) 2200 { 2201 __unqueue_futex(q); 2202 2203 BUG_ON(!q->pi_state); 2204 put_pi_state(q->pi_state); 2205 q->pi_state = NULL; 2206 2207 spin_unlock(q->lock_ptr); 2208 } 2209 2210 /* 2211 * Fixup the pi_state owner with the new owner. 2212 * 2213 * Must be called with hash bucket lock held and mm->sem held for non 2214 * private futexes. 2215 */ 2216 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 2217 struct task_struct *newowner) 2218 { 2219 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 2220 struct futex_pi_state *pi_state = q->pi_state; 2221 u32 uval, uninitialized_var(curval), newval; 2222 struct task_struct *oldowner; 2223 int ret; 2224 2225 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 2226 2227 oldowner = pi_state->owner; 2228 /* Owner died? */ 2229 if (!pi_state->owner) 2230 newtid |= FUTEX_OWNER_DIED; 2231 2232 /* 2233 * We are here either because we stole the rtmutex from the 2234 * previous highest priority waiter or we are the highest priority 2235 * waiter but have failed to get the rtmutex the first time. 2236 * 2237 * We have to replace the newowner TID in the user space variable. 2238 * This must be atomic as we have to preserve the owner died bit here. 2239 * 2240 * Note: We write the user space value _before_ changing the pi_state 2241 * because we can fault here. Imagine swapped out pages or a fork 2242 * that marked all the anonymous memory readonly for cow. 2243 * 2244 * Modifying pi_state _before_ the user space value would leave the 2245 * pi_state in an inconsistent state when we fault here, because we 2246 * need to drop the locks to handle the fault. This might be observed 2247 * in the PID check in lookup_pi_state. 2248 */ 2249 retry: 2250 if (get_futex_value_locked(&uval, uaddr)) 2251 goto handle_fault; 2252 2253 for (;;) { 2254 newval = (uval & FUTEX_OWNER_DIED) | newtid; 2255 2256 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) 2257 goto handle_fault; 2258 if (curval == uval) 2259 break; 2260 uval = curval; 2261 } 2262 2263 /* 2264 * We fixed up user space. Now we need to fix the pi_state 2265 * itself. 2266 */ 2267 if (pi_state->owner != NULL) { 2268 raw_spin_lock(&pi_state->owner->pi_lock); 2269 WARN_ON(list_empty(&pi_state->list)); 2270 list_del_init(&pi_state->list); 2271 raw_spin_unlock(&pi_state->owner->pi_lock); 2272 } 2273 2274 pi_state->owner = newowner; 2275 2276 raw_spin_lock(&newowner->pi_lock); 2277 WARN_ON(!list_empty(&pi_state->list)); 2278 list_add(&pi_state->list, &newowner->pi_state_list); 2279 raw_spin_unlock(&newowner->pi_lock); 2280 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 2281 2282 return 0; 2283 2284 /* 2285 * To handle the page fault we need to drop the locks here. That gives 2286 * the other task (either the highest priority waiter itself or the 2287 * task which stole the rtmutex) the chance to try the fixup of the 2288 * pi_state. So once we are back from handling the fault we need to 2289 * check the pi_state after reacquiring the locks and before trying to 2290 * do another fixup. When the fixup has been done already we simply 2291 * return. 2292 * 2293 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely 2294 * drop hb->lock since the caller owns the hb -> futex_q relation. 2295 * Dropping the pi_mutex->wait_lock requires the state revalidate. 2296 */ 2297 handle_fault: 2298 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 2299 spin_unlock(q->lock_ptr); 2300 2301 ret = fault_in_user_writeable(uaddr); 2302 2303 spin_lock(q->lock_ptr); 2304 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 2305 2306 /* 2307 * Check if someone else fixed it for us: 2308 */ 2309 if (pi_state->owner != oldowner) { 2310 ret = 0; 2311 goto out_unlock; 2312 } 2313 2314 if (ret) 2315 goto out_unlock; 2316 2317 goto retry; 2318 2319 out_unlock: 2320 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 2321 return ret; 2322 } 2323 2324 static long futex_wait_restart(struct restart_block *restart); 2325 2326 /** 2327 * fixup_owner() - Post lock pi_state and corner case management 2328 * @uaddr: user address of the futex 2329 * @q: futex_q (contains pi_state and access to the rt_mutex) 2330 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 2331 * 2332 * After attempting to lock an rt_mutex, this function is called to cleanup 2333 * the pi_state owner as well as handle race conditions that may allow us to 2334 * acquire the lock. Must be called with the hb lock held. 2335 * 2336 * Return: 2337 * 1 - success, lock taken; 2338 * 0 - success, lock not taken; 2339 * <0 - on error (-EFAULT) 2340 */ 2341 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) 2342 { 2343 int ret = 0; 2344 2345 if (locked) { 2346 /* 2347 * Got the lock. We might not be the anticipated owner if we 2348 * did a lock-steal - fix up the PI-state in that case: 2349 * 2350 * We can safely read pi_state->owner without holding wait_lock 2351 * because we now own the rt_mutex, only the owner will attempt 2352 * to change it. 2353 */ 2354 if (q->pi_state->owner != current) 2355 ret = fixup_pi_state_owner(uaddr, q, current); 2356 goto out; 2357 } 2358 2359 /* 2360 * Paranoia check. If we did not take the lock, then we should not be 2361 * the owner of the rt_mutex. 2362 */ 2363 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { 2364 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " 2365 "pi-state %p\n", ret, 2366 q->pi_state->pi_mutex.owner, 2367 q->pi_state->owner); 2368 } 2369 2370 out: 2371 return ret ? ret : locked; 2372 } 2373 2374 /** 2375 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal 2376 * @hb: the futex hash bucket, must be locked by the caller 2377 * @q: the futex_q to queue up on 2378 * @timeout: the prepared hrtimer_sleeper, or null for no timeout 2379 */ 2380 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, 2381 struct hrtimer_sleeper *timeout) 2382 { 2383 /* 2384 * The task state is guaranteed to be set before another task can 2385 * wake it. set_current_state() is implemented using smp_store_mb() and 2386 * queue_me() calls spin_unlock() upon completion, both serializing 2387 * access to the hash list and forcing another memory barrier. 2388 */ 2389 set_current_state(TASK_INTERRUPTIBLE); 2390 queue_me(q, hb); 2391 2392 /* Arm the timer */ 2393 if (timeout) 2394 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); 2395 2396 /* 2397 * If we have been removed from the hash list, then another task 2398 * has tried to wake us, and we can skip the call to schedule(). 2399 */ 2400 if (likely(!plist_node_empty(&q->list))) { 2401 /* 2402 * If the timer has already expired, current will already be 2403 * flagged for rescheduling. Only call schedule if there 2404 * is no timeout, or if it has yet to expire. 2405 */ 2406 if (!timeout || timeout->task) 2407 freezable_schedule(); 2408 } 2409 __set_current_state(TASK_RUNNING); 2410 } 2411 2412 /** 2413 * futex_wait_setup() - Prepare to wait on a futex 2414 * @uaddr: the futex userspace address 2415 * @val: the expected value 2416 * @flags: futex flags (FLAGS_SHARED, etc.) 2417 * @q: the associated futex_q 2418 * @hb: storage for hash_bucket pointer to be returned to caller 2419 * 2420 * Setup the futex_q and locate the hash_bucket. Get the futex value and 2421 * compare it with the expected value. Handle atomic faults internally. 2422 * Return with the hb lock held and a q.key reference on success, and unlocked 2423 * with no q.key reference on failure. 2424 * 2425 * Return: 2426 * 0 - uaddr contains val and hb has been locked; 2427 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked 2428 */ 2429 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 2430 struct futex_q *q, struct futex_hash_bucket **hb) 2431 { 2432 u32 uval; 2433 int ret; 2434 2435 /* 2436 * Access the page AFTER the hash-bucket is locked. 2437 * Order is important: 2438 * 2439 * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); 2440 * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } 2441 * 2442 * The basic logical guarantee of a futex is that it blocks ONLY 2443 * if cond(var) is known to be true at the time of blocking, for 2444 * any cond. If we locked the hash-bucket after testing *uaddr, that 2445 * would open a race condition where we could block indefinitely with 2446 * cond(var) false, which would violate the guarantee. 2447 * 2448 * On the other hand, we insert q and release the hash-bucket only 2449 * after testing *uaddr. This guarantees that futex_wait() will NOT 2450 * absorb a wakeup if *uaddr does not match the desired values 2451 * while the syscall executes. 2452 */ 2453 retry: 2454 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); 2455 if (unlikely(ret != 0)) 2456 return ret; 2457 2458 retry_private: 2459 *hb = queue_lock(q); 2460 2461 ret = get_futex_value_locked(&uval, uaddr); 2462 2463 if (ret) { 2464 queue_unlock(*hb); 2465 2466 ret = get_user(uval, uaddr); 2467 if (ret) 2468 goto out; 2469 2470 if (!(flags & FLAGS_SHARED)) 2471 goto retry_private; 2472 2473 put_futex_key(&q->key); 2474 goto retry; 2475 } 2476 2477 if (uval != val) { 2478 queue_unlock(*hb); 2479 ret = -EWOULDBLOCK; 2480 } 2481 2482 out: 2483 if (ret) 2484 put_futex_key(&q->key); 2485 return ret; 2486 } 2487 2488 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, 2489 ktime_t *abs_time, u32 bitset) 2490 { 2491 struct hrtimer_sleeper timeout, *to = NULL; 2492 struct restart_block *restart; 2493 struct futex_hash_bucket *hb; 2494 struct futex_q q = futex_q_init; 2495 int ret; 2496 2497 if (!bitset) 2498 return -EINVAL; 2499 q.bitset = bitset; 2500 2501 if (abs_time) { 2502 to = &timeout; 2503 2504 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? 2505 CLOCK_REALTIME : CLOCK_MONOTONIC, 2506 HRTIMER_MODE_ABS); 2507 hrtimer_init_sleeper(to, current); 2508 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 2509 current->timer_slack_ns); 2510 } 2511 2512 retry: 2513 /* 2514 * Prepare to wait on uaddr. On success, holds hb lock and increments 2515 * q.key refs. 2516 */ 2517 ret = futex_wait_setup(uaddr, val, flags, &q, &hb); 2518 if (ret) 2519 goto out; 2520 2521 /* queue_me and wait for wakeup, timeout, or a signal. */ 2522 futex_wait_queue_me(hb, &q, to); 2523 2524 /* If we were woken (and unqueued), we succeeded, whatever. */ 2525 ret = 0; 2526 /* unqueue_me() drops q.key ref */ 2527 if (!unqueue_me(&q)) 2528 goto out; 2529 ret = -ETIMEDOUT; 2530 if (to && !to->task) 2531 goto out; 2532 2533 /* 2534 * We expect signal_pending(current), but we might be the 2535 * victim of a spurious wakeup as well. 2536 */ 2537 if (!signal_pending(current)) 2538 goto retry; 2539 2540 ret = -ERESTARTSYS; 2541 if (!abs_time) 2542 goto out; 2543 2544 restart = ¤t->restart_block; 2545 restart->fn = futex_wait_restart; 2546 restart->futex.uaddr = uaddr; 2547 restart->futex.val = val; 2548 restart->futex.time = *abs_time; 2549 restart->futex.bitset = bitset; 2550 restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; 2551 2552 ret = -ERESTART_RESTARTBLOCK; 2553 2554 out: 2555 if (to) { 2556 hrtimer_cancel(&to->timer); 2557 destroy_hrtimer_on_stack(&to->timer); 2558 } 2559 return ret; 2560 } 2561 2562 2563 static long futex_wait_restart(struct restart_block *restart) 2564 { 2565 u32 __user *uaddr = restart->futex.uaddr; 2566 ktime_t t, *tp = NULL; 2567 2568 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { 2569 t = restart->futex.time; 2570 tp = &t; 2571 } 2572 restart->fn = do_no_restart_syscall; 2573 2574 return (long)futex_wait(uaddr, restart->futex.flags, 2575 restart->futex.val, tp, restart->futex.bitset); 2576 } 2577 2578 2579 /* 2580 * Userspace tried a 0 -> TID atomic transition of the futex value 2581 * and failed. The kernel side here does the whole locking operation: 2582 * if there are waiters then it will block as a consequence of relying 2583 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see 2584 * a 0 value of the futex too.). 2585 * 2586 * Also serves as futex trylock_pi()'ing, and due semantics. 2587 */ 2588 static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, 2589 ktime_t *time, int trylock) 2590 { 2591 struct hrtimer_sleeper timeout, *to = NULL; 2592 struct futex_pi_state *pi_state = NULL; 2593 struct rt_mutex_waiter rt_waiter; 2594 struct futex_hash_bucket *hb; 2595 struct futex_q q = futex_q_init; 2596 int res, ret; 2597 2598 if (refill_pi_state_cache()) 2599 return -ENOMEM; 2600 2601 if (time) { 2602 to = &timeout; 2603 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, 2604 HRTIMER_MODE_ABS); 2605 hrtimer_init_sleeper(to, current); 2606 hrtimer_set_expires(&to->timer, *time); 2607 } 2608 2609 retry: 2610 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); 2611 if (unlikely(ret != 0)) 2612 goto out; 2613 2614 retry_private: 2615 hb = queue_lock(&q); 2616 2617 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); 2618 if (unlikely(ret)) { 2619 /* 2620 * Atomic work succeeded and we got the lock, 2621 * or failed. Either way, we do _not_ block. 2622 */ 2623 switch (ret) { 2624 case 1: 2625 /* We got the lock. */ 2626 ret = 0; 2627 goto out_unlock_put_key; 2628 case -EFAULT: 2629 goto uaddr_faulted; 2630 case -EAGAIN: 2631 /* 2632 * Two reasons for this: 2633 * - Task is exiting and we just wait for the 2634 * exit to complete. 2635 * - The user space value changed. 2636 */ 2637 queue_unlock(hb); 2638 put_futex_key(&q.key); 2639 cond_resched(); 2640 goto retry; 2641 default: 2642 goto out_unlock_put_key; 2643 } 2644 } 2645 2646 WARN_ON(!q.pi_state); 2647 2648 /* 2649 * Only actually queue now that the atomic ops are done: 2650 */ 2651 __queue_me(&q, hb); 2652 2653 if (trylock) { 2654 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); 2655 /* Fixup the trylock return value: */ 2656 ret = ret ? 0 : -EWOULDBLOCK; 2657 goto no_block; 2658 } 2659 2660 rt_mutex_init_waiter(&rt_waiter); 2661 2662 /* 2663 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not 2664 * hold it while doing rt_mutex_start_proxy(), because then it will 2665 * include hb->lock in the blocking chain, even through we'll not in 2666 * fact hold it while blocking. This will lead it to report -EDEADLK 2667 * and BUG when futex_unlock_pi() interleaves with this. 2668 * 2669 * Therefore acquire wait_lock while holding hb->lock, but drop the 2670 * latter before calling rt_mutex_start_proxy_lock(). This still fully 2671 * serializes against futex_unlock_pi() as that does the exact same 2672 * lock handoff sequence. 2673 */ 2674 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); 2675 spin_unlock(q.lock_ptr); 2676 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); 2677 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); 2678 2679 if (ret) { 2680 if (ret == 1) 2681 ret = 0; 2682 2683 spin_lock(q.lock_ptr); 2684 goto no_block; 2685 } 2686 2687 2688 if (unlikely(to)) 2689 hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); 2690 2691 ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); 2692 2693 spin_lock(q.lock_ptr); 2694 /* 2695 * If we failed to acquire the lock (signal/timeout), we must 2696 * first acquire the hb->lock before removing the lock from the 2697 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex 2698 * wait lists consistent. 2699 * 2700 * In particular; it is important that futex_unlock_pi() can not 2701 * observe this inconsistency. 2702 */ 2703 if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) 2704 ret = 0; 2705 2706 no_block: 2707 /* 2708 * Fixup the pi_state owner and possibly acquire the lock if we 2709 * haven't already. 2710 */ 2711 res = fixup_owner(uaddr, &q, !ret); 2712 /* 2713 * If fixup_owner() returned an error, proprogate that. If it acquired 2714 * the lock, clear our -ETIMEDOUT or -EINTR. 2715 */ 2716 if (res) 2717 ret = (res < 0) ? res : 0; 2718 2719 /* 2720 * If fixup_owner() faulted and was unable to handle the fault, unlock 2721 * it and return the fault to userspace. 2722 */ 2723 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { 2724 pi_state = q.pi_state; 2725 get_pi_state(pi_state); 2726 } 2727 2728 /* Unqueue and drop the lock */ 2729 unqueue_me_pi(&q); 2730 2731 if (pi_state) { 2732 rt_mutex_futex_unlock(&pi_state->pi_mutex); 2733 put_pi_state(pi_state); 2734 } 2735 2736 goto out_put_key; 2737 2738 out_unlock_put_key: 2739 queue_unlock(hb); 2740 2741 out_put_key: 2742 put_futex_key(&q.key); 2743 out: 2744 if (to) { 2745 hrtimer_cancel(&to->timer); 2746 destroy_hrtimer_on_stack(&to->timer); 2747 } 2748 return ret != -EINTR ? ret : -ERESTARTNOINTR; 2749 2750 uaddr_faulted: 2751 queue_unlock(hb); 2752 2753 ret = fault_in_user_writeable(uaddr); 2754 if (ret) 2755 goto out_put_key; 2756 2757 if (!(flags & FLAGS_SHARED)) 2758 goto retry_private; 2759 2760 put_futex_key(&q.key); 2761 goto retry; 2762 } 2763 2764 /* 2765 * Userspace attempted a TID -> 0 atomic transition, and failed. 2766 * This is the in-kernel slowpath: we look up the PI state (if any), 2767 * and do the rt-mutex unlock. 2768 */ 2769 static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) 2770 { 2771 u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); 2772 union futex_key key = FUTEX_KEY_INIT; 2773 struct futex_hash_bucket *hb; 2774 struct futex_q *top_waiter; 2775 int ret; 2776 2777 retry: 2778 if (get_user(uval, uaddr)) 2779 return -EFAULT; 2780 /* 2781 * We release only a lock we actually own: 2782 */ 2783 if ((uval & FUTEX_TID_MASK) != vpid) 2784 return -EPERM; 2785 2786 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); 2787 if (ret) 2788 return ret; 2789 2790 hb = hash_futex(&key); 2791 spin_lock(&hb->lock); 2792 2793 /* 2794 * Check waiters first. We do not trust user space values at 2795 * all and we at least want to know if user space fiddled 2796 * with the futex value instead of blindly unlocking. 2797 */ 2798 top_waiter = futex_top_waiter(hb, &key); 2799 if (top_waiter) { 2800 struct futex_pi_state *pi_state = top_waiter->pi_state; 2801 2802 ret = -EINVAL; 2803 if (!pi_state) 2804 goto out_unlock; 2805 2806 /* 2807 * If current does not own the pi_state then the futex is 2808 * inconsistent and user space fiddled with the futex value. 2809 */ 2810 if (pi_state->owner != current) 2811 goto out_unlock; 2812 2813 get_pi_state(pi_state); 2814 /* 2815 * By taking wait_lock while still holding hb->lock, we ensure 2816 * there is no point where we hold neither; and therefore 2817 * wake_futex_pi() must observe a state consistent with what we 2818 * observed. 2819 */ 2820 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 2821 spin_unlock(&hb->lock); 2822 2823 ret = wake_futex_pi(uaddr, uval, pi_state); 2824 2825 put_pi_state(pi_state); 2826 2827 /* 2828 * Success, we're done! No tricky corner cases. 2829 */ 2830 if (!ret) 2831 goto out_putkey; 2832 /* 2833 * The atomic access to the futex value generated a 2834 * pagefault, so retry the user-access and the wakeup: 2835 */ 2836 if (ret == -EFAULT) 2837 goto pi_faulted; 2838 /* 2839 * A unconditional UNLOCK_PI op raced against a waiter 2840 * setting the FUTEX_WAITERS bit. Try again. 2841 */ 2842 if (ret == -EAGAIN) { 2843 put_futex_key(&key); 2844 goto retry; 2845 } 2846 /* 2847 * wake_futex_pi has detected invalid state. Tell user 2848 * space. 2849 */ 2850 goto out_putkey; 2851 } 2852 2853 /* 2854 * We have no kernel internal state, i.e. no waiters in the 2855 * kernel. Waiters which are about to queue themselves are stuck 2856 * on hb->lock. So we can safely ignore them. We do neither 2857 * preserve the WAITERS bit not the OWNER_DIED one. We are the 2858 * owner. 2859 */ 2860 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { 2861 spin_unlock(&hb->lock); 2862 goto pi_faulted; 2863 } 2864 2865 /* 2866 * If uval has changed, let user space handle it. 2867 */ 2868 ret = (curval == uval) ? 0 : -EAGAIN; 2869 2870 out_unlock: 2871 spin_unlock(&hb->lock); 2872 out_putkey: 2873 put_futex_key(&key); 2874 return ret; 2875 2876 pi_faulted: 2877 put_futex_key(&key); 2878 2879 ret = fault_in_user_writeable(uaddr); 2880 if (!ret) 2881 goto retry; 2882 2883 return ret; 2884 } 2885 2886 /** 2887 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex 2888 * @hb: the hash_bucket futex_q was original enqueued on 2889 * @q: the futex_q woken while waiting to be requeued 2890 * @key2: the futex_key of the requeue target futex 2891 * @timeout: the timeout associated with the wait (NULL if none) 2892 * 2893 * Detect if the task was woken on the initial futex as opposed to the requeue 2894 * target futex. If so, determine if it was a timeout or a signal that caused 2895 * the wakeup and return the appropriate error code to the caller. Must be 2896 * called with the hb lock held. 2897 * 2898 * Return: 2899 * 0 = no early wakeup detected; 2900 * <0 = -ETIMEDOUT or -ERESTARTNOINTR 2901 */ 2902 static inline 2903 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, 2904 struct futex_q *q, union futex_key *key2, 2905 struct hrtimer_sleeper *timeout) 2906 { 2907 int ret = 0; 2908 2909 /* 2910 * With the hb lock held, we avoid races while we process the wakeup. 2911 * We only need to hold hb (and not hb2) to ensure atomicity as the 2912 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. 2913 * It can't be requeued from uaddr2 to something else since we don't 2914 * support a PI aware source futex for requeue. 2915 */ 2916 if (!match_futex(&q->key, key2)) { 2917 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); 2918 /* 2919 * We were woken prior to requeue by a timeout or a signal. 2920 * Unqueue the futex_q and determine which it was. 2921 */ 2922 plist_del(&q->list, &hb->chain); 2923 hb_waiters_dec(hb); 2924 2925 /* Handle spurious wakeups gracefully */ 2926 ret = -EWOULDBLOCK; 2927 if (timeout && !timeout->task) 2928 ret = -ETIMEDOUT; 2929 else if (signal_pending(current)) 2930 ret = -ERESTARTNOINTR; 2931 } 2932 return ret; 2933 } 2934 2935 /** 2936 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2937 * @uaddr: the futex we initially wait on (non-pi) 2938 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be 2939 * the same type, no requeueing from private to shared, etc. 2940 * @val: the expected value of uaddr 2941 * @abs_time: absolute timeout 2942 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all 2943 * @uaddr2: the pi futex we will take prior to returning to user-space 2944 * 2945 * The caller will wait on uaddr and will be requeued by futex_requeue() to 2946 * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake 2947 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to 2948 * userspace. This ensures the rt_mutex maintains an owner when it has waiters; 2949 * without one, the pi logic would not know which task to boost/deboost, if 2950 * there was a need to. 2951 * 2952 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2953 * via the following-- 2954 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 2955 * 2) wakeup on uaddr2 after a requeue 2956 * 3) signal 2957 * 4) timeout 2958 * 2959 * If 3, cleanup and return -ERESTARTNOINTR. 2960 * 2961 * If 2, we may then block on trying to take the rt_mutex and return via: 2962 * 5) successful lock 2963 * 6) signal 2964 * 7) timeout 2965 * 8) other lock acquisition failure 2966 * 2967 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). 2968 * 2969 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2970 * 2971 * Return: 2972 * 0 - On success; 2973 * <0 - On error 2974 */ 2975 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, 2976 u32 val, ktime_t *abs_time, u32 bitset, 2977 u32 __user *uaddr2) 2978 { 2979 struct hrtimer_sleeper timeout, *to = NULL; 2980 struct futex_pi_state *pi_state = NULL; 2981 struct rt_mutex_waiter rt_waiter; 2982 struct futex_hash_bucket *hb; 2983 union futex_key key2 = FUTEX_KEY_INIT; 2984 struct futex_q q = futex_q_init; 2985 int res, ret; 2986 2987 if (uaddr == uaddr2) 2988 return -EINVAL; 2989 2990 if (!bitset) 2991 return -EINVAL; 2992 2993 if (abs_time) { 2994 to = &timeout; 2995 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? 2996 CLOCK_REALTIME : CLOCK_MONOTONIC, 2997 HRTIMER_MODE_ABS); 2998 hrtimer_init_sleeper(to, current); 2999 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 3000 current->timer_slack_ns); 3001 } 3002 3003 /* 3004 * The waiter is allocated on our stack, manipulated by the requeue 3005 * code while we sleep on uaddr. 3006 */ 3007 rt_mutex_init_waiter(&rt_waiter); 3008 3009 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); 3010 if (unlikely(ret != 0)) 3011 goto out; 3012 3013 q.bitset = bitset; 3014 q.rt_waiter = &rt_waiter; 3015 q.requeue_pi_key = &key2; 3016 3017 /* 3018 * Prepare to wait on uaddr. On success, increments q.key (key1) ref 3019 * count. 3020 */ 3021 ret = futex_wait_setup(uaddr, val, flags, &q, &hb); 3022 if (ret) 3023 goto out_key2; 3024 3025 /* 3026 * The check above which compares uaddrs is not sufficient for 3027 * shared futexes. We need to compare the keys: 3028 */ 3029 if (match_futex(&q.key, &key2)) { 3030 queue_unlock(hb); 3031 ret = -EINVAL; 3032 goto out_put_keys; 3033 } 3034 3035 /* Queue the futex_q, drop the hb lock, wait for wakeup. */ 3036 futex_wait_queue_me(hb, &q, to); 3037 3038 spin_lock(&hb->lock); 3039 ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); 3040 spin_unlock(&hb->lock); 3041 if (ret) 3042 goto out_put_keys; 3043 3044 /* 3045 * In order for us to be here, we know our q.key == key2, and since 3046 * we took the hb->lock above, we also know that futex_requeue() has 3047 * completed and we no longer have to concern ourselves with a wakeup 3048 * race with the atomic proxy lock acquisition by the requeue code. The 3049 * futex_requeue dropped our key1 reference and incremented our key2 3050 * reference count. 3051 */ 3052 3053 /* Check if the requeue code acquired the second futex for us. */ 3054 if (!q.rt_waiter) { 3055 /* 3056 * Got the lock. We might not be the anticipated owner if we 3057 * did a lock-steal - fix up the PI-state in that case. 3058 */ 3059 if (q.pi_state && (q.pi_state->owner != current)) { 3060 spin_lock(q.lock_ptr); 3061 ret = fixup_pi_state_owner(uaddr2, &q, current); 3062 if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { 3063 pi_state = q.pi_state; 3064 get_pi_state(pi_state); 3065 } 3066 /* 3067 * Drop the reference to the pi state which 3068 * the requeue_pi() code acquired for us. 3069 */ 3070 put_pi_state(q.pi_state); 3071 spin_unlock(q.lock_ptr); 3072 } 3073 } else { 3074 struct rt_mutex *pi_mutex; 3075 3076 /* 3077 * We have been woken up by futex_unlock_pi(), a timeout, or a 3078 * signal. futex_unlock_pi() will not destroy the lock_ptr nor 3079 * the pi_state. 3080 */ 3081 WARN_ON(!q.pi_state); 3082 pi_mutex = &q.pi_state->pi_mutex; 3083 ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); 3084 3085 spin_lock(q.lock_ptr); 3086 if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) 3087 ret = 0; 3088 3089 debug_rt_mutex_free_waiter(&rt_waiter); 3090 /* 3091 * Fixup the pi_state owner and possibly acquire the lock if we 3092 * haven't already. 3093 */ 3094 res = fixup_owner(uaddr2, &q, !ret); 3095 /* 3096 * If fixup_owner() returned an error, proprogate that. If it 3097 * acquired the lock, clear -ETIMEDOUT or -EINTR. 3098 */ 3099 if (res) 3100 ret = (res < 0) ? res : 0; 3101 3102 /* 3103 * If fixup_pi_state_owner() faulted and was unable to handle 3104 * the fault, unlock the rt_mutex and return the fault to 3105 * userspace. 3106 */ 3107 if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { 3108 pi_state = q.pi_state; 3109 get_pi_state(pi_state); 3110 } 3111 3112 /* Unqueue and drop the lock. */ 3113 unqueue_me_pi(&q); 3114 } 3115 3116 if (pi_state) { 3117 rt_mutex_futex_unlock(&pi_state->pi_mutex); 3118 put_pi_state(pi_state); 3119 } 3120 3121 if (ret == -EINTR) { 3122 /* 3123 * We've already been requeued, but cannot restart by calling 3124 * futex_lock_pi() directly. We could restart this syscall, but 3125 * it would detect that the user space "val" changed and return 3126 * -EWOULDBLOCK. Save the overhead of the restart and return 3127 * -EWOULDBLOCK directly. 3128 */ 3129 ret = -EWOULDBLOCK; 3130 } 3131 3132 out_put_keys: 3133 put_futex_key(&q.key); 3134 out_key2: 3135 put_futex_key(&key2); 3136 3137 out: 3138 if (to) { 3139 hrtimer_cancel(&to->timer); 3140 destroy_hrtimer_on_stack(&to->timer); 3141 } 3142 return ret; 3143 } 3144 3145 /* 3146 * Support for robust futexes: the kernel cleans up held futexes at 3147 * thread exit time. 3148 * 3149 * Implementation: user-space maintains a per-thread list of locks it 3150 * is holding. Upon do_exit(), the kernel carefully walks this list, 3151 * and marks all locks that are owned by this thread with the 3152 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is 3153 * always manipulated with the lock held, so the list is private and 3154 * per-thread. Userspace also maintains a per-thread 'list_op_pending' 3155 * field, to allow the kernel to clean up if the thread dies after 3156 * acquiring the lock, but just before it could have added itself to 3157 * the list. There can only be one such pending lock. 3158 */ 3159 3160 /** 3161 * sys_set_robust_list() - Set the robust-futex list head of a task 3162 * @head: pointer to the list-head 3163 * @len: length of the list-head, as userspace expects 3164 */ 3165 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, 3166 size_t, len) 3167 { 3168 if (!futex_cmpxchg_enabled) 3169 return -ENOSYS; 3170 /* 3171 * The kernel knows only one size for now: 3172 */ 3173 if (unlikely(len != sizeof(*head))) 3174 return -EINVAL; 3175 3176 current->robust_list = head; 3177 3178 return 0; 3179 } 3180 3181 /** 3182 * sys_get_robust_list() - Get the robust-futex list head of a task 3183 * @pid: pid of the process [zero for current task] 3184 * @head_ptr: pointer to a list-head pointer, the kernel fills it in 3185 * @len_ptr: pointer to a length field, the kernel fills in the header size 3186 */ 3187 SYSCALL_DEFINE3(get_robust_list, int, pid, 3188 struct robust_list_head __user * __user *, head_ptr, 3189 size_t __user *, len_ptr) 3190 { 3191 struct robust_list_head __user *head; 3192 unsigned long ret; 3193 struct task_struct *p; 3194 3195 if (!futex_cmpxchg_enabled) 3196 return -ENOSYS; 3197 3198 rcu_read_lock(); 3199 3200 ret = -ESRCH; 3201 if (!pid) 3202 p = current; 3203 else { 3204 p = find_task_by_vpid(pid); 3205 if (!p) 3206 goto err_unlock; 3207 } 3208 3209 ret = -EPERM; 3210 if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) 3211 goto err_unlock; 3212 3213 head = p->robust_list; 3214 rcu_read_unlock(); 3215 3216 if (put_user(sizeof(*head), len_ptr)) 3217 return -EFAULT; 3218 return put_user(head, head_ptr); 3219 3220 err_unlock: 3221 rcu_read_unlock(); 3222 3223 return ret; 3224 } 3225 3226 /* 3227 * Process a futex-list entry, check whether it's owned by the 3228 * dying task, and do notification if so: 3229 */ 3230 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) 3231 { 3232 u32 uval, uninitialized_var(nval), mval; 3233 3234 retry: 3235 if (get_user(uval, uaddr)) 3236 return -1; 3237 3238 if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { 3239 /* 3240 * Ok, this dying thread is truly holding a futex 3241 * of interest. Set the OWNER_DIED bit atomically 3242 * via cmpxchg, and if the value had FUTEX_WAITERS 3243 * set, wake up a waiter (if any). (We have to do a 3244 * futex_wake() even if OWNER_DIED is already set - 3245 * to handle the rare but possible case of recursive 3246 * thread-death.) The rest of the cleanup is done in 3247 * userspace. 3248 */ 3249 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 3250 /* 3251 * We are not holding a lock here, but we want to have 3252 * the pagefault_disable/enable() protection because 3253 * we want to handle the fault gracefully. If the 3254 * access fails we try to fault in the futex with R/W 3255 * verification via get_user_pages. get_user() above 3256 * does not guarantee R/W access. If that fails we 3257 * give up and leave the futex locked. 3258 */ 3259 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { 3260 if (fault_in_user_writeable(uaddr)) 3261 return -1; 3262 goto retry; 3263 } 3264 if (nval != uval) 3265 goto retry; 3266 3267 /* 3268 * Wake robust non-PI futexes here. The wakeup of 3269 * PI futexes happens in exit_pi_state(): 3270 */ 3271 if (!pi && (uval & FUTEX_WAITERS)) 3272 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); 3273 } 3274 return 0; 3275 } 3276 3277 /* 3278 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 3279 */ 3280 static inline int fetch_robust_entry(struct robust_list __user **entry, 3281 struct robust_list __user * __user *head, 3282 unsigned int *pi) 3283 { 3284 unsigned long uentry; 3285 3286 if (get_user(uentry, (unsigned long __user *)head)) 3287 return -EFAULT; 3288 3289 *entry = (void __user *)(uentry & ~1UL); 3290 *pi = uentry & 1; 3291 3292 return 0; 3293 } 3294 3295 /* 3296 * Walk curr->robust_list (very carefully, it's a userspace list!) 3297 * and mark any locks found there dead, and notify any waiters. 3298 * 3299 * We silently return on any sign of list-walking problem. 3300 */ 3301 void exit_robust_list(struct task_struct *curr) 3302 { 3303 struct robust_list_head __user *head = curr->robust_list; 3304 struct robust_list __user *entry, *next_entry, *pending; 3305 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 3306 unsigned int uninitialized_var(next_pi); 3307 unsigned long futex_offset; 3308 int rc; 3309 3310 if (!futex_cmpxchg_enabled) 3311 return; 3312 3313 /* 3314 * Fetch the list head (which was registered earlier, via 3315 * sys_set_robust_list()): 3316 */ 3317 if (fetch_robust_entry(&entry, &head->list.next, &pi)) 3318 return; 3319 /* 3320 * Fetch the relative futex offset: 3321 */ 3322 if (get_user(futex_offset, &head->futex_offset)) 3323 return; 3324 /* 3325 * Fetch any possibly pending lock-add first, and handle it 3326 * if it exists: 3327 */ 3328 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) 3329 return; 3330 3331 next_entry = NULL; /* avoid warning with gcc */ 3332 while (entry != &head->list) { 3333 /* 3334 * Fetch the next entry in the list before calling 3335 * handle_futex_death: 3336 */ 3337 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); 3338 /* 3339 * A pending lock might already be on the list, so 3340 * don't process it twice: 3341 */ 3342 if (entry != pending) 3343 if (handle_futex_death((void __user *)entry + futex_offset, 3344 curr, pi)) 3345 return; 3346 if (rc) 3347 return; 3348 entry = next_entry; 3349 pi = next_pi; 3350 /* 3351 * Avoid excessively long or circular lists: 3352 */ 3353 if (!--limit) 3354 break; 3355 3356 cond_resched(); 3357 } 3358 3359 if (pending) 3360 handle_futex_death((void __user *)pending + futex_offset, 3361 curr, pip); 3362 } 3363 3364 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 3365 u32 __user *uaddr2, u32 val2, u32 val3) 3366 { 3367 int cmd = op & FUTEX_CMD_MASK; 3368 unsigned int flags = 0; 3369 3370 if (!(op & FUTEX_PRIVATE_FLAG)) 3371 flags |= FLAGS_SHARED; 3372 3373 if (op & FUTEX_CLOCK_REALTIME) { 3374 flags |= FLAGS_CLOCKRT; 3375 if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \ 3376 cmd != FUTEX_WAIT_REQUEUE_PI) 3377 return -ENOSYS; 3378 } 3379 3380 switch (cmd) { 3381 case FUTEX_LOCK_PI: 3382 case FUTEX_UNLOCK_PI: 3383 case FUTEX_TRYLOCK_PI: 3384 case FUTEX_WAIT_REQUEUE_PI: 3385 case FUTEX_CMP_REQUEUE_PI: 3386 if (!futex_cmpxchg_enabled) 3387 return -ENOSYS; 3388 } 3389 3390 switch (cmd) { 3391 case FUTEX_WAIT: 3392 val3 = FUTEX_BITSET_MATCH_ANY; 3393 case FUTEX_WAIT_BITSET: 3394 return futex_wait(uaddr, flags, val, timeout, val3); 3395 case FUTEX_WAKE: 3396 val3 = FUTEX_BITSET_MATCH_ANY; 3397 case FUTEX_WAKE_BITSET: 3398 return futex_wake(uaddr, flags, val, val3); 3399 case FUTEX_REQUEUE: 3400 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); 3401 case FUTEX_CMP_REQUEUE: 3402 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); 3403 case FUTEX_WAKE_OP: 3404 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); 3405 case FUTEX_LOCK_PI: 3406 return futex_lock_pi(uaddr, flags, timeout, 0); 3407 case FUTEX_UNLOCK_PI: 3408 return futex_unlock_pi(uaddr, flags); 3409 case FUTEX_TRYLOCK_PI: 3410 return futex_lock_pi(uaddr, flags, NULL, 1); 3411 case FUTEX_WAIT_REQUEUE_PI: 3412 val3 = FUTEX_BITSET_MATCH_ANY; 3413 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, 3414 uaddr2); 3415 case FUTEX_CMP_REQUEUE_PI: 3416 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); 3417 } 3418 return -ENOSYS; 3419 } 3420 3421 3422 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, 3423 struct timespec __user *, utime, u32 __user *, uaddr2, 3424 u32, val3) 3425 { 3426 struct timespec ts; 3427 ktime_t t, *tp = NULL; 3428 u32 val2 = 0; 3429 int cmd = op & FUTEX_CMD_MASK; 3430 3431 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 3432 cmd == FUTEX_WAIT_BITSET || 3433 cmd == FUTEX_WAIT_REQUEUE_PI)) { 3434 if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) 3435 return -EFAULT; 3436 if (copy_from_user(&ts, utime, sizeof(ts)) != 0) 3437 return -EFAULT; 3438 if (!timespec_valid(&ts)) 3439 return -EINVAL; 3440 3441 t = timespec_to_ktime(ts); 3442 if (cmd == FUTEX_WAIT) 3443 t = ktime_add_safe(ktime_get(), t); 3444 tp = &t; 3445 } 3446 /* 3447 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. 3448 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. 3449 */ 3450 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || 3451 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) 3452 val2 = (u32) (unsigned long) utime; 3453 3454 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 3455 } 3456 3457 static void __init futex_detect_cmpxchg(void) 3458 { 3459 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG 3460 u32 curval; 3461 3462 /* 3463 * This will fail and we want it. Some arch implementations do 3464 * runtime detection of the futex_atomic_cmpxchg_inatomic() 3465 * functionality. We want to know that before we call in any 3466 * of the complex code paths. Also we want to prevent 3467 * registration of robust lists in that case. NULL is 3468 * guaranteed to fault and we get -EFAULT on functional 3469 * implementation, the non-functional ones will return 3470 * -ENOSYS. 3471 */ 3472 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) 3473 futex_cmpxchg_enabled = 1; 3474 #endif 3475 } 3476 3477 static int __init futex_init(void) 3478 { 3479 unsigned int futex_shift; 3480 unsigned long i; 3481 3482 #if CONFIG_BASE_SMALL 3483 futex_hashsize = 16; 3484 #else 3485 futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); 3486 #endif 3487 3488 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), 3489 futex_hashsize, 0, 3490 futex_hashsize < 256 ? HASH_SMALL : 0, 3491 &futex_shift, NULL, 3492 futex_hashsize, futex_hashsize); 3493 futex_hashsize = 1UL << futex_shift; 3494 3495 futex_detect_cmpxchg(); 3496 3497 for (i = 0; i < futex_hashsize; i++) { 3498 atomic_set(&futex_queues[i].waiters, 0); 3499 plist_head_init(&futex_queues[i].chain); 3500 spin_lock_init(&futex_queues[i].lock); 3501 } 3502 3503 return 0; 3504 } 3505 core_initcall(futex_init); 3506
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.