kernel/futex/core.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *  Fast Userspace Mutexes (which I call "Futexes!").
   4  *  (C) Rusty Russell, IBM 2002
   5  *
   6  *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
   7  *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
   8  *
   9  *  Removed page pinning, fix privately mapped COW pages and other cleanups
  10  *  (C) Copyright 2003, 2004 Jamie Lokier
  11  *
  12  *  Robust futex support started by Ingo Molnar
  13  *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
  14  *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
  15  *
  16  *  PI-futex support started by Ingo Molnar and Thomas Gleixner
  17  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  18  *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  19  *
  20  *  PRIVATE futexes by Eric Dumazet
  21  *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
  22  *
  23  *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
  24  *  Copyright (C) IBM Corporation, 2009
  25  *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
  26  *
  27  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  28  *  enough at me, Linus for the original (flawed) idea, Matthew
  29  *  Kirkwood for proof-of-concept implementation.
  30  *
  31  *  "The futexes are also cursed."
  32  *  "But they come in a choice of three flavours!"
  33  */
  34 #include <linux/compat.h>
  35 #include <linux/jhash.h>
  36 #include <linux/pagemap.h>
  37 #include <linux/memblock.h>
  38 #include <linux/fault-inject.h>
  39 #include <linux/slab.h>
  40
  41 #include "futex.h"
  42 #include "../locking/rtmutex_common.h"
  43
  44 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
  45 int  __read_mostly futex_cmpxchg_enabled;
  46 #endif
  47
  48
  49 /*
  50  * The base of the bucket array and its size are always used together
  51  * (after initialization only in futex_hash()), so ensure that they
  52  * reside in the same cacheline.
  53  */
  54 static struct {
  55         struct futex_hash_bucket *queues;
  56         unsigned long            hashsize;
  57 } __futex_data __read_mostly __aligned(2*sizeof(long));
  58 #define futex_queues   (__futex_data.queues)
  59 #define futex_hashsize (__futex_data.hashsize)
  60
  61
  62 /*
  63  * Fault injections for futexes.
  64  */
  65 #ifdef CONFIG_FAIL_FUTEX
  66
  67 static struct {
  68         struct fault_attr attr;
  69
  70         bool ignore_private;
  71 } fail_futex = {
  72         .attr = FAULT_ATTR_INITIALIZER,
  73         .ignore_private = false,
  74 };
  75
  76 static int __init setup_fail_futex(char *str)
  77 {
  78         return setup_fault_attr(&fail_futex.attr, str);
  79 }
  80 __setup("fail_futex=", setup_fail_futex);
  81
  82 bool should_fail_futex(bool fshared)
  83 {
  84         if (fail_futex.ignore_private && !fshared)
  85                 return false;
  86
  87         return should_fail(&fail_futex.attr, 1);
  88 }
  89
  90 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
  91
  92 static int __init fail_futex_debugfs(void)
  93 {
  94         umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
  95         struct dentry *dir;
  96
  97         dir = fault_create_debugfs_attr("fail_futex", NULL,
  98                                         &fail_futex.attr);
  99         if (IS_ERR(dir))
 100                 return PTR_ERR(dir);
 101
 102         debugfs_create_bool("ignore-private", mode, dir,
 103                             &fail_futex.ignore_private);
 104         return 0;
 105 }
 106
 107 late_initcall(fail_futex_debugfs);
 108
 109 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 110
 111 #endif /* CONFIG_FAIL_FUTEX */
 112
 113 /**
 114  * futex_hash - Return the hash bucket in the global hash
 115  * @key:        Pointer to the futex key for which the hash is calculated
 116  *
 117  * We hash on the keys returned from get_futex_key (see below) and return the
 118  * corresponding hash bucket in the global hash.
 119  */
 120 struct futex_hash_bucket *futex_hash(union futex_key *key)
 121 {
 122         u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
 123                           key->both.offset);
 124
 125         return &futex_queues[hash & (futex_hashsize - 1)];
 126 }
 127
 128
 129 /**
 130  * futex_setup_timer - set up the sleeping hrtimer.
 131  * @time:       ptr to the given timeout value
 132  * @timeout:    the hrtimer_sleeper structure to be set up
 133  * @flags:      futex flags
 134  * @range_ns:   optional range in ns
 135  *
 136  * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
 137  *         value given
 138  */
 139 struct hrtimer_sleeper *
 140 futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
 141                   int flags, u64 range_ns)
 142 {
 143         if (!time)
 144                 return NULL;
 145
 146         hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
 147                                       CLOCK_REALTIME : CLOCK_MONOTONIC,
 148                                       HRTIMER_MODE_ABS);
 149         /*
 150          * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
 151          * effectively the same as calling hrtimer_set_expires().
 152          */
 153         hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
 154
 155         return timeout;
 156 }
 157
 158 /*
 159  * Generate a machine wide unique identifier for this inode.
 160  *
 161  * This relies on u64 not wrapping in the life-time of the machine; which with
 162  * 1ns resolution means almost 585 years.
 163  *
 164  * This further relies on the fact that a well formed program will not unmap
 165  * the file while it has a (shared) futex waiting on it. This mapping will have
 166  * a file reference which pins the mount and inode.
 167  *
 168  * If for some reason an inode gets evicted and read back in again, it will get
 169  * a new sequence number and will _NOT_ match, even though it is the exact same
 170  * file.
 171  *
 172  * It is important that futex_match() will never have a false-positive, esp.
 173  * for PI futexes that can mess up the state. The above argues that false-negatives
 174  * are only possible for malformed programs.
 175  */
 176 static u64 get_inode_sequence_number(struct inode *inode)
 177 {
 178         static atomic64_t i_seq;
 179         u64 old;
 180
 181         /* Does the inode already have a sequence number? */
 182         old = atomic64_read(&inode->i_sequence);
 183         if (likely(old))
 184                 return old;
 185
 186         for (;;) {
 187                 u64 new = atomic64_add_return(1, &i_seq);
 188                 if (WARN_ON_ONCE(!new))
 189                         continue;
 190
 191                 old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
 192                 if (old)
 193                         return old;
 194                 return new;
 195         }
 196 }
 197
 198 /**
 199  * get_futex_key() - Get parameters which are the keys for a futex
 200  * @uaddr:      virtual address of the futex
 201  * @fshared:    false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
 202  * @key:        address where result is stored.
 203  * @rw:         mapping needs to be read/write (values: FUTEX_READ,
 204  *              FUTEX_WRITE)
 205  *
 206  * Return: a negative error code or 0
 207  *
 208  * The key words are stored in @key on success.
 209  *
 210  * For shared mappings (when @fshared), the key is:
 211  *
 212  *   ( inode->i_sequence, page->index, offset_within_page )
 213  *
 214  * [ also see get_inode_sequence_number() ]
 215  *
 216  * For private mappings (or when !@fshared), the key is:
 217  *
 218  *   ( current->mm, address, 0 )
 219  *
 220  * This allows (cross process, where applicable) identification of the futex
 221  * without keeping the page pinned for the duration of the FUTEX_WAIT.
 222  *
 223  * lock_page() might sleep, the caller should not hold a spinlock.
 224  */
 225 int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
 226                   enum futex_access rw)
 227 {
 228         unsigned long address = (unsigned long)uaddr;
 229         struct mm_struct *mm = current->mm;
 230         struct page *page, *tail;
 231         struct address_space *mapping;
 232         int err, ro = 0;
 233
 234         /*
 235          * The futex address must be "naturally" aligned.
 236          */
 237         key->both.offset = address % PAGE_SIZE;
 238         if (unlikely((address % sizeof(u32)) != 0))
 239                 return -EINVAL;
 240         address -= key->both.offset;
 241
 242         if (unlikely(!access_ok(uaddr, sizeof(u32))))
 243                 return -EFAULT;
 244
 245         if (unlikely(should_fail_futex(fshared)))
 246                 return -EFAULT;
 247
 248         /*
 249          * PROCESS_PRIVATE futexes are fast.
 250          * As the mm cannot disappear under us and the 'key' only needs
 251          * virtual address, we dont even have to find the underlying vma.
 252          * Note : We do have to check 'uaddr' is a valid user address,
 253          *        but access_ok() should be faster than find_vma()
 254          */
 255         if (!fshared) {
 256                 key->private.mm = mm;
 257                 key->private.address = address;
 258                 return 0;
 259         }
 260
 261 again:
 262         /* Ignore any VERIFY_READ mapping (futex common case) */
 263         if (unlikely(should_fail_futex(true)))
 264                 return -EFAULT;
 265
 266         err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
 267         /*
 268          * If write access is not required (eg. FUTEX_WAIT), try
 269          * and get read-only access.
 270          */
 271         if (err == -EFAULT && rw == FUTEX_READ) {
 272                 err = get_user_pages_fast(address, 1, 0, &page);
 273                 ro = 1;
 274         }
 275         if (err < 0)
 276                 return err;
 277         else
 278                 err = 0;
 279
 280         /*
 281          * The treatment of mapping from this point on is critical. The page
 282          * lock protects many things but in this context the page lock
 283          * stabilizes mapping, prevents inode freeing in the shared
 284          * file-backed region case and guards against movement to swap cache.
 285          *
 286          * Strictly speaking the page lock is not needed in all cases being
 287          * considered here and page lock forces unnecessarily serialization
 288          * From this point on, mapping will be re-verified if necessary and
 289          * page lock will be acquired only if it is unavoidable
 290          *
 291          * Mapping checks require the head page for any compound page so the
 292          * head page and mapping is looked up now. For anonymous pages, it
 293          * does not matter if the page splits in the future as the key is
 294          * based on the address. For filesystem-backed pages, the tail is
 295          * required as the index of the page determines the key. For
 296          * base pages, there is no tail page and tail == page.
 297          */
 298         tail = page;
 299         page = compound_head(page);
 300         mapping = READ_ONCE(page->mapping);
 301
 302         /*
 303          * If page->mapping is NULL, then it cannot be a PageAnon
 304          * page; but it might be the ZERO_PAGE or in the gate area or
 305          * in a special mapping (all cases which we are happy to fail);
 306          * or it may have been a good file page when get_user_pages_fast
 307          * found it, but truncated or holepunched or subjected to
 308          * invalidate_complete_page2 before we got the page lock (also
 309          * cases which we are happy to fail).  And we hold a reference,
 310          * so refcount care in invalidate_complete_page's remove_mapping
 311          * prevents drop_caches from setting mapping to NULL beneath us.
 312          *
 313          * The case we do have to guard against is when memory pressure made
 314          * shmem_writepage move it from filecache to swapcache beneath us:
 315          * an unlikely race, but we do need to retry for page->mapping.
 316          */
 317         if (unlikely(!mapping)) {
 318                 int shmem_swizzled;
 319
 320                 /*
 321                  * Page lock is required to identify which special case above
 322                  * applies. If this is really a shmem page then the page lock
 323                  * will prevent unexpected transitions.
 324                  */
 325                 lock_page(page);
 326                 shmem_swizzled = PageSwapCache(page) || page->mapping;
 327                 unlock_page(page);
 328                 put_page(page);
 329
 330                 if (shmem_swizzled)
 331                         goto again;
 332
 333                 return -EFAULT;
 334         }
 335
 336         /*
 337          * Private mappings are handled in a simple way.
 338          *
 339          * If the futex key is stored on an anonymous page, then the associated
 340          * object is the mm which is implicitly pinned by the calling process.
 341          *
 342          * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 343          * it's a read-only handle, it's expected that futexes attach to
 344          * the object not the particular process.
 345          */
 346         if (PageAnon(page)) {
 347                 /*
 348                  * A RO anonymous page will never change and thus doesn't make
 349                  * sense for futex operations.
 350                  */
 351                 if (unlikely(should_fail_futex(true)) || ro) {
 352                         err = -EFAULT;
 353                         goto out;
 354                 }
 355
 356                 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
 357                 key->private.mm = mm;
 358                 key->private.address = address;
 359
 360         } else {
 361                 struct inode *inode;
 362
 363                 /*
 364                  * The associated futex object in this case is the inode and
 365                  * the page->mapping must be traversed. Ordinarily this should
 366                  * be stabilised under page lock but it's not strictly
 367                  * necessary in this case as we just want to pin the inode, not
 368                  * update the radix tree or anything like that.
 369                  *
 370                  * The RCU read lock is taken as the inode is finally freed
 371                  * under RCU. If the mapping still matches expectations then the
 372                  * mapping->host can be safely accessed as being a valid inode.
 373                  */
 374                 rcu_read_lock();
 375
 376                 if (READ_ONCE(page->mapping) != mapping) {
 377                         rcu_read_unlock();
 378                         put_page(page);
 379
 380                         goto again;
 381                 }
 382
 383                 inode = READ_ONCE(mapping->host);
 384                 if (!inode) {
 385                         rcu_read_unlock();
 386                         put_page(page);
 387
 388                         goto again;
 389                 }
 390
 391                 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
 392                 key->shared.i_seq = get_inode_sequence_number(inode);
 393                 key->shared.pgoff = page_to_pgoff(tail);
 394                 rcu_read_unlock();
 395         }
 396
 397 out:
 398         put_page(page);
 399         return err;
 400 }
 401
 402 /**
 403  * fault_in_user_writeable() - Fault in user address and verify RW access
 404  * @uaddr:      pointer to faulting user space address
 405  *
 406  * Slow path to fixup the fault we just took in the atomic write
 407  * access to @uaddr.
 408  *
 409  * We have no generic implementation of a non-destructive write to the
 410  * user address. We know that we faulted in the atomic pagefault
 411  * disabled section so we can as well avoid the #PF overhead by
 412  * calling get_user_pages() right away.
 413  */
 414 int fault_in_user_writeable(u32 __user *uaddr)
 415 {
 416         struct mm_struct *mm = current->mm;
 417         int ret;
 418
 419         mmap_read_lock(mm);
 420         ret = fixup_user_fault(mm, (unsigned long)uaddr,
 421                                FAULT_FLAG_WRITE, NULL);
 422         mmap_read_unlock(mm);
 423
 424         return ret < 0 ? ret : 0;
 425 }
 426
 427 /**
 428  * futex_top_waiter() - Return the highest priority waiter on a futex
 429  * @hb:         the hash bucket the futex_q's reside in
 430  * @key:        the futex key (to distinguish it from other futex futex_q's)
 431  *
 432  * Must be called with the hb lock held.
 433  */
 434 struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key)
 435 {
 436         struct futex_q *this;
 437
 438         plist_for_each_entry(this, &hb->chain, list) {
 439                 if (futex_match(&this->key, key))
 440                         return this;
 441         }
 442         return NULL;
 443 }
 444
 445 int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
 446 {
 447         int ret;
 448
 449         pagefault_disable();
 450         ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
 451         pagefault_enable();
 452
 453         return ret;
 454 }
 455
 456 int futex_get_value_locked(u32 *dest, u32 __user *from)
 457 {
 458         int ret;
 459
 460         pagefault_disable();
 461         ret = __get_user(*dest, from);
 462         pagefault_enable();
 463
 464         return ret ? -EFAULT : 0;
 465 }
 466
 467 /**
 468  * wait_for_owner_exiting - Block until the owner has exited
 469  * @ret: owner's current futex lock status
 470  * @exiting:    Pointer to the exiting task
 471  *
 472  * Caller must hold a refcount on @exiting.
 473  */
 474 void wait_for_owner_exiting(int ret, struct task_struct *exiting)
 475 {
 476         if (ret != -EBUSY) {
 477                 WARN_ON_ONCE(exiting);
 478                 return;
 479         }
 480
 481         if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
 482                 return;
 483
 484         mutex_lock(&exiting->futex_exit_mutex);
 485         /*
 486          * No point in doing state checking here. If the waiter got here
 487          * while the task was in exec()->exec_futex_release() then it can
 488          * have any FUTEX_STATE_* value when the waiter has acquired the
 489          * mutex. OK, if running, EXITING or DEAD if it reached exit()
 490          * already. Highly unlikely and not a problem. Just one more round
 491          * through the futex maze.
 492          */
 493         mutex_unlock(&exiting->futex_exit_mutex);
 494
 495         put_task_struct(exiting);
 496 }
 497
 498 /**
 499  * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
 500  * @q:  The futex_q to unqueue
 501  *
 502  * The q->lock_ptr must not be NULL and must be held by the caller.
 503  */
 504 void __futex_unqueue(struct futex_q *q)
 505 {
 506         struct futex_hash_bucket *hb;
 507
 508         if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
 509                 return;
 510         lockdep_assert_held(q->lock_ptr);
 511
 512         hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
 513         plist_del(&q->list, &hb->chain);
 514         futex_hb_waiters_dec(hb);
 515 }
 516
 517 /* The key must be already stored in q->key. */
 518 struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
 519         __acquires(&hb->lock)
 520 {
 521         struct futex_hash_bucket *hb;
 522
 523         hb = futex_hash(&q->key);
 524
 525         /*
 526          * Increment the counter before taking the lock so that
 527          * a potential waker won't miss a to-be-slept task that is
 528          * waiting for the spinlock. This is safe as all futex_q_lock()
 529          * users end up calling futex_queue(). Similarly, for housekeeping,
 530          * decrement the counter at futex_q_unlock() when some error has
 531          * occurred and we don't end up adding the task to the list.
 532          */
 533         futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */
 534
 535         q->lock_ptr = &hb->lock;
 536
 537         spin_lock(&hb->lock);
 538         return hb;
 539 }
 540
 541 void futex_q_unlock(struct futex_hash_bucket *hb)
 542         __releases(&hb->lock)
 543 {
 544         spin_unlock(&hb->lock);
 545         futex_hb_waiters_dec(hb);
 546 }
 547
 548 void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
 549 {
 550         int prio;
 551
 552         /*
 553          * The priority used to register this element is
 554          * - either the real thread-priority for the real-time threads
 555          * (i.e. threads with a priority lower than MAX_RT_PRIO)
 556          * - or MAX_RT_PRIO for non-RT threads.
 557          * Thus, all RT-threads are woken first in priority order, and
 558          * the others are woken last, in FIFO order.
 559          */
 560         prio = min(current->normal_prio, MAX_RT_PRIO);
 561
 562         plist_node_init(&q->list, prio);
 563         plist_add(&q->list, &hb->chain);
 564         q->task = current;
 565 }
 566
 567 /**
 568  * futex_unqueue() - Remove the futex_q from its futex_hash_bucket
 569  * @q:  The futex_q to unqueue
 570  *
 571  * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
 572  * be paired with exactly one earlier call to futex_queue().
 573  *
 574  * Return:
 575  *  - 1 - if the futex_q was still queued (and we removed unqueued it);
 576  *  - 0 - if the futex_q was already removed by the waking thread
 577  */
 578 int futex_unqueue(struct futex_q *q)
 579 {
 580         spinlock_t *lock_ptr;
 581         int ret = 0;
 582
 583         /* In the common case we don't take the spinlock, which is nice. */
 584 retry:
 585         /*
 586          * q->lock_ptr can change between this read and the following spin_lock.
 587          * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
 588          * optimizing lock_ptr out of the logic below.
 589          */
 590         lock_ptr = READ_ONCE(q->lock_ptr);
 591         if (lock_ptr != NULL) {
 592                 spin_lock(lock_ptr);
 593                 /*
 594                  * q->lock_ptr can change between reading it and
 595                  * spin_lock(), causing us to take the wrong lock.  This
 596                  * corrects the race condition.
 597                  *
 598                  * Reasoning goes like this: if we have the wrong lock,
 599                  * q->lock_ptr must have changed (maybe several times)
 600                  * between reading it and the spin_lock().  It can
 601                  * change again after the spin_lock() but only if it was
 602                  * already changed before the spin_lock().  It cannot,
 603                  * however, change back to the original value.  Therefore
 604                  * we can detect whether we acquired the correct lock.
 605                  */
 606                 if (unlikely(lock_ptr != q->lock_ptr)) {
 607                         spin_unlock(lock_ptr);
 608                         goto retry;
 609                 }
 610                 __futex_unqueue(q);
 611
 612                 BUG_ON(q->pi_state);
 613
 614                 spin_unlock(lock_ptr);
 615                 ret = 1;
 616         }
 617
 618         return ret;
 619 }
 620
 621 /*
 622  * PI futexes can not be requeued and must remove themselves from the
 623  * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
 624  */
 625 void futex_unqueue_pi(struct futex_q *q)
 626 {
 627         __futex_unqueue(q);
 628
 629         BUG_ON(!q->pi_state);
 630         put_pi_state(q->pi_state);
 631         q->pi_state = NULL;
 632 }
 633
 634 /* Constants for the pending_op argument of handle_futex_death */
 635 #define HANDLE_DEATH_PENDING    true
 636 #define HANDLE_DEATH_LIST       false
 637
 638 /*
 639  * Process a futex-list entry, check whether it's owned by the
 640  * dying task, and do notification if so:
 641  */
 642 static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
 643                               bool pi, bool pending_op)
 644 {
 645         u32 uval, nval, mval;
 646         int err;
 647
 648         /* Futex address must be 32bit aligned */
 649         if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
 650                 return -1;
 651
 652 retry:
 653         if (get_user(uval, uaddr))
 654                 return -1;
 655
 656         /*
 657          * Special case for regular (non PI) futexes. The unlock path in
 658          * user space has two race scenarios:
 659          *
 660          * 1. The unlock path releases the user space futex value and
 661          *    before it can execute the futex() syscall to wake up
 662          *    waiters it is killed.
 663          *
 664          * 2. A woken up waiter is killed before it can acquire the
 665          *    futex in user space.
 666          *
 667          * In both cases the TID validation below prevents a wakeup of
 668          * potential waiters which can cause these waiters to block
 669          * forever.
 670          *
 671          * In both cases the following conditions are met:
 672          *
 673          *      1) task->robust_list->list_op_pending != NULL
 674          *         @pending_op == true
 675          *      2) User space futex value == 0
 676          *      3) Regular futex: @pi == false
 677          *
 678          * If these conditions are met, it is safe to attempt waking up a
 679          * potential waiter without touching the user space futex value and
 680          * trying to set the OWNER_DIED bit. The user space futex value is
 681          * uncontended and the rest of the user space mutex state is
 682          * consistent, so a woken waiter will just take over the
 683          * uncontended futex. Setting the OWNER_DIED bit would create
 684          * inconsistent state and malfunction of the user space owner died
 685          * handling.
 686          */
 687         if (pending_op && !pi && !uval) {
 688                 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
 689                 return 0;
 690         }
 691
 692         if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
 693                 return 0;
 694
 695         /*
 696          * Ok, this dying thread is truly holding a futex
 697          * of interest. Set the OWNER_DIED bit atomically
 698          * via cmpxchg, and if the value had FUTEX_WAITERS
 699          * set, wake up a waiter (if any). (We have to do a
 700          * futex_wake() even if OWNER_DIED is already set -
 701          * to handle the rare but possible case of recursive
 702          * thread-death.) The rest of the cleanup is done in
 703          * userspace.
 704          */
 705         mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
 706
 707         /*
 708          * We are not holding a lock here, but we want to have
 709          * the pagefault_disable/enable() protection because
 710          * we want to handle the fault gracefully. If the
 711          * access fails we try to fault in the futex with R/W
 712          * verification via get_user_pages. get_user() above
 713          * does not guarantee R/W access. If that fails we
 714          * give up and leave the futex locked.
 715          */
 716         if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) {
 717                 switch (err) {
 718                 case -EFAULT:
 719                         if (fault_in_user_writeable(uaddr))
 720                                 return -1;
 721                         goto retry;
 722
 723                 case -EAGAIN:
 724                         cond_resched();
 725                         goto retry;
 726
 727                 default:
 728                         WARN_ON_ONCE(1);
 729                         return err;
 730                 }
 731         }
 732
 733         if (nval != uval)
 734                 goto retry;
 735
 736         /*
 737          * Wake robust non-PI futexes here. The wakeup of
 738          * PI futexes happens in exit_pi_state():
 739          */
 740         if (!pi && (uval & FUTEX_WAITERS))
 741                 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
 742
 743         return 0;
 744 }
 745
 746 /*
 747  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
 748  */
 749 static inline int fetch_robust_entry(struct robust_list __user **entry,
 750                                      struct robust_list __user * __user *head,
 751                                      unsigned int *pi)
 752 {
 753         unsigned long uentry;
 754
 755         if (get_user(uentry, (unsigned long __user *)head))
 756                 return -EFAULT;
 757
 758         *entry = (void __user *)(uentry & ~1UL);
 759         *pi = uentry & 1;
 760
 761         return 0;
 762 }
 763
 764 /*
 765  * Walk curr->robust_list (very carefully, it's a userspace list!)
 766  * and mark any locks found there dead, and notify any waiters.
 767  *
 768  * We silently return on any sign of list-walking problem.
 769  */
 770 static void exit_robust_list(struct task_struct *curr)
 771 {
 772         struct robust_list_head __user *head = curr->robust_list;
 773         struct robust_list __user *entry, *next_entry, *pending;
 774         unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
 775         unsigned int next_pi;
 776         unsigned long futex_offset;
 777         int rc;
 778
 779         if (!futex_cmpxchg_enabled)
 780                 return;
 781
 782         /*
 783          * Fetch the list head (which was registered earlier, via
 784          * sys_set_robust_list()):
 785          */
 786         if (fetch_robust_entry(&entry, &head->list.next, &pi))
 787                 return;
 788         /*
 789          * Fetch the relative futex offset:
 790          */
 791         if (get_user(futex_offset, &head->futex_offset))
 792                 return;
 793         /*
 794          * Fetch any possibly pending lock-add first, and handle it
 795          * if it exists:
 796          */
 797         if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
 798                 return;
 799
 800         next_entry = NULL;      /* avoid warning with gcc */
 801         while (entry != &head->list) {
 802                 /*
 803                  * Fetch the next entry in the list before calling
 804                  * handle_futex_death:
 805                  */
 806                 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
 807                 /*
 808                  * A pending lock might already be on the list, so
 809                  * don't process it twice:
 810                  */
 811                 if (entry != pending) {
 812                         if (handle_futex_death((void __user *)entry + futex_offset,
 813                                                 curr, pi, HANDLE_DEATH_LIST))
 814                                 return;
 815                 }
 816                 if (rc)
 817                         return;
 818                 entry = next_entry;
 819                 pi = next_pi;
 820                 /*
 821                  * Avoid excessively long or circular lists:
 822                  */
 823                 if (!--limit)
 824                         break;
 825
 826                 cond_resched();
 827         }
 828
 829         if (pending) {
 830                 handle_futex_death((void __user *)pending + futex_offset,
 831                                    curr, pip, HANDLE_DEATH_PENDING);
 832         }
 833 }
 834
 835 #ifdef CONFIG_COMPAT
 836 static void __user *futex_uaddr(struct robust_list __user *entry,
 837                                 compat_long_t futex_offset)
 838 {
 839         compat_uptr_t base = ptr_to_compat(entry);
 840         void __user *uaddr = compat_ptr(base + futex_offset);
 841
 842         return uaddr;
 843 }
 844
 845 /*
 846  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
 847  */
 848 static inline int
 849 compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
 850                    compat_uptr_t __user *head, unsigned int *pi)
 851 {
 852         if (get_user(*uentry, head))
 853                 return -EFAULT;
 854
 855         *entry = compat_ptr((*uentry) & ~1);
 856         *pi = (unsigned int)(*uentry) & 1;
 857
 858         return 0;
 859 }
 860
 861 /*
 862  * Walk curr->robust_list (very carefully, it's a userspace list!)
 863  * and mark any locks found there dead, and notify any waiters.
 864  *
 865  * We silently return on any sign of list-walking problem.
 866  */
 867 static void compat_exit_robust_list(struct task_struct *curr)
 868 {
 869         struct compat_robust_list_head __user *head = curr->compat_robust_list;
 870         struct robust_list __user *entry, *next_entry, *pending;
 871         unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
 872         unsigned int next_pi;
 873         compat_uptr_t uentry, next_uentry, upending;
 874         compat_long_t futex_offset;
 875         int rc;
 876
 877         if (!futex_cmpxchg_enabled)
 878                 return;
 879
 880         /*
 881          * Fetch the list head (which was registered earlier, via
 882          * sys_set_robust_list()):
 883          */
 884         if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
 885                 return;
 886         /*
 887          * Fetch the relative futex offset:
 888          */
 889         if (get_user(futex_offset, &head->futex_offset))
 890                 return;
 891         /*
 892          * Fetch any possibly pending lock-add first, and handle it
 893          * if it exists:
 894          */
 895         if (compat_fetch_robust_entry(&upending, &pending,
 896                                &head->list_op_pending, &pip))
 897                 return;
 898
 899         next_entry = NULL;      /* avoid warning with gcc */
 900         while (entry != (struct robust_list __user *) &head->list) {
 901                 /*
 902                  * Fetch the next entry in the list before calling
 903                  * handle_futex_death:
 904                  */
 905                 rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
 906                         (compat_uptr_t __user *)&entry->next, &next_pi);
 907                 /*
 908                  * A pending lock might already be on the list, so
 909                  * dont process it twice:
 910                  */
 911                 if (entry != pending) {
 912                         void __user *uaddr = futex_uaddr(entry, futex_offset);
 913
 914                         if (handle_futex_death(uaddr, curr, pi,
 915                                                HANDLE_DEATH_LIST))
 916                                 return;
 917                 }
 918                 if (rc)
 919                         return;
 920                 uentry = next_uentry;
 921                 entry = next_entry;
 922                 pi = next_pi;
 923                 /*
 924                  * Avoid excessively long or circular lists:
 925                  */
 926                 if (!--limit)
 927                         break;
 928
 929                 cond_resched();
 930         }
 931         if (pending) {
 932                 void __user *uaddr = futex_uaddr(pending, futex_offset);
 933
 934                 handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
 935         }
 936 }
 937 #endif
 938
 939 #ifdef CONFIG_FUTEX_PI
 940
 941 /*
 942  * This task is holding PI mutexes at exit time => bad.
 943  * Kernel cleans up PI-state, but userspace is likely hosed.
 944  * (Robust-futex cleanup is separate and might save the day for userspace.)
 945  */
 946 static void exit_pi_state_list(struct task_struct *curr)
 947 {
 948         struct list_head *next, *head = &curr->pi_state_list;
 949         struct futex_pi_state *pi_state;
 950         struct futex_hash_bucket *hb;
 951         union futex_key key = FUTEX_KEY_INIT;
 952
 953         if (!futex_cmpxchg_enabled)
 954                 return;
 955         /*
 956          * We are a ZOMBIE and nobody can enqueue itself on
 957          * pi_state_list anymore, but we have to be careful
 958          * versus waiters unqueueing themselves:
 959          */
 960         raw_spin_lock_irq(&curr->pi_lock);
 961         while (!list_empty(head)) {
 962                 next = head->next;
 963                 pi_state = list_entry(next, struct futex_pi_state, list);
 964                 key = pi_state->key;
 965                 hb = futex_hash(&key);
 966
 967                 /*
 968                  * We can race against put_pi_state() removing itself from the
 969                  * list (a waiter going away). put_pi_state() will first
 970                  * decrement the reference count and then modify the list, so
 971                  * its possible to see the list entry but fail this reference
 972                  * acquire.
 973                  *
 974                  * In that case; drop the locks to let put_pi_state() make
 975                  * progress and retry the loop.
 976                  */
 977                 if (!refcount_inc_not_zero(&pi_state->refcount)) {
 978                         raw_spin_unlock_irq(&curr->pi_lock);
 979                         cpu_relax();
 980                         raw_spin_lock_irq(&curr->pi_lock);
 981                         continue;
 982                 }
 983                 raw_spin_unlock_irq(&curr->pi_lock);
 984
 985                 spin_lock(&hb->lock);
 986                 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 987                 raw_spin_lock(&curr->pi_lock);
 988                 /*
 989                  * We dropped the pi-lock, so re-check whether this
 990                  * task still owns the PI-state:
 991                  */
 992                 if (head->next != next) {
 993                         /* retain curr->pi_lock for the loop invariant */
 994                         raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
 995                         spin_unlock(&hb->lock);
 996                         put_pi_state(pi_state);
 997                         continue;
 998                 }
 999
1000                 WARN_ON(pi_state->owner != curr);
1001                 WARN_ON(list_empty(&pi_state->list));
1002                 list_del_init(&pi_state->list);
1003                 pi_state->owner = NULL;
1004
1005                 raw_spin_unlock(&curr->pi_lock);
1006                 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1007                 spin_unlock(&hb->lock);
1008
1009                 rt_mutex_futex_unlock(&pi_state->pi_mutex);
1010                 put_pi_state(pi_state);
1011
1012                 raw_spin_lock_irq(&curr->pi_lock);
1013         }
1014         raw_spin_unlock_irq(&curr->pi_lock);
1015 }
1016 #else
1017 static inline void exit_pi_state_list(struct task_struct *curr) { }
1018 #endif
1019
1020 static void futex_cleanup(struct task_struct *tsk)
1021 {
1022         if (unlikely(tsk->robust_list)) {
1023                 exit_robust_list(tsk);
1024                 tsk->robust_list = NULL;
1025         }
1026
1027 #ifdef CONFIG_COMPAT
1028         if (unlikely(tsk->compat_robust_list)) {
1029                 compat_exit_robust_list(tsk);
1030                 tsk->compat_robust_list = NULL;
1031         }
1032 #endif
1033
1034         if (unlikely(!list_empty(&tsk->pi_state_list)))
1035                 exit_pi_state_list(tsk);
1036 }
1037
1038 /**
1039  * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
1040  * @tsk:        task to set the state on
1041  *
1042  * Set the futex exit state of the task lockless. The futex waiter code
1043  * observes that state when a task is exiting and loops until the task has
1044  * actually finished the futex cleanup. The worst case for this is that the
1045  * waiter runs through the wait loop until the state becomes visible.
1046  *
1047  * This is called from the recursive fault handling path in do_exit().
1048  *
1049  * This is best effort. Either the futex exit code has run already or
1050  * not. If the OWNER_DIED bit has been set on the futex then the waiter can
1051  * take it over. If not, the problem is pushed back to user space. If the
1052  * futex exit code did not run yet, then an already queued waiter might
1053  * block forever, but there is nothing which can be done about that.
1054  */
1055 void futex_exit_recursive(struct task_struct *tsk)
1056 {
1057         /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
1058         if (tsk->futex_state == FUTEX_STATE_EXITING)
1059                 mutex_unlock(&tsk->futex_exit_mutex);
1060         tsk->futex_state = FUTEX_STATE_DEAD;
1061 }
1062
1063 static void futex_cleanup_begin(struct task_struct *tsk)
1064 {
1065         /*
1066          * Prevent various race issues against a concurrent incoming waiter
1067          * including live locks by forcing the waiter to block on
1068          * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
1069          * attach_to_pi_owner().
1070          */
1071         mutex_lock(&tsk->futex_exit_mutex);
1072
1073         /*
1074          * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
1075          *
1076          * This ensures that all subsequent checks of tsk->futex_state in
1077          * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
1078          * tsk->pi_lock held.
1079          *
1080          * It guarantees also that a pi_state which was queued right before
1081          * the state change under tsk->pi_lock by a concurrent waiter must
1082          * be observed in exit_pi_state_list().
1083          */
1084         raw_spin_lock_irq(&tsk->pi_lock);
1085         tsk->futex_state = FUTEX_STATE_EXITING;
1086         raw_spin_unlock_irq(&tsk->pi_lock);
1087 }
1088
1089 static void futex_cleanup_end(struct task_struct *tsk, int state)
1090 {
1091         /*
1092          * Lockless store. The only side effect is that an observer might
1093          * take another loop until it becomes visible.
1094          */
1095         tsk->futex_state = state;
1096         /*
1097          * Drop the exit protection. This unblocks waiters which observed
1098          * FUTEX_STATE_EXITING to reevaluate the state.
1099          */
1100         mutex_unlock(&tsk->futex_exit_mutex);
1101 }
1102
1103 void futex_exec_release(struct task_struct *tsk)
1104 {
1105         /*
1106          * The state handling is done for consistency, but in the case of
1107          * exec() there is no way to prevent further damage as the PID stays
1108          * the same. But for the unlikely and arguably buggy case that a
1109          * futex is held on exec(), this provides at least as much state
1110          * consistency protection which is possible.
1111          */
1112         futex_cleanup_begin(tsk);
1113         futex_cleanup(tsk);
1114         /*
1115          * Reset the state to FUTEX_STATE_OK. The task is alive and about
1116          * exec a new binary.
1117          */
1118         futex_cleanup_end(tsk, FUTEX_STATE_OK);
1119 }
1120
1121 void futex_exit_release(struct task_struct *tsk)
1122 {
1123         futex_cleanup_begin(tsk);
1124         futex_cleanup(tsk);
1125         futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
1126 }
1127
1128 static void __init futex_detect_cmpxchg(void)
1129 {
1130 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
1131         u32 curval;
1132
1133         /*
1134          * This will fail and we want it. Some arch implementations do
1135          * runtime detection of the futex_atomic_cmpxchg_inatomic()
1136          * functionality. We want to know that before we call in any
1137          * of the complex code paths. Also we want to prevent
1138          * registration of robust lists in that case. NULL is
1139          * guaranteed to fault and we get -EFAULT on functional
1140          * implementation, the non-functional ones will return
1141          * -ENOSYS.
1142          */
1143         if (futex_cmpxchg_value_locked(&curval, NULL, 0, 0) == -EFAULT)
1144                 futex_cmpxchg_enabled = 1;
1145 #endif
1146 }
1147
1148 static int __init futex_init(void)
1149 {
1150         unsigned int futex_shift;
1151         unsigned long i;
1152
1153 #if CONFIG_BASE_SMALL
1154         futex_hashsize = 16;
1155 #else
1156         futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
1157 #endif
1158
1159         futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
1160                                                futex_hashsize, 0,
1161                                                futex_hashsize < 256 ? HASH_SMALL : 0,
1162                                                &futex_shift, NULL,
1163                                                futex_hashsize, futex_hashsize);
1164         futex_hashsize = 1UL << futex_shift;
1165
1166         futex_detect_cmpxchg();
1167
1168         for (i = 0; i < futex_hashsize; i++) {
1169                 atomic_set(&futex_queues[i].waiters, 0);
1170                 plist_head_init(&futex_queues[i].chain);
1171                 spin_lock_init(&futex_queues[i].lock);
1172         }
1173
1174         return 0;
1175 }
1176 core_initcall(futex_init);