X-Git-Url: http://git.monstr.eu/?a=blobdiff_plain;f=kernel%2Ffutex.c;h=e7b4c6121da4e95af5df3dcaa9ef27cf5c934f28;hb=871dda463c6f2c2a4a660937e2f57616146f42de;hp=2ecb07575055aa692abf382f40662f858d1dd449;hpb=7b6ae471e5415bc2bf4384a83ccb4c21de7824c0;p=linux-2.6-microblaze.git diff --git a/kernel/futex.c b/kernel/futex.c index 2ecb07575055..e7b4c6121da4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -179,7 +179,7 @@ struct futex_pi_state { /* * The PI object: */ - struct rt_mutex pi_mutex; + struct rt_mutex_base pi_mutex; struct task_struct *owner; refcount_t refcount; @@ -197,6 +197,8 @@ struct futex_pi_state { * @rt_waiter: rt_waiter storage for use with requeue_pi * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup + * @requeue_state: State field for futex_requeue_pi() + * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) * * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so * we can wake only the relevant ones (hashed queues may be shared). @@ -219,12 +221,68 @@ struct futex_q { struct rt_mutex_waiter *rt_waiter; union futex_key *requeue_pi_key; u32 bitset; + atomic_t requeue_state; +#ifdef CONFIG_PREEMPT_RT + struct rcuwait requeue_wait; +#endif } __randomize_layout; +/* + * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an + * underlying rtmutex. The task which is about to be requeued could have + * just woken up (timeout, signal). After the wake up the task has to + * acquire hash bucket lock, which is held by the requeue code. As a task + * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking + * and the hash bucket lock blocking would collide and corrupt state. + * + * On !PREEMPT_RT this is not a problem and everything could be serialized + * on hash bucket lock, but aside of having the benefit of common code, + * this allows to avoid doing the requeue when the task is already on the + * way out and taking the hash bucket lock of the original uaddr1 when the + * requeue has been completed. + * + * The following state transitions are valid: + * + * On the waiter side: + * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE + * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT + * + * On the requeue side: + * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS + * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED + * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed) + * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED + * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed) + * + * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this + * signals that the waiter is already on the way out. It also means that + * the waiter is still on the 'wait' futex, i.e. uaddr1. + * + * The waiter side signals early wakeup to the requeue side either through + * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending + * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately + * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT, + * which means the wakeup is interleaving with a requeue in progress it has + * to wait for the requeue side to change the state. Either to DONE/LOCKED + * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex + * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by + * the requeue side when the requeue attempt failed via deadlock detection + * and therefore the waiter q is still on the uaddr1 futex. + */ +enum { + Q_REQUEUE_PI_NONE = 0, + Q_REQUEUE_PI_IGNORE, + Q_REQUEUE_PI_IN_PROGRESS, + Q_REQUEUE_PI_WAIT, + Q_REQUEUE_PI_DONE, + Q_REQUEUE_PI_LOCKED, +}; + static const struct futex_q futex_q_init = { /* list gets initialized in queue_me()*/ - .key = FUTEX_KEY_INIT, - .bitset = FUTEX_BITSET_MATCH_ANY + .key = FUTEX_KEY_INIT, + .bitset = FUTEX_BITSET_MATCH_ANY, + .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE), }; /* @@ -1299,27 +1357,6 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, return 0; } -static int lookup_pi_state(u32 __user *uaddr, u32 uval, - struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps, - struct task_struct **exiting) -{ - struct futex_q *top_waiter = futex_top_waiter(hb, key); - - /* - * If there is a waiter on that futex, validate it and - * attach to the pi_state when the validation succeeds. - */ - if (top_waiter) - return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); - - /* - * We are the first waiter - try to look up the owner based on - * @uval and attach to it. - */ - return attach_to_pi_owner(uaddr, uval, key, ps, exiting); -} - static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { int err; @@ -1354,7 +1391,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) * - 1 - acquired the lock; * - <0 - error * - * The hb->lock and futex_key refs shall be held by the caller. + * The hb->lock must be held by the caller. * * @exiting is only set when the return value is -EBUSY. If so, this holds * a refcount on the exiting task on return and the caller needs to drop it @@ -1493,11 +1530,11 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) */ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) { - u32 curval, newval; struct rt_mutex_waiter *top_waiter; struct task_struct *new_owner; bool postunlock = false; - DEFINE_WAKE_Q(wake_q); + DEFINE_RT_WAKE_Q(wqh); + u32 curval, newval; int ret = 0; top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); @@ -1549,14 +1586,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ * not fail. */ pi_state_update_owner(pi_state, new_owner); - postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh); } out_unlock: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); if (postunlock) - rt_mutex_postunlock(&wake_q); + rt_mutex_postunlock(&wqh); return ret; } @@ -1793,6 +1830,108 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, q->key = *key2; } +static inline bool futex_requeue_pi_prepare(struct futex_q *q, + struct futex_pi_state *pi_state) +{ + int old, new; + + /* + * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has + * already set Q_REQUEUE_PI_IGNORE to signal that requeue should + * ignore the waiter. + */ + old = atomic_read_acquire(&q->requeue_state); + do { + if (old == Q_REQUEUE_PI_IGNORE) + return false; + + /* + * futex_proxy_trylock_atomic() might have set it to + * IN_PROGRESS and a interleaved early wake to WAIT. + * + * It was considered to have an extra state for that + * trylock, but that would just add more conditionals + * all over the place for a dubious value. + */ + if (old != Q_REQUEUE_PI_NONE) + break; + + new = Q_REQUEUE_PI_IN_PROGRESS; + } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); + + q->pi_state = pi_state; + return true; +} + +static inline void futex_requeue_pi_complete(struct futex_q *q, int locked) +{ + int old, new; + + old = atomic_read_acquire(&q->requeue_state); + do { + if (old == Q_REQUEUE_PI_IGNORE) + return; + + if (locked >= 0) { + /* Requeue succeeded. Set DONE or LOCKED */ + WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS && + old != Q_REQUEUE_PI_WAIT); + new = Q_REQUEUE_PI_DONE + locked; + } else if (old == Q_REQUEUE_PI_IN_PROGRESS) { + /* Deadlock, no early wakeup interleave */ + new = Q_REQUEUE_PI_NONE; + } else { + /* Deadlock, early wakeup interleave. */ + WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT); + new = Q_REQUEUE_PI_IGNORE; + } + } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); + +#ifdef CONFIG_PREEMPT_RT + /* If the waiter interleaved with the requeue let it know */ + if (unlikely(old == Q_REQUEUE_PI_WAIT)) + rcuwait_wake_up(&q->requeue_wait); +#endif +} + +static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) +{ + int old, new; + + old = atomic_read_acquire(&q->requeue_state); + do { + /* Is requeue done already? */ + if (old >= Q_REQUEUE_PI_DONE) + return old; + + /* + * If not done, then tell the requeue code to either ignore + * the waiter or to wake it up once the requeue is done. + */ + new = Q_REQUEUE_PI_WAIT; + if (old == Q_REQUEUE_PI_NONE) + new = Q_REQUEUE_PI_IGNORE; + } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); + + /* If the requeue was in progress, wait for it to complete */ + if (old == Q_REQUEUE_PI_IN_PROGRESS) { +#ifdef CONFIG_PREEMPT_RT + rcuwait_wait_event(&q->requeue_wait, + atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT, + TASK_UNINTERRUPTIBLE); +#else + (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT); +#endif + } + + /* + * Requeue is now either prohibited or complete. Reread state + * because during the wait above it might have changed. Nothing + * will modify q->requeue_state after this point. + */ + return atomic_read(&q->requeue_state); +} + /** * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue * @q: the futex_q @@ -1820,6 +1959,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, q->lock_ptr = &hb->lock; + /* Signal locked state to the waiter */ + futex_requeue_pi_complete(q, 1); wake_up_state(q->task, TASK_NORMAL); } @@ -1879,10 +2020,21 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, if (!top_waiter) return 0; + /* + * Ensure that this is a waiter sitting in futex_wait_requeue_pi() + * and waiting on the 'waitqueue' futex which is always !PI. + */ + if (!top_waiter->rt_waiter || top_waiter->pi_state) + ret = -EINVAL; + /* Ensure we requeue to the expected futex. */ if (!match_futex(top_waiter->requeue_pi_key, key2)) return -EINVAL; + /* Ensure that this does not race against an early wakeup */ + if (!futex_requeue_pi_prepare(top_waiter, NULL)) + return -EAGAIN; + /* * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in * the contended case or if set_waiters is 1. The pi_state is returned @@ -1892,8 +2044,22 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, exiting, set_waiters); if (ret == 1) { + /* Dequeue, wake up and update top_waiter::requeue_state */ requeue_pi_wake_futex(top_waiter, key2, hb2); return vpid; + } else if (ret < 0) { + /* Rewind top_waiter::requeue_state */ + futex_requeue_pi_complete(top_waiter, ret); + } else { + /* + * futex_lock_pi_atomic() did not acquire the user space + * futex, but managed to establish the proxy lock and pi + * state. top_waiter::requeue_state cannot be fixed up here + * because the waiter is not enqueued on the rtmutex + * yet. This is handled at the callsite depending on the + * result of rt_mutex_start_proxy_lock() which is + * guaranteed to be reached with this function returning 0. + */ } return ret; } @@ -1947,24 +2113,36 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, if (uaddr1 == uaddr2) return -EINVAL; + /* + * futex_requeue() allows the caller to define the number + * of waiters to wake up via the @nr_wake argument. With + * REQUEUE_PI, waking up more than one waiter is creating + * more problems than it solves. Waking up a waiter makes + * only sense if the PI futex @uaddr2 is uncontended as + * this allows the requeue code to acquire the futex + * @uaddr2 before waking the waiter. The waiter can then + * return to user space without further action. A secondary + * wakeup would just make the futex_wait_requeue_pi() + * handling more complex, because that code would have to + * look up pi_state and do more or less all the handling + * which the requeue code has to do for the to be requeued + * waiters. So restrict the number of waiters to wake to + * one, and only wake it up when the PI futex is + * uncontended. Otherwise requeue it and let the unlock of + * the PI futex handle the wakeup. + * + * All REQUEUE_PI users, e.g. pthread_cond_signal() and + * pthread_cond_broadcast() must use nr_wake=1. + */ + if (nr_wake != 1) + return -EINVAL; + /* * requeue_pi requires a pi_state, try to allocate it now * without any locks in case it fails. */ if (refill_pi_state_cache()) return -ENOMEM; - /* - * requeue_pi must wake as many tasks as it can, up to nr_wake - * + nr_requeue, since it acquires the rt_mutex prior to - * returning to userspace, so as to not leave the rt_mutex with - * waiters and no owner. However, second and third wake-ups - * cannot be predicted as they involve race conditions with the - * first wake and a fault while looking up the pi_state. Both - * pthread_cond_signal() and pthread_cond_broadcast() should - * use nr_wake=1. - */ - if (nr_wake != 1) - return -EINVAL; } retry: @@ -2014,7 +2192,7 @@ retry_private: } } - if (requeue_pi && (task_count - nr_wake < nr_requeue)) { + if (requeue_pi) { struct task_struct *exiting = NULL; /* @@ -2022,6 +2200,8 @@ retry_private: * intend to requeue waiters, force setting the FUTEX_WAITERS * bit. We force this here where we are able to easily handle * faults rather in the requeue loop below. + * + * Updates topwaiter::requeue_state if a top waiter exists. */ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, &key2, &pi_state, @@ -2031,28 +2211,52 @@ retry_private: * At this point the top_waiter has either taken uaddr2 or is * waiting on it. If the former, then the pi_state will not * exist yet, look it up one more time to ensure we have a - * reference to it. If the lock was taken, ret contains the - * vpid of the top waiter task. + * reference to it. If the lock was taken, @ret contains the + * VPID of the top waiter task. * If the lock was not taken, we have pi_state and an initial * refcount on it. In case of an error we have nothing. + * + * The top waiter's requeue_state is up to date: + * + * - If the lock was acquired atomically (ret > 0), then + * the state is Q_REQUEUE_PI_LOCKED. + * + * - If the trylock failed with an error (ret < 0) then + * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing + * happened", or Q_REQUEUE_PI_IGNORE when there was an + * interleaved early wakeup. + * + * - If the trylock did not succeed (ret == 0) then the + * state is either Q_REQUEUE_PI_IN_PROGRESS or + * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. + * This will be cleaned up in the loop below, which + * cannot fail because futex_proxy_trylock_atomic() did + * the same sanity checks for requeue_pi as the loop + * below does. */ if (ret > 0) { WARN_ON(pi_state); task_count++; /* - * If we acquired the lock, then the user space value - * of uaddr2 should be vpid. It cannot be changed by - * the top waiter as it is blocked on hb2 lock if it - * tries to do so. If something fiddled with it behind - * our back the pi state lookup might unearth it. So - * we rather use the known value than rereading and - * handing potential crap to lookup_pi_state. + * If futex_proxy_trylock_atomic() acquired the + * user space futex, then the user space value + * @uaddr2 has been set to the @hb1's top waiter + * task VPID. This task is guaranteed to be alive + * and cannot be exiting because it is either + * sleeping or blocked on @hb2 lock. + * + * The @uaddr2 futex cannot have waiters either as + * otherwise futex_proxy_trylock_atomic() would not + * have succeeded. * - * If that call succeeds then we have pi_state and an - * initial refcount on it. + * In order to requeue waiters to @hb2, pi state is + * required. Hand in the VPID value (@ret) and + * allocate PI state with an initial refcount on + * it. */ - ret = lookup_pi_state(uaddr2, ret, hb2, &key2, - &pi_state, &exiting); + ret = attach_to_pi_owner(uaddr2, ret, &key2, &pi_state, + &exiting); + WARN_ON(ret); } switch (ret) { @@ -2060,7 +2264,10 @@ retry_private: /* We hold a reference on the pi state. */ break; - /* If the above failed, then pi_state is NULL */ + /* + * If the above failed, then pi_state is NULL and + * waiter::requeue_state is correct. + */ case -EFAULT: double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); @@ -2112,18 +2319,17 @@ retry_private: break; } - /* - * Wake nr_wake waiters. For requeue_pi, if we acquired the - * lock, we already woke the top_waiter. If not, it will be - * woken by futex_unlock_pi(). - */ - if (++task_count <= nr_wake && !requeue_pi) { - mark_wake_futex(&wake_q, this); + /* Plain futexes just wake or requeue and are done */ + if (!requeue_pi) { + if (++task_count <= nr_wake) + mark_wake_futex(&wake_q, this); + else + requeue_futex(this, hb1, hb2, &key2); continue; } /* Ensure we requeue to the expected futex for requeue_pi. */ - if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { + if (!match_futex(this->requeue_pi_key, &key2)) { ret = -EINVAL; break; } @@ -2131,54 +2337,67 @@ retry_private: /* * Requeue nr_requeue waiters and possibly one more in the case * of requeue_pi if we couldn't acquire the lock atomically. + * + * Prepare the waiter to take the rt_mutex. Take a refcount + * on the pi_state and store the pointer in the futex_q + * object of the waiter. */ - if (requeue_pi) { + get_pi_state(pi_state); + + /* Don't requeue when the waiter is already on the way out. */ + if (!futex_requeue_pi_prepare(this, pi_state)) { /* - * Prepare the waiter to take the rt_mutex. Take a - * refcount on the pi_state and store the pointer in - * the futex_q object of the waiter. + * Early woken waiter signaled that it is on the + * way out. Drop the pi_state reference and try the + * next waiter. @this->pi_state is still NULL. */ - get_pi_state(pi_state); - this->pi_state = pi_state; - ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, - this->rt_waiter, - this->task); - if (ret == 1) { - /* - * We got the lock. We do neither drop the - * refcount on pi_state nor clear - * this->pi_state because the waiter needs the - * pi_state for cleaning up the user space - * value. It will drop the refcount after - * doing so. - */ - requeue_pi_wake_futex(this, &key2, hb2); - continue; - } else if (ret) { - /* - * rt_mutex_start_proxy_lock() detected a - * potential deadlock when we tried to queue - * that waiter. Drop the pi_state reference - * which we took above and remove the pointer - * to the state from the waiters futex_q - * object. - */ - this->pi_state = NULL; - put_pi_state(pi_state); - /* - * We stop queueing more waiters and let user - * space deal with the mess. - */ - break; - } + put_pi_state(pi_state); + continue; + } + + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task); + + if (ret == 1) { + /* + * We got the lock. We do neither drop the refcount + * on pi_state nor clear this->pi_state because the + * waiter needs the pi_state for cleaning up the + * user space value. It will drop the refcount + * after doing so. this::requeue_state is updated + * in the wakeup as well. + */ + requeue_pi_wake_futex(this, &key2, hb2); + task_count++; + } else if (!ret) { + /* Waiter is queued, move it to hb2 */ + requeue_futex(this, hb1, hb2, &key2); + futex_requeue_pi_complete(this, 0); + task_count++; + } else { + /* + * rt_mutex_start_proxy_lock() detected a potential + * deadlock when we tried to queue that waiter. + * Drop the pi_state reference which we took above + * and remove the pointer to the state from the + * waiters futex_q object. + */ + this->pi_state = NULL; + put_pi_state(pi_state); + futex_requeue_pi_complete(this, ret); + /* + * We stop queueing more waiters and let user space + * deal with the mess. + */ + break; } - requeue_futex(this, hb1, hb2, &key2); } /* - * We took an extra initial reference to the pi_state either - * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We - * need to drop it here again. + * We took an extra initial reference to the pi_state either in + * futex_proxy_trylock_atomic() or in attach_to_pi_owner(). We need + * to drop it here again. */ put_pi_state(pi_state); @@ -2357,7 +2576,7 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * Modifying pi_state _before_ the user space value would leave the * pi_state in an inconsistent state when we fault here, because we * need to drop the locks to handle the fault. This might be observed - * in the PID check in lookup_pi_state. + * in the PID checks when attaching to PI state . */ retry: if (!argowner) { @@ -2614,8 +2833,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * * Setup the futex_q and locate the hash_bucket. Get the futex value and * compare it with the expected value. Handle atomic faults internally. - * Return with the hb lock held and a q.key reference on success, and unlocked - * with no q.key reference on failure. + * Return with the hb lock held on success, and unlocked on failure. * * Return: * - 0 - uaddr contains val and hb has been locked; @@ -2693,8 +2911,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, current->timer_slack_ns); retry: /* - * Prepare to wait on uaddr. On success, holds hb lock and increments - * q.key refs. + * Prepare to wait on uaddr. On success, it holds hb->lock and q + * is initialized. */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) @@ -2705,7 +2923,6 @@ retry: /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; - /* unqueue_me() drops q.key ref */ if (!unqueue_me(&q)) goto out; ret = -ETIMEDOUT; @@ -3072,27 +3289,22 @@ pi_faulted: } /** - * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex + * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex * @hb: the hash_bucket futex_q was original enqueued on * @q: the futex_q woken while waiting to be requeued - * @key2: the futex_key of the requeue target futex * @timeout: the timeout associated with the wait (NULL if none) * - * Detect if the task was woken on the initial futex as opposed to the requeue - * target futex. If so, determine if it was a timeout or a signal that caused - * the wakeup and return the appropriate error code to the caller. Must be - * called with the hb lock held. + * Determine the cause for the early wakeup. * * Return: - * - 0 = no early wakeup detected; - * - <0 = -ETIMEDOUT or -ERESTARTNOINTR + * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, - struct futex_q *q, union futex_key *key2, + struct futex_q *q, struct hrtimer_sleeper *timeout) { - int ret = 0; + int ret; /* * With the hb lock held, we avoid races while we process the wakeup. @@ -3101,22 +3313,21 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * It can't be requeued from uaddr2 to something else since we don't * support a PI aware source futex for requeue. */ - if (!match_futex(&q->key, key2)) { - WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); - /* - * We were woken prior to requeue by a timeout or a signal. - * Unqueue the futex_q and determine which it was. - */ - plist_del(&q->list, &hb->chain); - hb_waiters_dec(hb); + WARN_ON_ONCE(&hb->lock != q->lock_ptr); - /* Handle spurious wakeups gracefully */ - ret = -EWOULDBLOCK; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - else if (signal_pending(current)) - ret = -ERESTARTNOINTR; - } + /* + * We were woken prior to requeue by a timeout or a signal. + * Unqueue the futex_q and determine which it was. + */ + plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); + + /* Handle spurious wakeups gracefully */ + ret = -EWOULDBLOCK; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + else if (signal_pending(current)) + ret = -ERESTARTNOINTR; return ret; } @@ -3169,6 +3380,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; + struct rt_mutex_base *pi_mutex; int res, ret; if (!IS_ENABLED(CONFIG_FUTEX_PI)) @@ -3198,8 +3410,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, q.requeue_pi_key = &key2; /* - * Prepare to wait on uaddr. On success, increments q.key (key1) ref - * count. + * Prepare to wait on uaddr. On success, it holds hb->lock and q + * is initialized. */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) @@ -3218,32 +3430,22 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue_me(hb, &q, to); - spin_lock(&hb->lock); - ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); - spin_unlock(&hb->lock); - if (ret) - goto out; - - /* - * In order for us to be here, we know our q.key == key2, and since - * we took the hb->lock above, we also know that futex_requeue() has - * completed and we no longer have to concern ourselves with a wakeup - * race with the atomic proxy lock acquisition by the requeue code. The - * futex_requeue dropped our key1 reference and incremented our key2 - * reference count. - */ + switch (futex_requeue_pi_wakeup_sync(&q)) { + case Q_REQUEUE_PI_IGNORE: + /* The waiter is still on uaddr1 */ + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, to); + spin_unlock(&hb->lock); + break; - /* - * Check if the requeue code acquired the second futex for us and do - * any pertinent fixup. - */ - if (!q.rt_waiter) { + case Q_REQUEUE_PI_LOCKED: + /* The requeue acquired the lock */ if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_owner(uaddr2, &q, true); /* - * Drop the reference to the pi state which - * the requeue_pi() code acquired for us. + * Drop the reference to the pi state which the + * requeue_pi() code acquired for us. */ put_pi_state(q.pi_state); spin_unlock(q.lock_ptr); @@ -3253,18 +3455,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ ret = ret < 0 ? ret : 0; } - } else { - struct rt_mutex *pi_mutex; + break; - /* - * We have been woken up by futex_unlock_pi(), a timeout, or a - * signal. futex_unlock_pi() will not destroy the lock_ptr nor - * the pi_state. - */ - WARN_ON(!q.pi_state); + case Q_REQUEUE_PI_DONE: + /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */ pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); + /* Current is not longer pi_blocked_on */ spin_lock(q.lock_ptr); if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) ret = 0; @@ -3284,17 +3482,21 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, unqueue_me_pi(&q); spin_unlock(q.lock_ptr); - } - if (ret == -EINTR) { - /* - * We've already been requeued, but cannot restart by calling - * futex_lock_pi() directly. We could restart this syscall, but - * it would detect that the user space "val" changed and return - * -EWOULDBLOCK. Save the overhead of the restart and return - * -EWOULDBLOCK directly. - */ - ret = -EWOULDBLOCK; + if (ret == -EINTR) { + /* + * We've already been requeued, but cannot restart + * by calling futex_lock_pi() directly. We could + * restart this syscall, but it would detect that + * the user space "val" changed and return + * -EWOULDBLOCK. Save the overhead of the restart + * and return -EWOULDBLOCK directly. + */ + ret = -EWOULDBLOCK; + } + break; + default: + BUG(); } out: