afs: Fix lock-wait/callback-break double locking
[linux-2.6-microblaze.git] / fs / afs / flock.c
index e432bd2..c91cd20 100644 (file)
 
 #define AFS_LOCK_GRANTED       0
 #define AFS_LOCK_PENDING       1
+#define AFS_LOCK_YOUR_TRY      2
 
 struct workqueue_struct *afs_lock_manager;
 
+static void afs_next_locker(struct afs_vnode *vnode, int error);
 static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl);
 static void afs_fl_release_private(struct file_lock *fl);
 
@@ -24,6 +26,14 @@ static const struct file_lock_operations afs_lock_ops = {
        .fl_release_private     = afs_fl_release_private,
 };
 
+static inline void afs_set_lock_state(struct afs_vnode *vnode, enum afs_lock_state state)
+{
+       _debug("STATE %u -> %u", vnode->lock_state, state);
+       vnode->lock_state = state;
+}
+
+static atomic_t afs_file_lock_debug_id;
+
 /*
  * if the callback is broken on this vnode, then the lock may now be available
  */
@@ -31,7 +41,11 @@ void afs_lock_may_be_available(struct afs_vnode *vnode)
 {
        _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
 
-       queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0);
+       spin_lock(&vnode->lock);
+       if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB)
+               afs_next_locker(vnode, 0);
+       trace_afs_flock_ev(vnode, NULL, afs_flock_callback_break, 0);
+       spin_unlock(&vnode->lock);
 }
 
 /*
@@ -40,8 +54,35 @@ void afs_lock_may_be_available(struct afs_vnode *vnode)
  */
 static void afs_schedule_lock_extension(struct afs_vnode *vnode)
 {
-       queue_delayed_work(afs_lock_manager, &vnode->lock_work,
-                          AFS_LOCKWAIT * HZ / 2);
+       ktime_t expires_at, now, duration;
+       u64 duration_j;
+
+       expires_at = ktime_add_ms(vnode->locked_at, AFS_LOCKWAIT * 1000 / 2);
+       now = ktime_get_real();
+       duration = ktime_sub(expires_at, now);
+       if (duration <= 0)
+               duration_j = 0;
+       else
+               duration_j = nsecs_to_jiffies(ktime_to_ns(duration));
+
+       queue_delayed_work(afs_lock_manager, &vnode->lock_work, duration_j);
+}
+
+/*
+ * In the case of successful completion of a lock operation, record the time
+ * the reply appeared and start the lock extension timer.
+ */
+void afs_lock_op_done(struct afs_call *call)
+{
+       struct afs_vnode *vnode = call->reply[0];
+
+       if (call->error == 0) {
+               spin_lock(&vnode->lock);
+               trace_afs_flock_ev(vnode, NULL, afs_flock_timestamp, 0);
+               vnode->locked_at = call->reply_time;
+               afs_schedule_lock_extension(vnode);
+               spin_unlock(&vnode->lock);
+       }
 }
 
 /*
@@ -49,22 +90,90 @@ static void afs_schedule_lock_extension(struct afs_vnode *vnode)
  * first lock in the queue is itself a readlock)
  * - the caller must hold the vnode lock
  */
-static void afs_grant_locks(struct afs_vnode *vnode, struct file_lock *fl)
+static void afs_grant_locks(struct afs_vnode *vnode)
 {
        struct file_lock *p, *_p;
+       bool exclusive = (vnode->lock_type == AFS_LOCK_WRITE);
 
-       list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
-       if (fl->fl_type == F_RDLCK) {
-               list_for_each_entry_safe(p, _p, &vnode->pending_locks,
-                                        fl_u.afs.link) {
-                       if (p->fl_type == F_RDLCK) {
-                               p->fl_u.afs.state = AFS_LOCK_GRANTED;
-                               list_move_tail(&p->fl_u.afs.link,
-                                              &vnode->granted_locks);
-                               wake_up(&p->fl_wait);
-                       }
+       list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) {
+               if (!exclusive && p->fl_type == F_WRLCK)
+                       continue;
+
+               list_move_tail(&p->fl_u.afs.link, &vnode->granted_locks);
+               p->fl_u.afs.state = AFS_LOCK_GRANTED;
+               trace_afs_flock_op(vnode, p, afs_flock_op_grant);
+               wake_up(&p->fl_wait);
+       }
+}
+
+/*
+ * If an error is specified, reject every pending lock that matches the
+ * authentication and type of the lock we failed to get.  If there are any
+ * remaining lockers, try to wake up one of them to have a go.
+ */
+static void afs_next_locker(struct afs_vnode *vnode, int error)
+{
+       struct file_lock *p, *_p, *next = NULL;
+       struct key *key = vnode->lock_key;
+       unsigned int fl_type = F_RDLCK;
+
+       _enter("");
+
+       if (vnode->lock_type == AFS_LOCK_WRITE)
+               fl_type = F_WRLCK;
+
+       list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) {
+               if (error &&
+                   p->fl_type == fl_type &&
+                   afs_file_key(p->fl_file) == key) {
+                       list_del_init(&p->fl_u.afs.link);
+                       p->fl_u.afs.state = error;
+                       wake_up(&p->fl_wait);
                }
+
+               /* Select the next locker to hand off to. */
+               if (next &&
+                   (next->fl_type == F_WRLCK || p->fl_type == F_RDLCK))
+                       continue;
+               next = p;
+       }
+
+       vnode->lock_key = NULL;
+       key_put(key);
+
+       if (next) {
+               afs_set_lock_state(vnode, AFS_VNODE_LOCK_SETTING);
+               next->fl_u.afs.state = AFS_LOCK_YOUR_TRY;
+               trace_afs_flock_op(vnode, next, afs_flock_op_wake);
+               wake_up(&next->fl_wait);
+       } else {
+               afs_set_lock_state(vnode, AFS_VNODE_LOCK_NONE);
+               trace_afs_flock_ev(vnode, NULL, afs_flock_no_lockers, 0);
        }
+
+       _leave("");
+}
+
+/*
+ * Kill off all waiters in the the pending lock queue due to the vnode being
+ * deleted.
+ */
+static void afs_kill_lockers_enoent(struct afs_vnode *vnode)
+{
+       struct file_lock *p;
+
+       afs_set_lock_state(vnode, AFS_VNODE_LOCK_DELETED);
+
+       while (!list_empty(&vnode->pending_locks)) {
+               p = list_entry(vnode->pending_locks.next,
+                              struct file_lock, fl_u.afs.link);
+               list_del_init(&p->fl_u.afs.link);
+               p->fl_u.afs.state = -ENOENT;
+               wake_up(&p->fl_wait);
+       }
+
+       key_put(vnode->lock_key);
+       vnode->lock_key = NULL;
 }
 
 /*
@@ -84,7 +193,7 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key,
               key_serial(key), type);
 
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, vnode, key)) {
+       if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(vnode);
                        afs_fs_set_lock(&fc, type);
@@ -115,7 +224,7 @@ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key)
               key_serial(key));
 
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, vnode, key)) {
+       if (afs_begin_vnode_operation(&fc, vnode, key, false)) {
                while (afs_select_current_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(vnode);
                        afs_fs_extend_lock(&fc);
@@ -146,7 +255,7 @@ static int afs_release_lock(struct afs_vnode *vnode, struct key *key)
               key_serial(key));
 
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, vnode, key)) {
+       if (afs_begin_vnode_operation(&fc, vnode, key, false)) {
                while (afs_select_current_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(vnode);
                        afs_fs_release_lock(&fc);
@@ -170,8 +279,6 @@ void afs_lock_work(struct work_struct *work)
 {
        struct afs_vnode *vnode =
                container_of(work, struct afs_vnode, lock_work.work);
-       struct file_lock *fl, *next;
-       afs_lock_type_t type;
        struct key *key;
        int ret;
 
@@ -183,35 +290,28 @@ again:
        _debug("wstate %u for %p", vnode->lock_state, vnode);
        switch (vnode->lock_state) {
        case AFS_VNODE_LOCK_NEED_UNLOCK:
-               _debug("unlock");
-               vnode->lock_state = AFS_VNODE_LOCK_UNLOCKING;
+               afs_set_lock_state(vnode, AFS_VNODE_LOCK_UNLOCKING);
+               trace_afs_flock_ev(vnode, NULL, afs_flock_work_unlocking, 0);
                spin_unlock(&vnode->lock);
 
                /* attempt to release the server lock; if it fails, we just
                 * wait 5 minutes and it'll expire anyway */
                ret = afs_release_lock(vnode, vnode->lock_key);
-               if (ret < 0)
+               if (ret < 0 && vnode->lock_state != AFS_VNODE_LOCK_DELETED) {
+                       trace_afs_flock_ev(vnode, NULL, afs_flock_release_fail,
+                                          ret);
                        printk(KERN_WARNING "AFS:"
                               " Failed to release lock on {%llx:%llx} error %d\n",
                               vnode->fid.vid, vnode->fid.vnode, ret);
-
-               spin_lock(&vnode->lock);
-               key_put(vnode->lock_key);
-               vnode->lock_key = NULL;
-               vnode->lock_state = AFS_VNODE_LOCK_NONE;
-
-               if (list_empty(&vnode->pending_locks)) {
-                       spin_unlock(&vnode->lock);
-                       return;
                }
 
-               /* The new front of the queue now owns the state variables. */
-               next = list_entry(vnode->pending_locks.next,
-                                 struct file_lock, fl_u.afs.link);
-               vnode->lock_key = key_get(afs_file_key(next->fl_file));
-               vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
-               vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
-               goto again;
+               spin_lock(&vnode->lock);
+               if (ret == -ENOENT)
+                       afs_kill_lockers_enoent(vnode);
+               else
+                       afs_next_locker(vnode, 0);
+               spin_unlock(&vnode->lock);
+               return;
 
        /* If we've already got a lock, then it must be time to extend that
         * lock as AFS locks time out after 5 minutes.
@@ -222,87 +322,57 @@ again:
                ASSERT(!list_empty(&vnode->granted_locks));
 
                key = key_get(vnode->lock_key);
-               vnode->lock_state = AFS_VNODE_LOCK_EXTENDING;
+               afs_set_lock_state(vnode, AFS_VNODE_LOCK_EXTENDING);
+               trace_afs_flock_ev(vnode, NULL, afs_flock_work_extending, 0);
                spin_unlock(&vnode->lock);
 
                ret = afs_extend_lock(vnode, key); /* RPC */
                key_put(key);
 
-               if (ret < 0)
+               if (ret < 0) {
+                       trace_afs_flock_ev(vnode, NULL, afs_flock_extend_fail,
+                                          ret);
                        pr_warning("AFS: Failed to extend lock on {%llx:%llx} error %d\n",
                                   vnode->fid.vid, vnode->fid.vnode, ret);
+               }
 
                spin_lock(&vnode->lock);
 
+               if (ret == -ENOENT) {
+                       afs_kill_lockers_enoent(vnode);
+                       spin_unlock(&vnode->lock);
+                       return;
+               }
+
                if (vnode->lock_state != AFS_VNODE_LOCK_EXTENDING)
                        goto again;
-               vnode->lock_state = AFS_VNODE_LOCK_GRANTED;
+               afs_set_lock_state(vnode, AFS_VNODE_LOCK_GRANTED);
 
-               if (ret == 0)
-                       afs_schedule_lock_extension(vnode);
-               else
+               if (ret != 0)
                        queue_delayed_work(afs_lock_manager, &vnode->lock_work,
                                           HZ * 10);
                spin_unlock(&vnode->lock);
                _leave(" [ext]");
                return;
 
-               /* If we don't have a granted lock, then we must've been called
-                * back by the server, and so if might be possible to get a
-                * lock we're currently waiting for.
-                */
+       /* If we're waiting for a callback to indicate lock release, we can't
+        * actually rely on this, so need to recheck at regular intervals.  The
+        * problem is that the server might not notify us if the lock just
+        * expires (say because a client died) rather than being explicitly
+        * released.
+        */
        case AFS_VNODE_LOCK_WAITING_FOR_CB:
-               _debug("get");
-
-               key = key_get(vnode->lock_key);
-               type = vnode->lock_type;
-               vnode->lock_state = AFS_VNODE_LOCK_SETTING;
+               _debug("retry");
+               afs_next_locker(vnode, 0);
                spin_unlock(&vnode->lock);
+               return;
 
-               ret = afs_set_lock(vnode, key, type); /* RPC */
-               key_put(key);
-
-               spin_lock(&vnode->lock);
-               switch (ret) {
-               case -EWOULDBLOCK:
-                       _debug("blocked");
-                       break;
-               case 0:
-                       _debug("acquired");
-                       vnode->lock_state = AFS_VNODE_LOCK_GRANTED;
-                       /* Fall through */
-               default:
-                       /* Pass the lock or the error onto the first locker in
-                        * the list - if they're looking for this type of lock.
-                        * If they're not, we assume that whoever asked for it
-                        * took a signal.
-                        */
-                       if (list_empty(&vnode->pending_locks)) {
-                               _debug("withdrawn");
-                               vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK;
-                               goto again;
-                       }
-
-                       fl = list_entry(vnode->pending_locks.next,
-                                       struct file_lock, fl_u.afs.link);
-                       type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
-                       if (vnode->lock_type != type) {
-                               _debug("changed");
-                               vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK;
-                               goto again;
-                       }
-
-                       fl->fl_u.afs.state = ret;
-                       if (ret == 0)
-                               afs_grant_locks(vnode, fl);
-                       else
-                               list_del_init(&fl->fl_u.afs.link);
-                       wake_up(&fl->fl_wait);
-                       spin_unlock(&vnode->lock);
-                       _leave(" [granted]");
-                       return;
-               }
+       case AFS_VNODE_LOCK_DELETED:
+               afs_kill_lockers_enoent(vnode);
+               spin_unlock(&vnode->lock);
+               return;
 
+               /* Fall through */
        default:
                /* Looks like a lock request was withdrawn. */
                spin_unlock(&vnode->lock);
@@ -319,14 +389,16 @@ again:
  */
 static void afs_defer_unlock(struct afs_vnode *vnode)
 {
-       _enter("");
+       _enter("%u", vnode->lock_state);
 
-       if (vnode->lock_state == AFS_VNODE_LOCK_GRANTED ||
-           vnode->lock_state == AFS_VNODE_LOCK_EXTENDING) {
+       if (list_empty(&vnode->granted_locks) &&
+           (vnode->lock_state == AFS_VNODE_LOCK_GRANTED ||
+            vnode->lock_state == AFS_VNODE_LOCK_EXTENDING)) {
                cancel_delayed_work(&vnode->lock_work);
 
-               vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK;
-               afs_lock_may_be_available(vnode);
+               afs_set_lock_state(vnode, AFS_VNODE_LOCK_NEED_UNLOCK);
+               trace_afs_flock_ev(vnode, NULL, afs_flock_defer_unlock, 0);
+               queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0);
        }
 }
 
@@ -335,7 +407,7 @@ static void afs_defer_unlock(struct afs_vnode *vnode)
  * whether we think that we have a locking permit.
  */
 static int afs_do_setlk_check(struct afs_vnode *vnode, struct key *key,
-                             afs_lock_type_t type, bool can_sleep)
+                             enum afs_flock_mode mode, afs_lock_type_t type)
 {
        afs_access_t access;
        int ret;
@@ -363,62 +435,14 @@ static int afs_do_setlk_check(struct afs_vnode *vnode, struct key *key,
        if (type == AFS_LOCK_READ) {
                if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE | AFS_ACE_LOCK)))
                        return -EACCES;
-               if (vnode->status.lock_count == -1 && !can_sleep)
-                       return -EAGAIN; /* Write locked */
        } else {
                if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE)))
                        return -EACCES;
-               if (vnode->status.lock_count != 0 && !can_sleep)
-                       return -EAGAIN; /* Locked */
        }
 
        return 0;
 }
 
-/*
- * Remove the front runner from the pending queue.
- * - The caller must hold vnode->lock.
- */
-static void afs_dequeue_lock(struct afs_vnode *vnode, struct file_lock *fl)
-{
-       struct file_lock *next;
-
-       _enter("");
-
-       /* ->lock_type, ->lock_key and ->lock_state only belong to this
-        * file_lock if we're at the front of the pending queue or if we have
-        * the lock granted or if the lock_state is NEED_UNLOCK or UNLOCKING.
-        */
-       if (vnode->granted_locks.next == &fl->fl_u.afs.link &&
-           vnode->granted_locks.prev == &fl->fl_u.afs.link) {
-               list_del_init(&fl->fl_u.afs.link);
-               afs_defer_unlock(vnode);
-               return;
-       }
-
-       if (!list_empty(&vnode->granted_locks) ||
-           vnode->pending_locks.next != &fl->fl_u.afs.link) {
-               list_del_init(&fl->fl_u.afs.link);
-               return;
-       }
-
-       list_del_init(&fl->fl_u.afs.link);
-       key_put(vnode->lock_key);
-       vnode->lock_key = NULL;
-       vnode->lock_state = AFS_VNODE_LOCK_NONE;
-
-       if (list_empty(&vnode->pending_locks))
-               return;
-
-       /* The new front of the queue now owns the state variables. */
-       next = list_entry(vnode->pending_locks.next,
-                         struct file_lock, fl_u.afs.link);
-       vnode->lock_key = key_get(afs_file_key(next->fl_file));
-       vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
-       vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
-       afs_lock_may_be_available(vnode);
-}
-
 /*
  * request a lock on a file on the server
  */
@@ -426,97 +450,162 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
 {
        struct inode *inode = locks_inode(file);
        struct afs_vnode *vnode = AFS_FS_I(inode);
+       enum afs_flock_mode mode = AFS_FS_S(inode->i_sb)->flock_mode;
        afs_lock_type_t type;
        struct key *key = afs_file_key(file);
+       bool partial, no_server_lock = false;
        int ret;
 
-       _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+       if (mode == afs_flock_mode_unset)
+               mode = afs_flock_mode_openafs;
 
-       /* only whole-file locks are supported */
-       if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
-               return -EINVAL;
+       _enter("{%llx:%llu},%llu-%llu,%u,%u",
+              vnode->fid.vid, vnode->fid.vnode,
+              fl->fl_start, fl->fl_end, fl->fl_type, mode);
 
        fl->fl_ops = &afs_lock_ops;
        INIT_LIST_HEAD(&fl->fl_u.afs.link);
        fl->fl_u.afs.state = AFS_LOCK_PENDING;
 
+       partial = (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX);
        type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
+       if (mode == afs_flock_mode_write && partial)
+               type = AFS_LOCK_WRITE;
 
-       ret = afs_do_setlk_check(vnode, key, type, fl->fl_flags & FL_SLEEP);
+       ret = afs_do_setlk_check(vnode, key, mode, type);
        if (ret < 0)
                return ret;
 
-       spin_lock(&vnode->lock);
+       trace_afs_flock_op(vnode, fl, afs_flock_op_set_lock);
 
-       /* If we've already got a readlock on the server then we instantly
-        * grant another readlock, irrespective of whether there are any
-        * pending writelocks.
+       /* AFS3 protocol only supports full-file locks and doesn't provide any
+        * method of upgrade/downgrade, so we need to emulate for partial-file
+        * locks.
+        *
+        * The OpenAFS client only gets a server lock for a full-file lock and
+        * keeps partial-file locks local.  Allow this behaviour to be emulated
+        * (as the default).
         */
-       if (type == AFS_LOCK_READ &&
-           vnode->lock_state == AFS_VNODE_LOCK_GRANTED &&
-           vnode->lock_type == AFS_LOCK_READ) {
-               _debug("instant readlock");
-               ASSERT(!list_empty(&vnode->granted_locks));
-               goto share_existing_lock;
+       if (mode == afs_flock_mode_local ||
+           (partial && mode == afs_flock_mode_openafs)) {
+               no_server_lock = true;
+               goto skip_server_lock;
        }
 
+       spin_lock(&vnode->lock);
        list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
 
+       ret = -ENOENT;
+       if (vnode->lock_state == AFS_VNODE_LOCK_DELETED)
+               goto error_unlock;
+
+       /* If we've already got a lock on the server then try to move to having
+        * the VFS grant the requested lock.  Note that this means that other
+        * clients may get starved out.
+        */
+       _debug("try %u", vnode->lock_state);
+       if (vnode->lock_state == AFS_VNODE_LOCK_GRANTED) {
+               if (type == AFS_LOCK_READ) {
+                       _debug("instant readlock");
+                       list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
+                       fl->fl_u.afs.state = AFS_LOCK_GRANTED;
+                       goto vnode_is_locked_u;
+               }
+
+               if (vnode->lock_type == AFS_LOCK_WRITE) {
+                       _debug("instant writelock");
+                       list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
+                       fl->fl_u.afs.state = AFS_LOCK_GRANTED;
+                       goto vnode_is_locked_u;
+               }
+       }
+
+       if (vnode->lock_state == AFS_VNODE_LOCK_NONE &&
+           !(fl->fl_flags & FL_SLEEP)) {
+               ret = -EAGAIN;
+               if (type == AFS_LOCK_READ) {
+                       if (vnode->status.lock_count == -1)
+                               goto lock_is_contended; /* Write locked */
+               } else {
+                       if (vnode->status.lock_count != 0)
+                               goto lock_is_contended; /* Locked */
+               }
+       }
+
        if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
                goto need_to_wait;
 
+try_to_lock:
        /* We don't have a lock on this vnode and we aren't currently waiting
         * for one either, so ask the server for a lock.
         *
         * Note that we need to be careful if we get interrupted by a signal
         * after dispatching the request as we may still get the lock, even
         * though we don't wait for the reply (it's not too bad a problem - the
-        * lock will expire in 10 mins anyway).
+        * lock will expire in 5 mins anyway).
         */
-       _debug("not locked");
+       trace_afs_flock_ev(vnode, fl, afs_flock_try_to_lock, 0);
        vnode->lock_key = key_get(key);
        vnode->lock_type = type;
-       vnode->lock_state = AFS_VNODE_LOCK_SETTING;
+       afs_set_lock_state(vnode, AFS_VNODE_LOCK_SETTING);
        spin_unlock(&vnode->lock);
 
        ret = afs_set_lock(vnode, key, type); /* RPC */
 
        spin_lock(&vnode->lock);
        switch (ret) {
+       case -EKEYREJECTED:
+       case -EKEYEXPIRED:
+       case -EKEYREVOKED:
+       case -EPERM:
+       case -EACCES:
+               fl->fl_u.afs.state = ret;
+               trace_afs_flock_ev(vnode, fl, afs_flock_fail_perm, ret);
+               list_del_init(&fl->fl_u.afs.link);
+               afs_next_locker(vnode, ret);
+               goto error_unlock;
+
+       case -ENOENT:
+               fl->fl_u.afs.state = ret;
+               trace_afs_flock_ev(vnode, fl, afs_flock_fail_other, ret);
+               list_del_init(&fl->fl_u.afs.link);
+               afs_kill_lockers_enoent(vnode);
+               goto error_unlock;
+
        default:
-               goto abort_attempt;
+               fl->fl_u.afs.state = ret;
+               trace_afs_flock_ev(vnode, fl, afs_flock_fail_other, ret);
+               list_del_init(&fl->fl_u.afs.link);
+               afs_next_locker(vnode, 0);
+               goto error_unlock;
 
        case -EWOULDBLOCK:
                /* The server doesn't have a lock-waiting queue, so the client
                 * will have to retry.  The server will break the outstanding
                 * callbacks on a file when a lock is released.
                 */
-               _debug("would block");
                ASSERT(list_empty(&vnode->granted_locks));
                ASSERTCMP(vnode->pending_locks.next, ==, &fl->fl_u.afs.link);
-               vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
-               goto need_to_wait;
+               goto lock_is_contended;
 
        case 0:
-               _debug("acquired");
-               break;
+               afs_set_lock_state(vnode, AFS_VNODE_LOCK_GRANTED);
+               trace_afs_flock_ev(vnode, fl, afs_flock_acquired, type);
+               afs_grant_locks(vnode);
+               goto vnode_is_locked_u;
        }
 
-       /* we've acquired a server lock, but it needs to be renewed after 5
-        * mins */
-       vnode->lock_state = AFS_VNODE_LOCK_GRANTED;
-       afs_schedule_lock_extension(vnode);
-
-share_existing_lock:
-       /* the lock has been granted as far as we're concerned... */
-       fl->fl_u.afs.state = AFS_LOCK_GRANTED;
-       list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
-
-given_lock:
-       /* ... but we do still need to get the VFS's blessing */
+vnode_is_locked_u:
        spin_unlock(&vnode->lock);
-
-       ret = posix_lock_file(file, fl, NULL);
+vnode_is_locked:
+       /* the lock has been granted by the server... */
+       ASSERTCMP(fl->fl_u.afs.state, ==, AFS_LOCK_GRANTED);
+
+skip_server_lock:
+       /* ... but the VFS still needs to distribute access on this client. */
+       trace_afs_flock_ev(vnode, fl, afs_flock_vfs_locking, 0);
+       ret = locks_lock_file_wait(file, fl);
+       trace_afs_flock_ev(vnode, fl, afs_flock_vfs_lock, ret);
        if (ret < 0)
                goto vfs_rejected_lock;
 
@@ -528,38 +617,62 @@ given_lock:
        _leave(" = 0");
        return 0;
 
+lock_is_contended:
+       if (!(fl->fl_flags & FL_SLEEP)) {
+               list_del_init(&fl->fl_u.afs.link);
+               afs_next_locker(vnode, 0);
+               ret = -EAGAIN;
+               goto error_unlock;
+       }
+
+       afs_set_lock_state(vnode, AFS_VNODE_LOCK_WAITING_FOR_CB);
+       trace_afs_flock_ev(vnode, fl, afs_flock_would_block, ret);
+       queue_delayed_work(afs_lock_manager, &vnode->lock_work, HZ * 5);
+
 need_to_wait:
        /* We're going to have to wait.  Either this client doesn't have a lock
         * on the server yet and we need to wait for a callback to occur, or
-        * the client does have a lock on the server, but it belongs to some
-        * other process(es) and is incompatible with the lock we want.
+        * the client does have a lock on the server, but it's shared and we
+        * need an exclusive lock.
         */
-       ret = -EAGAIN;
-       if (fl->fl_flags & FL_SLEEP) {
-               spin_unlock(&vnode->lock);
+       spin_unlock(&vnode->lock);
 
-               _debug("sleep");
-               ret = wait_event_interruptible(fl->fl_wait,
-                                              fl->fl_u.afs.state != AFS_LOCK_PENDING);
+       trace_afs_flock_ev(vnode, fl, afs_flock_waiting, 0);
+       ret = wait_event_interruptible(fl->fl_wait,
+                                      fl->fl_u.afs.state != AFS_LOCK_PENDING);
+       trace_afs_flock_ev(vnode, fl, afs_flock_waited, ret);
 
+       if (fl->fl_u.afs.state >= 0 && fl->fl_u.afs.state != AFS_LOCK_GRANTED) {
                spin_lock(&vnode->lock);
-       }
 
-       if (fl->fl_u.afs.state == AFS_LOCK_GRANTED)
-               goto given_lock;
-       if (fl->fl_u.afs.state < 0)
-               ret = fl->fl_u.afs.state;
+               switch (fl->fl_u.afs.state) {
+               case AFS_LOCK_YOUR_TRY:
+                       fl->fl_u.afs.state = AFS_LOCK_PENDING;
+                       goto try_to_lock;
+               case AFS_LOCK_PENDING:
+                       if (ret > 0) {
+                               /* We need to retry the lock.  We may not be
+                                * notified by the server if it just expired
+                                * rather than being released.
+                                */
+                               ASSERTCMP(vnode->lock_state, ==, AFS_VNODE_LOCK_WAITING_FOR_CB);
+                               afs_set_lock_state(vnode, AFS_VNODE_LOCK_SETTING);
+                               fl->fl_u.afs.state = AFS_LOCK_PENDING;
+                               goto try_to_lock;
+                       }
+                       goto error_unlock;
+               case AFS_LOCK_GRANTED:
+               default:
+                       break;
+               }
 
-abort_attempt:
-       /* we aren't going to get the lock, either because we're unwilling to
-        * wait, or because some signal happened */
-       _debug("abort");
-       afs_dequeue_lock(vnode, fl);
+               spin_unlock(&vnode->lock);
+       }
 
-error_unlock:
-       spin_unlock(&vnode->lock);
-       _leave(" = %d", ret);
-       return ret;
+       if (fl->fl_u.afs.state == AFS_LOCK_GRANTED)
+               goto vnode_is_locked;
+       ret = fl->fl_u.afs.state;
+       goto error;
 
 vfs_rejected_lock:
        /* The VFS rejected the lock we just obtained, so we have to discard
@@ -567,11 +680,17 @@ vfs_rejected_lock:
         * deal with.
         */
        _debug("vfs refused %d", ret);
+       if (no_server_lock)
+               goto error;
        spin_lock(&vnode->lock);
        list_del_init(&fl->fl_u.afs.link);
-       if (list_empty(&vnode->granted_locks))
-               afs_defer_unlock(vnode);
-       goto error_unlock;
+       afs_defer_unlock(vnode);
+
+error_unlock:
+       spin_unlock(&vnode->lock);
+error:
+       _leave(" = %d", ret);
+       return ret;
 }
 
 /*
@@ -584,14 +703,12 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl)
 
        _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
 
+       trace_afs_flock_op(vnode, fl, afs_flock_op_unlock);
+
        /* Flush all pending writes before doing anything with locks. */
        vfs_fsync(file, 0);
 
-       /* only whole-file unlocks are supported */
-       if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
-               return -EINVAL;
-
-       ret = posix_lock_file(file, fl, NULL);
+       ret = locks_lock_file_wait(file, fl);
        _leave(" = %d [%u]", ret, vnode->lock_state);
        return ret;
 }
@@ -607,6 +724,9 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
 
        _enter("");
 
+       if (vnode->lock_state == AFS_VNODE_LOCK_DELETED)
+               return -ENOENT;
+
        fl->fl_type = F_UNLCK;
 
        /* check local lock records first */
@@ -618,12 +738,15 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
                        goto error;
 
                lock_count = READ_ONCE(vnode->status.lock_count);
-               if (lock_count > 0)
-                       fl->fl_type = F_RDLCK;
-               else
-                       fl->fl_type = F_WRLCK;
-               fl->fl_start = 0;
-               fl->fl_end = OFFSET_MAX;
+               if (lock_count != 0) {
+                       if (lock_count > 0)
+                               fl->fl_type = F_RDLCK;
+                       else
+                               fl->fl_type = F_WRLCK;
+                       fl->fl_start = 0;
+                       fl->fl_end = OFFSET_MAX;
+                       fl->fl_pid = 0;
+               }
        }
 
        ret = 0;
@@ -638,6 +761,8 @@ error:
 int afs_lock(struct file *file, int cmd, struct file_lock *fl)
 {
        struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
+       enum afs_flock_operation op;
+       int ret;
 
        _enter("{%llx:%llu},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
               vnode->fid.vid, vnode->fid.vnode, cmd,
@@ -650,9 +775,23 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
 
        if (IS_GETLK(cmd))
                return afs_do_getlk(file, fl);
+
+       fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id);
+       trace_afs_flock_op(vnode, fl, afs_flock_op_lock);
+
        if (fl->fl_type == F_UNLCK)
-               return afs_do_unlk(file, fl);
-       return afs_do_setlk(file, fl);
+               ret = afs_do_unlk(file, fl);
+       else
+               ret = afs_do_setlk(file, fl);
+
+       switch (ret) {
+       case 0:         op = afs_flock_op_return_ok; break;
+       case -EAGAIN:   op = afs_flock_op_return_eagain; break;
+       case -EDEADLK:  op = afs_flock_op_return_edeadlk; break;
+       default:        op = afs_flock_op_return_error; break;
+       }
+       trace_afs_flock_op(vnode, fl, op);
+       return ret;
 }
 
 /*
@@ -661,6 +800,8 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
 int afs_flock(struct file *file, int cmd, struct file_lock *fl)
 {
        struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
+       enum afs_flock_operation op;
+       int ret;
 
        _enter("{%llx:%llu},%d,{t=%x,fl=%x}",
               vnode->fid.vid, vnode->fid.vnode, cmd,
@@ -676,10 +817,23 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
        if (!(fl->fl_flags & FL_FLOCK))
                return -ENOLCK;
 
+       fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id);
+       trace_afs_flock_op(vnode, fl, afs_flock_op_flock);
+
        /* we're simulating flock() locks using posix locks on the server */
        if (fl->fl_type == F_UNLCK)
-               return afs_do_unlk(file, fl);
-       return afs_do_setlk(file, fl);
+               ret = afs_do_unlk(file, fl);
+       else
+               ret = afs_do_setlk(file, fl);
+
+       switch (ret) {
+       case 0:         op = afs_flock_op_return_ok; break;
+       case -EAGAIN:   op = afs_flock_op_return_eagain; break;
+       case -EDEADLK:  op = afs_flock_op_return_edeadlk; break;
+       default:        op = afs_flock_op_return_error; break;
+       }
+       trace_afs_flock_op(vnode, fl, op);
+       return ret;
 }
 
 /*
@@ -694,7 +848,10 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl)
 
        _enter("");
 
+       new->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id);
+
        spin_lock(&vnode->lock);
+       trace_afs_flock_op(vnode, new, afs_flock_op_copy_lock);
        list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link);
        spin_unlock(&vnode->lock);
 }
@@ -710,7 +867,12 @@ static void afs_fl_release_private(struct file_lock *fl)
        _enter("");
 
        spin_lock(&vnode->lock);
-       afs_dequeue_lock(vnode, fl);
+
+       trace_afs_flock_op(vnode, fl, afs_flock_op_release_lock);
+       list_del_init(&fl->fl_u.afs.link);
+       if (list_empty(&vnode->granted_locks))
+               afs_defer_unlock(vnode);
+
        _debug("state %u for %p", vnode->lock_state, vnode);
        spin_unlock(&vnode->lock);
 }