Merge branch 'misc.namei' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[linux-2.6-microblaze.git] / fs / ceph / caps.c
index 2a29009..6c0e52f 100644 (file)
@@ -703,29 +703,12 @@ void ceph_add_cap(struct inode *inode,
                 */
                struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
                                                               realmino);
-               if (realm) {
-                       struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
-                       if (oldrealm) {
-                               spin_lock(&oldrealm->inodes_with_caps_lock);
-                               list_del_init(&ci->i_snap_realm_item);
-                               spin_unlock(&oldrealm->inodes_with_caps_lock);
-                       }
-
-                       spin_lock(&realm->inodes_with_caps_lock);
-                       list_add(&ci->i_snap_realm_item,
-                                &realm->inodes_with_caps);
-                       ci->i_snap_realm = realm;
-                       if (realm->ino == ci->i_vino.ino)
-                               realm->inode = inode;
-                       spin_unlock(&realm->inodes_with_caps_lock);
-
-                       if (oldrealm)
-                               ceph_put_snap_realm(mdsc, oldrealm);
-               } else {
-                       pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
-                              realmino);
-                       WARN_ON(!realm);
-               }
+               if (realm)
+                       ceph_change_snap_realm(inode, realm);
+               else
+                       WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n",
+                            __func__, realmino, ci->i_vino.ino,
+                            ci->i_snap_realm ? ci->i_snap_realm->ino : 0);
        }
 
        __check_cap_issue(ci, cap, issued);
@@ -1112,20 +1095,6 @@ int ceph_is_any_caps(struct inode *inode)
        return ret;
 }
 
-static void drop_inode_snap_realm(struct ceph_inode_info *ci)
-{
-       struct ceph_snap_realm *realm = ci->i_snap_realm;
-       spin_lock(&realm->inodes_with_caps_lock);
-       list_del_init(&ci->i_snap_realm_item);
-       ci->i_snap_realm_counter++;
-       ci->i_snap_realm = NULL;
-       if (realm->ino == ci->i_vino.ino)
-               realm->inode = NULL;
-       spin_unlock(&realm->inodes_with_caps_lock);
-       ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
-                           realm);
-}
-
 /*
  * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
  *
@@ -1145,17 +1114,16 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
                return;
        }
 
+       lockdep_assert_held(&ci->i_ceph_lock);
+
        dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
 
        mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc;
 
        /* remove from inode's cap rbtree, and clear auth cap */
        rb_erase(&cap->ci_node, &ci->i_caps);
-       if (ci->i_auth_cap == cap) {
-               WARN_ON_ONCE(!list_empty(&ci->i_dirty_item) &&
-                            !mdsc->fsc->blocklisted);
+       if (ci->i_auth_cap == cap)
                ci->i_auth_cap = NULL;
-       }
 
        /* remove from session list */
        spin_lock(&session->s_cap_lock);
@@ -1201,12 +1169,34 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
                 * keep i_snap_realm.
                 */
                if (ci->i_wr_ref == 0 && ci->i_snap_realm)
-                       drop_inode_snap_realm(ci);
+                       ceph_change_snap_realm(&ci->vfs_inode, NULL);
 
                __cap_delay_cancel(mdsc, ci);
        }
 }
 
+void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
+{
+       struct ceph_inode_info *ci = cap->ci;
+       struct ceph_fs_client *fsc;
+
+       /* 'ci' being NULL means the remove have already occurred */
+       if (!ci) {
+               dout("%s: cap inode is NULL\n", __func__);
+               return;
+       }
+
+       lockdep_assert_held(&ci->i_ceph_lock);
+
+       fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+       WARN_ON_ONCE(ci->i_auth_cap == cap &&
+                    !list_empty(&ci->i_dirty_item) &&
+                    !fsc->blocklisted &&
+                    READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN);
+
+       __ceph_remove_cap(cap, queue_release);
+}
+
 struct cap_msg_args {
        struct ceph_mds_session *session;
        u64                     ino, cid, follows;
@@ -1335,7 +1325,7 @@ void __ceph_remove_caps(struct ceph_inode_info *ci)
        while (p) {
                struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
                p = rb_next(p);
-               __ceph_remove_cap(cap, true);
+               ceph_remove_cap(cap, true);
        }
        spin_unlock(&ci->i_ceph_lock);
 }
@@ -1743,7 +1733,14 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
 
 struct ceph_cap_flush *ceph_alloc_cap_flush(void)
 {
-       return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+       struct ceph_cap_flush *cf;
+
+       cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+       if (!cf)
+               return NULL;
+
+       cf->is_capsnap = false;
+       return cf;
 }
 
 void ceph_free_cap_flush(struct ceph_cap_flush *cf)
@@ -1778,7 +1775,7 @@ static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc,
                prev->wake = true;
                wake = false;
        }
-       list_del(&cf->g_list);
+       list_del_init(&cf->g_list);
        return wake;
 }
 
@@ -1793,7 +1790,7 @@ static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci,
                prev->wake = true;
                wake = false;
        }
-       list_del(&cf->i_list);
+       list_del_init(&cf->i_list);
        return wake;
 }
 
@@ -1852,6 +1849,8 @@ static u64 __mark_caps_flushing(struct inode *inode,
  * try to invalidate mapping pages without blocking.
  */
 static int try_nonblocking_invalidate(struct inode *inode)
+       __releases(ci->i_ceph_lock)
+       __acquires(ci->i_ceph_lock)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        u32 invalidating_gen = ci->i_rdcache_gen;
@@ -2215,6 +2214,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
  */
 static int unsafe_request_wait(struct inode *inode)
 {
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_mds_request *req1 = NULL, *req2 = NULL;
        int ret, err = 0;
@@ -2234,6 +2234,81 @@ static int unsafe_request_wait(struct inode *inode)
        }
        spin_unlock(&ci->i_unsafe_lock);
 
+       /*
+        * Trigger to flush the journal logs in all the relevant MDSes
+        * manually, or in the worst case we must wait at most 5 seconds
+        * to wait the journal logs to be flushed by the MDSes periodically.
+        */
+       if (req1 || req2) {
+               struct ceph_mds_session **sessions = NULL;
+               struct ceph_mds_session *s;
+               struct ceph_mds_request *req;
+               unsigned int max;
+               int i;
+
+               /*
+                * The mdsc->max_sessions is unlikely to be changed
+                * mostly, here we will retry it by reallocating the
+                * sessions arrary memory to get rid of the mdsc->mutex
+                * lock.
+                */
+retry:
+               max = mdsc->max_sessions;
+               sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);
+               if (!sessions)
+                       return -ENOMEM;
+
+               spin_lock(&ci->i_unsafe_lock);
+               if (req1) {
+                       list_for_each_entry(req, &ci->i_unsafe_dirops,
+                                           r_unsafe_dir_item) {
+                               s = req->r_session;
+                               if (unlikely(s->s_mds > max)) {
+                                       spin_unlock(&ci->i_unsafe_lock);
+                                       goto retry;
+                               }
+                               if (!sessions[s->s_mds]) {
+                                       s = ceph_get_mds_session(s);
+                                       sessions[s->s_mds] = s;
+                               }
+                       }
+               }
+               if (req2) {
+                       list_for_each_entry(req, &ci->i_unsafe_iops,
+                                           r_unsafe_target_item) {
+                               s = req->r_session;
+                               if (unlikely(s->s_mds > max)) {
+                                       spin_unlock(&ci->i_unsafe_lock);
+                                       goto retry;
+                               }
+                               if (!sessions[s->s_mds]) {
+                                       s = ceph_get_mds_session(s);
+                                       sessions[s->s_mds] = s;
+                               }
+                       }
+               }
+               spin_unlock(&ci->i_unsafe_lock);
+
+               /* the auth MDS */
+               spin_lock(&ci->i_ceph_lock);
+               if (ci->i_auth_cap) {
+                     s = ci->i_auth_cap->session;
+                     if (!sessions[s->s_mds])
+                             sessions[s->s_mds] = ceph_get_mds_session(s);
+               }
+               spin_unlock(&ci->i_ceph_lock);
+
+               /* send flush mdlog request to MDSes */
+               for (i = 0; i < max; i++) {
+                       s = sessions[i];
+                       if (s) {
+                               send_flush_mdlog(s);
+                               ceph_put_mds_session(s);
+                       }
+               }
+               kfree(sessions);
+       }
+
        dout("unsafe_request_wait %p wait on tid %llu %llu\n",
             inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
        if (req1) {
@@ -2352,7 +2427,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
        ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
 
        list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
-               if (!cf->caps) {
+               if (cf->is_capsnap) {
                        last_snap_flush = cf->tid;
                        break;
                }
@@ -2371,7 +2446,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 
                first_tid = cf->tid + 1;
 
-               if (cf->caps) {
+               if (!cf->is_capsnap) {
                        struct cap_msg_args arg;
 
                        dout("kick_flushing_caps %p cap %p tid %llu %s\n",
@@ -3004,7 +3079,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
                        }
                        /* see comment in __ceph_remove_cap() */
                        if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
-                               drop_inode_snap_realm(ci);
+                               ceph_change_snap_realm(inode, NULL);
                }
        }
        if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) {
@@ -3110,7 +3185,16 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
                                break;
                        }
                }
-               BUG_ON(!found);
+
+               if (!found) {
+                       /*
+                        * The capsnap should already be removed when removing
+                        * auth cap in the case of a forced unmount.
+                        */
+                       WARN_ON_ONCE(ci->i_auth_cap);
+                       goto unlock;
+               }
+
                capsnap->dirty_pages -= nr;
                if (capsnap->dirty_pages == 0) {
                        complete_capsnap = true;
@@ -3132,6 +3216,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
                     complete_capsnap ? " (complete capsnap)" : "");
        }
 
+unlock:
        spin_unlock(&ci->i_ceph_lock);
 
        if (last) {
@@ -3516,7 +3601,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
                        cleaned = cf->caps;
 
                /* Is this a capsnap? */
-               if (cf->caps == 0)
+               if (cf->is_capsnap)
                        continue;
 
                if (cf->tid <= flush_tid) {
@@ -3589,8 +3674,9 @@ out:
        while (!list_empty(&to_remove)) {
                cf = list_first_entry(&to_remove,
                                      struct ceph_cap_flush, i_list);
-               list_del(&cf->i_list);
-               ceph_free_cap_flush(cf);
+               list_del_init(&cf->i_list);
+               if (!cf->is_capsnap)
+                       ceph_free_cap_flush(cf);
        }
 
        if (wake_ci)
@@ -3601,6 +3687,43 @@ out:
                iput(inode);
 }
 
+void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
+                          bool *wake_ci, bool *wake_mdsc)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+       bool ret;
+
+       lockdep_assert_held(&ci->i_ceph_lock);
+
+       dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci);
+
+       list_del_init(&capsnap->ci_item);
+       ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
+       if (wake_ci)
+               *wake_ci = ret;
+
+       spin_lock(&mdsc->cap_dirty_lock);
+       if (list_empty(&ci->i_cap_flush_list))
+               list_del_init(&ci->i_flushing_item);
+
+       ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush);
+       if (wake_mdsc)
+               *wake_mdsc = ret;
+       spin_unlock(&mdsc->cap_dirty_lock);
+}
+
+void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
+                        bool *wake_ci, bool *wake_mdsc)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+
+       lockdep_assert_held(&ci->i_ceph_lock);
+
+       WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing);
+       __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc);
+}
+
 /*
  * Handle FLUSHSNAP_ACK.  MDS has flushed snap data to disk and we can
  * throw away our cap_snap.
@@ -3638,23 +3761,10 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
                             capsnap, capsnap->follows);
                }
        }
-       if (flushed) {
-               WARN_ON(capsnap->dirty_pages || capsnap->writing);
-               dout(" removing %p cap_snap %p follows %lld\n",
-                    inode, capsnap, follows);
-               list_del(&capsnap->ci_item);
-               wake_ci |= __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
-
-               spin_lock(&mdsc->cap_dirty_lock);
-
-               if (list_empty(&ci->i_cap_flush_list))
-                       list_del_init(&ci->i_flushing_item);
-
-               wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc,
-                                                         &capsnap->cap_flush);
-               spin_unlock(&mdsc->cap_dirty_lock);
-       }
+       if (flushed)
+               ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc);
        spin_unlock(&ci->i_ceph_lock);
+
        if (flushed) {
                ceph_put_snap_context(capsnap->context);
                ceph_put_cap_snap(capsnap);
@@ -3738,7 +3848,7 @@ retry:
                goto out_unlock;
 
        if (target < 0) {
-               __ceph_remove_cap(cap, false);
+               ceph_remove_cap(cap, false);
                goto out_unlock;
        }
 
@@ -3773,7 +3883,7 @@ retry:
                                change_auth_cap_ses(ci, tcap->session);
                        }
                }
-               __ceph_remove_cap(cap, false);
+               ceph_remove_cap(cap, false);
                goto out_unlock;
        } else if (tsession) {
                /* add placeholder for the export tagert */
@@ -3790,7 +3900,7 @@ retry:
                        spin_unlock(&mdsc->cap_dirty_lock);
                }
 
-               __ceph_remove_cap(cap, false);
+               ceph_remove_cap(cap, false);
                goto out_unlock;
        }
 
@@ -3901,7 +4011,7 @@ retry:
                                        ocap->mseq, mds, le32_to_cpu(ph->seq),
                                        le32_to_cpu(ph->mseq));
                }
-               __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
+               ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
        }
 
        *old_issued = issued;
@@ -4129,8 +4239,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 done:
        mutex_unlock(&session->s_mutex);
 done_unlocked:
-       ceph_put_string(extra_info.pool_ns);
        iput(inode);
+out:
+       ceph_put_string(extra_info.pool_ns);
        return;
 
 flush_cap_releases:
@@ -4145,7 +4256,7 @@ flush_cap_releases:
 bad:
        pr_err("ceph_handle_caps: corrupt message\n");
        ceph_msg_dump(msg);
-       return;
+       goto out;
 }
 
 /*
@@ -4220,33 +4331,9 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
        dout("flush_dirty_caps done\n");
 }
 
-static void iterate_sessions(struct ceph_mds_client *mdsc,
-                            void (*cb)(struct ceph_mds_session *))
-{
-       int mds;
-
-       mutex_lock(&mdsc->mutex);
-       for (mds = 0; mds < mdsc->max_sessions; ++mds) {
-               struct ceph_mds_session *s;
-
-               if (!mdsc->sessions[mds])
-                       continue;
-
-               s = ceph_get_mds_session(mdsc->sessions[mds]);
-               if (!s)
-                       continue;
-
-               mutex_unlock(&mdsc->mutex);
-               cb(s);
-               ceph_put_mds_session(s);
-               mutex_lock(&mdsc->mutex);
-       }
-       mutex_unlock(&mdsc->mutex);
-}
-
 void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
 {
-       iterate_sessions(mdsc, flush_dirty_session_caps);
+       ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
 }
 
 void __ceph_touch_fmode(struct ceph_inode_info *ci,