ceph: avoid dereferencing invalid pointer during cached readdir
authorYan, Zheng <zyan@redhat.com>
Mon, 27 Nov 2017 03:23:48 +0000 (11:23 +0800)
committerIlya Dryomov <idryomov@gmail.com>
Mon, 29 Jan 2018 17:36:07 +0000 (18:36 +0100)
Readdir cache keeps array of dentry pointers in page cache. If any
dentry in readdir cache gets pruned, ceph_d_prune() disables readdir
cache for later readdir syscall. The problem is that ceph_d_prune()
ignores unhashed dentry. Ideally MDS should have already revoked
CEPH_CAP_FILE_SHARED (which also disables readdir cache) when dentry
gets unhashed. But if it is somehow MDS does not properly revoke
CEPH_CAP_FILE_SHARED and the unhashed dentry gets pruned later,
ceph_d_prune() will not disable readdir cache, later readdir may
reference invalid dentry pointer.

The fix is make ceph_d_prune() do extra check for unhashed dentry.
Disable readdir cache if the unhashed dentry is still referenced
by readdir cache.

Another fix in this patch is handle d_splice_alias(). If a dentry
gets spliced into new parent dentry, treat it as if it was pruned
(call ceph_d_prune() for it).

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
fs/ceph/dir.c
fs/ceph/inode.c

index d671d58..0c43468 100644 (file)
@@ -231,11 +231,17 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
                        goto out;
                }
 
-               di = ceph_dentry(dentry);
                spin_lock(&dentry->d_lock);
-               if (di->lease_shared_gen == shared_gen &&
-                   d_really_is_positive(dentry) &&
-                   fpos_cmp(ctx->pos, di->offset) <= 0) {
+               di = ceph_dentry(dentry);
+               if (d_unhashed(dentry) ||
+                   d_really_is_negative(dentry) ||
+                   di->lease_shared_gen != shared_gen) {
+                       spin_unlock(&dentry->d_lock);
+                       dput(dentry);
+                       err = -EAGAIN;
+                       goto out;
+               }
+               if (fpos_cmp(ctx->pos, di->offset) <= 0) {
                        emit_dentry = true;
                }
                spin_unlock(&dentry->d_lock);
@@ -1324,24 +1330,37 @@ static void ceph_d_release(struct dentry *dentry)
  */
 static void ceph_d_prune(struct dentry *dentry)
 {
-       dout("ceph_d_prune %p\n", dentry);
+       struct ceph_inode_info *dir_ci;
+       struct ceph_dentry_info *di;
+
+       dout("ceph_d_prune %pd %p\n", dentry, dentry);
 
        /* do we have a valid parent? */
        if (IS_ROOT(dentry))
                return;
 
-       /* if we are not hashed, we don't affect dir's completeness */
-       if (d_unhashed(dentry))
+       /* we hold d_lock, so d_parent is stable */
+       dir_ci = ceph_inode(d_inode(dentry->d_parent));
+       if (dir_ci->i_vino.snap == CEPH_SNAPDIR)
                return;
 
-       if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR)
+       /* who calls d_delete() should also disable dcache readdir */
+       if (d_really_is_negative(dentry))
                return;
 
-       /*
-        * we hold d_lock, so d_parent is stable, and d_fsdata is never
-        * cleared until d_release
-        */
-       ceph_dir_clear_complete(d_inode(dentry->d_parent));
+       /* d_fsdata does not get cleared until d_release */
+       if (!d_unhashed(dentry)) {
+               __ceph_dir_clear_complete(dir_ci);
+               return;
+       }
+
+       /* Disable dcache readdir just in case that someone called d_drop()
+        * or d_invalidate(), but MDS didn't revoke CEPH_CAP_FILE_SHARED
+        * properly (dcache readdir is still enabled) */
+       di = ceph_dentry(dentry);
+       if (di->offset > 0 &&
+           di->lease_shared_gen == atomic_read(&dir_ci->i_shared_gen))
+               __ceph_dir_clear_ordered(dir_ci);
 }
 
 /*
index 6d02528..c6ec5aa 100644 (file)
@@ -1080,6 +1080,27 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
 
        BUG_ON(d_inode(dn));
 
+       if (S_ISDIR(in->i_mode)) {
+               /* If inode is directory, d_splice_alias() below will remove
+                * 'realdn' from its origin parent. We need to ensure that
+                * origin parent's readdir cache will not reference 'realdn'
+                */
+               realdn = d_find_any_alias(in);
+               if (realdn) {
+                       struct ceph_dentry_info *di = ceph_dentry(realdn);
+                       spin_lock(&realdn->d_lock);
+
+                       realdn->d_op->d_prune(realdn);
+
+                       di->time = jiffies;
+                       di->lease_shared_gen = 0;
+                       di->offset = 0;
+
+                       spin_unlock(&realdn->d_lock);
+                       dput(realdn);
+               }
+       }
+
        /* dn must be unhashed */
        if (!d_unhashed(dn))
                d_drop(dn);
@@ -1295,8 +1316,8 @@ retry_lookup:
                if (!rinfo->head->is_target) {
                        dout("fill_trace null dentry\n");
                        if (d_really_is_positive(dn)) {
-                               ceph_dir_clear_ordered(dir);
                                dout("d_delete %p\n", dn);
+                               ceph_dir_clear_ordered(dir);
                                d_delete(dn);
                        } else if (have_lease) {
                                if (d_unhashed(dn))
@@ -1323,7 +1344,6 @@ retry_lookup:
                        dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
                             dn, d_inode(dn), ceph_vinop(d_inode(dn)),
                             ceph_vinop(in));
-                       ceph_dir_clear_ordered(dir);
                        d_invalidate(dn);
                        have_lease = false;
                }
@@ -1573,9 +1593,19 @@ retry_lookup:
                } else if (d_really_is_positive(dn) &&
                           (ceph_ino(d_inode(dn)) != tvino.ino ||
                            ceph_snap(d_inode(dn)) != tvino.snap)) {
+                       struct ceph_dentry_info *di = ceph_dentry(dn);
                        dout(" dn %p points to wrong inode %p\n",
                             dn, d_inode(dn));
-                       __ceph_dir_clear_ordered(ci);
+
+                       spin_lock(&dn->d_lock);
+                       if (di->offset > 0 &&
+                           di->lease_shared_gen ==
+                           atomic_read(&ci->i_shared_gen)) {
+                               __ceph_dir_clear_ordered(ci);
+                               di->offset = 0;
+                       }
+                       spin_unlock(&dn->d_lock);
+
                        d_delete(dn);
                        dput(dn);
                        goto retry_lookup;
@@ -1600,9 +1630,7 @@ retry_lookup:
                                 &req->r_caps_reservation);
                if (ret < 0) {
                        pr_err("fill_inode badness on %p\n", in);
-                       if (d_really_is_positive(dn))
-                               __ceph_dir_clear_ordered(ci);
-                       else
+                       if (d_really_is_negative(dn))
                                iput(in);
                        d_drop(dn);
                        err = ret;