mm/hugetlbfs: unmap pages if page fault raced with hole punch
[linux-2.6-microblaze.git] / fs / namei.c
index 0c3974c..bceefd5 100644 (file)
@@ -505,13 +505,13 @@ struct nameidata {
        int             total_link_count;
        struct saved {
                struct path link;
-               void *cookie;
+               struct delayed_call done;
                const char *name;
-               struct inode *inode;
                unsigned seq;
        } *stack, internal[EMBEDDED_LEVELS];
        struct filename *name;
        struct nameidata *saved;
+       struct inode    *link_inode;
        unsigned        root_seq;
        int             dfd;
 };
@@ -534,10 +534,8 @@ static void restore_nameidata(void)
        current->nameidata = old;
        if (old)
                old->total_link_count = now->total_link_count;
-       if (now->stack != now->internal) {
+       if (now->stack != now->internal)
                kfree(now->stack);
-               now->stack = now->internal;
-       }
 }
 
 static int __nd_alloc_stack(struct nameidata *nd)
@@ -592,11 +590,8 @@ static void drop_links(struct nameidata *nd)
        int i = nd->depth;
        while (i--) {
                struct saved *last = nd->stack + i;
-               struct inode *inode = last->inode;
-               if (last->cookie && inode->i_op->put_link) {
-                       inode->i_op->put_link(inode, last->cookie);
-                       last->cookie = NULL;
-               }
+               do_delayed_call(&last->done);
+               clear_delayed_call(&last->done);
        }
 }
 
@@ -657,7 +652,7 @@ static bool legitimize_links(struct nameidata *nd)
  * Path walking has 2 modes, rcu-walk and ref-walk (see
  * Documentation/filesystems/path-lookup.txt).  In situations when we can't
  * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
- * normal reference counts on dentries and vfsmounts to transition to rcu-walk
+ * normal reference counts on dentries and vfsmounts to transition to ref-walk
  * mode.  Refcounts are grabbed at the last known good point before rcu-walk
  * got stuck, so ref-walk may continue from there. If this is not successful
  * (eg. a seqcount has changed), then failure is returned and it's up to caller
@@ -806,20 +801,20 @@ static int complete_walk(struct nameidata *nd)
 }
 
 static void set_root(struct nameidata *nd)
-{
-       get_fs_root(current->fs, &nd->root);
-}
-
-static void set_root_rcu(struct nameidata *nd)
 {
        struct fs_struct *fs = current->fs;
-       unsigned seq;
 
-       do {
-               seq = read_seqcount_begin(&fs->seq);
-               nd->root = fs->root;
-               nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
-       } while (read_seqcount_retry(&fs->seq, seq));
+       if (nd->flags & LOOKUP_RCU) {
+               unsigned seq;
+
+               do {
+                       seq = read_seqcount_begin(&fs->seq);
+                       nd->root = fs->root;
+                       nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
+               } while (read_seqcount_retry(&fs->seq, seq));
+       } else {
+               get_fs_root(fs, &nd->root);
+       }
 }
 
 static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -841,8 +836,28 @@ static inline void path_to_nameidata(const struct path *path,
        nd->path.dentry = path->dentry;
 }
 
+static int nd_jump_root(struct nameidata *nd)
+{
+       if (nd->flags & LOOKUP_RCU) {
+               struct dentry *d;
+               nd->path = nd->root;
+               d = nd->path.dentry;
+               nd->inode = d->d_inode;
+               nd->seq = nd->root_seq;
+               if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
+                       return -ECHILD;
+       } else {
+               path_put(&nd->path);
+               nd->path = nd->root;
+               path_get(&nd->path);
+               nd->inode = nd->path.dentry->d_inode;
+       }
+       nd->flags |= LOOKUP_JUMPED;
+       return 0;
+}
+
 /*
- * Helper to directly jump to a known parsed path from ->follow_link,
+ * Helper to directly jump to a known parsed path from ->get_link,
  * caller must have taken a reference to path beforehand.
  */
 void nd_jump_link(struct path *path)
@@ -858,9 +873,7 @@ void nd_jump_link(struct path *path)
 static inline void put_link(struct nameidata *nd)
 {
        struct saved *last = nd->stack + --nd->depth;
-       struct inode *inode = last->inode;
-       if (last->cookie && inode->i_op->put_link)
-               inode->i_op->put_link(inode, last->cookie);
+       do_delayed_call(&last->done);
        if (!(nd->flags & LOOKUP_RCU))
                path_put(&last->link);
 }
@@ -892,7 +905,7 @@ static inline int may_follow_link(struct nameidata *nd)
                return 0;
 
        /* Allowed if owner and follower match. */
-       inode = nd->stack[0].inode;
+       inode = nd->link_inode;
        if (uid_eq(current_cred()->fsuid, inode->i_uid))
                return 0;
 
@@ -983,7 +996,7 @@ const char *get_link(struct nameidata *nd)
 {
        struct saved *last = nd->stack + nd->depth - 1;
        struct dentry *dentry = last->link.dentry;
-       struct inode *inode = last->inode;
+       struct inode *inode = nd->link_inode;
        int error;
        const char *res;
 
@@ -1004,36 +1017,27 @@ const char *get_link(struct nameidata *nd)
        nd->last_type = LAST_BIND;
        res = inode->i_link;
        if (!res) {
+               const char * (*get)(struct dentry *, struct inode *,
+                               struct delayed_call *);
+               get = inode->i_op->get_link;
                if (nd->flags & LOOKUP_RCU) {
-                       if (unlikely(unlazy_walk(nd, NULL, 0)))
-                               return ERR_PTR(-ECHILD);
+                       res = get(NULL, inode, &last->done);
+                       if (res == ERR_PTR(-ECHILD)) {
+                               if (unlikely(unlazy_walk(nd, NULL, 0)))
+                                       return ERR_PTR(-ECHILD);
+                               res = get(dentry, inode, &last->done);
+                       }
+               } else {
+                       res = get(dentry, inode, &last->done);
                }
-               res = inode->i_op->follow_link(dentry, &last->cookie);
-               if (IS_ERR_OR_NULL(res)) {
-                       last->cookie = NULL;
+               if (IS_ERR_OR_NULL(res))
                        return res;
-               }
        }
        if (*res == '/') {
-               if (nd->flags & LOOKUP_RCU) {
-                       struct dentry *d;
-                       if (!nd->root.mnt)
-                               set_root_rcu(nd);
-                       nd->path = nd->root;
-                       d = nd->path.dentry;
-                       nd->inode = d->d_inode;
-                       nd->seq = nd->root_seq;
-                       if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
-                               return ERR_PTR(-ECHILD);
-               } else {
-                       if (!nd->root.mnt)
-                               set_root(nd);
-                       path_put(&nd->path);
-                       nd->path = nd->root;
-                       path_get(&nd->root);
-                       nd->inode = nd->path.dentry->d_inode;
-               }
-               nd->flags |= LOOKUP_JUMPED;
+               if (!nd->root.mnt)
+                       set_root(nd);
+               if (unlikely(nd_jump_root(nd)))
+                       return ERR_PTR(-ECHILD);
                while (unlikely(*++res == '/'))
                        ;
        }
@@ -1294,8 +1298,6 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 static int follow_dotdot_rcu(struct nameidata *nd)
 {
        struct inode *inode = nd->inode;
-       if (!nd->root.mnt)
-               set_root_rcu(nd);
 
        while (1) {
                if (path_equal(&nd->path, &nd->root))
@@ -1415,9 +1417,6 @@ static void follow_mount(struct path *path)
 
 static int follow_dotdot(struct nameidata *nd)
 {
-       if (!nd->root.mnt)
-               set_root(nd);
-
        while(1) {
                struct dentry *old = nd->path.dentry;
 
@@ -1655,6 +1654,8 @@ static inline int may_lookup(struct nameidata *nd)
 static inline int handle_dots(struct nameidata *nd, int type)
 {
        if (type == LAST_DOTDOT) {
+               if (!nd->root.mnt)
+                       set_root(nd);
                if (nd->flags & LOOKUP_RCU) {
                        return follow_dotdot_rcu(nd);
                } else
@@ -1691,8 +1692,8 @@ static int pick_link(struct nameidata *nd, struct path *link,
 
        last = nd->stack + nd->depth++;
        last->link = *link;
-       last->cookie = NULL;
-       last->inode = inode;
+       clear_delayed_call(&last->done);
+       nd->link_inode = inode;
        last->seq = seq;
        return 1;
 }
@@ -2020,18 +2021,19 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
        }
 
        nd->root.mnt = NULL;
+       nd->path.mnt = NULL;
+       nd->path.dentry = NULL;
 
        nd->m_seq = read_seqbegin(&mount_lock);
        if (*s == '/') {
-               if (flags & LOOKUP_RCU) {
+               if (flags & LOOKUP_RCU)
                        rcu_read_lock();
-                       set_root_rcu(nd);
-                       nd->seq = nd->root_seq;
-               } else {
-                       set_root(nd);
-                       path_get(&nd->root);
-               }
-               nd->path = nd->root;
+               set_root(nd);
+               if (likely(!nd_jump_root(nd)))
+                       return s;
+               nd->root.mnt = NULL;
+               rcu_read_unlock();
+               return ERR_PTR(-ECHILD);
        } else if (nd->dfd == AT_FDCWD) {
                if (flags & LOOKUP_RCU) {
                        struct fs_struct *fs = current->fs;
@@ -2042,11 +2044,14 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
                        do {
                                seq = read_seqcount_begin(&fs->seq);
                                nd->path = fs->pwd;
+                               nd->inode = nd->path.dentry->d_inode;
                                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
                        } while (read_seqcount_retry(&fs->seq, seq));
                } else {
                        get_fs_pwd(current->fs, &nd->path);
+                       nd->inode = nd->path.dentry->d_inode;
                }
+               return s;
        } else {
                /* Caller must check execute permissions on the starting path component */
                struct fd f = fdget_raw(nd->dfd);
@@ -2076,16 +2081,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
                fdput(f);
                return s;
        }
-
-       nd->inode = nd->path.dentry->d_inode;
-       if (!(flags & LOOKUP_RCU))
-               return s;
-       if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
-               return s;
-       if (!(nd->flags & LOOKUP_ROOT))
-               nd->root.mnt = NULL;
-       rcu_read_unlock();
-       return ERR_PTR(-ECHILD);
 }
 
 static const char *trailing_symlink(struct nameidata *nd)
@@ -2278,6 +2273,8 @@ EXPORT_SYMBOL(vfs_path_lookup);
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2321,6 +2318,75 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name:      pathname component to lookup
+ * @base:      base directory to lookup from
+ * @len:       maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+                                      struct dentry *base, int len)
+{
+       struct qstr this;
+       unsigned int c;
+       int err;
+       struct dentry *ret;
+
+       this.name = name;
+       this.len = len;
+       this.hash = full_name_hash(name, len);
+       if (!len)
+               return ERR_PTR(-EACCES);
+
+       if (unlikely(name[0] == '.')) {
+               if (len < 2 || (len == 2 && name[1] == '.'))
+                       return ERR_PTR(-EACCES);
+       }
+
+       while (len--) {
+               c = *(const unsigned char *)name++;
+               if (c == '/' || c == '\0')
+                       return ERR_PTR(-EACCES);
+       }
+       /*
+        * See if the low-level filesystem might want
+        * to use its own hash..
+        */
+       if (base->d_flags & DCACHE_OP_HASH) {
+               int err = base->d_op->d_hash(base, &this);
+               if (err < 0)
+                       return ERR_PTR(err);
+       }
+
+       err = inode_permission(base->d_inode, MAY_EXEC);
+       if (err)
+               return ERR_PTR(err);
+
+       /*
+        * __d_lookup() is used to try to get a quick answer and avoid the
+        * mutex.  A false-negative does no harm.
+        */
+       ret = __d_lookup(base, &this);
+       if (ret && unlikely(ret->d_flags & DCACHE_OP_REVALIDATE)) {
+               dput(ret);
+               ret = NULL;
+       }
+       if (ret)
+               return ret;
+
+       mutex_lock(&base->d_inode->i_mutex);
+       ret =  __lookup_hash(&this, base, 0);
+       mutex_unlock(&base->d_inode->i_mutex);
+       return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
                 struct path *path, int *empty)
 {
@@ -2669,10 +2735,6 @@ static int may_open(struct path *path, int acc_mode, int flag)
        struct inode *inode = dentry->d_inode;
        int error;
 
-       /* O_PATH? */
-       if (!acc_mode)
-               return 0;
-
        if (!inode)
                return -ENOENT;
 
@@ -2694,7 +2756,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
                break;
        }
 
-       error = inode_permission(inode, acc_mode);
+       error = inode_permission(inode, MAY_OPEN | acc_mode);
        if (error)
                return error;
 
@@ -2886,7 +2948,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
        if (*opened & FILE_CREATED) {
                WARN_ON(!(open_flag & O_CREAT));
                fsnotify_create(dir, dentry);
-               acc_mode = MAY_OPEN;
+               acc_mode = 0;
        }
        error = may_open(&file->f_path, acc_mode, open_flag);
        if (error)
@@ -3099,7 +3161,7 @@ retry_lookup:
                /* Don't check for write permission, don't truncate */
                open_flag &= ~O_TRUNC;
                will_truncate = false;
-               acc_mode = MAY_OPEN;
+               acc_mode = 0;
                path_to_nameidata(&path, nd);
                goto finish_open_created;
        }
@@ -3183,10 +3245,11 @@ finish_open:
                got_write = true;
        }
 finish_open_created:
-       error = may_open(&nd->path, acc_mode, open_flag);
-       if (error)
-               goto out;
-
+       if (likely(!(open_flag & O_PATH))) {
+               error = may_open(&nd->path, acc_mode, open_flag);
+               if (error)
+                       goto out;
+       }
        BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
        error = vfs_open(&nd->path, file, current_cred());
        if (!error) {
@@ -3273,7 +3336,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
                goto out2;
        audit_inode(nd->name, child, 0);
        /* Don't check for other permissions, the inode was just created */
-       error = may_open(&path, MAY_OPEN, op->open_flag);
+       error = may_open(&path, 0, op->open_flag);
        if (error)
                goto out2;
        file->f_path.mnt = path.mnt;
@@ -4495,72 +4558,73 @@ EXPORT_SYMBOL(readlink_copy);
 
 /*
  * A helper for ->readlink().  This should be used *ONLY* for symlinks that
- * have ->follow_link() touching nd only in nd_set_link().  Using (or not
- * using) it for any given inode is up to filesystem.
+ * have ->get_link() not calling nd_jump_link().  Using (or not using) it
+ * for any given inode is up to filesystem.
  */
 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
-       void *cookie;
+       DEFINE_DELAYED_CALL(done);
        struct inode *inode = d_inode(dentry);
        const char *link = inode->i_link;
        int res;
 
        if (!link) {
-               link = inode->i_op->follow_link(dentry, &cookie);
+               link = inode->i_op->get_link(dentry, inode, &done);
                if (IS_ERR(link))
                        return PTR_ERR(link);
        }
        res = readlink_copy(buffer, buflen, link);
-       if (inode->i_op->put_link)
-               inode->i_op->put_link(inode, cookie);
+       do_delayed_call(&done);
        return res;
 }
 EXPORT_SYMBOL(generic_readlink);
 
 /* get the link contents into pagecache */
-static char *page_getlink(struct dentry * dentry, struct page **ppage)
+const char *page_get_link(struct dentry *dentry, struct inode *inode,
+                         struct delayed_call *callback)
 {
        char *kaddr;
        struct page *page;
-       struct address_space *mapping = dentry->d_inode->i_mapping;
-       page = read_mapping_page(mapping, 0, NULL);
-       if (IS_ERR(page))
-               return (char*)page;
-       *ppage = page;
-       kaddr = kmap(page);
-       nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
+       struct address_space *mapping = inode->i_mapping;
+
+       if (!dentry) {
+               page = find_get_page(mapping, 0);
+               if (!page)
+                       return ERR_PTR(-ECHILD);
+               if (!PageUptodate(page)) {
+                       put_page(page);
+                       return ERR_PTR(-ECHILD);
+               }
+       } else {
+               page = read_mapping_page(mapping, 0, NULL);
+               if (IS_ERR(page))
+                       return (char*)page;
+       }
+       set_delayed_call(callback, page_put_link, page);
+       BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
+       kaddr = page_address(page);
+       nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
        return kaddr;
 }
 
-int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
-{
-       struct page *page = NULL;
-       int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
-       if (page) {
-               kunmap(page);
-               page_cache_release(page);
-       }
-       return res;
-}
-EXPORT_SYMBOL(page_readlink);
+EXPORT_SYMBOL(page_get_link);
 
-const char *page_follow_link_light(struct dentry *dentry, void **cookie)
+void page_put_link(void *arg)
 {
-       struct page *page = NULL;
-       char *res = page_getlink(dentry, &page);
-       if (!IS_ERR(res))
-               *cookie = page;
-       return res;
+       put_page(arg);
 }
-EXPORT_SYMBOL(page_follow_link_light);
+EXPORT_SYMBOL(page_put_link);
 
-void page_put_link(struct inode *unused, void *cookie)
+int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
-       struct page *page = cookie;
-       kunmap(page);
-       page_cache_release(page);
+       DEFINE_DELAYED_CALL(done);
+       int res = readlink_copy(buffer, buflen,
+                               page_get_link(dentry, d_inode(dentry),
+                                             &done));
+       do_delayed_call(&done);
+       return res;
 }
-EXPORT_SYMBOL(page_put_link);
+EXPORT_SYMBOL(page_readlink);
 
 /*
  * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
@@ -4571,7 +4635,6 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
        struct page *page;
        void *fsdata;
        int err;
-       char *kaddr;
        unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
        if (nofs)
                flags |= AOP_FLAG_NOFS;
@@ -4582,9 +4645,7 @@ retry:
        if (err)
                goto fail;
 
-       kaddr = kmap_atomic(page);
-       memcpy(kaddr, symname, len-1);
-       kunmap_atomic(kaddr);
+       memcpy(page_address(page), symname, len-1);
 
        err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
                                                        page, fsdata);
@@ -4609,7 +4670,6 @@ EXPORT_SYMBOL(page_symlink);
 
 const struct inode_operations page_symlink_inode_operations = {
        .readlink       = generic_readlink,
-       .follow_link    = page_follow_link_light,
-       .put_link       = page_put_link,
+       .get_link       = page_get_link,
 };
 EXPORT_SYMBOL(page_symlink_inode_operations);