Merge branch 'x86-cpu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-microblaze.git] / fs / namei.c
index 30145f8..c386a32 100644 (file)
@@ -35,6 +35,8 @@
 #include <linux/fs_struct.h>
 #include <linux/posix_acl.h>
 #include <linux/hash.h>
+#include <linux/bitops.h>
+#include <linux/init_task.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -265,7 +267,7 @@ static int check_acl(struct inode *inode, int mask)
                if (!acl)
                        return -EAGAIN;
                /* no ->get_acl() calls in RCU mode... */
-               if (acl == ACL_NOT_CACHED)
+               if (is_uncached_acl(acl))
                        return -ECHILD;
                return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
        }
@@ -409,6 +411,14 @@ int __inode_permission(struct inode *inode, int mask)
                 */
                if (IS_IMMUTABLE(inode))
                        return -EACCES;
+
+               /*
+                * Updating mtime will likely cause i_uid and i_gid to be
+                * written back improperly if their true value is unknown
+                * to the vfs.
+                */
+               if (HAS_UNMAPPED_ID(inode))
+                       return -EACCES;
        }
 
        retval = do_inode_permission(inode, mask);
@@ -900,6 +910,7 @@ static inline int may_follow_link(struct nameidata *nd)
 {
        const struct inode *inode;
        const struct inode *parent;
+       kuid_t puid;
 
        if (!sysctl_protected_symlinks)
                return 0;
@@ -915,7 +926,8 @@ static inline int may_follow_link(struct nameidata *nd)
                return 0;
 
        /* Allowed if parent directory and link owner match. */
-       if (uid_eq(parent->i_uid, inode->i_uid))
+       puid = parent->i_uid;
+       if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
                return 0;
 
        if (nd->flags & LOOKUP_RCU)
@@ -1088,6 +1100,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
                            bool *need_mntput)
 {
        struct vfsmount *mnt;
+       const struct cred *old_cred;
        int err;
 
        if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
@@ -1109,11 +1122,16 @@ static int follow_automount(struct path *path, struct nameidata *nd,
            path->dentry->d_inode)
                return -EISDIR;
 
+       if (path->dentry->d_sb->s_user_ns != &init_user_ns)
+               return -EACCES;
+
        nd->total_link_count++;
        if (nd->total_link_count >= 40)
                return -ELOOP;
 
+       old_cred = override_creds(&init_cred);
        mnt = path->dentry->d_op->d_automount(path);
+       revert_creds(old_cred);
        if (IS_ERR(mnt)) {
                /*
                 * The filesystem is allowed to return -EISDIR here to indicate
@@ -1415,21 +1433,28 @@ static void follow_mount(struct path *path)
        }
 }
 
+static int path_parent_directory(struct path *path)
+{
+       struct dentry *old = path->dentry;
+       /* rare case of legitimate dget_parent()... */
+       path->dentry = dget_parent(path->dentry);
+       dput(old);
+       if (unlikely(!path_connected(path)))
+               return -ENOENT;
+       return 0;
+}
+
 static int follow_dotdot(struct nameidata *nd)
 {
        while(1) {
-               struct dentry *old = nd->path.dentry;
-
                if (nd->path.dentry == nd->root.dentry &&
                    nd->path.mnt == nd->root.mnt) {
                        break;
                }
                if (nd->path.dentry != nd->path.mnt->mnt_root) {
-                       /* rare case of legitimate dget_parent()... */
-                       nd->path.dentry = dget_parent(nd->path.dentry);
-                       dput(old);
-                       if (unlikely(!path_connected(&nd->path)))
-                               return -ENOENT;
+                       int ret = path_parent_directory(&nd->path);
+                       if (ret)
+                               return ret;
                        break;
                }
                if (!follow_up(&nd->path))
@@ -1441,9 +1466,8 @@ static int follow_dotdot(struct nameidata *nd)
 }
 
 /*
- * This looks up the name in dcache, possibly revalidates the old dentry and
- * allocates a new one if not found or not valid.  In the need_lookup argument
- * returns whether i_op->lookup is necessary.
+ * This looks up the name in dcache and possibly revalidates the found dentry.
+ * NULL is returned if the dentry does not exist in the cache.
  */
 static struct dentry *lookup_dcache(const struct qstr *name,
                                    struct dentry *dir,
@@ -1603,32 +1627,42 @@ static struct dentry *lookup_slow(const struct qstr *name,
                                  struct dentry *dir,
                                  unsigned int flags)
 {
-       struct dentry *dentry;
-       inode_lock(dir->d_inode);
-       dentry = d_lookup(dir, name);
-       if (unlikely(dentry)) {
+       struct dentry *dentry = ERR_PTR(-ENOENT), *old;
+       struct inode *inode = dir->d_inode;
+       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+
+       inode_lock_shared(inode);
+       /* Don't go there if it's already dead */
+       if (unlikely(IS_DEADDIR(inode)))
+               goto out;
+again:
+       dentry = d_alloc_parallel(dir, name, &wq);
+       if (IS_ERR(dentry))
+               goto out;
+       if (unlikely(!d_in_lookup(dentry))) {
                if ((dentry->d_flags & DCACHE_OP_REVALIDATE) &&
                    !(flags & LOOKUP_NO_REVAL)) {
                        int error = d_revalidate(dentry, flags);
                        if (unlikely(error <= 0)) {
-                               if (!error)
+                               if (!error) {
                                        d_invalidate(dentry);
+                                       dput(dentry);
+                                       goto again;
+                               }
                                dput(dentry);
                                dentry = ERR_PTR(error);
                        }
                }
-               if (dentry) {
-                       inode_unlock(dir->d_inode);
-                       return dentry;
+       } else {
+               old = inode->i_op->lookup(inode, dentry, flags);
+               d_lookup_done(dentry);
+               if (unlikely(old)) {
+                       dput(dentry);
+                       dentry = old;
                }
        }
-       dentry = d_alloc(dir, name);
-       if (unlikely(!dentry)) {
-               inode_unlock(dir->d_inode);
-               return ERR_PTR(-ENOMEM);
-       }
-       dentry = lookup_real(dir->d_inode, dentry, flags);
-       inode_unlock(dir->d_inode);
+out:
+       inode_unlock_shared(inode);
        return dentry;
 }
 
@@ -1787,88 +1821,200 @@ static int walk_component(struct nameidata *nd, int flags)
 
 #include <asm/word-at-a-time.h>
 
-#ifdef CONFIG_64BIT
+#ifdef HASH_MIX
 
-static inline unsigned int fold_hash(unsigned long hash)
+/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
+
+#elif defined(CONFIG_64BIT)
+/*
+ * Register pressure in the mixing function is an issue, particularly
+ * on 32-bit x86, but almost any function requires one state value and
+ * one temporary.  Instead, use a function designed for two state values
+ * and no temporaries.
+ *
+ * This function cannot create a collision in only two iterations, so
+ * we have two iterations to achieve avalanche.  In those two iterations,
+ * we have six layers of mixing, which is enough to spread one bit's
+ * influence out to 2^6 = 64 state bits.
+ *
+ * Rotate constants are scored by considering either 64 one-bit input
+ * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
+ * probability of that delta causing a change to each of the 128 output
+ * bits, using a sample of random initial states.
+ *
+ * The Shannon entropy of the computed probabilities is then summed
+ * to produce a score.  Ideally, any input change has a 50% chance of
+ * toggling any given output bit.
+ *
+ * Mixing scores (in bits) for (12,45):
+ * Input delta: 1-bit      2-bit
+ * 1 round:     713.3    42542.6
+ * 2 rounds:   2753.7   140389.8
+ * 3 rounds:   5954.1   233458.2
+ * 4 rounds:   7862.6   256672.2
+ * Perfect:    8192     258048
+ *            (64*128) (64*63/2 * 128)
+ */
+#define HASH_MIX(x, y, a)      \
+       (       x ^= (a),       \
+       y ^= x, x = rol64(x,12),\
+       x += y, y = rol64(y,45),\
+       y *= 9                  )
+
+/*
+ * Fold two longs into one 32-bit hash value.  This must be fast, but
+ * latency isn't quite as critical, as there is a fair bit of additional
+ * work done before the hash value is used.
+ */
+static inline unsigned int fold_hash(unsigned long x, unsigned long y)
 {
-       return hash_64(hash, 32);
+       y ^= x * GOLDEN_RATIO_64;
+       y *= GOLDEN_RATIO_64;
+       return y >> 32;
 }
 
 #else  /* 32-bit case */
 
-#define fold_hash(x) (x)
+/*
+ * Mixing scores (in bits) for (7,20):
+ * Input delta: 1-bit      2-bit
+ * 1 round:     330.3     9201.6
+ * 2 rounds:   1246.4    25475.4
+ * 3 rounds:   1907.1    31295.1
+ * 4 rounds:   2042.3    31718.6
+ * Perfect:    2048      31744
+ *            (32*64)   (32*31/2 * 64)
+ */
+#define HASH_MIX(x, y, a)      \
+       (       x ^= (a),       \
+       y ^= x, x = rol32(x, 7),\
+       x += y, y = rol32(y,20),\
+       y *= 9                  )
+
+static inline unsigned int fold_hash(unsigned long x, unsigned long y)
+{
+       /* Use arch-optimized multiply if one exists */
+       return __hash_32(y ^ __hash_32(x));
+}
 
 #endif
 
-unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+/*
+ * Return the hash of a string of known length.  This is carfully
+ * designed to match hash_name(), which is the more critical function.
+ * In particular, we must end by hashing a final word containing 0..7
+ * payload bytes, to match the way that hash_name() iterates until it
+ * finds the delimiter after the name.
+ */
+unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
 {
-       unsigned long a, mask;
-       unsigned long hash = 0;
+       unsigned long a, x = 0, y = (unsigned long)salt;
 
        for (;;) {
+               if (!len)
+                       goto done;
                a = load_unaligned_zeropad(name);
                if (len < sizeof(unsigned long))
                        break;
-               hash += a;
-               hash *= 9;
+               HASH_MIX(x, y, a);
                name += sizeof(unsigned long);
                len -= sizeof(unsigned long);
-               if (!len)
-                       goto done;
        }
-       mask = bytemask_from_count(len);
-       hash += mask & a;
+       x ^= a & bytemask_from_count(len);
 done:
-       return fold_hash(hash);
+       return fold_hash(x, y);
 }
 EXPORT_SYMBOL(full_name_hash);
 
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+u64 hashlen_string(const void *salt, const char *name)
+{
+       unsigned long a = 0, x = 0, y = (unsigned long)salt;
+       unsigned long adata, mask, len;
+       const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
+
+       len = 0;
+       goto inside;
+
+       do {
+               HASH_MIX(x, y, a);
+               len += sizeof(unsigned long);
+inside:
+               a = load_unaligned_zeropad(name+len);
+       } while (!has_zero(a, &adata, &constants));
+
+       adata = prep_zero_mask(a, adata, &constants);
+       mask = create_zero_mask(adata);
+       x ^= a & zero_bytemask(mask);
+
+       return hashlen_create(fold_hash(x, y), len + find_zero(mask));
+}
+EXPORT_SYMBOL(hashlen_string);
+
 /*
  * Calculate the length and hash of the path component, and
  * return the "hash_len" as the result.
  */
-static inline u64 hash_name(const char *name)
+static inline u64 hash_name(const void *salt, const char *name)
 {
-       unsigned long a, b, adata, bdata, mask, hash, len;
+       unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
+       unsigned long adata, bdata, mask, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
 
-       hash = a = 0;
-       len = -sizeof(unsigned long);
+       len = 0;
+       goto inside;
+
        do {
-               hash = (hash + a) * 9;
+               HASH_MIX(x, y, a);
                len += sizeof(unsigned long);
+inside:
                a = load_unaligned_zeropad(name+len);
                b = a ^ REPEAT_BYTE('/');
        } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
 
        adata = prep_zero_mask(a, adata, &constants);
        bdata = prep_zero_mask(b, bdata, &constants);
-
        mask = create_zero_mask(adata | bdata);
+       x ^= a & zero_bytemask(mask);
 
-       hash += a & zero_bytemask(mask);
-       len += find_zero(mask);
-       return hashlen_create(fold_hash(hash), len);
+       return hashlen_create(fold_hash(x, y), len + find_zero(mask));
 }
 
-#else
+#else  /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
 
-unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+/* Return the hash of a string of known length */
+unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
 {
-       unsigned long hash = init_name_hash();
+       unsigned long hash = init_name_hash(salt);
        while (len--)
-               hash = partial_name_hash(*name++, hash);
+               hash = partial_name_hash((unsigned char)*name++, hash);
        return end_name_hash(hash);
 }
 EXPORT_SYMBOL(full_name_hash);
 
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+u64 hashlen_string(const void *salt, const char *name)
+{
+       unsigned long hash = init_name_hash(salt);
+       unsigned long len = 0, c;
+
+       c = (unsigned char)*name;
+       while (c) {
+               len++;
+               hash = partial_name_hash(c, hash);
+               c = (unsigned char)name[len];
+       }
+       return hashlen_create(end_name_hash(hash), len);
+}
+EXPORT_SYMBOL(hashlen_string);
+
 /*
  * We know there's a real path component here of at least
  * one character.
  */
-static inline u64 hash_name(const char *name)
+static inline u64 hash_name(const void *salt, const char *name)
 {
-       unsigned long hash = init_name_hash();
+       unsigned long hash = init_name_hash(salt);
        unsigned long len = 0, c;
 
        c = (unsigned char)*name;
@@ -1905,10 +2051,10 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                int type;
 
                err = may_lookup(nd);
-               if (err)
+               if (err)
                        return err;
 
-               hash_len = hash_name(name);
+               hash_len = hash_name(nd->path.dentry, name);
 
                type = LAST_NORM;
                if (name[0] == '.') switch (hashlen_len(hash_len)) {
@@ -2266,33 +2412,6 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 }
 EXPORT_SYMBOL(vfs_path_lookup);
 
-/**
- * lookup_hash - lookup single pathname component on already hashed name
- * @name:      name and hash to lookup
- * @base:      base directory to lookup from
- *
- * The name must have been verified and hashed (see lookup_one_len()).  Using
- * this after just full_name_hash() is unsafe.
- *
- * This function also doesn't check for search permission on base directory.
- *
- * Use lookup_one_len_unlocked() instead, unless you really know what you are
- * doing.
- *
- * Do not hold i_mutex; this helper takes i_mutex if necessary.
- */
-struct dentry *lookup_hash(const struct qstr *name, struct dentry *base)
-{
-       struct dentry *ret;
-
-       ret = lookup_dcache(name, base, 0);
-       if (!ret)
-               ret = lookup_slow(name, base, 0);
-
-       return ret;
-}
-EXPORT_SYMBOL(lookup_hash);
-
 /**
  * lookup_one_len - filesystem helper to lookup single pathname component
  * @name:      pathname component to lookup
@@ -2314,7 +2433,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 
        this.name = name;
        this.len = len;
-       this.hash = full_name_hash(name, len);
+       this.hash = full_name_hash(base, name, len);
        if (!len)
                return ERR_PTR(-EACCES);
 
@@ -2364,10 +2483,11 @@ struct dentry *lookup_one_len_unlocked(const char *name,
        struct qstr this;
        unsigned int c;
        int err;
+       struct dentry *ret;
 
        this.name = name;
        this.len = len;
-       this.hash = full_name_hash(name, len);
+       this.hash = full_name_hash(base, name, len);
        if (!len)
                return ERR_PTR(-EACCES);
 
@@ -2395,10 +2515,41 @@ struct dentry *lookup_one_len_unlocked(const char *name,
        if (err)
                return ERR_PTR(err);
 
-       return lookup_hash(&this, base);
+       ret = lookup_dcache(&this, base, 0);
+       if (!ret)
+               ret = lookup_slow(&this, base, 0);
+       return ret;
 }
 EXPORT_SYMBOL(lookup_one_len_unlocked);
 
+#ifdef CONFIG_UNIX98_PTYS
+int path_pts(struct path *path)
+{
+       /* Find something mounted on "pts" in the same directory as
+        * the input path.
+        */
+       struct dentry *child, *parent;
+       struct qstr this;
+       int ret;
+
+       ret = path_parent_directory(path);
+       if (ret)
+               return ret;
+
+       parent = path->dentry;
+       this.name = "pts";
+       this.len = 3;
+       child = d_hash_and_lookup(parent, &this);
+       if (!child)
+               return -ENOENT;
+
+       path->dentry = child;
+       dput(parent);
+       follow_mount(path);
+       return 0;
+}
+#endif
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
                 struct path *path, int *empty)
 {
@@ -2607,10 +2758,11 @@ EXPORT_SYMBOL(__check_sticky);
  *     c. have CAP_FOWNER capability
  *  6. If the victim is append-only or immutable we can't do antyhing with
  *     links pointing to it.
- *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
- *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
- *  9. We can't remove a root or mountpoint.
- * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ *  7. If the victim has an unknown uid or gid we can't change the inode.
+ *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ * 10. We can't remove a root or mountpoint.
+ * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
  *     nfs_async_unlink().
  */
 static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
@@ -2632,7 +2784,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
                return -EPERM;
 
        if (check_sticky(dir, inode) || IS_APPEND(inode) ||
-           IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
+           IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
                return -EPERM;
        if (isdir) {
                if (!d_is_dir(victim))
@@ -2653,16 +2805,22 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
  *  1. We can't do it if child already exists (open has special treatment for
  *     this case, but since we are inlined it's OK)
  *  2. We can't do it if dir is read-only (done in permission())
- *  3. We should have write and exec permissions on dir
- *  4. We can't do it if dir is immutable (done in permission())
+ *  3. We can't do it if the fs can't represent the fsuid or fsgid.
+ *  4. We should have write and exec permissions on dir
+ *  5. We can't do it if dir is immutable (done in permission())
  */
 static inline int may_create(struct inode *dir, struct dentry *child)
 {
+       struct user_namespace *s_user_ns;
        audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
        if (child->d_inode)
                return -EEXIST;
        if (IS_DEADDIR(dir))
                return -ENOENT;
+       s_user_ns = dir->i_sb->s_user_ns;
+       if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+           !kgid_has_mapping(s_user_ns, current_fsgid()))
+               return -EOVERFLOW;
        return inode_permission(dir, MAY_WRITE | MAY_EXEC);
 }
 
@@ -2678,7 +2836,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
                return NULL;
        }
 
-       mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
+       mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
 
        p = d_ancestor(p2, p1);
        if (p) {
@@ -2705,7 +2863,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
        inode_unlock(p1->d_inode);
        if (p1 != p2) {
                inode_unlock(p2->d_inode);
-               mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
+               mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
        }
 }
 EXPORT_SYMBOL(unlock_rename);
@@ -2731,6 +2889,12 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 }
 EXPORT_SYMBOL(vfs_create);
 
+bool may_open_dev(const struct path *path)
+{
+       return !(path->mnt->mnt_flags & MNT_NODEV) &&
+               !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
+}
+
 static int may_open(struct path *path, int acc_mode, int flag)
 {
        struct dentry *dentry = path->dentry;
@@ -2749,7 +2913,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
                break;
        case S_IFBLK:
        case S_IFCHR:
-               if (path->mnt->mnt_flags & MNT_NODEV)
+               if (!may_open_dev(path))
                        return -EACCES;
                /*FALLTHRU*/
        case S_IFIFO:
@@ -2808,7 +2972,7 @@ static inline int open_to_namei_flags(int flag)
        return flag;
 }
 
-static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
+static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
 {
        int error = security_path_mknod(dir, dentry, mode, 0);
        if (error)
@@ -2837,143 +3001,60 @@ static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
 static int atomic_open(struct nameidata *nd, struct dentry *dentry,
                        struct path *path, struct file *file,
                        const struct open_flags *op,
-                       bool got_write, bool need_lookup,
+                       int open_flag, umode_t mode,
                        int *opened)
 {
+       struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
        struct inode *dir =  nd->path.dentry->d_inode;
-       unsigned open_flag = open_to_namei_flags(op->open_flag);
-       umode_t mode;
        int error;
-       int acc_mode;
-       int create_error = 0;
-       struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
-       bool excl;
-
-       BUG_ON(dentry->d_inode);
-
-       /* Don't create child dentry for a dead directory. */
-       if (unlikely(IS_DEADDIR(dir))) {
-               error = -ENOENT;
-               goto out;
-       }
-
-       mode = op->mode;
-       if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
-               mode &= ~current_umask();
 
-       excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT);
-       if (excl)
+       if (!(~open_flag & (O_EXCL | O_CREAT))) /* both O_EXCL and O_CREAT */
                open_flag &= ~O_TRUNC;
 
-       /*
-        * Checking write permission is tricky, bacuse we don't know if we are
-        * going to actually need it: O_CREAT opens should work as long as the
-        * file exists.  But checking existence breaks atomicity.  The trick is
-        * to check access and if not granted clear O_CREAT from the flags.
-        *
-        * Another problem is returing the "right" error value (e.g. for an
-        * O_EXCL open we want to return EEXIST not EROFS).
-        */
-       if (((open_flag & (O_CREAT | O_TRUNC)) ||
-           (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) {
-               if (!(open_flag & O_CREAT)) {
-                       /*
-                        * No O_CREATE -> atomicity not a requirement -> fall
-                        * back to lookup + open
-                        */
-                       goto no_open;
-               } else if (open_flag & (O_EXCL | O_TRUNC)) {
-                       /* Fall back and fail with the right error */
-                       create_error = -EROFS;
-                       goto no_open;
-               } else {
-                       /* No side effects, safe to clear O_CREAT */
-                       create_error = -EROFS;
-                       open_flag &= ~O_CREAT;
-               }
-       }
-
-       if (open_flag & O_CREAT) {
-               error = may_o_create(&nd->path, dentry, mode);
-               if (error) {
-                       create_error = error;
-                       if (open_flag & O_EXCL)
-                               goto no_open;
-                       open_flag &= ~O_CREAT;
-               }
-       }
-
        if (nd->flags & LOOKUP_DIRECTORY)
                open_flag |= O_DIRECTORY;
 
        file->f_path.dentry = DENTRY_NOT_SET;
        file->f_path.mnt = nd->path.mnt;
-       error = dir->i_op->atomic_open(dir, dentry, file, open_flag, mode,
-                                     opened);
-       if (error < 0) {
-               if (create_error && error == -ENOENT)
-                       error = create_error;
-               goto out;
-       }
-
-       if (error) {    /* returned 1, that is */
+       error = dir->i_op->atomic_open(dir, dentry, file,
+                                      open_to_namei_flags(open_flag),
+                                      mode, opened);
+       d_lookup_done(dentry);
+       if (!error) {
+               /*
+                * We didn't have the inode before the open, so check open
+                * permission here.
+                */
+               int acc_mode = op->acc_mode;
+               if (*opened & FILE_CREATED) {
+                       WARN_ON(!(open_flag & O_CREAT));
+                       fsnotify_create(dir, dentry);
+                       acc_mode = 0;
+               }
+               error = may_open(&file->f_path, acc_mode, open_flag);
+               if (WARN_ON(error > 0))
+                       error = -EINVAL;
+       } else if (error > 0) {
                if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
                        error = -EIO;
-                       goto out;
-               }
-               if (file->f_path.dentry) {
-                       dput(dentry);
-                       dentry = file->f_path.dentry;
-               }
-               if (*opened & FILE_CREATED)
-                       fsnotify_create(dir, dentry);
-               if (!dentry->d_inode) {
-                       WARN_ON(*opened & FILE_CREATED);
-                       if (create_error) {
-                               error = create_error;
-                               goto out;
-                       }
                } else {
-                       if (excl && !(*opened & FILE_CREATED)) {
-                               error = -EEXIST;
-                               goto out;
+                       if (file->f_path.dentry) {
+                               dput(dentry);
+                               dentry = file->f_path.dentry;
+                       }
+                       if (*opened & FILE_CREATED)
+                               fsnotify_create(dir, dentry);
+                       if (unlikely(d_is_negative(dentry))) {
+                               error = -ENOENT;
+                       } else {
+                               path->dentry = dentry;
+                               path->mnt = nd->path.mnt;
+                               return 1;
                        }
                }
-               goto looked_up;
-       }
-
-       /*
-        * We didn't have the inode before the open, so check open permission
-        * here.
-        */
-       acc_mode = op->acc_mode;
-       if (*opened & FILE_CREATED) {
-               WARN_ON(!(open_flag & O_CREAT));
-               fsnotify_create(dir, dentry);
-               acc_mode = 0;
        }
-       error = may_open(&file->f_path, acc_mode, open_flag);
-       if (error)
-               fput(file);
-
-out:
        dput(dentry);
        return error;
-
-no_open:
-       if (need_lookup) {
-               dentry = lookup_real(dir, dentry, nd->flags);
-               if (IS_ERR(dentry))
-                       return PTR_ERR(dentry);
-       }
-       if (create_error && !dentry->d_inode) {
-               error = create_error;
-               goto out;
-       }
-looked_up:
-       path->dentry = dentry;
-       path->mnt = nd->path.mnt;
-       return 1;
 }
 
 /*
@@ -3001,62 +3082,118 @@ static int lookup_open(struct nameidata *nd, struct path *path,
 {
        struct dentry *dir = nd->path.dentry;
        struct inode *dir_inode = dir->d_inode;
+       int open_flag = op->open_flag;
        struct dentry *dentry;
-       int error;
-       bool need_lookup = false;
+       int error, create_error = 0;
+       umode_t mode = op->mode;
+       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+
+       if (unlikely(IS_DEADDIR(dir_inode)))
+               return -ENOENT;
 
        *opened &= ~FILE_CREATED;
-       dentry = lookup_dcache(&nd->last, dir, nd->flags);
-       if (IS_ERR(dentry))
-               return PTR_ERR(dentry);
+       dentry = d_lookup(dir, &nd->last);
+       for (;;) {
+               if (!dentry) {
+                       dentry = d_alloc_parallel(dir, &nd->last, &wq);
+                       if (IS_ERR(dentry))
+                               return PTR_ERR(dentry);
+               }
+               if (d_in_lookup(dentry))
+                       break;
 
-       if (!dentry) {
-               dentry = d_alloc(dir, &nd->last);
-               if (unlikely(!dentry))
-                       return -ENOMEM;
-               need_lookup = true;
-       } else if (dentry->d_inode) {
+               if (!(dentry->d_flags & DCACHE_OP_REVALIDATE))
+                       break;
+
+               error = d_revalidate(dentry, nd->flags);
+               if (likely(error > 0))
+                       break;
+               if (error)
+                       goto out_dput;
+               d_invalidate(dentry);
+               dput(dentry);
+               dentry = NULL;
+       }
+       if (dentry->d_inode) {
                /* Cached positive dentry: will open in f_op->open */
                goto out_no_open;
        }
 
-       if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
-               return atomic_open(nd, dentry, path, file, op, got_write,
-                                  need_lookup, opened);
+       /*
+        * Checking write permission is tricky, bacuse we don't know if we are
+        * going to actually need it: O_CREAT opens should work as long as the
+        * file exists.  But checking existence breaks atomicity.  The trick is
+        * to check access and if not granted clear O_CREAT from the flags.
+        *
+        * Another problem is returing the "right" error value (e.g. for an
+        * O_EXCL open we want to return EEXIST not EROFS).
+        */
+       if (open_flag & O_CREAT) {
+               if (!IS_POSIXACL(dir->d_inode))
+                       mode &= ~current_umask();
+               if (unlikely(!got_write)) {
+                       create_error = -EROFS;
+                       open_flag &= ~O_CREAT;
+                       if (open_flag & (O_EXCL | O_TRUNC))
+                               goto no_open;
+                       /* No side effects, safe to clear O_CREAT */
+               } else {
+                       create_error = may_o_create(&nd->path, dentry, mode);
+                       if (create_error) {
+                               open_flag &= ~O_CREAT;
+                               if (open_flag & O_EXCL)
+                                       goto no_open;
+                       }
+               }
+       } else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) &&
+                  unlikely(!got_write)) {
+               /*
+                * No O_CREATE -> atomicity not a requirement -> fall
+                * back to lookup + open
+                */
+               goto no_open;
        }
 
-       if (need_lookup) {
-               BUG_ON(dentry->d_inode);
+       if (dir_inode->i_op->atomic_open) {
+               error = atomic_open(nd, dentry, path, file, op, open_flag,
+                                   mode, opened);
+               if (unlikely(error == -ENOENT) && create_error)
+                       error = create_error;
+               return error;
+       }
 
-               dentry = lookup_real(dir_inode, dentry, nd->flags);
-               if (IS_ERR(dentry))
-                       return PTR_ERR(dentry);
+no_open:
+       if (d_in_lookup(dentry)) {
+               struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
+                                                            nd->flags);
+               d_lookup_done(dentry);
+               if (unlikely(res)) {
+                       if (IS_ERR(res)) {
+                               error = PTR_ERR(res);
+                               goto out_dput;
+                       }
+                       dput(dentry);
+                       dentry = res;
+               }
        }
 
        /* Negative dentry, just create the file */
-       if (!dentry->d_inode && (op->open_flag & O_CREAT)) {
-               umode_t mode = op->mode;
-               if (!IS_POSIXACL(dir->d_inode))
-                       mode &= ~current_umask();
-               /*
-                * This write is needed to ensure that a
-                * rw->ro transition does not occur between
-                * the time when the file is created and when
-                * a permanent write count is taken through
-                * the 'struct file' in finish_open().
-                */
-               if (!got_write) {
-                       error = -EROFS;
-                       goto out_dput;
-               }
+       if (!dentry->d_inode && (open_flag & O_CREAT)) {
                *opened |= FILE_CREATED;
-               error = security_path_mknod(&nd->path, dentry, mode, 0);
-               if (error)
+               audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
+               if (!dir_inode->i_op->create) {
+                       error = -EACCES;
                        goto out_dput;
-               error = vfs_create(dir->d_inode, dentry, mode,
-                                  nd->flags & LOOKUP_EXCL);
+               }
+               error = dir_inode->i_op->create(dir_inode, dentry, mode,
+                                               open_flag & O_EXCL);
                if (error)
                        goto out_dput;
+               fsnotify_create(dir_inode, dentry);
+       }
+       if (unlikely(create_error) && !dentry->d_inode) {
+               error = create_error;
+               goto out_dput;
        }
 out_no_open:
        path->dentry = dentry;
@@ -3082,9 +3219,7 @@ static int do_last(struct nameidata *nd,
        int acc_mode = op->acc_mode;
        unsigned seq;
        struct inode *inode;
-       struct path save_parent = { .dentry = NULL, .mnt = NULL };
        struct path path;
-       bool retried = false;
        int error;
 
        nd->flags &= ~LOOKUP_PARENT;
@@ -3127,8 +3262,7 @@ static int do_last(struct nameidata *nd,
                        return -EISDIR;
        }
 
-retry_lookup:
-       if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
+       if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
                error = mnt_want_write(nd->path.mnt);
                if (!error)
                        got_write = true;
@@ -3138,9 +3272,15 @@ retry_lookup:
                 * dropping this one anyway.
                 */
        }
-       inode_lock(dir->d_inode);
+       if (open_flag & O_CREAT)
+               inode_lock(dir->d_inode);
+       else
+               inode_lock_shared(dir->d_inode);
        error = lookup_open(nd, &path, file, op, got_write, opened);
-       inode_unlock(dir->d_inode);
+       if (open_flag & O_CREAT)
+               inode_unlock(dir->d_inode);
+       else
+               inode_unlock_shared(dir->d_inode);
 
        if (error <= 0) {
                if (error)
@@ -3173,6 +3313,10 @@ retry_lookup:
                got_write = false;
        }
 
+       error = follow_managed(&path, nd);
+       if (unlikely(error < 0))
+               return error;
+
        if (unlikely(d_is_negative(path.dentry))) {
                path_to_nameidata(&path, nd);
                return -ENOENT;
@@ -3188,10 +3332,6 @@ retry_lookup:
                return -EEXIST;
        }
 
-       error = follow_managed(&path, nd);
-       if (unlikely(error < 0))
-               return error;
-
        seq = 0;        /* out of RCU mode, so the value doesn't matter */
        inode = d_backing_inode(path.dentry);
 finish_lookup:
@@ -3202,28 +3342,15 @@ finish_lookup:
        if (unlikely(error))
                return error;
 
-       if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
-               path_to_nameidata(&path, nd);
-       } else {
-               save_parent.dentry = nd->path.dentry;
-               save_parent.mnt = mntget(path.mnt);
-               nd->path.dentry = path.dentry;
-
-       }
+       path_to_nameidata(&path, nd);
        nd->inode = inode;
        nd->seq = seq;
        /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
 finish_open:
        error = complete_walk(nd);
-       if (error) {
-               path_put(&save_parent);
+       if (error)
                return error;
-       }
        audit_inode(nd->name, nd->path.dentry, 0);
-       if (unlikely(d_is_symlink(nd->path.dentry)) && !(open_flag & O_PATH)) {
-               error = -ELOOP;
-               goto out;
-       }
        error = -EISDIR;
        if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
                goto out;
@@ -3240,64 +3367,30 @@ finish_open:
                got_write = true;
        }
 finish_open_created:
-       if (likely(!(open_flag & O_PATH))) {
-               error = may_open(&nd->path, acc_mode, open_flag);
-               if (error)
-                       goto out;
-       }
+       error = may_open(&nd->path, acc_mode, open_flag);
+       if (error)
+               goto out;
        BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
        error = vfs_open(&nd->path, file, current_cred());
-       if (!error) {
-               *opened |= FILE_OPENED;
-       } else {
-               if (error == -EOPENSTALE)
-                       goto stale_open;
+       if (error)
                goto out;
-       }
+       *opened |= FILE_OPENED;
 opened:
        error = open_check_o_direct(file);
-       if (error)
-               goto exit_fput;
-       error = ima_file_check(file, op->acc_mode, *opened);
-       if (error)
-               goto exit_fput;
-
-       if (will_truncate) {
+       if (!error)
+               error = ima_file_check(file, op->acc_mode, *opened);
+       if (!error && will_truncate)
                error = handle_truncate(file);
-               if (error)
-                       goto exit_fput;
-       }
 out:
+       if (unlikely(error) && (*opened & FILE_OPENED))
+               fput(file);
        if (unlikely(error > 0)) {
                WARN_ON(1);
                error = -EINVAL;
        }
        if (got_write)
                mnt_drop_write(nd->path.mnt);
-       path_put(&save_parent);
        return error;
-
-exit_fput:
-       fput(file);
-       goto out;
-
-stale_open:
-       /* If no saved parent or already retried then can't retry */
-       if (!save_parent.dentry || retried)
-               goto out;
-
-       BUG_ON(save_parent.dentry != dir);
-       path_put(&nd->path);
-       nd->path = save_parent;
-       nd->inode = dir->d_inode;
-       save_parent.mnt = NULL;
-       save_parent.dentry = NULL;
-       if (got_write) {
-               mnt_drop_write(nd->path.mnt);
-               got_write = false;
-       }
-       retried = true;
-       goto retry_lookup;
 }
 
 static int do_tmpfile(struct nameidata *nd, unsigned flags,
@@ -3358,6 +3451,18 @@ out:
        return error;
 }
 
+static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
+{
+       struct path path;
+       int error = path_lookupat(nd, flags, &path);
+       if (!error) {
+               audit_inode(nd->name, path.dentry, 0);
+               error = vfs_open(&path, file, current_cred());
+               path_put(&path);
+       }
+       return error;
+}
+
 static struct file *path_openat(struct nameidata *nd,
                        const struct open_flags *op, unsigned flags)
 {
@@ -3377,6 +3482,13 @@ static struct file *path_openat(struct nameidata *nd,
                goto out2;
        }
 
+       if (unlikely(file->f_flags & O_PATH)) {
+               error = do_o_path(nd, flags, file);
+               if (!error)
+                       opened |= FILE_OPENED;
+               goto out2;
+       }
+
        s = path_init(nd, flags);
        if (IS_ERR(s)) {
                put_filp(file);
@@ -3619,6 +3731,8 @@ retry:
        switch (mode & S_IFMT) {
                case 0: case S_IFREG:
                        error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+                       if (!error)
+                               ima_post_path_mknod(dentry);
                        break;
                case S_IFCHR: case S_IFBLK:
                        error = vfs_mknod(path.dentry->d_inode,dentry,mode,
@@ -4051,6 +4165,13 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
         */
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return -EPERM;
+       /*
+        * Updating the link count will likely cause i_uid and i_gid to
+        * be writen back improperly if their true value is unknown to
+        * the vfs.
+        */
+       if (HAS_UNMAPPED_ID(inode))
+               return -EPERM;
        if (!dir->i_op->link)
                return -EPERM;
        if (S_ISDIR(inode->i_mode))
@@ -4228,7 +4349,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         * Check source == target.
         * On overlayfs need to look at underlying inodes.
         */
-       if (vfs_select_inode(old_dentry, 0) == vfs_select_inode(new_dentry, 0))
+       if (d_real_inode(old_dentry) == d_real_inode(new_dentry))
                return 0;
 
        error = may_delete(old_dir, old_dentry, is_dir);
@@ -4532,7 +4653,6 @@ int readlink_copy(char __user *buffer, int buflen, const char *link)
 out:
        return len;
 }
-EXPORT_SYMBOL(readlink_copy);
 
 /*
  * A helper for ->readlink().  This should be used *ONLY* for symlinks that