fs/namei.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *  linux/fs/namei.c
   4  *
   5  *  Copyright (C) 1991, 1992  Linus Torvalds
   6  */
   7
   8 /*
   9  * Some corrections by tytso.
  10  */
  11
  12 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
  13  * lookup logic.
  14  */
  15 /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
  16  */
  17
  18 #include <linux/init.h>
  19 #include <linux/export.h>
  20 #include <linux/kernel.h>
  21 #include <linux/slab.h>
  22 #include <linux/fs.h>
  23 #include <linux/namei.h>
  24 #include <linux/pagemap.h>
  25 #include <linux/fsnotify.h>
  26 #include <linux/personality.h>
  27 #include <linux/security.h>
  28 #include <linux/ima.h>
  29 #include <linux/syscalls.h>
  30 #include <linux/mount.h>
  31 #include <linux/audit.h>
  32 #include <linux/capability.h>
  33 #include <linux/file.h>
  34 #include <linux/fcntl.h>
  35 #include <linux/device_cgroup.h>
  36 #include <linux/fs_struct.h>
  37 #include <linux/posix_acl.h>
  38 #include <linux/hash.h>
  39 #include <linux/bitops.h>
  40 #include <linux/init_task.h>
  41 #include <linux/uaccess.h>
  42
  43 #include "internal.h"
  44 #include "mount.h"
  45
  46 /* [Feb-1997 T. Schoebel-Theuer]
  47  * Fundamental changes in the pathname lookup mechanisms (namei)
  48  * were necessary because of omirr.  The reason is that omirr needs
  49  * to know the _real_ pathname, not the user-supplied one, in case
  50  * of symlinks (and also when transname replacements occur).
  51  *
  52  * The new code replaces the old recursive symlink resolution with
  53  * an iterative one (in case of non-nested symlink chains).  It does
  54  * this with calls to <fs>_follow_link().
  55  * As a side effect, dir_namei(), _namei() and follow_link() are now
  56  * replaced with a single function lookup_dentry() that can handle all
  57  * the special cases of the former code.
  58  *
  59  * With the new dcache, the pathname is stored at each inode, at least as
  60  * long as the refcount of the inode is positive.  As a side effect, the
  61  * size of the dcache depends on the inode cache and thus is dynamic.
  62  *
  63  * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
  64  * resolution to correspond with current state of the code.
  65  *
  66  * Note that the symlink resolution is not *completely* iterative.
  67  * There is still a significant amount of tail- and mid- recursion in
  68  * the algorithm.  Also, note that <fs>_readlink() is not used in
  69  * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
  70  * may return different results than <fs>_follow_link().  Many virtual
  71  * filesystems (including /proc) exhibit this behavior.
  72  */
  73
  74 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
  75  * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
  76  * and the name already exists in form of a symlink, try to create the new
  77  * name indicated by the symlink. The old code always complained that the
  78  * name already exists, due to not following the symlink even if its target
  79  * is nonexistent.  The new semantics affects also mknod() and link() when
  80  * the name is a symlink pointing to a non-existent name.
  81  *
  82  * I don't know which semantics is the right one, since I have no access
  83  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
  84  * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
  85  * "old" one. Personally, I think the new semantics is much more logical.
  86  * Note that "ln old new" where "new" is a symlink pointing to a non-existing
  87  * file does succeed in both HP-UX and SunOs, but not in Solaris
  88  * and in the old Linux semantics.
  89  */
  90
  91 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
  92  * semantics.  See the comments in "open_namei" and "do_link" below.
  93  *
  94  * [10-Sep-98 Alan Modra] Another symlink change.
  95  */
  96
  97 /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
  98  *      inside the path - always follow.
  99  *      in the last component in creation/removal/renaming - never follow.
 100  *      if LOOKUP_FOLLOW passed - follow.
 101  *      if the pathname has trailing slashes - follow.
 102  *      otherwise - don't follow.
 103  * (applied in that order).
 104  *
 105  * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 106  * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 107  * During the 2.4 we need to fix the userland stuff depending on it -
 108  * hopefully we will be able to get rid of that wart in 2.5. So far only
 109  * XEmacs seems to be relying on it...
 110  */
 111 /*
 112  * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 113  * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
 114  * any extra contention...
 115  */
 116
 117 /* In order to reduce some races, while at the same time doing additional
 118  * checking and hopefully speeding things up, we copy filenames to the
 119  * kernel data space before using them..
 120  *
 121  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 122  * PATH_MAX includes the nul terminator --RR.
 123  */
 124
 125 #define EMBEDDED_NAME_MAX       (PATH_MAX - offsetof(struct filename, iname))
 126
 127 struct filename *
 128 getname_flags(const char __user *filename, int flags, int *empty)
 129 {
 130         struct filename *result;
 131         char *kname;
 132         int len;
 133
 134         result = audit_reusename(filename);
 135         if (result)
 136                 return result;
 137
 138         result = __getname();
 139         if (unlikely(!result))
 140                 return ERR_PTR(-ENOMEM);
 141
 142         /*
 143          * First, try to embed the struct filename inside the names_cache
 144          * allocation
 145          */
 146         kname = (char *)result->iname;
 147         result->name = kname;
 148
 149         len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
 150         if (unlikely(len < 0)) {
 151                 __putname(result);
 152                 return ERR_PTR(len);
 153         }
 154
 155         /*
 156          * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
 157          * separate struct filename so we can dedicate the entire
 158          * names_cache allocation for the pathname, and re-do the copy from
 159          * userland.
 160          */
 161         if (unlikely(len == EMBEDDED_NAME_MAX)) {
 162                 const size_t size = offsetof(struct filename, iname[1]);
 163                 kname = (char *)result;
 164
 165                 /*
 166                  * size is chosen that way we to guarantee that
 167                  * result->iname[0] is within the same object and that
 168                  * kname can't be equal to result->iname, no matter what.
 169                  */
 170                 result = kzalloc(size, GFP_KERNEL);
 171                 if (unlikely(!result)) {
 172                         __putname(kname);
 173                         return ERR_PTR(-ENOMEM);
 174                 }
 175                 result->name = kname;
 176                 len = strncpy_from_user(kname, filename, PATH_MAX);
 177                 if (unlikely(len < 0)) {
 178                         __putname(kname);
 179                         kfree(result);
 180                         return ERR_PTR(len);
 181                 }
 182                 if (unlikely(len == PATH_MAX)) {
 183                         __putname(kname);
 184                         kfree(result);
 185                         return ERR_PTR(-ENAMETOOLONG);
 186                 }
 187         }
 188
 189         result->refcnt = 1;
 190         /* The empty path is special. */
 191         if (unlikely(!len)) {
 192                 if (empty)
 193                         *empty = 1;
 194                 if (!(flags & LOOKUP_EMPTY)) {
 195                         putname(result);
 196                         return ERR_PTR(-ENOENT);
 197                 }
 198         }
 199
 200         result->uptr = filename;
 201         result->aname = NULL;
 202         audit_getname(result);
 203         return result;
 204 }
 205
 206 struct filename *
 207 getname(const char __user * filename)
 208 {
 209         return getname_flags(filename, 0, NULL);
 210 }
 211
 212 struct filename *
 213 getname_kernel(const char * filename)
 214 {
 215         struct filename *result;
 216         int len = strlen(filename) + 1;
 217
 218         result = __getname();
 219         if (unlikely(!result))
 220                 return ERR_PTR(-ENOMEM);
 221
 222         if (len <= EMBEDDED_NAME_MAX) {
 223                 result->name = (char *)result->iname;
 224         } else if (len <= PATH_MAX) {
 225                 const size_t size = offsetof(struct filename, iname[1]);
 226                 struct filename *tmp;
 227
 228                 tmp = kmalloc(size, GFP_KERNEL);
 229                 if (unlikely(!tmp)) {
 230                         __putname(result);
 231                         return ERR_PTR(-ENOMEM);
 232                 }
 233                 tmp->name = (char *)result;
 234                 result = tmp;
 235         } else {
 236                 __putname(result);
 237                 return ERR_PTR(-ENAMETOOLONG);
 238         }
 239         memcpy((char *)result->name, filename, len);
 240         result->uptr = NULL;
 241         result->aname = NULL;
 242         result->refcnt = 1;
 243         audit_getname(result);
 244
 245         return result;
 246 }
 247
 248 void putname(struct filename *name)
 249 {
 250         if (IS_ERR_OR_NULL(name))
 251                 return;
 252
 253         BUG_ON(name->refcnt <= 0);
 254
 255         if (--name->refcnt > 0)
 256                 return;
 257
 258         if (name->name != name->iname) {
 259                 __putname(name->name);
 260                 kfree(name);
 261         } else
 262                 __putname(name);
 263 }
 264
 265 /**
 266  * check_acl - perform ACL permission checking
 267  * @mnt_userns: user namespace of the mount the inode was found from
 268  * @inode:      inode to check permissions on
 269  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 270  *
 271  * This function performs the ACL permission checking. Since this function
 272  * retrieve POSIX acls it needs to know whether it is called from a blocking or
 273  * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 274  *
 275  * If the inode has been found through an idmapped mount the user namespace of
 276  * the vfsmount must be passed through @mnt_userns. This function will then take
 277  * care to map the inode according to @mnt_userns before checking permissions.
 278  * On non-idmapped mounts or if permission checking is to be performed on the
 279  * raw inode simply passs init_user_ns.
 280  */
 281 static int check_acl(struct user_namespace *mnt_userns,
 282                      struct inode *inode, int mask)
 283 {
 284 #ifdef CONFIG_FS_POSIX_ACL
 285         struct posix_acl *acl;
 286
 287         if (mask & MAY_NOT_BLOCK) {
 288                 acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
 289                 if (!acl)
 290                         return -EAGAIN;
 291                 /* no ->get_acl() calls in RCU mode... */
 292                 if (is_uncached_acl(acl))
 293                         return -ECHILD;
 294                 return posix_acl_permission(mnt_userns, inode, acl, mask);
 295         }
 296
 297         acl = get_acl(inode, ACL_TYPE_ACCESS);
 298         if (IS_ERR(acl))
 299                 return PTR_ERR(acl);
 300         if (acl) {
 301                 int error = posix_acl_permission(mnt_userns, inode, acl, mask);
 302                 posix_acl_release(acl);
 303                 return error;
 304         }
 305 #endif
 306
 307         return -EAGAIN;
 308 }
 309
 310 /**
 311  * acl_permission_check - perform basic UNIX permission checking
 312  * @mnt_userns: user namespace of the mount the inode was found from
 313  * @inode:      inode to check permissions on
 314  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 315  *
 316  * This function performs the basic UNIX permission checking. Since this
 317  * function may retrieve POSIX acls it needs to know whether it is called from a
 318  * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 319  *
 320  * If the inode has been found through an idmapped mount the user namespace of
 321  * the vfsmount must be passed through @mnt_userns. This function will then take
 322  * care to map the inode according to @mnt_userns before checking permissions.
 323  * On non-idmapped mounts or if permission checking is to be performed on the
 324  * raw inode simply passs init_user_ns.
 325  */
 326 static int acl_permission_check(struct user_namespace *mnt_userns,
 327                                 struct inode *inode, int mask)
 328 {
 329         unsigned int mode = inode->i_mode;
 330         kuid_t i_uid;
 331
 332         /* Are we the owner? If so, ACL's don't matter */
 333         i_uid = i_uid_into_mnt(mnt_userns, inode);
 334         if (likely(uid_eq(current_fsuid(), i_uid))) {
 335                 mask &= 7;
 336                 mode >>= 6;
 337                 return (mask & ~mode) ? -EACCES : 0;
 338         }
 339
 340         /* Do we have ACL's? */
 341         if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
 342                 int error = check_acl(mnt_userns, inode, mask);
 343                 if (error != -EAGAIN)
 344                         return error;
 345         }
 346
 347         /* Only RWX matters for group/other mode bits */
 348         mask &= 7;
 349
 350         /*
 351          * Are the group permissions different from
 352          * the other permissions in the bits we care
 353          * about? Need to check group ownership if so.
 354          */
 355         if (mask & (mode ^ (mode >> 3))) {
 356                 kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
 357                 if (in_group_p(kgid))
 358                         mode >>= 3;
 359         }
 360
 361         /* Bits in 'mode' clear that we require? */
 362         return (mask & ~mode) ? -EACCES : 0;
 363 }
 364
 365 /**
 366  * generic_permission -  check for access rights on a Posix-like filesystem
 367  * @mnt_userns: user namespace of the mount the inode was found from
 368  * @inode:      inode to check access rights for
 369  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
 370  *              %MAY_NOT_BLOCK ...)
 371  *
 372  * Used to check for read/write/execute permissions on a file.
 373  * We use "fsuid" for this, letting us set arbitrary permissions
 374  * for filesystem access without changing the "normal" uids which
 375  * are used for other things.
 376  *
 377  * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 378  * request cannot be satisfied (eg. requires blocking or too much complexity).
 379  * It would then be called again in ref-walk mode.
 380  *
 381  * If the inode has been found through an idmapped mount the user namespace of
 382  * the vfsmount must be passed through @mnt_userns. This function will then take
 383  * care to map the inode according to @mnt_userns before checking permissions.
 384  * On non-idmapped mounts or if permission checking is to be performed on the
 385  * raw inode simply passs init_user_ns.
 386  */
 387 int generic_permission(struct user_namespace *mnt_userns, struct inode *inode,
 388                        int mask)
 389 {
 390         int ret;
 391
 392         /*
 393          * Do the basic permission checks.
 394          */
 395         ret = acl_permission_check(mnt_userns, inode, mask);
 396         if (ret != -EACCES)
 397                 return ret;
 398
 399         if (S_ISDIR(inode->i_mode)) {
 400                 /* DACs are overridable for directories */
 401                 if (!(mask & MAY_WRITE))
 402                         if (capable_wrt_inode_uidgid(mnt_userns, inode,
 403                                                      CAP_DAC_READ_SEARCH))
 404                                 return 0;
 405                 if (capable_wrt_inode_uidgid(mnt_userns, inode,
 406                                              CAP_DAC_OVERRIDE))
 407                         return 0;
 408                 return -EACCES;
 409         }
 410
 411         /*
 412          * Searching includes executable on directories, else just read.
 413          */
 414         mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 415         if (mask == MAY_READ)
 416                 if (capable_wrt_inode_uidgid(mnt_userns, inode,
 417                                              CAP_DAC_READ_SEARCH))
 418                         return 0;
 419         /*
 420          * Read/write DACs are always overridable.
 421          * Executable DACs are overridable when there is
 422          * at least one exec bit set.
 423          */
 424         if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
 425                 if (capable_wrt_inode_uidgid(mnt_userns, inode,
 426                                              CAP_DAC_OVERRIDE))
 427                         return 0;
 428
 429         return -EACCES;
 430 }
 431 EXPORT_SYMBOL(generic_permission);
 432
 433 /**
 434  * do_inode_permission - UNIX permission checking
 435  * @mnt_userns: user namespace of the mount the inode was found from
 436  * @inode:      inode to check permissions on
 437  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 438  *
 439  * We _really_ want to just do "generic_permission()" without
 440  * even looking at the inode->i_op values. So we keep a cache
 441  * flag in inode->i_opflags, that says "this has not special
 442  * permission function, use the fast case".
 443  */
 444 static inline int do_inode_permission(struct user_namespace *mnt_userns,
 445                                       struct inode *inode, int mask)
 446 {
 447         if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
 448                 if (likely(inode->i_op->permission))
 449                         return inode->i_op->permission(mnt_userns, inode, mask);
 450
 451                 /* This gets set once for the inode lifetime */
 452                 spin_lock(&inode->i_lock);
 453                 inode->i_opflags |= IOP_FASTPERM;
 454                 spin_unlock(&inode->i_lock);
 455         }
 456         return generic_permission(mnt_userns, inode, mask);
 457 }
 458
 459 /**
 460  * sb_permission - Check superblock-level permissions
 461  * @sb: Superblock of inode to check permission on
 462  * @inode: Inode to check permission on
 463  * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 464  *
 465  * Separate out file-system wide checks from inode-specific permission checks.
 466  */
 467 static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
 468 {
 469         if (unlikely(mask & MAY_WRITE)) {
 470                 umode_t mode = inode->i_mode;
 471
 472                 /* Nobody gets write access to a read-only fs. */
 473                 if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 474                         return -EROFS;
 475         }
 476         return 0;
 477 }
 478
 479 /**
 480  * inode_permission - Check for access rights to a given inode
 481  * @mnt_userns: User namespace of the mount the inode was found from
 482  * @inode:      Inode to check permission on
 483  * @mask:       Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 484  *
 485  * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 486  * this, letting us set arbitrary permissions for filesystem access without
 487  * changing the "normal" UIDs which are used for other things.
 488  *
 489  * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 490  */
 491 int inode_permission(struct user_namespace *mnt_userns,
 492                      struct inode *inode, int mask)
 493 {
 494         int retval;
 495
 496         retval = sb_permission(inode->i_sb, inode, mask);
 497         if (retval)
 498                 return retval;
 499
 500         if (unlikely(mask & MAY_WRITE)) {
 501                 /*
 502                  * Nobody gets write access to an immutable file.
 503                  */
 504                 if (IS_IMMUTABLE(inode))
 505                         return -EPERM;
 506
 507                 /*
 508                  * Updating mtime will likely cause i_uid and i_gid to be
 509                  * written back improperly if their true value is unknown
 510                  * to the vfs.
 511                  */
 512                 if (HAS_UNMAPPED_ID(mnt_userns, inode))
 513                         return -EACCES;
 514         }
 515
 516         retval = do_inode_permission(mnt_userns, inode, mask);
 517         if (retval)
 518                 return retval;
 519
 520         retval = devcgroup_inode_permission(inode, mask);
 521         if (retval)
 522                 return retval;
 523
 524         return security_inode_permission(inode, mask);
 525 }
 526 EXPORT_SYMBOL(inode_permission);
 527
 528 /**
 529  * path_get - get a reference to a path
 530  * @path: path to get the reference to
 531  *
 532  * Given a path increment the reference count to the dentry and the vfsmount.
 533  */
 534 void path_get(const struct path *path)
 535 {
 536         mntget(path->mnt);
 537         dget(path->dentry);
 538 }
 539 EXPORT_SYMBOL(path_get);
 540
 541 /**
 542  * path_put - put a reference to a path
 543  * @path: path to put the reference to
 544  *
 545  * Given a path decrement the reference count to the dentry and the vfsmount.
 546  */
 547 void path_put(const struct path *path)
 548 {
 549         dput(path->dentry);
 550         mntput(path->mnt);
 551 }
 552 EXPORT_SYMBOL(path_put);
 553
 554 #define EMBEDDED_LEVELS 2
 555 struct nameidata {
 556         struct path     path;
 557         struct qstr     last;
 558         struct path     root;
 559         struct inode    *inode; /* path.dentry.d_inode */
 560         unsigned int    flags, state;
 561         unsigned        seq, m_seq, r_seq;
 562         int             last_type;
 563         unsigned        depth;
 564         int             total_link_count;
 565         struct saved {
 566                 struct path link;
 567                 struct delayed_call done;
 568                 const char *name;
 569                 unsigned seq;
 570         } *stack, internal[EMBEDDED_LEVELS];
 571         struct filename *name;
 572         struct nameidata *saved;
 573         unsigned        root_seq;
 574         int             dfd;
 575         kuid_t          dir_uid;
 576         umode_t         dir_mode;
 577 } __randomize_layout;
 578
 579 #define ND_ROOT_PRESET 1
 580 #define ND_ROOT_GRABBED 2
 581 #define ND_JUMPED 4
 582
 583 static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 584 {
 585         struct nameidata *old = current->nameidata;
 586         p->stack = p->internal;
 587         p->depth = 0;
 588         p->dfd = dfd;
 589         p->name = name;
 590         p->path.mnt = NULL;
 591         p->path.dentry = NULL;
 592         p->total_link_count = old ? old->total_link_count : 0;
 593         p->saved = old;
 594         current->nameidata = p;
 595 }
 596
 597 static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name,
 598                           const struct path *root)
 599 {
 600         __set_nameidata(p, dfd, name);
 601         p->state = 0;
 602         if (unlikely(root)) {
 603                 p->state = ND_ROOT_PRESET;
 604                 p->root = *root;
 605         }
 606 }
 607
 608 static void restore_nameidata(void)
 609 {
 610         struct nameidata *now = current->nameidata, *old = now->saved;
 611
 612         current->nameidata = old;
 613         if (old)
 614                 old->total_link_count = now->total_link_count;
 615         if (now->stack != now->internal)
 616                 kfree(now->stack);
 617 }
 618
 619 static bool nd_alloc_stack(struct nameidata *nd)
 620 {
 621         struct saved *p;
 622
 623         p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
 624                          nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
 625         if (unlikely(!p))
 626                 return false;
 627         memcpy(p, nd->internal, sizeof(nd->internal));
 628         nd->stack = p;
 629         return true;
 630 }
 631
 632 /**
 633  * path_connected - Verify that a dentry is below mnt.mnt_root
 634  *
 635  * Rename can sometimes move a file or directory outside of a bind
 636  * mount, path_connected allows those cases to be detected.
 637  */
 638 static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
 639 {
 640         struct super_block *sb = mnt->mnt_sb;
 641
 642         /* Bind mounts can have disconnected paths */
 643         if (mnt->mnt_root == sb->s_root)
 644                 return true;
 645
 646         return is_subdir(dentry, mnt->mnt_root);
 647 }
 648
 649 static void drop_links(struct nameidata *nd)
 650 {
 651         int i = nd->depth;
 652         while (i--) {
 653                 struct saved *last = nd->stack + i;
 654                 do_delayed_call(&last->done);
 655                 clear_delayed_call(&last->done);
 656         }
 657 }
 658
 659 static void terminate_walk(struct nameidata *nd)
 660 {
 661         drop_links(nd);
 662         if (!(nd->flags & LOOKUP_RCU)) {
 663                 int i;
 664                 path_put(&nd->path);
 665                 for (i = 0; i < nd->depth; i++)
 666                         path_put(&nd->stack[i].link);
 667                 if (nd->state & ND_ROOT_GRABBED) {
 668                         path_put(&nd->root);
 669                         nd->state &= ~ND_ROOT_GRABBED;
 670                 }
 671         } else {
 672                 nd->flags &= ~LOOKUP_RCU;
 673                 rcu_read_unlock();
 674         }
 675         nd->depth = 0;
 676         nd->path.mnt = NULL;
 677         nd->path.dentry = NULL;
 678 }
 679
 680 /* path_put is needed afterwards regardless of success or failure */
 681 static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
 682 {
 683         int res = __legitimize_mnt(path->mnt, mseq);
 684         if (unlikely(res)) {
 685                 if (res > 0)
 686                         path->mnt = NULL;
 687                 path->dentry = NULL;
 688                 return false;
 689         }
 690         if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
 691                 path->dentry = NULL;
 692                 return false;
 693         }
 694         return !read_seqcount_retry(&path->dentry->d_seq, seq);
 695 }
 696
 697 static inline bool legitimize_path(struct nameidata *nd,
 698                             struct path *path, unsigned seq)
 699 {
 700         return __legitimize_path(path, seq, nd->m_seq);
 701 }
 702
 703 static bool legitimize_links(struct nameidata *nd)
 704 {
 705         int i;
 706         if (unlikely(nd->flags & LOOKUP_CACHED)) {
 707                 drop_links(nd);
 708                 nd->depth = 0;
 709                 return false;
 710         }
 711         for (i = 0; i < nd->depth; i++) {
 712                 struct saved *last = nd->stack + i;
 713                 if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
 714                         drop_links(nd);
 715                         nd->depth = i + 1;
 716                         return false;
 717                 }
 718         }
 719         return true;
 720 }
 721
 722 static bool legitimize_root(struct nameidata *nd)
 723 {
 724         /*
 725          * For scoped-lookups (where nd->root has been zeroed), we need to
 726          * restart the whole lookup from scratch -- because set_root() is wrong
 727          * for these lookups (nd->dfd is the root, not the filesystem root).
 728          */
 729         if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
 730                 return false;
 731         /* Nothing to do if nd->root is zero or is managed by the VFS user. */
 732         if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
 733                 return true;
 734         nd->state |= ND_ROOT_GRABBED;
 735         return legitimize_path(nd, &nd->root, nd->root_seq);
 736 }
 737
 738 /*
 739  * Path walking has 2 modes, rcu-walk and ref-walk (see
 740  * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 741  * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 742  * normal reference counts on dentries and vfsmounts to transition to ref-walk
 743  * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 744  * got stuck, so ref-walk may continue from there. If this is not successful
 745  * (eg. a seqcount has changed), then failure is returned and it's up to caller
 746  * to restart the path walk from the beginning in ref-walk mode.
 747  */
 748
 749 /**
 750  * try_to_unlazy - try to switch to ref-walk mode.
 751  * @nd: nameidata pathwalk data
 752  * Returns: true on success, false on failure
 753  *
 754  * try_to_unlazy attempts to legitimize the current nd->path and nd->root
 755  * for ref-walk mode.
 756  * Must be called from rcu-walk context.
 757  * Nothing should touch nameidata between try_to_unlazy() failure and
 758  * terminate_walk().
 759  */
 760 static bool try_to_unlazy(struct nameidata *nd)
 761 {
 762         struct dentry *parent = nd->path.dentry;
 763
 764         BUG_ON(!(nd->flags & LOOKUP_RCU));
 765
 766         nd->flags &= ~LOOKUP_RCU;
 767         if (unlikely(!legitimize_links(nd)))
 768                 goto out1;
 769         if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
 770                 goto out;
 771         if (unlikely(!legitimize_root(nd)))
 772                 goto out;
 773         rcu_read_unlock();
 774         BUG_ON(nd->inode != parent->d_inode);
 775         return true;
 776
 777 out1:
 778         nd->path.mnt = NULL;
 779         nd->path.dentry = NULL;
 780 out:
 781         rcu_read_unlock();
 782         return false;
 783 }
 784
 785 /**
 786  * try_to_unlazy_next - try to switch to ref-walk mode.
 787  * @nd: nameidata pathwalk data
 788  * @dentry: next dentry to step into
 789  * @seq: seq number to check @dentry against
 790  * Returns: true on success, false on failure
 791  *
 792  * Similar to to try_to_unlazy(), but here we have the next dentry already
 793  * picked by rcu-walk and want to legitimize that in addition to the current
 794  * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
 795  * Nothing should touch nameidata between try_to_unlazy_next() failure and
 796  * terminate_walk().
 797  */
 798 static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq)
 799 {
 800         BUG_ON(!(nd->flags & LOOKUP_RCU));
 801
 802         nd->flags &= ~LOOKUP_RCU;
 803         if (unlikely(!legitimize_links(nd)))
 804                 goto out2;
 805         if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
 806                 goto out2;
 807         if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
 808                 goto out1;
 809
 810         /*
 811          * We need to move both the parent and the dentry from the RCU domain
 812          * to be properly refcounted. And the sequence number in the dentry
 813          * validates *both* dentry counters, since we checked the sequence
 814          * number of the parent after we got the child sequence number. So we
 815          * know the parent must still be valid if the child sequence number is
 816          */
 817         if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
 818                 goto out;
 819         if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
 820                 goto out_dput;
 821         /*
 822          * Sequence counts matched. Now make sure that the root is
 823          * still valid and get it if required.
 824          */
 825         if (unlikely(!legitimize_root(nd)))
 826                 goto out_dput;
 827         rcu_read_unlock();
 828         return true;
 829
 830 out2:
 831         nd->path.mnt = NULL;
 832 out1:
 833         nd->path.dentry = NULL;
 834 out:
 835         rcu_read_unlock();
 836         return false;
 837 out_dput:
 838         rcu_read_unlock();
 839         dput(dentry);
 840         return false;
 841 }
 842
 843 static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
 844 {
 845         if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
 846                 return dentry->d_op->d_revalidate(dentry, flags);
 847         else
 848                 return 1;
 849 }
 850
 851 /**
 852  * complete_walk - successful completion of path walk
 853  * @nd:  pointer nameidata
 854  *
 855  * If we had been in RCU mode, drop out of it and legitimize nd->path.
 856  * Revalidate the final result, unless we'd already done that during
 857  * the path walk or the filesystem doesn't ask for it.  Return 0 on
 858  * success, -error on failure.  In case of failure caller does not
 859  * need to drop nd->path.
 860  */
 861 static int complete_walk(struct nameidata *nd)
 862 {
 863         struct dentry *dentry = nd->path.dentry;
 864         int status;
 865
 866         if (nd->flags & LOOKUP_RCU) {
 867                 /*
 868                  * We don't want to zero nd->root for scoped-lookups or
 869                  * externally-managed nd->root.
 870                  */
 871                 if (!(nd->state & ND_ROOT_PRESET))
 872                         if (!(nd->flags & LOOKUP_IS_SCOPED))
 873                                 nd->root.mnt = NULL;
 874                 nd->flags &= ~LOOKUP_CACHED;
 875                 if (!try_to_unlazy(nd))
 876                         return -ECHILD;
 877         }
 878
 879         if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
 880                 /*
 881                  * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
 882                  * ever step outside the root during lookup" and should already
 883                  * be guaranteed by the rest of namei, we want to avoid a namei
 884                  * BUG resulting in userspace being given a path that was not
 885                  * scoped within the root at some point during the lookup.
 886                  *
 887                  * So, do a final sanity-check to make sure that in the
 888                  * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
 889                  * we won't silently return an fd completely outside of the
 890                  * requested root to userspace.
 891                  *
 892                  * Userspace could move the path outside the root after this
 893                  * check, but as discussed elsewhere this is not a concern (the
 894                  * resolved file was inside the root at some point).
 895                  */
 896                 if (!path_is_under(&nd->path, &nd->root))
 897                         return -EXDEV;
 898         }
 899
 900         if (likely(!(nd->state & ND_JUMPED)))
 901                 return 0;
 902
 903         if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
 904                 return 0;
 905
 906         status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
 907         if (status > 0)
 908                 return 0;
 909
 910         if (!status)
 911                 status = -ESTALE;
 912
 913         return status;
 914 }
 915
 916 static int set_root(struct nameidata *nd)
 917 {
 918         struct fs_struct *fs = current->fs;
 919
 920         /*
 921          * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
 922          * still have to ensure it doesn't happen because it will cause a breakout
 923          * from the dirfd.
 924          */
 925         if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
 926                 return -ENOTRECOVERABLE;
 927
 928         if (nd->flags & LOOKUP_RCU) {
 929                 unsigned seq;
 930
 931                 do {
 932                         seq = read_seqcount_begin(&fs->seq);
 933                         nd->root = fs->root;
 934                         nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
 935                 } while (read_seqcount_retry(&fs->seq, seq));
 936         } else {
 937                 get_fs_root(fs, &nd->root);
 938                 nd->state |= ND_ROOT_GRABBED;
 939         }
 940         return 0;
 941 }
 942
 943 static int nd_jump_root(struct nameidata *nd)
 944 {
 945         if (unlikely(nd->flags & LOOKUP_BENEATH))
 946                 return -EXDEV;
 947         if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
 948                 /* Absolute path arguments to path_init() are allowed. */
 949                 if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
 950                         return -EXDEV;
 951         }
 952         if (!nd->root.mnt) {
 953                 int error = set_root(nd);
 954                 if (error)
 955                         return error;
 956         }
 957         if (nd->flags & LOOKUP_RCU) {
 958                 struct dentry *d;
 959                 nd->path = nd->root;
 960                 d = nd->path.dentry;
 961                 nd->inode = d->d_inode;
 962                 nd->seq = nd->root_seq;
 963                 if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
 964                         return -ECHILD;
 965         } else {
 966                 path_put(&nd->path);
 967                 nd->path = nd->root;
 968                 path_get(&nd->path);
 969                 nd->inode = nd->path.dentry->d_inode;
 970         }
 971         nd->state |= ND_JUMPED;
 972         return 0;
 973 }
 974
 975 /*
 976  * Helper to directly jump to a known parsed path from ->get_link,
 977  * caller must have taken a reference to path beforehand.
 978  */
 979 int nd_jump_link(struct path *path)
 980 {
 981         int error = -ELOOP;
 982         struct nameidata *nd = current->nameidata;
 983
 984         if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
 985                 goto err;
 986
 987         error = -EXDEV;
 988         if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
 989                 if (nd->path.mnt != path->mnt)
 990                         goto err;
 991         }
 992         /* Not currently safe for scoped-lookups. */
 993         if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
 994                 goto err;
 995
 996         path_put(&nd->path);
 997         nd->path = *path;
 998         nd->inode = nd->path.dentry->d_inode;
 999         nd->state |= ND_JUMPED;
1000         return 0;
1001
1002 err:
1003         path_put(path);
1004         return error;
1005 }
1006
1007 static inline void put_link(struct nameidata *nd)
1008 {
1009         struct saved *last = nd->stack + --nd->depth;
1010         do_delayed_call(&last->done);
1011         if (!(nd->flags & LOOKUP_RCU))
1012                 path_put(&last->link);
1013 }
1014
1015 int sysctl_protected_symlinks __read_mostly = 0;
1016 int sysctl_protected_hardlinks __read_mostly = 0;
1017 int sysctl_protected_fifos __read_mostly;
1018 int sysctl_protected_regular __read_mostly;
1019
1020 /**
1021  * may_follow_link - Check symlink following for unsafe situations
1022  * @nd: nameidata pathwalk data
1023  *
1024  * In the case of the sysctl_protected_symlinks sysctl being enabled,
1025  * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
1026  * in a sticky world-writable directory. This is to protect privileged
1027  * processes from failing races against path names that may change out
1028  * from under them by way of other users creating malicious symlinks.
1029  * It will permit symlinks to be followed only when outside a sticky
1030  * world-writable directory, or when the uid of the symlink and follower
1031  * match, or when the directory owner matches the symlink's owner.
1032  *
1033  * Returns 0 if following the symlink is allowed, -ve on error.
1034  */
1035 static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
1036 {
1037         struct user_namespace *mnt_userns;
1038         kuid_t i_uid;
1039
1040         if (!sysctl_protected_symlinks)
1041                 return 0;
1042
1043         mnt_userns = mnt_user_ns(nd->path.mnt);
1044         i_uid = i_uid_into_mnt(mnt_userns, inode);
1045         /* Allowed if owner and follower match. */
1046         if (uid_eq(current_cred()->fsuid, i_uid))
1047                 return 0;
1048
1049         /* Allowed if parent directory not sticky and world-writable. */
1050         if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
1051                 return 0;
1052
1053         /* Allowed if parent directory and link owner match. */
1054         if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
1055                 return 0;
1056
1057         if (nd->flags & LOOKUP_RCU)
1058                 return -ECHILD;
1059
1060         audit_inode(nd->name, nd->stack[0].link.dentry, 0);
1061         audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
1062         return -EACCES;
1063 }
1064
1065 /**
1066  * safe_hardlink_source - Check for safe hardlink conditions
1067  * @mnt_userns: user namespace of the mount the inode was found from
1068  * @inode: the source inode to hardlink from
1069  *
1070  * Return false if at least one of the following conditions:
1071  *    - inode is not a regular file
1072  *    - inode is setuid
1073  *    - inode is setgid and group-exec
1074  *    - access failure for read and write
1075  *
1076  * Otherwise returns true.
1077  */
1078 static bool safe_hardlink_source(struct user_namespace *mnt_userns,
1079                                  struct inode *inode)
1080 {
1081         umode_t mode = inode->i_mode;
1082
1083         /* Special files should not get pinned to the filesystem. */
1084         if (!S_ISREG(mode))
1085                 return false;
1086
1087         /* Setuid files should not get pinned to the filesystem. */
1088         if (mode & S_ISUID)
1089                 return false;
1090
1091         /* Executable setgid files should not get pinned to the filesystem. */
1092         if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
1093                 return false;
1094
1095         /* Hardlinking to unreadable or unwritable sources is dangerous. */
1096         if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE))
1097                 return false;
1098
1099         return true;
1100 }
1101
1102 /**
1103  * may_linkat - Check permissions for creating a hardlink
1104  * @mnt_userns: user namespace of the mount the inode was found from
1105  * @link: the source to hardlink from
1106  *
1107  * Block hardlink when all of:
1108  *  - sysctl_protected_hardlinks enabled
1109  *  - fsuid does not match inode
1110  *  - hardlink source is unsafe (see safe_hardlink_source() above)
1111  *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
1112  *
1113  * If the inode has been found through an idmapped mount the user namespace of
1114  * the vfsmount must be passed through @mnt_userns. This function will then take
1115  * care to map the inode according to @mnt_userns before checking permissions.
1116  * On non-idmapped mounts or if permission checking is to be performed on the
1117  * raw inode simply passs init_user_ns.
1118  *
1119  * Returns 0 if successful, -ve on error.
1120  */
1121 int may_linkat(struct user_namespace *mnt_userns, struct path *link)
1122 {
1123         struct inode *inode = link->dentry->d_inode;
1124
1125         /* Inode writeback is not safe when the uid or gid are invalid. */
1126         if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
1127             !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
1128                 return -EOVERFLOW;
1129
1130         if (!sysctl_protected_hardlinks)
1131                 return 0;
1132
1133         /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
1134          * otherwise, it must be a safe source.
1135          */
1136         if (safe_hardlink_source(mnt_userns, inode) ||
1137             inode_owner_or_capable(mnt_userns, inode))
1138                 return 0;
1139
1140         audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
1141         return -EPERM;
1142 }
1143
1144 /**
1145  * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
1146  *                        should be allowed, or not, on files that already
1147  *                        exist.
1148  * @mnt_userns: user namespace of the mount the inode was found from
1149  * @nd: nameidata pathwalk data
1150  * @inode: the inode of the file to open
1151  *
1152  * Block an O_CREAT open of a FIFO (or a regular file) when:
1153  *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
1154  *   - the file already exists
1155  *   - we are in a sticky directory
1156  *   - we don't own the file
1157  *   - the owner of the directory doesn't own the file
1158  *   - the directory is world writable
1159  * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
1160  * the directory doesn't have to be world writable: being group writable will
1161  * be enough.
1162  *
1163  * If the inode has been found through an idmapped mount the user namespace of
1164  * the vfsmount must be passed through @mnt_userns. This function will then take
1165  * care to map the inode according to @mnt_userns before checking permissions.
1166  * On non-idmapped mounts or if permission checking is to be performed on the
1167  * raw inode simply passs init_user_ns.
1168  *
1169  * Returns 0 if the open is allowed, -ve on error.
1170  */
1171 static int may_create_in_sticky(struct user_namespace *mnt_userns,
1172                                 struct nameidata *nd, struct inode *const inode)
1173 {
1174         umode_t dir_mode = nd->dir_mode;
1175         kuid_t dir_uid = nd->dir_uid;
1176
1177         if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
1178             (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
1179             likely(!(dir_mode & S_ISVTX)) ||
1180             uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) ||
1181             uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
1182                 return 0;
1183
1184         if (likely(dir_mode & 0002) ||
1185             (dir_mode & 0020 &&
1186              ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
1187               (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
1188                 const char *operation = S_ISFIFO(inode->i_mode) ?
1189                                         "sticky_create_fifo" :
1190                                         "sticky_create_regular";
1191                 audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
1192                 return -EACCES;
1193         }
1194         return 0;
1195 }
1196
1197 /*
1198  * follow_up - Find the mountpoint of path's vfsmount
1199  *
1200  * Given a path, find the mountpoint of its source file system.
1201  * Replace @path with the path of the mountpoint in the parent mount.
1202  * Up is towards /.
1203  *
1204  * Return 1 if we went up a level and 0 if we were already at the
1205  * root.
1206  */
1207 int follow_up(struct path *path)
1208 {
1209         struct mount *mnt = real_mount(path->mnt);
1210         struct mount *parent;
1211         struct dentry *mountpoint;
1212
1213         read_seqlock_excl(&mount_lock);
1214         parent = mnt->mnt_parent;
1215         if (parent == mnt) {
1216                 read_sequnlock_excl(&mount_lock);
1217                 return 0;
1218         }
1219         mntget(&parent->mnt);
1220         mountpoint = dget(mnt->mnt_mountpoint);
1221         read_sequnlock_excl(&mount_lock);
1222         dput(path->dentry);
1223         path->dentry = mountpoint;
1224         mntput(path->mnt);
1225         path->mnt = &parent->mnt;
1226         return 1;
1227 }
1228 EXPORT_SYMBOL(follow_up);
1229
1230 static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
1231                                   struct path *path, unsigned *seqp)
1232 {
1233         while (mnt_has_parent(m)) {
1234                 struct dentry *mountpoint = m->mnt_mountpoint;
1235
1236                 m = m->mnt_parent;
1237                 if (unlikely(root->dentry == mountpoint &&
1238                              root->mnt == &m->mnt))
1239                         break;
1240                 if (mountpoint != m->mnt.mnt_root) {
1241                         path->mnt = &m->mnt;
1242                         path->dentry = mountpoint;
1243                         *seqp = read_seqcount_begin(&mountpoint->d_seq);
1244                         return true;
1245                 }
1246         }
1247         return false;
1248 }
1249
1250 static bool choose_mountpoint(struct mount *m, const struct path *root,
1251                               struct path *path)
1252 {
1253         bool found;
1254
1255         rcu_read_lock();
1256         while (1) {
1257                 unsigned seq, mseq = read_seqbegin(&mount_lock);
1258
1259                 found = choose_mountpoint_rcu(m, root, path, &seq);
1260                 if (unlikely(!found)) {
1261                         if (!read_seqretry(&mount_lock, mseq))
1262                                 break;
1263                 } else {
1264                         if (likely(__legitimize_path(path, seq, mseq)))
1265                                 break;
1266                         rcu_read_unlock();
1267                         path_put(path);
1268                         rcu_read_lock();
1269                 }
1270         }
1271         rcu_read_unlock();
1272         return found;
1273 }
1274
1275 /*
1276  * Perform an automount
1277  * - return -EISDIR to tell follow_managed() to stop and return the path we
1278  *   were called with.
1279  */
1280 static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
1281 {
1282         struct dentry *dentry = path->dentry;
1283
1284         /* We don't want to mount if someone's just doing a stat -
1285          * unless they're stat'ing a directory and appended a '/' to
1286          * the name.
1287          *
1288          * We do, however, want to mount if someone wants to open or
1289          * create a file of any type under the mountpoint, wants to
1290          * traverse through the mountpoint or wants to open the
1291          * mounted directory.  Also, autofs may mark negative dentries
1292          * as being automount points.  These will need the attentions
1293          * of the daemon to instantiate them before they can be used.
1294          */
1295         if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1296                            LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1297             dentry->d_inode)
1298                 return -EISDIR;
1299
1300         if (count && (*count)++ >= MAXSYMLINKS)
1301                 return -ELOOP;
1302
1303         return finish_automount(dentry->d_op->d_automount(path), path);
1304 }
1305
1306 /*
1307  * mount traversal - out-of-line part.  One note on ->d_flags accesses -
1308  * dentries are pinned but not locked here, so negative dentry can go
1309  * positive right under us.  Use of smp_load_acquire() provides a barrier
1310  * sufficient for ->d_inode and ->d_flags consistency.
1311  */
1312 static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
1313                              int *count, unsigned lookup_flags)
1314 {
1315         struct vfsmount *mnt = path->mnt;
1316         bool need_mntput = false;
1317         int ret = 0;
1318
1319         while (flags & DCACHE_MANAGED_DENTRY) {
1320                 /* Allow the filesystem to manage the transit without i_mutex
1321                  * being held. */
1322                 if (flags & DCACHE_MANAGE_TRANSIT) {
1323                         ret = path->dentry->d_op->d_manage(path, false);
1324                         flags = smp_load_acquire(&path->dentry->d_flags);
1325                         if (ret < 0)
1326                                 break;
1327                 }
1328
1329                 if (flags & DCACHE_MOUNTED) {   // something's mounted on it..
1330                         struct vfsmount *mounted = lookup_mnt(path);
1331                         if (mounted) {          // ... in our namespace
1332                                 dput(path->dentry);
1333                                 if (need_mntput)
1334                                         mntput(path->mnt);
1335                                 path->mnt = mounted;
1336                                 path->dentry = dget(mounted->mnt_root);
1337                                 // here we know it's positive
1338                                 flags = path->dentry->d_flags;
1339                                 need_mntput = true;
1340                                 continue;
1341                         }
1342                 }
1343
1344                 if (!(flags & DCACHE_NEED_AUTOMOUNT))
1345                         break;
1346
1347                 // uncovered automount point
1348                 ret = follow_automount(path, count, lookup_flags);
1349                 flags = smp_load_acquire(&path->dentry->d_flags);
1350                 if (ret < 0)
1351                         break;
1352         }
1353
1354         if (ret == -EISDIR)
1355                 ret = 0;
1356         // possible if you race with several mount --move
1357         if (need_mntput && path->mnt == mnt)
1358                 mntput(path->mnt);
1359         if (!ret && unlikely(d_flags_negative(flags)))
1360                 ret = -ENOENT;
1361         *jumped = need_mntput;
1362         return ret;
1363 }
1364
1365 static inline int traverse_mounts(struct path *path, bool *jumped,
1366                                   int *count, unsigned lookup_flags)
1367 {
1368         unsigned flags = smp_load_acquire(&path->dentry->d_flags);
1369
1370         /* fastpath */
1371         if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
1372                 *jumped = false;
1373                 if (unlikely(d_flags_negative(flags)))
1374                         return -ENOENT;
1375                 return 0;
1376         }
1377         return __traverse_mounts(path, flags, jumped, count, lookup_flags);
1378 }
1379
1380 int follow_down_one(struct path *path)
1381 {
1382         struct vfsmount *mounted;
1383
1384         mounted = lookup_mnt(path);
1385         if (mounted) {
1386                 dput(path->dentry);
1387                 mntput(path->mnt);
1388                 path->mnt = mounted;
1389                 path->dentry = dget(mounted->mnt_root);
1390                 return 1;
1391         }
1392         return 0;
1393 }
1394 EXPORT_SYMBOL(follow_down_one);
1395
1396 /*
1397  * Follow down to the covering mount currently visible to userspace.  At each
1398  * point, the filesystem owning that dentry may be queried as to whether the
1399  * caller is permitted to proceed or not.
1400  */
1401 int follow_down(struct path *path)
1402 {
1403         struct vfsmount *mnt = path->mnt;
1404         bool jumped;
1405         int ret = traverse_mounts(path, &jumped, NULL, 0);
1406
1407         if (path->mnt != mnt)
1408                 mntput(mnt);
1409         return ret;
1410 }
1411 EXPORT_SYMBOL(follow_down);
1412
1413 /*
1414  * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
1415  * we meet a managed dentry that would need blocking.
1416  */
1417 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1418                                struct inode **inode, unsigned *seqp)
1419 {
1420         struct dentry *dentry = path->dentry;
1421         unsigned int flags = dentry->d_flags;
1422
1423         if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
1424                 return true;
1425
1426         if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1427                 return false;
1428
1429         for (;;) {
1430                 /*
1431                  * Don't forget we might have a non-mountpoint managed dentry
1432                  * that wants to block transit.
1433                  */
1434                 if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
1435                         int res = dentry->d_op->d_manage(path, true);
1436                         if (res)
1437                                 return res == -EISDIR;
1438                         flags = dentry->d_flags;
1439                 }
1440
1441                 if (flags & DCACHE_MOUNTED) {
1442                         struct mount *mounted = __lookup_mnt(path->mnt, dentry);
1443                         if (mounted) {
1444                                 path->mnt = &mounted->mnt;
1445                                 dentry = path->dentry = mounted->mnt.mnt_root;
1446                                 nd->state |= ND_JUMPED;
1447                                 *seqp = read_seqcount_begin(&dentry->d_seq);
1448                                 *inode = dentry->d_inode;
1449                                 /*
1450                                  * We don't need to re-check ->d_seq after this
1451                                  * ->d_inode read - there will be an RCU delay
1452                                  * between mount hash removal and ->mnt_root
1453                                  * becoming unpinned.
1454                                  */
1455                                 flags = dentry->d_flags;
1456                                 continue;
1457                         }
1458                         if (read_seqretry(&mount_lock, nd->m_seq))
1459                                 return false;
1460                 }
1461                 return !(flags & DCACHE_NEED_AUTOMOUNT);
1462         }
1463 }
1464
1465 static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
1466                           struct path *path, struct inode **inode,
1467                           unsigned int *seqp)
1468 {
1469         bool jumped;
1470         int ret;
1471
1472         path->mnt = nd->path.mnt;
1473         path->dentry = dentry;
1474         if (nd->flags & LOOKUP_RCU) {
1475                 unsigned int seq = *seqp;
1476                 if (unlikely(!*inode))
1477                         return -ENOENT;
1478                 if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
1479                         return 0;
1480                 if (!try_to_unlazy_next(nd, dentry, seq))
1481                         return -ECHILD;
1482                 // *path might've been clobbered by __follow_mount_rcu()
1483                 path->mnt = nd->path.mnt;
1484                 path->dentry = dentry;
1485         }
1486         ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
1487         if (jumped) {
1488                 if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1489                         ret = -EXDEV;
1490                 else
1491                         nd->state |= ND_JUMPED;
1492         }
1493         if (unlikely(ret)) {
1494                 dput(path->dentry);
1495                 if (path->mnt != nd->path.mnt)
1496                         mntput(path->mnt);
1497         } else {
1498                 *inode = d_backing_inode(path->dentry);
1499                 *seqp = 0; /* out of RCU mode, so the value doesn't matter */
1500         }
1501         return ret;
1502 }
1503
1504 /*
1505  * This looks up the name in dcache and possibly revalidates the found dentry.
1506  * NULL is returned if the dentry does not exist in the cache.
1507  */
1508 static struct dentry *lookup_dcache(const struct qstr *name,
1509                                     struct dentry *dir,
1510                                     unsigned int flags)
1511 {
1512         struct dentry *dentry = d_lookup(dir, name);
1513         if (dentry) {
1514                 int error = d_revalidate(dentry, flags);
1515                 if (unlikely(error <= 0)) {
1516                         if (!error)
1517                                 d_invalidate(dentry);
1518                         dput(dentry);
1519                         return ERR_PTR(error);
1520                 }
1521         }
1522         return dentry;
1523 }
1524
1525 /*
1526  * Parent directory has inode locked exclusive.  This is one
1527  * and only case when ->lookup() gets called on non in-lookup
1528  * dentries - as the matter of fact, this only gets called
1529  * when directory is guaranteed to have no in-lookup children
1530  * at all.
1531  */
1532 static struct dentry *__lookup_hash(const struct qstr *name,
1533                 struct dentry *base, unsigned int flags)
1534 {
1535         struct dentry *dentry = lookup_dcache(name, base, flags);
1536         struct dentry *old;
1537         struct inode *dir = base->d_inode;
1538
1539         if (dentry)
1540                 return dentry;
1541
1542         /* Don't create child dentry for a dead directory. */
1543         if (unlikely(IS_DEADDIR(dir)))
1544                 return ERR_PTR(-ENOENT);
1545
1546         dentry = d_alloc(base, name);
1547         if (unlikely(!dentry))
1548                 return ERR_PTR(-ENOMEM);
1549
1550         old = dir->i_op->lookup(dir, dentry, flags);
1551         if (unlikely(old)) {
1552                 dput(dentry);
1553                 dentry = old;
1554         }
1555         return dentry;
1556 }
1557
1558 static struct dentry *lookup_fast(struct nameidata *nd,
1559                                   struct inode **inode,
1560                                   unsigned *seqp)
1561 {
1562         struct dentry *dentry, *parent = nd->path.dentry;
1563         int status = 1;
1564
1565         /*
1566          * Rename seqlock is not required here because in the off chance
1567          * of a false negative due to a concurrent rename, the caller is
1568          * going to fall back to non-racy lookup.
1569          */
1570         if (nd->flags & LOOKUP_RCU) {
1571                 unsigned seq;
1572                 dentry = __d_lookup_rcu(parent, &nd->last, &seq);
1573                 if (unlikely(!dentry)) {
1574                         if (!try_to_unlazy(nd))
1575                                 return ERR_PTR(-ECHILD);
1576                         return NULL;
1577                 }
1578
1579                 /*
1580                  * This sequence count validates that the inode matches
1581                  * the dentry name information from lookup.
1582                  */
1583                 *inode = d_backing_inode(dentry);
1584                 if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
1585                         return ERR_PTR(-ECHILD);
1586
1587                 /*
1588                  * This sequence count validates that the parent had no
1589                  * changes while we did the lookup of the dentry above.
1590                  *
1591                  * The memory barrier in read_seqcount_begin of child is
1592                  *  enough, we can use __read_seqcount_retry here.
1593                  */
1594                 if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
1595                         return ERR_PTR(-ECHILD);
1596
1597                 *seqp = seq;
1598                 status = d_revalidate(dentry, nd->flags);
1599                 if (likely(status > 0))
1600                         return dentry;
1601                 if (!try_to_unlazy_next(nd, dentry, seq))
1602                         return ERR_PTR(-ECHILD);
1603                 if (status == -ECHILD)
1604                         /* we'd been told to redo it in non-rcu mode */
1605                         status = d_revalidate(dentry, nd->flags);
1606         } else {
1607                 dentry = __d_lookup(parent, &nd->last);
1608                 if (unlikely(!dentry))
1609                         return NULL;
1610                 status = d_revalidate(dentry, nd->flags);
1611         }
1612         if (unlikely(status <= 0)) {
1613                 if (!status)
1614                         d_invalidate(dentry);
1615                 dput(dentry);
1616                 return ERR_PTR(status);
1617         }
1618         return dentry;
1619 }
1620
1621 /* Fast lookup failed, do it the slow way */
1622 static struct dentry *__lookup_slow(const struct qstr *name,
1623                                     struct dentry *dir,
1624                                     unsigned int flags)
1625 {
1626         struct dentry *dentry, *old;
1627         struct inode *inode = dir->d_inode;
1628         DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1629
1630         /* Don't go there if it's already dead */
1631         if (unlikely(IS_DEADDIR(inode)))
1632                 return ERR_PTR(-ENOENT);
1633 again:
1634         dentry = d_alloc_parallel(dir, name, &wq);
1635         if (IS_ERR(dentry))
1636                 return dentry;
1637         if (unlikely(!d_in_lookup(dentry))) {
1638                 int error = d_revalidate(dentry, flags);
1639                 if (unlikely(error <= 0)) {
1640                         if (!error) {
1641                                 d_invalidate(dentry);
1642                                 dput(dentry);
1643                                 goto again;
1644                         }
1645                         dput(dentry);
1646                         dentry = ERR_PTR(error);
1647                 }
1648         } else {
1649                 old = inode->i_op->lookup(inode, dentry, flags);
1650                 d_lookup_done(dentry);
1651                 if (unlikely(old)) {
1652                         dput(dentry);
1653                         dentry = old;
1654                 }
1655         }
1656         return dentry;
1657 }
1658
1659 static struct dentry *lookup_slow(const struct qstr *name,
1660                                   struct dentry *dir,
1661                                   unsigned int flags)
1662 {
1663         struct inode *inode = dir->d_inode;
1664         struct dentry *res;
1665         inode_lock_shared(inode);
1666         res = __lookup_slow(name, dir, flags);
1667         inode_unlock_shared(inode);
1668         return res;
1669 }
1670
1671 static inline int may_lookup(struct user_namespace *mnt_userns,
1672                              struct nameidata *nd)
1673 {
1674         if (nd->flags & LOOKUP_RCU) {
1675                 int err = inode_permission(mnt_userns, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1676                 if (err != -ECHILD || !try_to_unlazy(nd))
1677                         return err;
1678         }
1679         return inode_permission(mnt_userns, nd->inode, MAY_EXEC);
1680 }
1681
1682 static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
1683 {
1684         if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
1685                 return -ELOOP;
1686
1687         if (likely(nd->depth != EMBEDDED_LEVELS))
1688                 return 0;
1689         if (likely(nd->stack != nd->internal))
1690                 return 0;
1691         if (likely(nd_alloc_stack(nd)))
1692                 return 0;
1693
1694         if (nd->flags & LOOKUP_RCU) {
1695                 // we need to grab link before we do unlazy.  And we can't skip
1696                 // unlazy even if we fail to grab the link - cleanup needs it
1697                 bool grabbed_link = legitimize_path(nd, link, seq);
1698
1699                 if (!try_to_unlazy(nd) != 0 || !grabbed_link)
1700                         return -ECHILD;
1701
1702                 if (nd_alloc_stack(nd))
1703                         return 0;
1704         }
1705         return -ENOMEM;
1706 }
1707
1708 enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
1709
1710 static const char *pick_link(struct nameidata *nd, struct path *link,
1711                      struct inode *inode, unsigned seq, int flags)
1712 {
1713         struct saved *last;
1714         const char *res;
1715         int error = reserve_stack(nd, link, seq);
1716
1717         if (unlikely(error)) {
1718                 if (!(nd->flags & LOOKUP_RCU))
1719                         path_put(link);
1720                 return ERR_PTR(error);
1721         }
1722         last = nd->stack + nd->depth++;
1723         last->link = *link;
1724         clear_delayed_call(&last->done);
1725         last->seq = seq;
1726
1727         if (flags & WALK_TRAILING) {
1728                 error = may_follow_link(nd, inode);
1729                 if (unlikely(error))
1730                         return ERR_PTR(error);
1731         }
1732
1733         if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
1734                         unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
1735                 return ERR_PTR(-ELOOP);
1736
1737         if (!(nd->flags & LOOKUP_RCU)) {
1738                 touch_atime(&last->link);
1739                 cond_resched();
1740         } else if (atime_needs_update(&last->link, inode)) {
1741                 if (!try_to_unlazy(nd))
1742                         return ERR_PTR(-ECHILD);
1743                 touch_atime(&last->link);
1744         }
1745
1746         error = security_inode_follow_link(link->dentry, inode,
1747                                            nd->flags & LOOKUP_RCU);
1748         if (unlikely(error))
1749                 return ERR_PTR(error);
1750
1751         res = READ_ONCE(inode->i_link);
1752         if (!res) {
1753                 const char * (*get)(struct dentry *, struct inode *,
1754                                 struct delayed_call *);
1755                 get = inode->i_op->get_link;
1756                 if (nd->flags & LOOKUP_RCU) {
1757                         res = get(NULL, inode, &last->done);
1758                         if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
1759                                 res = get(link->dentry, inode, &last->done);
1760                 } else {
1761                         res = get(link->dentry, inode, &last->done);
1762                 }
1763                 if (!res)
1764                         goto all_done;
1765                 if (IS_ERR(res))
1766                         return res;
1767         }
1768         if (*res == '/') {
1769                 error = nd_jump_root(nd);
1770                 if (unlikely(error))
1771                         return ERR_PTR(error);
1772                 while (unlikely(*++res == '/'))
1773                         ;
1774         }
1775         if (*res)
1776                 return res;
1777 all_done: // pure jump
1778         put_link(nd);
1779         return NULL;
1780 }
1781
1782 /*
1783  * Do we need to follow links? We _really_ want to be able
1784  * to do this check without having to look at inode->i_op,
1785  * so we keep a cache of "no, this doesn't need follow_link"
1786  * for the common case.
1787  */
1788 static const char *step_into(struct nameidata *nd, int flags,
1789                      struct dentry *dentry, struct inode *inode, unsigned seq)
1790 {
1791         struct path path;
1792         int err = handle_mounts(nd, dentry, &path, &inode, &seq);
1793
1794         if (err < 0)
1795                 return ERR_PTR(err);
1796         if (likely(!d_is_symlink(path.dentry)) ||
1797            ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
1798            (flags & WALK_NOFOLLOW)) {
1799                 /* not a symlink or should not follow */
1800                 if (!(nd->flags & LOOKUP_RCU)) {
1801                         dput(nd->path.dentry);
1802                         if (nd->path.mnt != path.mnt)
1803                                 mntput(nd->path.mnt);
1804                 }
1805                 nd->path = path;
1806                 nd->inode = inode;
1807                 nd->seq = seq;
1808                 return NULL;
1809         }
1810         if (nd->flags & LOOKUP_RCU) {
1811                 /* make sure that d_is_symlink above matches inode */
1812                 if (read_seqcount_retry(&path.dentry->d_seq, seq))
1813                         return ERR_PTR(-ECHILD);
1814         } else {
1815                 if (path.mnt == nd->path.mnt)
1816                         mntget(path.mnt);
1817         }
1818         return pick_link(nd, &path, inode, seq, flags);
1819 }
1820
1821 static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
1822                                         struct inode **inodep,
1823                                         unsigned *seqp)
1824 {
1825         struct dentry *parent, *old;
1826
1827         if (path_equal(&nd->path, &nd->root))
1828                 goto in_root;
1829         if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
1830                 struct path path;
1831                 unsigned seq;
1832                 if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
1833                                            &nd->root, &path, &seq))
1834                         goto in_root;
1835                 if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1836                         return ERR_PTR(-ECHILD);
1837                 nd->path = path;
1838                 nd->inode = path.dentry->d_inode;
1839                 nd->seq = seq;
1840                 if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
1841                         return ERR_PTR(-ECHILD);
1842                 /* we know that mountpoint was pinned */
1843         }
1844         old = nd->path.dentry;
1845         parent = old->d_parent;
1846         *inodep = parent->d_inode;
1847         *seqp = read_seqcount_begin(&parent->d_seq);
1848         if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
1849                 return ERR_PTR(-ECHILD);
1850         if (unlikely(!path_connected(nd->path.mnt, parent)))
1851                 return ERR_PTR(-ECHILD);
1852         return parent;
1853 in_root:
1854         if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
1855                 return ERR_PTR(-ECHILD);
1856         if (unlikely(nd->flags & LOOKUP_BENEATH))
1857                 return ERR_PTR(-ECHILD);
1858         return NULL;
1859 }
1860
1861 static struct dentry *follow_dotdot(struct nameidata *nd,
1862                                  struct inode **inodep,
1863                                  unsigned *seqp)
1864 {
1865         struct dentry *parent;
1866
1867         if (path_equal(&nd->path, &nd->root))
1868                 goto in_root;
1869         if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
1870                 struct path path;
1871
1872                 if (!choose_mountpoint(real_mount(nd->path.mnt),
1873                                        &nd->root, &path))
1874                         goto in_root;
1875                 path_put(&nd->path);
1876                 nd->path = path;
1877                 nd->inode = path.dentry->d_inode;
1878                 if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1879                         return ERR_PTR(-EXDEV);
1880         }
1881         /* rare case of legitimate dget_parent()... */
1882         parent = dget_parent(nd->path.dentry);
1883         if (unlikely(!path_connected(nd->path.mnt, parent))) {
1884                 dput(parent);
1885                 return ERR_PTR(-ENOENT);
1886         }
1887         *seqp = 0;
1888         *inodep = parent->d_inode;
1889         return parent;
1890
1891 in_root:
1892         if (unlikely(nd->flags & LOOKUP_BENEATH))
1893                 return ERR_PTR(-EXDEV);
1894         dget(nd->path.dentry);
1895         return NULL;
1896 }
1897
1898 static const char *handle_dots(struct nameidata *nd, int type)
1899 {
1900         if (type == LAST_DOTDOT) {
1901                 const char *error = NULL;
1902                 struct dentry *parent;
1903                 struct inode *inode;
1904                 unsigned seq;
1905
1906                 if (!nd->root.mnt) {
1907                         error = ERR_PTR(set_root(nd));
1908                         if (error)
1909                                 return error;
1910                 }
1911                 if (nd->flags & LOOKUP_RCU)
1912                         parent = follow_dotdot_rcu(nd, &inode, &seq);
1913                 else
1914                         parent = follow_dotdot(nd, &inode, &seq);
1915                 if (IS_ERR(parent))
1916                         return ERR_CAST(parent);
1917                 if (unlikely(!parent))
1918                         error = step_into(nd, WALK_NOFOLLOW,
1919                                          nd->path.dentry, nd->inode, nd->seq);
1920                 else
1921                         error = step_into(nd, WALK_NOFOLLOW,
1922                                          parent, inode, seq);
1923                 if (unlikely(error))
1924                         return error;
1925
1926                 if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
1927                         /*
1928                          * If there was a racing rename or mount along our
1929                          * path, then we can't be sure that ".." hasn't jumped
1930                          * above nd->root (and so userspace should retry or use
1931                          * some fallback).
1932                          */
1933                         smp_rmb();
1934                         if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
1935                                 return ERR_PTR(-EAGAIN);
1936                         if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
1937                                 return ERR_PTR(-EAGAIN);
1938                 }
1939         }
1940         return NULL;
1941 }
1942
1943 static const char *walk_component(struct nameidata *nd, int flags)
1944 {
1945         struct dentry *dentry;
1946         struct inode *inode;
1947         unsigned seq;
1948         /*
1949          * "." and ".." are special - ".." especially so because it has
1950          * to be able to know about the current root directory and
1951          * parent relationships.
1952          */
1953         if (unlikely(nd->last_type != LAST_NORM)) {
1954                 if (!(flags & WALK_MORE) && nd->depth)
1955                         put_link(nd);
1956                 return handle_dots(nd, nd->last_type);
1957         }
1958         dentry = lookup_fast(nd, &inode, &seq);
1959         if (IS_ERR(dentry))
1960                 return ERR_CAST(dentry);
1961         if (unlikely(!dentry)) {
1962                 dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
1963                 if (IS_ERR(dentry))
1964                         return ERR_CAST(dentry);
1965         }
1966         if (!(flags & WALK_MORE) && nd->depth)
1967                 put_link(nd);
1968         return step_into(nd, flags, dentry, inode, seq);
1969 }
1970
1971 /*
1972  * We can do the critical dentry name comparison and hashing
1973  * operations one word at a time, but we are limited to:
1974  *
1975  * - Architectures with fast unaligned word accesses. We could
1976  *   do a "get_unaligned()" if this helps and is sufficiently
1977  *   fast.
1978  *
1979  * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
1980  *   do not trap on the (extremely unlikely) case of a page
1981  *   crossing operation.
1982  *
1983  * - Furthermore, we need an efficient 64-bit compile for the
1984  *   64-bit case in order to generate the "number of bytes in
1985  *   the final mask". Again, that could be replaced with a
1986  *   efficient population count instruction or similar.
1987  */
1988 #ifdef CONFIG_DCACHE_WORD_ACCESS
1989
1990 #include <asm/word-at-a-time.h>
1991
1992 #ifdef HASH_MIX
1993
1994 /* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
1995
1996 #elif defined(CONFIG_64BIT)
1997 /*
1998  * Register pressure in the mixing function is an issue, particularly
1999  * on 32-bit x86, but almost any function requires one state value and
2000  * one temporary.  Instead, use a function designed for two state values
2001  * and no temporaries.
2002  *
2003  * This function cannot create a collision in only two iterations, so
2004  * we have two iterations to achieve avalanche.  In those two iterations,
2005  * we have six layers of mixing, which is enough to spread one bit's
2006  * influence out to 2^6 = 64 state bits.
2007  *
2008  * Rotate constants are scored by considering either 64 one-bit input
2009  * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
2010  * probability of that delta causing a change to each of the 128 output
2011  * bits, using a sample of random initial states.
2012  *
2013  * The Shannon entropy of the computed probabilities is then summed
2014  * to produce a score.  Ideally, any input change has a 50% chance of
2015  * toggling any given output bit.
2016  *
2017  * Mixing scores (in bits) for (12,45):
2018  * Input delta: 1-bit      2-bit
2019  * 1 round:     713.3    42542.6
2020  * 2 rounds:   2753.7   140389.8
2021  * 3 rounds:   5954.1   233458.2
2022  * 4 rounds:   7862.6   256672.2
2023  * Perfect:    8192     258048
2024  *            (64*128) (64*63/2 * 128)
2025  */
2026 #define HASH_MIX(x, y, a)       \
2027         (       x ^= (a),       \
2028         y ^= x, x = rol64(x,12),\
2029         x += y, y = rol64(y,45),\
2030         y *= 9                  )
2031
2032 /*
2033  * Fold two longs into one 32-bit hash value.  This must be fast, but
2034  * latency isn't quite as critical, as there is a fair bit of additional
2035  * work done before the hash value is used.
2036  */
2037 static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2038 {
2039         y ^= x * GOLDEN_RATIO_64;
2040         y *= GOLDEN_RATIO_64;
2041         return y >> 32;
2042 }
2043
2044 #else   /* 32-bit case */
2045
2046 /*
2047  * Mixing scores (in bits) for (7,20):
2048  * Input delta: 1-bit      2-bit
2049  * 1 round:     330.3     9201.6
2050  * 2 rounds:   1246.4    25475.4
2051  * 3 rounds:   1907.1    31295.1
2052  * 4 rounds:   2042.3    31718.6
2053  * Perfect:    2048      31744
2054  *            (32*64)   (32*31/2 * 64)
2055  */
2056 #define HASH_MIX(x, y, a)       \
2057         (       x ^= (a),       \
2058         y ^= x, x = rol32(x, 7),\
2059         x += y, y = rol32(y,20),\
2060         y *= 9                  )
2061
2062 static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2063 {
2064         /* Use arch-optimized multiply if one exists */
2065         return __hash_32(y ^ __hash_32(x));
2066 }
2067
2068 #endif
2069
2070 /*
2071  * Return the hash of a string of known length.  This is carfully
2072  * designed to match hash_name(), which is the more critical function.
2073  * In particular, we must end by hashing a final word containing 0..7
2074  * payload bytes, to match the way that hash_name() iterates until it
2075  * finds the delimiter after the name.
2076  */
2077 unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
2078 {
2079         unsigned long a, x = 0, y = (unsigned long)salt;
2080
2081         for (;;) {
2082                 if (!len)
2083                         goto done;
2084                 a = load_unaligned_zeropad(name);
2085                 if (len < sizeof(unsigned long))
2086                         break;
2087                 HASH_MIX(x, y, a);
2088                 name += sizeof(unsigned long);
2089                 len -= sizeof(unsigned long);
2090         }
2091         x ^= a & bytemask_from_count(len);
2092 done:
2093         return fold_hash(x, y);
2094 }
2095 EXPORT_SYMBOL(full_name_hash);
2096
2097 /* Return the "hash_len" (hash and length) of a null-terminated string */
2098 u64 hashlen_string(const void *salt, const char *name)
2099 {
2100         unsigned long a = 0, x = 0, y = (unsigned long)salt;
2101         unsigned long adata, mask, len;
2102         const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2103
2104         len = 0;
2105         goto inside;
2106
2107         do {
2108                 HASH_MIX(x, y, a);
2109                 len += sizeof(unsigned long);
2110 inside:
2111                 a = load_unaligned_zeropad(name+len);
2112         } while (!has_zero(a, &adata, &constants));
2113
2114         adata = prep_zero_mask(a, adata, &constants);
2115         mask = create_zero_mask(adata);
2116         x ^= a & zero_bytemask(mask);
2117
2118         return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2119 }
2120 EXPORT_SYMBOL(hashlen_string);
2121
2122 /*
2123  * Calculate the length and hash of the path component, and
2124  * return the "hash_len" as the result.
2125  */
2126 static inline u64 hash_name(const void *salt, const char *name)
2127 {
2128         unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
2129         unsigned long adata, bdata, mask, len;
2130         const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2131
2132         len = 0;
2133         goto inside;
2134
2135         do {
2136                 HASH_MIX(x, y, a);
2137                 len += sizeof(unsigned long);
2138 inside:
2139                 a = load_unaligned_zeropad(name+len);
2140                 b = a ^ REPEAT_BYTE('/');
2141         } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
2142
2143         adata = prep_zero_mask(a, adata, &constants);
2144         bdata = prep_zero_mask(b, bdata, &constants);
2145         mask = create_zero_mask(adata | bdata);
2146         x ^= a & zero_bytemask(mask);
2147
2148         return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2149 }
2150
2151 #else   /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
2152
2153 /* Return the hash of a string of known length */
2154 unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
2155 {
2156         unsigned long hash = init_name_hash(salt);
2157         while (len--)
2158                 hash = partial_name_hash((unsigned char)*name++, hash);
2159         return end_name_hash(hash);
2160 }
2161 EXPORT_SYMBOL(full_name_hash);
2162
2163 /* Return the "hash_len" (hash and length) of a null-terminated string */
2164 u64 hashlen_string(const void *salt, const char *name)
2165 {
2166         unsigned long hash = init_name_hash(salt);
2167         unsigned long len = 0, c;
2168
2169         c = (unsigned char)*name;
2170         while (c) {
2171                 len++;
2172                 hash = partial_name_hash(c, hash);
2173                 c = (unsigned char)name[len];
2174         }
2175         return hashlen_create(end_name_hash(hash), len);
2176 }
2177 EXPORT_SYMBOL(hashlen_string);
2178
2179 /*
2180  * We know there's a real path component here of at least
2181  * one character.
2182  */
2183 static inline u64 hash_name(const void *salt, const char *name)
2184 {
2185         unsigned long hash = init_name_hash(salt);
2186         unsigned long len = 0, c;
2187
2188         c = (unsigned char)*name;
2189         do {
2190                 len++;
2191                 hash = partial_name_hash(c, hash);
2192                 c = (unsigned char)name[len];
2193         } while (c && c != '/');
2194         return hashlen_create(end_name_hash(hash), len);
2195 }
2196
2197 #endif
2198
2199 /*
2200  * Name resolution.
2201  * This is the basic name resolution function, turning a pathname into
2202  * the final dentry. We expect 'base' to be positive and a directory.
2203  *
2204  * Returns 0 and nd will have valid dentry and mnt on success.
2205  * Returns error and drops reference to input namei data on failure.
2206  */
2207 static int link_path_walk(const char *name, struct nameidata *nd)
2208 {
2209         int depth = 0; // depth <= nd->depth
2210         int err;
2211
2212         nd->last_type = LAST_ROOT;
2213         nd->flags |= LOOKUP_PARENT;
2214         if (IS_ERR(name))
2215                 return PTR_ERR(name);
2216         while (*name=='/')
2217                 name++;
2218         if (!*name) {
2219                 nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
2220                 return 0;
2221         }
2222
2223         /* At this point we know we have a real path component. */
2224         for(;;) {
2225                 struct user_namespace *mnt_userns;
2226                 const char *link;
2227                 u64 hash_len;
2228                 int type;
2229
2230                 mnt_userns = mnt_user_ns(nd->path.mnt);
2231                 err = may_lookup(mnt_userns, nd);
2232                 if (err)
2233                         return err;
2234
2235                 hash_len = hash_name(nd->path.dentry, name);
2236
2237                 type = LAST_NORM;
2238                 if (name[0] == '.') switch (hashlen_len(hash_len)) {
2239                         case 2:
2240                                 if (name[1] == '.') {
2241                                         type = LAST_DOTDOT;
2242                                         nd->state |= ND_JUMPED;
2243                                 }
2244                                 break;
2245                         case 1:
2246                                 type = LAST_DOT;
2247                 }
2248                 if (likely(type == LAST_NORM)) {
2249                         struct dentry *parent = nd->path.dentry;
2250                         nd->state &= ~ND_JUMPED;
2251                         if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2252                                 struct qstr this = { { .hash_len = hash_len }, .name = name };
2253                                 err = parent->d_op->d_hash(parent, &this);
2254                                 if (err < 0)
2255                                         return err;
2256                                 hash_len = this.hash_len;
2257                                 name = this.name;
2258                         }
2259                 }
2260
2261                 nd->last.hash_len = hash_len;
2262                 nd->last.name = name;
2263                 nd->last_type = type;
2264
2265                 name += hashlen_len(hash_len);
2266                 if (!*name)
2267                         goto OK;
2268                 /*
2269                  * If it wasn't NUL, we know it was '/'. Skip that
2270                  * slash, and continue until no more slashes.
2271                  */
2272                 do {
2273                         name++;
2274                 } while (unlikely(*name == '/'));
2275                 if (unlikely(!*name)) {
2276 OK:
2277                         /* pathname or trailing symlink, done */
2278                         if (!depth) {
2279                                 nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
2280                                 nd->dir_mode = nd->inode->i_mode;
2281                                 nd->flags &= ~LOOKUP_PARENT;
2282                                 return 0;
2283                         }
2284                         /* last component of nested symlink */
2285                         name = nd->stack[--depth].name;
2286                         link = walk_component(nd, 0);
2287                 } else {
2288                         /* not the last component */
2289                         link = walk_component(nd, WALK_MORE);
2290                 }
2291                 if (unlikely(link)) {
2292                         if (IS_ERR(link))
2293                                 return PTR_ERR(link);
2294                         /* a symlink to follow */
2295                         nd->stack[depth++].name = name;
2296                         name = link;
2297                         continue;
2298                 }
2299                 if (unlikely(!d_can_lookup(nd->path.dentry))) {
2300                         if (nd->flags & LOOKUP_RCU) {
2301                                 if (!try_to_unlazy(nd))
2302                                         return -ECHILD;
2303                         }
2304                         return -ENOTDIR;
2305                 }
2306         }
2307 }
2308
2309 /* must be paired with terminate_walk() */
2310 static const char *path_init(struct nameidata *nd, unsigned flags)
2311 {
2312         int error;
2313         const char *s = nd->name->name;
2314
2315         /* LOOKUP_CACHED requires RCU, ask caller to retry */
2316         if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
2317                 return ERR_PTR(-EAGAIN);
2318
2319         if (!*s)
2320                 flags &= ~LOOKUP_RCU;
2321         if (flags & LOOKUP_RCU)
2322                 rcu_read_lock();
2323
2324         nd->flags = flags;
2325         nd->state |= ND_JUMPED;
2326
2327         nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
2328         nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
2329         smp_rmb();
2330
2331         if (nd->state & ND_ROOT_PRESET) {
2332                 struct dentry *root = nd->root.dentry;
2333                 struct inode *inode = root->d_inode;
2334                 if (*s && unlikely(!d_can_lookup(root)))
2335                         return ERR_PTR(-ENOTDIR);
2336                 nd->path = nd->root;
2337                 nd->inode = inode;
2338                 if (flags & LOOKUP_RCU) {
2339                         nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2340                         nd->root_seq = nd->seq;
2341                 } else {
2342                         path_get(&nd->path);
2343                 }
2344                 return s;
2345         }
2346
2347         nd->root.mnt = NULL;
2348
2349         /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
2350         if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
2351                 error = nd_jump_root(nd);
2352                 if (unlikely(error))
2353                         return ERR_PTR(error);
2354                 return s;
2355         }
2356
2357         /* Relative pathname -- get the starting-point it is relative to. */
2358         if (nd->dfd == AT_FDCWD) {
2359                 if (flags & LOOKUP_RCU) {
2360                         struct fs_struct *fs = current->fs;
2361                         unsigned seq;
2362
2363                         do {
2364                                 seq = read_seqcount_begin(&fs->seq);
2365                                 nd->path = fs->pwd;
2366                                 nd->inode = nd->path.dentry->d_inode;
2367                                 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2368                         } while (read_seqcount_retry(&fs->seq, seq));
2369                 } else {
2370                         get_fs_pwd(current->fs, &nd->path);
2371                         nd->inode = nd->path.dentry->d_inode;
2372                 }
2373         } else {
2374                 /* Caller must check execute permissions on the starting path component */
2375                 struct fd f = fdget_raw(nd->dfd);
2376                 struct dentry *dentry;
2377
2378                 if (!f.file)
2379                         return ERR_PTR(-EBADF);
2380
2381                 dentry = f.file->f_path.dentry;
2382
2383                 if (*s && unlikely(!d_can_lookup(dentry))) {
2384                         fdput(f);
2385                         return ERR_PTR(-ENOTDIR);
2386                 }
2387
2388                 nd->path = f.file->f_path;
2389                 if (flags & LOOKUP_RCU) {
2390                         nd->inode = nd->path.dentry->d_inode;
2391                         nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2392                 } else {
2393                         path_get(&nd->path);
2394                         nd->inode = nd->path.dentry->d_inode;
2395                 }
2396                 fdput(f);
2397         }
2398
2399         /* For scoped-lookups we need to set the root to the dirfd as well. */
2400         if (flags & LOOKUP_IS_SCOPED) {
2401                 nd->root = nd->path;
2402                 if (flags & LOOKUP_RCU) {
2403                         nd->root_seq = nd->seq;
2404                 } else {
2405                         path_get(&nd->root);
2406                         nd->state |= ND_ROOT_GRABBED;
2407                 }
2408         }
2409         return s;
2410 }
2411
2412 static inline const char *lookup_last(struct nameidata *nd)
2413 {
2414         if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
2415                 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2416
2417         return walk_component(nd, WALK_TRAILING);
2418 }
2419
2420 static int handle_lookup_down(struct nameidata *nd)
2421 {
2422         if (!(nd->flags & LOOKUP_RCU))
2423                 dget(nd->path.dentry);
2424         return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
2425                         nd->path.dentry, nd->inode, nd->seq));
2426 }
2427
2428 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2429 static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2430 {
2431         const char *s = path_init(nd, flags);
2432         int err;
2433
2434         if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
2435                 err = handle_lookup_down(nd);
2436                 if (unlikely(err < 0))
2437                         s = ERR_PTR(err);
2438         }
2439
2440         while (!(err = link_path_walk(s, nd)) &&
2441                (s = lookup_last(nd)) != NULL)
2442                 ;
2443         if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
2444                 err = handle_lookup_down(nd);
2445                 nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
2446         }
2447         if (!err)
2448                 err = complete_walk(nd);
2449
2450         if (!err && nd->flags & LOOKUP_DIRECTORY)
2451                 if (!d_can_lookup(nd->path.dentry))
2452                         err = -ENOTDIR;
2453         if (!err) {
2454                 *path = nd->path;
2455                 nd->path.mnt = NULL;
2456                 nd->path.dentry = NULL;
2457         }
2458         terminate_walk(nd);
2459         return err;
2460 }
2461
2462 int filename_lookup(int dfd, struct filename *name, unsigned flags,
2463                     struct path *path, struct path *root)
2464 {
2465         int retval;
2466         struct nameidata nd;
2467         if (IS_ERR(name))
2468                 return PTR_ERR(name);
2469         set_nameidata(&nd, dfd, name, root);
2470         retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
2471         if (unlikely(retval == -ECHILD))
2472                 retval = path_lookupat(&nd, flags, path);
2473         if (unlikely(retval == -ESTALE))
2474                 retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
2475
2476         if (likely(!retval))
2477                 audit_inode(name, path->dentry,
2478                             flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
2479         restore_nameidata();
2480         putname(name);
2481         return retval;
2482 }
2483
2484 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2485 static int path_parentat(struct nameidata *nd, unsigned flags,
2486                                 struct path *parent)
2487 {
2488         const char *s = path_init(nd, flags);
2489         int err = link_path_walk(s, nd);
2490         if (!err)
2491                 err = complete_walk(nd);
2492         if (!err) {
2493                 *parent = nd->path;
2494                 nd->path.mnt = NULL;
2495                 nd->path.dentry = NULL;
2496         }
2497         terminate_walk(nd);
2498         return err;
2499 }
2500
2501 static int __filename_parentat(int dfd, struct filename *name,
2502                                 unsigned int flags, struct path *parent,
2503                                 struct qstr *last, int *type)
2504 {
2505         int retval;
2506         struct nameidata nd;
2507
2508         if (IS_ERR(name))
2509                 return PTR_ERR(name);
2510         set_nameidata(&nd, dfd, name, NULL);
2511         retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2512         if (unlikely(retval == -ECHILD))
2513                 retval = path_parentat(&nd, flags, parent);
2514         if (unlikely(retval == -ESTALE))
2515                 retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2516         if (likely(!retval)) {
2517                 *last = nd.last;
2518                 *type = nd.last_type;
2519                 audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
2520         }
2521         restore_nameidata();
2522         return retval;
2523 }
2524
2525 static int filename_parentat(int dfd, struct filename *name,
2526                                 unsigned int flags, struct path *parent,
2527                                 struct qstr *last, int *type)
2528 {
2529         int retval = __filename_parentat(dfd, name, flags, parent, last, type);
2530
2531         putname(name);
2532         return retval;
2533 }
2534
2535 /* does lookup, returns the object with parent locked */
2536 struct dentry *kern_path_locked(const char *name, struct path *path)
2537 {
2538         struct dentry *d;
2539         struct qstr last;
2540         int type, error;
2541
2542         error = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
2543                                     &last, &type);
2544         if (error)
2545                 return ERR_PTR(error);
2546         if (unlikely(type != LAST_NORM)) {
2547                 path_put(path);
2548                 return ERR_PTR(-EINVAL);
2549         }
2550         inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
2551         d = __lookup_hash(&last, path->dentry, 0);
2552         if (IS_ERR(d)) {
2553                 inode_unlock(path->dentry->d_inode);
2554                 path_put(path);
2555         }
2556         return d;
2557 }
2558
2559 int kern_path(const char *name, unsigned int flags, struct path *path)
2560 {
2561         return filename_lookup(AT_FDCWD, getname_kernel(name),
2562                                flags, path, NULL);
2563 }
2564 EXPORT_SYMBOL(kern_path);
2565
2566 /**
2567  * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
2568  * @dentry:  pointer to dentry of the base directory
2569  * @mnt: pointer to vfs mount of the base directory
2570  * @name: pointer to file name
2571  * @flags: lookup flags
2572  * @path: pointer to struct path to fill
2573  */
2574 int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
2575                     const char *name, unsigned int flags,
2576                     struct path *path)
2577 {
2578         struct path root = {.mnt = mnt, .dentry = dentry};
2579         /* the first argument of filename_lookup() is ignored with root */
2580         return filename_lookup(AT_FDCWD, getname_kernel(name),
2581                                flags , path, &root);
2582 }
2583 EXPORT_SYMBOL(vfs_path_lookup);
2584
2585 static int lookup_one_len_common(const char *name, struct dentry *base,
2586                                  int len, struct qstr *this)
2587 {
2588         this->name = name;
2589         this->len = len;
2590         this->hash = full_name_hash(base, name, len);
2591         if (!len)
2592                 return -EACCES;
2593
2594         if (unlikely(name[0] == '.')) {
2595                 if (len < 2 || (len == 2 && name[1] == '.'))
2596                         return -EACCES;
2597         }
2598
2599         while (len--) {
2600                 unsigned int c = *(const unsigned char *)name++;
2601                 if (c == '/' || c == '\0')
2602                         return -EACCES;
2603         }
2604         /*
2605          * See if the low-level filesystem might want
2606          * to use its own hash..
2607          */
2608         if (base->d_flags & DCACHE_OP_HASH) {
2609                 int err = base->d_op->d_hash(base, this);
2610                 if (err < 0)
2611                         return err;
2612         }
2613
2614         return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC);
2615 }
2616
2617 /**
2618  * try_lookup_one_len - filesystem helper to lookup single pathname component
2619  * @name:       pathname component to lookup
2620  * @base:       base directory to lookup from
2621  * @len:        maximum length @len should be interpreted to
2622  *
2623  * Look up a dentry by name in the dcache, returning NULL if it does not
2624  * currently exist.  The function does not try to create a dentry.
2625  *
2626  * Note that this routine is purely a helper for filesystem usage and should
2627  * not be called by generic code.
2628  *
2629  * The caller must hold base->i_mutex.
2630  */
2631 struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
2632 {
2633         struct qstr this;
2634         int err;
2635
2636         WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2637
2638         err = lookup_one_len_common(name, base, len, &this);
2639         if (err)
2640                 return ERR_PTR(err);
2641
2642         return lookup_dcache(&this, base, 0);
2643 }
2644 EXPORT_SYMBOL(try_lookup_one_len);
2645
2646 /**
2647  * lookup_one_len - filesystem helper to lookup single pathname component
2648  * @name:       pathname component to lookup
2649  * @base:       base directory to lookup from
2650  * @len:        maximum length @len should be interpreted to
2651  *
2652  * Note that this routine is purely a helper for filesystem usage and should
2653  * not be called by generic code.
2654  *
2655  * The caller must hold base->i_mutex.
2656  */
2657 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2658 {
2659         struct dentry *dentry;
2660         struct qstr this;
2661         int err;
2662
2663         WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2664
2665         err = lookup_one_len_common(name, base, len, &this);
2666         if (err)
2667                 return ERR_PTR(err);
2668
2669         dentry = lookup_dcache(&this, base, 0);
2670         return dentry ? dentry : __lookup_slow(&this, base, 0);
2671 }
2672 EXPORT_SYMBOL(lookup_one_len);
2673
2674 /**
2675  * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
2676  * @name:       pathname component to lookup
2677  * @base:       base directory to lookup from
2678  * @len:        maximum length @len should be interpreted to
2679  *
2680  * Note that this routine is purely a helper for filesystem usage and should
2681  * not be called by generic code.
2682  *
2683  * Unlike lookup_one_len, it should be called without the parent
2684  * i_mutex held, and will take the i_mutex itself if necessary.
2685  */
2686 struct dentry *lookup_one_len_unlocked(const char *name,
2687                                        struct dentry *base, int len)
2688 {
2689         struct qstr this;
2690         int err;
2691         struct dentry *ret;
2692
2693         err = lookup_one_len_common(name, base, len, &this);
2694         if (err)
2695                 return ERR_PTR(err);
2696
2697         ret = lookup_dcache(&this, base, 0);
2698         if (!ret)
2699                 ret = lookup_slow(&this, base, 0);
2700         return ret;
2701 }
2702 EXPORT_SYMBOL(lookup_one_len_unlocked);
2703
2704 /*
2705  * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
2706  * on negatives.  Returns known positive or ERR_PTR(); that's what
2707  * most of the users want.  Note that pinned negative with unlocked parent
2708  * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
2709  * need to be very careful; pinned positives have ->d_inode stable, so
2710  * this one avoids such problems.
2711  */
2712 struct dentry *lookup_positive_unlocked(const char *name,
2713                                        struct dentry *base, int len)
2714 {
2715         struct dentry *ret = lookup_one_len_unlocked(name, base, len);
2716         if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
2717                 dput(ret);
2718                 ret = ERR_PTR(-ENOENT);
2719         }
2720         return ret;
2721 }
2722 EXPORT_SYMBOL(lookup_positive_unlocked);
2723
2724 #ifdef CONFIG_UNIX98_PTYS
2725 int path_pts(struct path *path)
2726 {
2727         /* Find something mounted on "pts" in the same directory as
2728          * the input path.
2729          */
2730         struct dentry *parent = dget_parent(path->dentry);
2731         struct dentry *child;
2732         struct qstr this = QSTR_INIT("pts", 3);
2733
2734         if (unlikely(!path_connected(path->mnt, parent))) {
2735                 dput(parent);
2736                 return -ENOENT;
2737         }
2738         dput(path->dentry);
2739         path->dentry = parent;
2740         child = d_hash_and_lookup(parent, &this);
2741         if (!child)
2742                 return -ENOENT;
2743
2744         path->dentry = child;
2745         dput(parent);
2746         follow_down(path);
2747         return 0;
2748 }
2749 #endif
2750
2751 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
2752                  struct path *path, int *empty)
2753 {
2754         return filename_lookup(dfd, getname_flags(name, flags, empty),
2755                                flags, path, NULL);
2756 }
2757 EXPORT_SYMBOL(user_path_at_empty);
2758
2759 int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
2760                    struct inode *inode)
2761 {
2762         kuid_t fsuid = current_fsuid();
2763
2764         if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid))
2765                 return 0;
2766         if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid))
2767                 return 0;
2768         return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER);
2769 }
2770 EXPORT_SYMBOL(__check_sticky);
2771
2772 /*
2773  *      Check whether we can remove a link victim from directory dir, check
2774  *  whether the type of victim is right.
2775  *  1. We can't do it if dir is read-only (done in permission())
2776  *  2. We should have write and exec permissions on dir
2777  *  3. We can't remove anything from append-only dir
2778  *  4. We can't do anything with immutable dir (done in permission())
2779  *  5. If the sticky bit on dir is set we should either
2780  *      a. be owner of dir, or
2781  *      b. be owner of victim, or
2782  *      c. have CAP_FOWNER capability
2783  *  6. If the victim is append-only or immutable we can't do antyhing with
2784  *     links pointing to it.
2785  *  7. If the victim has an unknown uid or gid we can't change the inode.
2786  *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
2787  *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
2788  * 10. We can't remove a root or mountpoint.
2789  * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
2790  *     nfs_async_unlink().
2791  */
2792 static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
2793                       struct dentry *victim, bool isdir)
2794 {
2795         struct inode *inode = d_backing_inode(victim);
2796         int error;
2797
2798         if (d_is_negative(victim))
2799                 return -ENOENT;
2800         BUG_ON(!inode);
2801
2802         BUG_ON(victim->d_parent->d_inode != dir);
2803
2804         /* Inode writeback is not safe when the uid or gid are invalid. */
2805         if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
2806             !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
2807                 return -EOVERFLOW;
2808
2809         audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
2810
2811         error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
2812         if (error)
2813                 return error;
2814         if (IS_APPEND(dir))
2815                 return -EPERM;
2816
2817         if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) ||
2818             IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
2819             HAS_UNMAPPED_ID(mnt_userns, inode))
2820                 return -EPERM;
2821         if (isdir) {
2822                 if (!d_is_dir(victim))
2823                         return -ENOTDIR;
2824                 if (IS_ROOT(victim))
2825                         return -EBUSY;
2826         } else if (d_is_dir(victim))
2827                 return -EISDIR;
2828         if (IS_DEADDIR(dir))
2829                 return -ENOENT;
2830         if (victim->d_flags & DCACHE_NFSFS_RENAMED)
2831                 return -EBUSY;
2832         return 0;
2833 }
2834
2835 /*      Check whether we can create an object with dentry child in directory
2836  *  dir.
2837  *  1. We can't do it if child already exists (open has special treatment for
2838  *     this case, but since we are inlined it's OK)
2839  *  2. We can't do it if dir is read-only (done in permission())
2840  *  3. We can't do it if the fs can't represent the fsuid or fsgid.
2841  *  4. We should have write and exec permissions on dir
2842  *  5. We can't do it if dir is immutable (done in permission())
2843  */
2844 static inline int may_create(struct user_namespace *mnt_userns,
2845                              struct inode *dir, struct dentry *child)
2846 {
2847         audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
2848         if (child->d_inode)
2849                 return -EEXIST;
2850         if (IS_DEADDIR(dir))
2851                 return -ENOENT;
2852         if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
2853                 return -EOVERFLOW;
2854
2855         return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
2856 }
2857
2858 /*
2859  * p1 and p2 should be directories on the same fs.
2860  */
2861 struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
2862 {
2863         struct dentry *p;
2864
2865         if (p1 == p2) {
2866                 inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
2867                 return NULL;
2868         }
2869
2870         mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
2871
2872         p = d_ancestor(p2, p1);
2873         if (p) {
2874                 inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
2875                 inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
2876                 return p;
2877         }
2878
2879         p = d_ancestor(p1, p2);
2880         if (p) {
2881                 inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
2882                 inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
2883                 return p;
2884         }
2885
2886         inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
2887         inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
2888         return NULL;
2889 }
2890 EXPORT_SYMBOL(lock_rename);
2891
2892 void unlock_rename(struct dentry *p1, struct dentry *p2)
2893 {
2894         inode_unlock(p1->d_inode);
2895         if (p1 != p2) {
2896                 inode_unlock(p2->d_inode);
2897                 mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
2898         }
2899 }
2900 EXPORT_SYMBOL(unlock_rename);
2901
2902 /**
2903  * vfs_create - create new file
2904  * @mnt_userns: user namespace of the mount the inode was found from
2905  * @dir:        inode of @dentry
2906  * @dentry:     pointer to dentry of the base directory
2907  * @mode:       mode of the new file
2908  * @want_excl:  whether the file must not yet exist
2909  *
2910  * Create a new file.
2911  *
2912  * If the inode has been found through an idmapped mount the user namespace of
2913  * the vfsmount must be passed through @mnt_userns. This function will then take
2914  * care to map the inode according to @mnt_userns before checking permissions.
2915  * On non-idmapped mounts or if permission checking is to be performed on the
2916  * raw inode simply passs init_user_ns.
2917  */
2918 int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
2919                struct dentry *dentry, umode_t mode, bool want_excl)
2920 {
2921         int error = may_create(mnt_userns, dir, dentry);
2922         if (error)
2923                 return error;
2924
2925         if (!dir->i_op->create)
2926                 return -EACCES; /* shouldn't it be ENOSYS? */
2927         mode &= S_IALLUGO;
2928         mode |= S_IFREG;
2929         error = security_inode_create(dir, dentry, mode);
2930         if (error)
2931                 return error;
2932         error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl);
2933         if (!error)
2934                 fsnotify_create(dir, dentry);
2935         return error;
2936 }
2937 EXPORT_SYMBOL(vfs_create);
2938
2939 int vfs_mkobj(struct dentry *dentry, umode_t mode,
2940                 int (*f)(struct dentry *, umode_t, void *),
2941                 void *arg)
2942 {
2943         struct inode *dir = dentry->d_parent->d_inode;
2944         int error = may_create(&init_user_ns, dir, dentry);
2945         if (error)
2946                 return error;
2947
2948         mode &= S_IALLUGO;
2949         mode |= S_IFREG;
2950         error = security_inode_create(dir, dentry, mode);
2951         if (error)
2952                 return error;
2953         error = f(dentry, mode, arg);
2954         if (!error)
2955                 fsnotify_create(dir, dentry);
2956         return error;
2957 }
2958 EXPORT_SYMBOL(vfs_mkobj);
2959
2960 bool may_open_dev(const struct path *path)
2961 {
2962         return !(path->mnt->mnt_flags & MNT_NODEV) &&
2963                 !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
2964 }
2965
2966 static int may_open(struct user_namespace *mnt_userns, const struct path *path,
2967                     int acc_mode, int flag)
2968 {
2969         struct dentry *dentry = path->dentry;
2970         struct inode *inode = dentry->d_inode;
2971         int error;
2972
2973         if (!inode)
2974                 return -ENOENT;
2975
2976         switch (inode->i_mode & S_IFMT) {
2977         case S_IFLNK:
2978                 return -ELOOP;
2979         case S_IFDIR:
2980                 if (acc_mode & MAY_WRITE)
2981                         return -EISDIR;
2982                 if (acc_mode & MAY_EXEC)
2983                         return -EACCES;
2984                 break;
2985         case S_IFBLK:
2986         case S_IFCHR:
2987                 if (!may_open_dev(path))
2988                         return -EACCES;
2989                 fallthrough;
2990         case S_IFIFO:
2991         case S_IFSOCK:
2992                 if (acc_mode & MAY_EXEC)
2993                         return -EACCES;
2994                 flag &= ~O_TRUNC;
2995                 break;
2996         case S_IFREG:
2997                 if ((acc_mode & MAY_EXEC) && path_noexec(path))
2998                         return -EACCES;
2999                 break;
3000         }
3001
3002         error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode);
3003         if (error)
3004                 return error;
3005
3006         /*
3007          * An append-only file must be opened in append mode for writing.
3008          */
3009         if (IS_APPEND(inode)) {
3010                 if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
3011                         return -EPERM;
3012                 if (flag & O_TRUNC)
3013                         return -EPERM;
3014         }
3015
3016         /* O_NOATIME can only be set by the owner or superuser */
3017         if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode))
3018                 return -EPERM;
3019
3020         return 0;
3021 }
3022
3023 static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
3024 {
3025         const struct path *path = &filp->f_path;
3026         struct inode *inode = path->dentry->d_inode;
3027         int error = get_write_access(inode);
3028         if (error)
3029                 return error;
3030         /*
3031          * Refuse to truncate files with mandatory locks held on them.
3032          */
3033         error = locks_verify_locked(filp);
3034         if (!error)
3035                 error = security_path_truncate(path);
3036         if (!error) {
3037                 error = do_truncate(mnt_userns, path->dentry, 0,
3038                                     ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
3039                                     filp);
3040         }
3041         put_write_access(inode);
3042         return error;
3043 }
3044
3045 static inline int open_to_namei_flags(int flag)
3046 {
3047         if ((flag & O_ACCMODE) == 3)
3048                 flag--;
3049         return flag;
3050 }
3051
3052 static int may_o_create(struct user_namespace *mnt_userns,
3053                         const struct path *dir, struct dentry *dentry,
3054                         umode_t mode)
3055 {
3056         int error = security_path_mknod(dir, dentry, mode, 0);
3057         if (error)
3058                 return error;
3059
3060         if (!fsuidgid_has_mapping(dir->dentry->d_sb, mnt_userns))
3061                 return -EOVERFLOW;
3062
3063         error = inode_permission(mnt_userns, dir->dentry->d_inode,
3064                                  MAY_WRITE | MAY_EXEC);
3065         if (error)
3066                 return error;
3067
3068         return security_inode_create(dir->dentry->d_inode, dentry, mode);
3069 }
3070
3071 /*
3072  * Attempt to atomically look up, create and open a file from a negative
3073  * dentry.
3074  *
3075  * Returns 0 if successful.  The file will have been created and attached to
3076  * @file by the filesystem calling finish_open().
3077  *
3078  * If the file was looked up only or didn't need creating, FMODE_OPENED won't
3079  * be set.  The caller will need to perform the open themselves.  @path will
3080  * have been updated to point to the new dentry.  This may be negative.
3081  *
3082  * Returns an error code otherwise.
3083  */
3084 static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
3085                                   struct file *file,
3086                                   int open_flag, umode_t mode)
3087 {
3088         struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
3089         struct inode *dir =  nd->path.dentry->d_inode;
3090         int error;
3091
3092         if (nd->flags & LOOKUP_DIRECTORY)
3093                 open_flag |= O_DIRECTORY;
3094
3095         file->f_path.dentry = DENTRY_NOT_SET;
3096         file->f_path.mnt = nd->path.mnt;
3097         error = dir->i_op->atomic_open(dir, dentry, file,
3098                                        open_to_namei_flags(open_flag), mode);
3099         d_lookup_done(dentry);
3100         if (!error) {
3101                 if (file->f_mode & FMODE_OPENED) {
3102                         if (unlikely(dentry != file->f_path.dentry)) {
3103                                 dput(dentry);
3104                                 dentry = dget(file->f_path.dentry);
3105                         }
3106                 } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
3107                         error = -EIO;
3108                 } else {
3109                         if (file->f_path.dentry) {
3110                                 dput(dentry);
3111                                 dentry = file->f_path.dentry;
3112                         }
3113                         if (unlikely(d_is_negative(dentry)))
3114                                 error = -ENOENT;
3115                 }
3116         }
3117         if (error) {
3118                 dput(dentry);
3119                 dentry = ERR_PTR(error);
3120         }
3121         return dentry;
3122 }
3123
3124 /*
3125  * Look up and maybe create and open the last component.
3126  *
3127  * Must be called with parent locked (exclusive in O_CREAT case).
3128  *
3129  * Returns 0 on success, that is, if
3130  *  the file was successfully atomically created (if necessary) and opened, or
3131  *  the file was not completely opened at this time, though lookups and
3132  *  creations were performed.
3133  * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
3134  * In the latter case dentry returned in @path might be negative if O_CREAT
3135  * hadn't been specified.
3136  *
3137  * An error code is returned on failure.
3138  */
3139 static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
3140                                   const struct open_flags *op,
3141                                   bool got_write)
3142 {
3143         struct user_namespace *mnt_userns;
3144         struct dentry *dir = nd->path.dentry;
3145         struct inode *dir_inode = dir->d_inode;
3146         int open_flag = op->open_flag;
3147         struct dentry *dentry;
3148         int error, create_error = 0;
3149         umode_t mode = op->mode;
3150         DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
3151
3152         if (unlikely(IS_DEADDIR(dir_inode)))
3153                 return ERR_PTR(-ENOENT);
3154
3155         file->f_mode &= ~FMODE_CREATED;
3156         dentry = d_lookup(dir, &nd->last);
3157         for (;;) {
3158                 if (!dentry) {
3159                         dentry = d_alloc_parallel(dir, &nd->last, &wq);
3160                         if (IS_ERR(dentry))
3161                                 return dentry;
3162                 }
3163                 if (d_in_lookup(dentry))
3164                         break;
3165
3166                 error = d_revalidate(dentry, nd->flags);
3167                 if (likely(error > 0))
3168                         break;
3169                 if (error)
3170                         goto out_dput;
3171                 d_invalidate(dentry);
3172                 dput(dentry);
3173                 dentry = NULL;
3174         }
3175         if (dentry->d_inode) {
3176                 /* Cached positive dentry: will open in f_op->open */
3177                 return dentry;
3178         }
3179
3180         /*
3181          * Checking write permission is tricky, bacuse we don't know if we are
3182          * going to actually need it: O_CREAT opens should work as long as the
3183          * file exists.  But checking existence breaks atomicity.  The trick is
3184          * to check access and if not granted clear O_CREAT from the flags.
3185          *
3186          * Another problem is returing the "right" error value (e.g. for an
3187          * O_EXCL open we want to return EEXIST not EROFS).
3188          */
3189         if (unlikely(!got_write))
3190                 open_flag &= ~O_TRUNC;
3191         mnt_userns = mnt_user_ns(nd->path.mnt);
3192         if (open_flag & O_CREAT) {
3193                 if (open_flag & O_EXCL)
3194                         open_flag &= ~O_TRUNC;
3195                 if (!IS_POSIXACL(dir->d_inode))
3196                         mode &= ~current_umask();
3197                 if (likely(got_write))
3198                         create_error = may_o_create(mnt_userns, &nd->path,
3199                                                     dentry, mode);
3200                 else
3201                         create_error = -EROFS;
3202         }
3203         if (create_error)
3204                 open_flag &= ~O_CREAT;
3205         if (dir_inode->i_op->atomic_open) {
3206                 dentry = atomic_open(nd, dentry, file, open_flag, mode);
3207                 if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
3208                         dentry = ERR_PTR(create_error);
3209                 return dentry;
3210         }
3211
3212         if (d_in_lookup(dentry)) {
3213                 struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
3214                                                              nd->flags);
3215                 d_lookup_done(dentry);
3216                 if (unlikely(res)) {
3217                         if (IS_ERR(res)) {
3218                                 error = PTR_ERR(res);
3219                                 goto out_dput;
3220                         }
3221                         dput(dentry);
3222                         dentry = res;
3223                 }
3224         }
3225
3226         /* Negative dentry, just create the file */
3227         if (!dentry->d_inode && (open_flag & O_CREAT)) {
3228                 file->f_mode |= FMODE_CREATED;
3229                 audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
3230                 if (!dir_inode->i_op->create) {
3231                         error = -EACCES;
3232                         goto out_dput;
3233                 }
3234
3235                 error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry,
3236                                                 mode, open_flag & O_EXCL);
3237                 if (error)
3238                         goto out_dput;
3239         }
3240         if (unlikely(create_error) && !dentry->d_inode) {
3241                 error = create_error;
3242                 goto out_dput;
3243         }
3244         return dentry;
3245
3246 out_dput:
3247         dput(dentry);
3248         return ERR_PTR(error);
3249 }
3250
3251 static const char *open_last_lookups(struct nameidata *nd,
3252                    struct file *file, const struct open_flags *op)
3253 {
3254         struct dentry *dir = nd->path.dentry;
3255         int open_flag = op->open_flag;
3256         bool got_write = false;
3257         unsigned seq;
3258         struct inode *inode;
3259         struct dentry *dentry;
3260         const char *res;
3261
3262         nd->flags |= op->intent;
3263
3264         if (nd->last_type != LAST_NORM) {
3265                 if (nd->depth)
3266                         put_link(nd);
3267                 return handle_dots(nd, nd->last_type);
3268         }
3269
3270         if (!(open_flag & O_CREAT)) {
3271                 if (nd->last.name[nd->last.len])
3272                         nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
3273                 /* we _can_ be in RCU mode here */
3274                 dentry = lookup_fast(nd, &inode, &seq);
3275                 if (IS_ERR(dentry))
3276                         return ERR_CAST(dentry);
3277                 if (likely(dentry))
3278                         goto finish_lookup;
3279
3280                 BUG_ON(nd->flags & LOOKUP_RCU);
3281         } else {
3282                 /* create side of things */
3283                 if (nd->flags & LOOKUP_RCU) {
3284                         if (!try_to_unlazy(nd))
3285                                 return ERR_PTR(-ECHILD);
3286                 }
3287                 audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
3288                 /* trailing slashes? */
3289                 if (unlikely(nd->last.name[nd->last.len]))
3290                         return ERR_PTR(-EISDIR);
3291         }
3292
3293         if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
3294                 got_write = !mnt_want_write(nd->path.mnt);
3295                 /*
3296                  * do _not_ fail yet - we might not need that or fail with
3297                  * a different error; let lookup_open() decide; we'll be
3298                  * dropping this one anyway.
3299                  */
3300         }
3301         if (open_flag & O_CREAT)
3302                 inode_lock(dir->d_inode);
3303         else
3304                 inode_lock_shared(dir->d_inode);
3305         dentry = lookup_open(nd, file, op, got_write);
3306         if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
3307                 fsnotify_create(dir->d_inode, dentry);
3308         if (open_flag & O_CREAT)
3309                 inode_unlock(dir->d_inode);
3310         else
3311                 inode_unlock_shared(dir->d_inode);
3312
3313         if (got_write)
3314                 mnt_drop_write(nd->path.mnt);
3315
3316         if (IS_ERR(dentry))
3317                 return ERR_CAST(dentry);
3318
3319         if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
3320                 dput(nd->path.dentry);
3321                 nd->path.dentry = dentry;
3322                 return NULL;
3323         }
3324
3325 finish_lookup:
3326         if (nd->depth)
3327                 put_link(nd);
3328         res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
3329         if (unlikely(res))
3330                 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3331         return res;
3332 }
3333
3334 /*
3335  * Handle the last step of open()
3336  */
3337 static int do_open(struct nameidata *nd,
3338                    struct file *file, const struct open_flags *op)
3339 {
3340         struct user_namespace *mnt_userns;
3341         int open_flag = op->open_flag;
3342         bool do_truncate;
3343         int acc_mode;
3344         int error;
3345
3346         if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
3347                 error = complete_walk(nd);
3348                 if (error)
3349                         return error;
3350         }
3351         if (!(file->f_mode & FMODE_CREATED))
3352                 audit_inode(nd->name, nd->path.dentry, 0);
3353         mnt_userns = mnt_user_ns(nd->path.mnt);
3354         if (open_flag & O_CREAT) {
3355                 if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
3356                         return -EEXIST;
3357                 if (d_is_dir(nd->path.dentry))
3358                         return -EISDIR;
3359                 error = may_create_in_sticky(mnt_userns, nd,
3360                                              d_backing_inode(nd->path.dentry));
3361                 if (unlikely(error))
3362                         return error;
3363         }
3364         if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3365                 return -ENOTDIR;
3366
3367         do_truncate = false;
3368         acc_mode = op->acc_mode;
3369         if (file->f_mode & FMODE_CREATED) {
3370                 /* Don't check for write permission, don't truncate */
3371                 open_flag &= ~O_TRUNC;
3372                 acc_mode = 0;
3373         } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
3374                 error = mnt_want_write(nd->path.mnt);
3375                 if (error)
3376                         return error;
3377                 do_truncate = true;
3378         }
3379         error = may_open(mnt_userns, &nd->path, acc_mode, open_flag);
3380         if (!error && !(file->f_mode & FMODE_OPENED))
3381                 error = vfs_open(&nd->path, file);
3382         if (!error)
3383                 error = ima_file_check(file, op->acc_mode);
3384         if (!error && do_truncate)
3385                 error = handle_truncate(mnt_userns, file);
3386         if (unlikely(error > 0)) {
3387                 WARN_ON(1);
3388                 error = -EINVAL;
3389         }
3390         if (do_truncate)
3391                 mnt_drop_write(nd->path.mnt);
3392         return error;
3393 }
3394
3395 /**
3396  * vfs_tmpfile - create tmpfile
3397  * @mnt_userns: user namespace of the mount the inode was found from
3398  * @dentry:     pointer to dentry of the base directory
3399  * @mode:       mode of the new tmpfile
3400  * @open_flag:  flags
3401  *
3402  * Create a temporary file.
3403  *
3404  * If the inode has been found through an idmapped mount the user namespace of
3405  * the vfsmount must be passed through @mnt_userns. This function will then take
3406  * care to map the inode according to @mnt_userns before checking permissions.
3407  * On non-idmapped mounts or if permission checking is to be performed on the
3408  * raw inode simply passs init_user_ns.
3409  */
3410 struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
3411                            struct dentry *dentry, umode_t mode, int open_flag)
3412 {
3413         struct dentry *child = NULL;
3414         struct inode *dir = dentry->d_inode;
3415         struct inode *inode;
3416         int error;
3417
3418         /* we want directory to be writable */
3419         error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
3420         if (error)
3421                 goto out_err;
3422         error = -EOPNOTSUPP;
3423         if (!dir->i_op->tmpfile)
3424                 goto out_err;
3425         error = -ENOMEM;
3426         child = d_alloc(dentry, &slash_name);
3427         if (unlikely(!child))
3428                 goto out_err;
3429         error = dir->i_op->tmpfile(mnt_userns, dir, child, mode);
3430         if (error)
3431                 goto out_err;
3432         error = -ENOENT;
3433         inode = child->d_inode;
3434         if (unlikely(!inode))
3435                 goto out_err;
3436         if (!(open_flag & O_EXCL)) {
3437                 spin_lock(&inode->i_lock);
3438                 inode->i_state |= I_LINKABLE;
3439                 spin_unlock(&inode->i_lock);
3440         }
3441         ima_post_create_tmpfile(mnt_userns, inode);
3442         return child;
3443
3444 out_err:
3445         dput(child);
3446         return ERR_PTR(error);
3447 }
3448 EXPORT_SYMBOL(vfs_tmpfile);
3449
3450 static int do_tmpfile(struct nameidata *nd, unsigned flags,
3451                 const struct open_flags *op,
3452                 struct file *file)
3453 {
3454         struct user_namespace *mnt_userns;
3455         struct dentry *child;
3456         struct path path;
3457         int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
3458         if (unlikely(error))
3459                 return error;
3460         error = mnt_want_write(path.mnt);
3461         if (unlikely(error))
3462                 goto out;
3463         mnt_userns = mnt_user_ns(path.mnt);
3464         child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag);
3465         error = PTR_ERR(child);
3466         if (IS_ERR(child))
3467                 goto out2;
3468         dput(path.dentry);
3469         path.dentry = child;
3470         audit_inode(nd->name, child, 0);
3471         /* Don't check for other permissions, the inode was just created */
3472         error = may_open(mnt_userns, &path, 0, op->open_flag);
3473         if (!error)
3474                 error = vfs_open(&path, file);
3475 out2:
3476         mnt_drop_write(path.mnt);
3477 out:
3478         path_put(&path);
3479         return error;
3480 }
3481
3482 static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
3483 {
3484         struct path path;
3485         int error = path_lookupat(nd, flags, &path);
3486         if (!error) {
3487                 audit_inode(nd->name, path.dentry, 0);
3488                 error = vfs_open(&path, file);
3489                 path_put(&path);
3490         }
3491         return error;
3492 }
3493
3494 static struct file *path_openat(struct nameidata *nd,
3495                         const struct open_flags *op, unsigned flags)
3496 {
3497         struct file *file;
3498         int error;
3499
3500         file = alloc_empty_file(op->open_flag, current_cred());
3501         if (IS_ERR(file))
3502                 return file;
3503
3504         if (unlikely(file->f_flags & __O_TMPFILE)) {
3505                 error = do_tmpfile(nd, flags, op, file);
3506         } else if (unlikely(file->f_flags & O_PATH)) {
3507                 error = do_o_path(nd, flags, file);
3508         } else {
3509                 const char *s = path_init(nd, flags);
3510                 while (!(error = link_path_walk(s, nd)) &&
3511                        (s = open_last_lookups(nd, file, op)) != NULL)
3512                         ;
3513                 if (!error)
3514                         error = do_open(nd, file, op);
3515                 terminate_walk(nd);
3516         }
3517         if (likely(!error)) {
3518                 if (likely(file->f_mode & FMODE_OPENED))
3519                         return file;
3520                 WARN_ON(1);
3521                 error = -EINVAL;
3522         }
3523         fput(file);
3524         if (error == -EOPENSTALE) {
3525                 if (flags & LOOKUP_RCU)
3526                         error = -ECHILD;
3527                 else
3528                         error = -ESTALE;
3529         }
3530         return ERR_PTR(error);
3531 }
3532
3533 struct file *do_filp_open(int dfd, struct filename *pathname,
3534                 const struct open_flags *op)
3535 {
3536         struct nameidata nd;
3537         int flags = op->lookup_flags;
3538         struct file *filp;
3539
3540         set_nameidata(&nd, dfd, pathname, NULL);
3541         filp = path_openat(&nd, op, flags | LOOKUP_RCU);
3542         if (unlikely(filp == ERR_PTR(-ECHILD)))
3543                 filp = path_openat(&nd, op, flags);
3544         if (unlikely(filp == ERR_PTR(-ESTALE)))
3545                 filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
3546         restore_nameidata();
3547         return filp;
3548 }
3549
3550 struct file *do_file_open_root(const struct path *root,
3551                 const char *name, const struct open_flags *op)
3552 {
3553         struct nameidata nd;
3554         struct file *file;
3555         struct filename *filename;
3556         int flags = op->lookup_flags;
3557
3558         if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
3559                 return ERR_PTR(-ELOOP);
3560
3561         filename = getname_kernel(name);
3562         if (IS_ERR(filename))
3563                 return ERR_CAST(filename);
3564
3565         set_nameidata(&nd, -1, filename, root);
3566         file = path_openat(&nd, op, flags | LOOKUP_RCU);
3567         if (unlikely(file == ERR_PTR(-ECHILD)))
3568                 file = path_openat(&nd, op, flags);
3569         if (unlikely(file == ERR_PTR(-ESTALE)))
3570                 file = path_openat(&nd, op, flags | LOOKUP_REVAL);
3571         restore_nameidata();
3572         putname(filename);
3573         return file;
3574 }
3575
3576 static struct dentry *filename_create(int dfd, struct filename *name,
3577                                 struct path *path, unsigned int lookup_flags)
3578 {
3579         struct dentry *dentry = ERR_PTR(-EEXIST);
3580         struct qstr last;
3581         int type;
3582         int err2;
3583         int error;
3584         bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
3585
3586         /*
3587          * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
3588          * other flags passed in are ignored!
3589          */
3590         lookup_flags &= LOOKUP_REVAL;
3591
3592         error = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
3593         if (error)
3594                 return ERR_PTR(error);
3595
3596         /*
3597          * Yucky last component or no last component at all?
3598          * (foo/., foo/.., /////)
3599          */
3600         if (unlikely(type != LAST_NORM))
3601                 goto out;
3602
3603         /* don't fail immediately if it's r/o, at least try to report other errors */
3604         err2 = mnt_want_write(path->mnt);
3605         /*
3606          * Do the final lookup.
3607          */
3608         lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
3609         inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
3610         dentry = __lookup_hash(&last, path->dentry, lookup_flags);
3611         if (IS_ERR(dentry))
3612                 goto unlock;
3613
3614         error = -EEXIST;
3615         if (d_is_positive(dentry))
3616                 goto fail;
3617
3618         /*
3619          * Special case - lookup gave negative, but... we had foo/bar/
3620          * From the vfs_mknod() POV we just have a negative dentry -
3621          * all is fine. Let's be bastards - you had / on the end, you've
3622          * been asking for (non-existent) directory. -ENOENT for you.
3623          */
3624         if (unlikely(!is_dir && last.name[last.len])) {
3625                 error = -ENOENT;
3626                 goto fail;
3627         }
3628         if (unlikely(err2)) {
3629                 error = err2;
3630                 goto fail;
3631         }
3632         return dentry;
3633 fail:
3634         dput(dentry);
3635         dentry = ERR_PTR(error);
3636 unlock:
3637         inode_unlock(path->dentry->d_inode);
3638         if (!err2)
3639                 mnt_drop_write(path->mnt);
3640 out:
3641         path_put(path);
3642         return dentry;
3643 }
3644
3645 struct dentry *kern_path_create(int dfd, const char *pathname,
3646                                 struct path *path, unsigned int lookup_flags)
3647 {
3648         return filename_create(dfd, getname_kernel(pathname),
3649                                 path, lookup_flags);
3650 }
3651 EXPORT_SYMBOL(kern_path_create);
3652
3653 void done_path_create(struct path *path, struct dentry *dentry)
3654 {
3655         dput(dentry);
3656         inode_unlock(path->dentry->d_inode);
3657         mnt_drop_write(path->mnt);
3658         path_put(path);
3659 }
3660 EXPORT_SYMBOL(done_path_create);
3661
3662 inline struct dentry *user_path_create(int dfd, const char __user *pathname,
3663                                 struct path *path, unsigned int lookup_flags)
3664 {
3665         return filename_create(dfd, getname(pathname), path, lookup_flags);
3666 }
3667 EXPORT_SYMBOL(user_path_create);
3668
3669 /**
3670  * vfs_mknod - create device node or file
3671  * @mnt_userns: user namespace of the mount the inode was found from
3672  * @dir:        inode of @dentry
3673  * @dentry:     pointer to dentry of the base directory
3674  * @mode:       mode of the new device node or file
3675  * @dev:        device number of device to create
3676  *
3677  * Create a device node or file.
3678  *
3679  * If the inode has been found through an idmapped mount the user namespace of
3680  * the vfsmount must be passed through @mnt_userns. This function will then take
3681  * care to map the inode according to @mnt_userns before checking permissions.
3682  * On non-idmapped mounts or if permission checking is to be performed on the
3683  * raw inode simply passs init_user_ns.
3684  */
3685 int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
3686               struct dentry *dentry, umode_t mode, dev_t dev)
3687 {
3688         bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
3689         int error = may_create(mnt_userns, dir, dentry);
3690
3691         if (error)
3692                 return error;
3693
3694         if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
3695             !capable(CAP_MKNOD))
3696                 return -EPERM;
3697
3698         if (!dir->i_op->mknod)
3699                 return -EPERM;
3700
3701         error = devcgroup_inode_mknod(mode, dev);
3702         if (error)
3703                 return error;
3704
3705         error = security_inode_mknod(dir, dentry, mode, dev);
3706         if (error)
3707                 return error;
3708
3709         error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev);
3710         if (!error)
3711                 fsnotify_create(dir, dentry);
3712         return error;
3713 }
3714 EXPORT_SYMBOL(vfs_mknod);
3715
3716 static int may_mknod(umode_t mode)
3717 {
3718         switch (mode & S_IFMT) {
3719         case S_IFREG:
3720         case S_IFCHR:
3721         case S_IFBLK:
3722         case S_IFIFO:
3723         case S_IFSOCK:
3724         case 0: /* zero mode translates to S_IFREG */
3725                 return 0;
3726         case S_IFDIR:
3727                 return -EPERM;
3728         default:
3729                 return -EINVAL;
3730         }
3731 }
3732
3733 static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
3734                 unsigned int dev)
3735 {
3736         struct user_namespace *mnt_userns;
3737         struct dentry *dentry;
3738         struct path path;
3739         int error;
3740         unsigned int lookup_flags = 0;
3741
3742         error = may_mknod(mode);
3743         if (error)
3744                 return error;
3745 retry:
3746         dentry = user_path_create(dfd, filename, &path, lookup_flags);
3747         if (IS_ERR(dentry))
3748                 return PTR_ERR(dentry);
3749
3750         if (!IS_POSIXACL(path.dentry->d_inode))
3751                 mode &= ~current_umask();
3752         error = security_path_mknod(&path, dentry, mode, dev);
3753         if (error)
3754                 goto out;
3755
3756         mnt_userns = mnt_user_ns(path.mnt);
3757         switch (mode & S_IFMT) {
3758                 case 0: case S_IFREG:
3759                         error = vfs_create(mnt_userns, path.dentry->d_inode,
3760                                            dentry, mode, true);
3761                         if (!error)
3762                                 ima_post_path_mknod(mnt_userns, dentry);
3763                         break;
3764                 case S_IFCHR: case S_IFBLK:
3765                         error = vfs_mknod(mnt_userns, path.dentry->d_inode,
3766                                           dentry, mode, new_decode_dev(dev));
3767                         break;
3768                 case S_IFIFO: case S_IFSOCK:
3769                         error = vfs_mknod(mnt_userns, path.dentry->d_inode,
3770                                           dentry, mode, 0);
3771                         break;
3772         }
3773 out:
3774         done_path_create(&path, dentry);
3775         if (retry_estale(error, lookup_flags)) {
3776                 lookup_flags |= LOOKUP_REVAL;
3777                 goto retry;
3778         }
3779         return error;
3780 }
3781
3782 SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
3783                 unsigned int, dev)
3784 {
3785         return do_mknodat(dfd, filename, mode, dev);
3786 }
3787
3788 SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3789 {
3790         return do_mknodat(AT_FDCWD, filename, mode, dev);
3791 }
3792
3793 /**
3794  * vfs_mkdir - create directory
3795  * @mnt_userns: user namespace of the mount the inode was found from
3796  * @dir:        inode of @dentry
3797  * @dentry:     pointer to dentry of the base directory
3798  * @mode:       mode of the new directory
3799  *
3800  * Create a directory.
3801  *
3802  * If the inode has been found through an idmapped mount the user namespace of
3803  * the vfsmount must be passed through @mnt_userns. This function will then take
3804  * care to map the inode according to @mnt_userns before checking permissions.
3805  * On non-idmapped mounts or if permission checking is to be performed on the
3806  * raw inode simply passs init_user_ns.
3807  */
3808 int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
3809               struct dentry *dentry, umode_t mode)
3810 {
3811         int error = may_create(mnt_userns, dir, dentry);
3812         unsigned max_links = dir->i_sb->s_max_links;
3813
3814         if (error)
3815                 return error;
3816
3817         if (!dir->i_op->mkdir)
3818                 return -EPERM;
3819
3820         mode &= (S_IRWXUGO|S_ISVTX);
3821         error = security_inode_mkdir(dir, dentry, mode);
3822         if (error)
3823                 return error;
3824
3825         if (max_links && dir->i_nlink >= max_links)
3826                 return -EMLINK;
3827
3828         error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode);
3829         if (!error)
3830                 fsnotify_mkdir(dir, dentry);
3831         return error;
3832 }
3833 EXPORT_SYMBOL(vfs_mkdir);
3834
3835 static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
3836 {
3837         struct dentry *dentry;
3838         struct path path;
3839         int error;
3840         unsigned int lookup_flags = LOOKUP_DIRECTORY;
3841
3842 retry:
3843         dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3844         if (IS_ERR(dentry))
3845                 return PTR_ERR(dentry);
3846
3847         if (!IS_POSIXACL(path.dentry->d_inode))
3848                 mode &= ~current_umask();
3849         error = security_path_mkdir(&path, dentry, mode);
3850         if (!error) {
3851                 struct user_namespace *mnt_userns;
3852                 mnt_userns = mnt_user_ns(path.mnt);
3853                 error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry,
3854                                   mode);
3855         }
3856         done_path_create(&path, dentry);
3857         if (retry_estale(error, lookup_flags)) {
3858                 lookup_flags |= LOOKUP_REVAL;
3859                 goto retry;
3860         }
3861         return error;
3862 }
3863
3864 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
3865 {
3866         return do_mkdirat(dfd, pathname, mode);
3867 }
3868
3869 SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3870 {
3871         return do_mkdirat(AT_FDCWD, pathname, mode);
3872 }
3873
3874 /**
3875  * vfs_rmdir - remove directory
3876  * @mnt_userns: user namespace of the mount the inode was found from
3877  * @dir:        inode of @dentry
3878  * @dentry:     pointer to dentry of the base directory
3879  *
3880  * Remove a directory.
3881  *
3882  * If the inode has been found through an idmapped mount the user namespace of
3883  * the vfsmount must be passed through @mnt_userns. This function will then take
3884  * care to map the inode according to @mnt_userns before checking permissions.
3885  * On non-idmapped mounts or if permission checking is to be performed on the
3886  * raw inode simply passs init_user_ns.
3887  */
3888 int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
3889                      struct dentry *dentry)
3890 {
3891         int error = may_delete(mnt_userns, dir, dentry, 1);
3892
3893         if (error)
3894                 return error;
3895
3896         if (!dir->i_op->rmdir)
3897                 return -EPERM;
3898
3899         dget(dentry);
3900         inode_lock(dentry->d_inode);
3901
3902         error = -EBUSY;
3903         if (is_local_mountpoint(dentry))
3904                 goto out;
3905
3906         error = security_inode_rmdir(dir, dentry);
3907         if (error)
3908                 goto out;
3909
3910         error = dir->i_op->rmdir(dir, dentry);
3911         if (error)
3912                 goto out;
3913
3914         shrink_dcache_parent(dentry);
3915         dentry->d_inode->i_flags |= S_DEAD;
3916         dont_mount(dentry);
3917         detach_mounts(dentry);
3918         fsnotify_rmdir(dir, dentry);
3919
3920 out:
3921         inode_unlock(dentry->d_inode);
3922         dput(dentry);
3923         if (!error)
3924                 d_delete(dentry);
3925         return error;
3926 }
3927 EXPORT_SYMBOL(vfs_rmdir);
3928
3929 long do_rmdir(int dfd, struct filename *name)
3930 {
3931         struct user_namespace *mnt_userns;
3932         int error;
3933         struct dentry *dentry;
3934         struct path path;
3935         struct qstr last;
3936         int type;
3937         unsigned int lookup_flags = 0;
3938 retry:
3939         error = __filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
3940         if (error)
3941                 goto exit1;
3942
3943         switch (type) {
3944         case LAST_DOTDOT:
3945                 error = -ENOTEMPTY;
3946                 goto exit2;
3947         case LAST_DOT:
3948                 error = -EINVAL;
3949                 goto exit2;
3950         case LAST_ROOT:
3951                 error = -EBUSY;
3952                 goto exit2;
3953         }
3954
3955         error = mnt_want_write(path.mnt);
3956         if (error)
3957                 goto exit2;
3958
3959         inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
3960         dentry = __lookup_hash(&last, path.dentry, lookup_flags);
3961         error = PTR_ERR(dentry);
3962         if (IS_ERR(dentry))
3963                 goto exit3;
3964         if (!dentry->d_inode) {
3965                 error = -ENOENT;
3966                 goto exit4;
3967         }
3968         error = security_path_rmdir(&path, dentry);
3969         if (error)
3970                 goto exit4;
3971         mnt_userns = mnt_user_ns(path.mnt);
3972         error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry);
3973 exit4:
3974         dput(dentry);
3975 exit3:
3976         inode_unlock(path.dentry->d_inode);
3977         mnt_drop_write(path.mnt);
3978 exit2:
3979         path_put(&path);
3980         if (retry_estale(error, lookup_flags)) {
3981                 lookup_flags |= LOOKUP_REVAL;
3982                 goto retry;
3983         }
3984 exit1:
3985         putname(name);
3986         return error;
3987 }
3988
3989 SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3990 {
3991         return do_rmdir(AT_FDCWD, getname(pathname));
3992 }
3993
3994 /**
3995  * vfs_unlink - unlink a filesystem object
3996  * @mnt_userns: user namespace of the mount the inode was found from
3997  * @dir:        parent directory
3998  * @dentry:     victim
3999  * @delegated_inode: returns victim inode, if the inode is delegated.
4000  *
4001  * The caller must hold dir->i_mutex.
4002  *
4003  * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
4004  * return a reference to the inode in delegated_inode.  The caller
4005  * should then break the delegation on that inode and retry.  Because
4006  * breaking a delegation may take a long time, the caller should drop
4007  * dir->i_mutex before doing so.
4008  *
4009  * Alternatively, a caller may pass NULL for delegated_inode.  This may
4010  * be appropriate for callers that expect the underlying filesystem not
4011  * to be NFS exported.
4012  *
4013  * If the inode has been found through an idmapped mount the user namespace of
4014  * the vfsmount must be passed through @mnt_userns. This function will then take
4015  * care to map the inode according to @mnt_userns before checking permissions.
4016  * On non-idmapped mounts or if permission checking is to be performed on the
4017  * raw inode simply passs init_user_ns.
4018  */
4019 int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
4020                struct dentry *dentry, struct inode **delegated_inode)
4021 {
4022         struct inode *target = dentry->d_inode;
4023         int error = may_delete(mnt_userns, dir, dentry, 0);
4024
4025         if (error)
4026                 return error;
4027
4028         if (!dir->i_op->unlink)
4029                 return -EPERM;
4030
4031         inode_lock(target);
4032         if (is_local_mountpoint(dentry))
4033                 error = -EBUSY;
4034         else {
4035                 error = security_inode_unlink(dir, dentry);
4036                 if (!error) {
4037                         error = try_break_deleg(target, delegated_inode);
4038                         if (error)
4039                                 goto out;
4040                         error = dir->i_op->unlink(dir, dentry);
4041                         if (!error) {
4042                                 dont_mount(dentry);
4043                                 detach_mounts(dentry);
4044                                 fsnotify_unlink(dir, dentry);
4045                         }
4046                 }
4047         }
4048 out:
4049         inode_unlock(target);
4050
4051         /* We don't d_delete() NFS sillyrenamed files--they still exist. */
4052         if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
4053                 fsnotify_link_count(target);
4054                 d_delete(dentry);
4055         }
4056
4057         return error;
4058 }
4059 EXPORT_SYMBOL(vfs_unlink);
4060
4061 /*
4062  * Make sure that the actual truncation of the file will occur outside its
4063  * directory's i_mutex.  Truncate can take a long time if there is a lot of
4064  * writeout happening, and we don't want to prevent access to the directory
4065  * while waiting on the I/O.
4066  */
4067 long do_unlinkat(int dfd, struct filename *name)
4068 {
4069         int error;
4070         struct dentry *dentry;
4071         struct path path;
4072         struct qstr last;
4073         int type;
4074         struct inode *inode = NULL;
4075         struct inode *delegated_inode = NULL;
4076         unsigned int lookup_flags = 0;
4077 retry:
4078         error = __filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
4079         if (error)
4080                 goto exit1;
4081
4082         error = -EISDIR;
4083         if (type != LAST_NORM)
4084                 goto exit2;
4085
4086         error = mnt_want_write(path.mnt);
4087         if (error)
4088                 goto exit2;
4089 retry_deleg:
4090         inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
4091         dentry = __lookup_hash(&last, path.dentry, lookup_flags);
4092         error = PTR_ERR(dentry);
4093         if (!IS_ERR(dentry)) {
4094                 struct user_namespace *mnt_userns;
4095
4096                 /* Why not before? Because we want correct error value */
4097                 if (last.name[last.len])
4098                         goto slashes;
4099                 inode = dentry->d_inode;
4100                 if (d_is_negative(dentry))
4101                         goto slashes;
4102                 ihold(inode);
4103                 error = security_path_unlink(&path, dentry);
4104                 if (error)
4105                         goto exit3;
4106                 mnt_userns = mnt_user_ns(path.mnt);
4107                 error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry,
4108                                    &delegated_inode);
4109 exit3:
4110                 dput(dentry);
4111         }
4112         inode_unlock(path.dentry->d_inode);
4113         if (inode)
4114                 iput(inode);    /* truncate the inode here */
4115         inode = NULL;
4116         if (delegated_inode) {
4117                 error = break_deleg_wait(&delegated_inode);
4118                 if (!error)
4119                         goto retry_deleg;
4120         }
4121         mnt_drop_write(path.mnt);
4122 exit2:
4123         path_put(&path);
4124         if (retry_estale(error, lookup_flags)) {
4125                 lookup_flags |= LOOKUP_REVAL;
4126                 inode = NULL;
4127                 goto retry;
4128         }
4129 exit1:
4130         putname(name);
4131         return error;
4132
4133 slashes:
4134         if (d_is_negative(dentry))
4135                 error = -ENOENT;
4136         else if (d_is_dir(dentry))
4137                 error = -EISDIR;
4138         else
4139                 error = -ENOTDIR;
4140         goto exit3;
4141 }
4142
4143 SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
4144 {
4145         if ((flag & ~AT_REMOVEDIR) != 0)
4146                 return -EINVAL;
4147
4148         if (flag & AT_REMOVEDIR)
4149                 return do_rmdir(dfd, getname(pathname));
4150         return do_unlinkat(dfd, getname(pathname));
4151 }
4152
4153 SYSCALL_DEFINE1(unlink, const char __user *, pathname)
4154 {
4155         return do_unlinkat(AT_FDCWD, getname(pathname));
4156 }
4157
4158 /**
4159  * vfs_symlink - create symlink
4160  * @mnt_userns: user namespace of the mount the inode was found from
4161  * @dir:        inode of @dentry
4162  * @dentry:     pointer to dentry of the base directory
4163  * @oldname:    name of the file to link to
4164  *
4165  * Create a symlink.
4166  *
4167  * If the inode has been found through an idmapped mount the user namespace of
4168  * the vfsmount must be passed through @mnt_userns. This function will then take
4169  * care to map the inode according to @mnt_userns before checking permissions.
4170  * On non-idmapped mounts or if permission checking is to be performed on the
4171  * raw inode simply passs init_user_ns.
4172  */
4173 int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
4174                 struct dentry *dentry, const char *oldname)
4175 {
4176         int error = may_create(mnt_userns, dir, dentry);
4177
4178         if (error)
4179                 return error;
4180
4181         if (!dir->i_op->symlink)
4182                 return -EPERM;
4183
4184         error = security_inode_symlink(dir, dentry, oldname);
4185         if (error)
4186                 return error;
4187
4188         error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname);
4189         if (!error)
4190                 fsnotify_create(dir, dentry);
4191         return error;
4192 }
4193 EXPORT_SYMBOL(vfs_symlink);
4194
4195 static long do_symlinkat(const char __user *oldname, int newdfd,
4196                   const char __user *newname)
4197 {
4198         int error;
4199         struct filename *from;
4200         struct dentry *dentry;
4201         struct path path;
4202         unsigned int lookup_flags = 0;
4203
4204         from = getname(oldname);
4205         if (IS_ERR(from))
4206                 return PTR_ERR(from);
4207 retry:
4208         dentry = user_path_create(newdfd, newname, &path, lookup_flags);
4209         error = PTR_ERR(dentry);
4210         if (IS_ERR(dentry))
4211                 goto out_putname;
4212
4213         error = security_path_symlink(&path, dentry, from->name);
4214         if (!error) {
4215                 struct user_namespace *mnt_userns;
4216
4217                 mnt_userns = mnt_user_ns(path.mnt);
4218                 error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry,
4219                                     from->name);
4220         }
4221         done_path_create(&path, dentry);
4222         if (retry_estale(error, lookup_flags)) {
4223                 lookup_flags |= LOOKUP_REVAL;
4224                 goto retry;
4225         }
4226 out_putname:
4227         putname(from);
4228         return error;
4229 }
4230
4231 SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
4232                 int, newdfd, const char __user *, newname)
4233 {
4234         return do_symlinkat(oldname, newdfd, newname);
4235 }
4236
4237 SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
4238 {
4239         return do_symlinkat(oldname, AT_FDCWD, newname);
4240 }
4241
4242 /**
4243  * vfs_link - create a new link
4244  * @old_dentry: object to be linked
4245  * @mnt_userns: the user namespace of the mount
4246  * @dir:        new parent
4247  * @new_dentry: where to create the new link
4248  * @delegated_inode: returns inode needing a delegation break
4249  *
4250  * The caller must hold dir->i_mutex
4251  *
4252  * If vfs_link discovers a delegation on the to-be-linked file in need
4253  * of breaking, it will return -EWOULDBLOCK and return a reference to the
4254  * inode in delegated_inode.  The caller should then break the delegation
4255  * and retry.  Because breaking a delegation may take a long time, the
4256  * caller should drop the i_mutex before doing so.
4257  *
4258  * Alternatively, a caller may pass NULL for delegated_inode.  This may
4259  * be appropriate for callers that expect the underlying filesystem not
4260  * to be NFS exported.
4261  *
4262  * If the inode has been found through an idmapped mount the user namespace of
4263  * the vfsmount must be passed through @mnt_userns. This function will then take
4264  * care to map the inode according to @mnt_userns before checking permissions.
4265  * On non-idmapped mounts or if permission checking is to be performed on the
4266  * raw inode simply passs init_user_ns.
4267  */
4268 int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns,
4269              struct inode *dir, struct dentry *new_dentry,
4270              struct inode **delegated_inode)
4271 {
4272         struct inode *inode = old_dentry->d_inode;
4273         unsigned max_links = dir->i_sb->s_max_links;
4274         int error;
4275
4276         if (!inode)
4277                 return -ENOENT;
4278
4279         error = may_create(mnt_userns, dir, new_dentry);
4280         if (error)
4281                 return error;
4282
4283         if (dir->i_sb != inode->i_sb)
4284                 return -EXDEV;
4285
4286         /*
4287          * A link to an append-only or immutable file cannot be created.
4288          */
4289         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4290                 return -EPERM;
4291         /*
4292          * Updating the link count will likely cause i_uid and i_gid to
4293          * be writen back improperly if their true value is unknown to
4294          * the vfs.
4295          */
4296         if (HAS_UNMAPPED_ID(mnt_userns, inode))
4297                 return -EPERM;
4298         if (!dir->i_op->link)
4299                 return -EPERM;
4300         if (S_ISDIR(inode->i_mode))
4301                 return -EPERM;
4302
4303         error = security_inode_link(old_dentry, dir, new_dentry);
4304         if (error)
4305                 return error;
4306
4307         inode_lock(inode);
4308         /* Make sure we don't allow creating hardlink to an unlinked file */
4309         if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
4310                 error =  -ENOENT;
4311         else if (max_links && inode->i_nlink >= max_links)
4312                 error = -EMLINK;
4313         else {
4314                 error = try_break_deleg(inode, delegated_inode);
4315                 if (!error)
4316                         error = dir->i_op->link(old_dentry, dir, new_dentry);
4317         }
4318
4319         if (!error && (inode->i_state & I_LINKABLE)) {
4320                 spin_lock(&inode->i_lock);
4321                 inode->i_state &= ~I_LINKABLE;
4322                 spin_unlock(&inode->i_lock);
4323         }
4324         inode_unlock(inode);
4325         if (!error)
4326                 fsnotify_link(dir, inode, new_dentry);
4327         return error;
4328 }
4329 EXPORT_SYMBOL(vfs_link);
4330
4331 /*
4332  * Hardlinks are often used in delicate situations.  We avoid
4333  * security-related surprises by not following symlinks on the
4334  * newname.  --KAB
4335  *
4336  * We don't follow them on the oldname either to be compatible
4337  * with linux 2.0, and to avoid hard-linking to directories
4338  * and other special files.  --ADM
4339  */
4340 static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
4341               const char __user *newname, int flags)
4342 {
4343         struct user_namespace *mnt_userns;
4344         struct dentry *new_dentry;
4345         struct path old_path, new_path;
4346         struct inode *delegated_inode = NULL;
4347         int how = 0;
4348         int error;
4349
4350         if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
4351                 return -EINVAL;
4352         /*
4353          * To use null names we require CAP_DAC_READ_SEARCH
4354          * This ensures that not everyone will be able to create
4355          * handlink using the passed filedescriptor.
4356          */
4357         if (flags & AT_EMPTY_PATH) {
4358                 if (!capable(CAP_DAC_READ_SEARCH))
4359                         return -ENOENT;
4360                 how = LOOKUP_EMPTY;
4361         }
4362
4363         if (flags & AT_SYMLINK_FOLLOW)
4364                 how |= LOOKUP_FOLLOW;
4365 retry:
4366         error = user_path_at(olddfd, oldname, how, &old_path);
4367         if (error)
4368                 return error;
4369
4370         new_dentry = user_path_create(newdfd, newname, &new_path,
4371                                         (how & LOOKUP_REVAL));
4372         error = PTR_ERR(new_dentry);
4373         if (IS_ERR(new_dentry))
4374                 goto out;
4375
4376         error = -EXDEV;
4377         if (old_path.mnt != new_path.mnt)
4378                 goto out_dput;
4379         mnt_userns = mnt_user_ns(new_path.mnt);
4380         error = may_linkat(mnt_userns, &old_path);
4381         if (unlikely(error))
4382                 goto out_dput;
4383         error = security_path_link(old_path.dentry, &new_path, new_dentry);
4384         if (error)
4385                 goto out_dput;
4386         error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode,
4387                          new_dentry, &delegated_inode);
4388 out_dput:
4389         done_path_create(&new_path, new_dentry);
4390         if (delegated_inode) {
4391                 error = break_deleg_wait(&delegated_inode);
4392                 if (!error) {
4393                         path_put(&old_path);
4394                         goto retry;
4395                 }
4396         }
4397         if (retry_estale(error, how)) {
4398                 path_put(&old_path);
4399                 how |= LOOKUP_REVAL;
4400                 goto retry;
4401         }
4402 out:
4403         path_put(&old_path);
4404
4405         return error;
4406 }
4407
4408 SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
4409                 int, newdfd, const char __user *, newname, int, flags)
4410 {
4411         return do_linkat(olddfd, oldname, newdfd, newname, flags);
4412 }
4413
4414 SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4415 {
4416         return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4417 }
4418
4419 /**
4420  * vfs_rename - rename a filesystem object
4421  * @rd:         pointer to &struct renamedata info
4422  *
4423  * The caller must hold multiple mutexes--see lock_rename()).
4424  *
4425  * If vfs_rename discovers a delegation in need of breaking at either
4426  * the source or destination, it will return -EWOULDBLOCK and return a
4427  * reference to the inode in delegated_inode.  The caller should then
4428  * break the delegation and retry.  Because breaking a delegation may
4429  * take a long time, the caller should drop all locks before doing
4430  * so.
4431  *
4432  * Alternatively, a caller may pass NULL for delegated_inode.  This may
4433  * be appropriate for callers that expect the underlying filesystem not
4434  * to be NFS exported.
4435  *
4436  * The worst of all namespace operations - renaming directory. "Perverted"
4437  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
4438  * Problems:
4439  *
4440  *      a) we can get into loop creation.
4441  *      b) race potential - two innocent renames can create a loop together.
4442  *         That's where 4.4 screws up. Current fix: serialization on
4443  *         sb->s_vfs_rename_mutex. We might be more accurate, but that's another
4444  *         story.
4445  *      c) we have to lock _four_ objects - parents and victim (if it exists),
4446  *         and source (if it is not a directory).
4447  *         And that - after we got ->i_mutex on parents (until then we don't know
4448  *         whether the target exists).  Solution: try to be smart with locking
4449  *         order for inodes.  We rely on the fact that tree topology may change
4450  *         only under ->s_vfs_rename_mutex _and_ that parent of the object we
4451  *         move will be locked.  Thus we can rank directories by the tree
4452  *         (ancestors first) and rank all non-directories after them.
4453  *         That works since everybody except rename does "lock parent, lookup,
4454  *         lock child" and rename is under ->s_vfs_rename_mutex.
4455  *         HOWEVER, it relies on the assumption that any object with ->lookup()
4456  *         has no more than 1 dentry.  If "hybrid" objects will ever appear,
4457  *         we'd better make sure that there's no link(2) for them.
4458  *      d) conversion from fhandle to dentry may come in the wrong moment - when
4459  *         we are removing the target. Solution: we will have to grab ->i_mutex
4460  *         in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4461  *         ->i_mutex on parents, which works but leads to some truly excessive
4462  *         locking].
4463  */
4464 int vfs_rename(struct renamedata *rd)
4465 {
4466         int error;
4467         struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
4468         struct dentry *old_dentry = rd->old_dentry;
4469         struct dentry *new_dentry = rd->new_dentry;
4470         struct inode **delegated_inode = rd->delegated_inode;
4471         unsigned int flags = rd->flags;
4472         bool is_dir = d_is_dir(old_dentry);
4473         struct inode *source = old_dentry->d_inode;
4474         struct inode *target = new_dentry->d_inode;
4475         bool new_is_dir = false;
4476         unsigned max_links = new_dir->i_sb->s_max_links;
4477         struct name_snapshot old_name;
4478
4479         if (source == target)
4480                 return 0;
4481
4482         error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir);
4483         if (error)
4484                 return error;
4485
4486         if (!target) {
4487                 error = may_create(rd->new_mnt_userns, new_dir, new_dentry);
4488         } else {
4489                 new_is_dir = d_is_dir(new_dentry);
4490
4491                 if (!(flags & RENAME_EXCHANGE))
4492                         error = may_delete(rd->new_mnt_userns, new_dir,
4493                                            new_dentry, is_dir);
4494                 else
4495                         error = may_delete(rd->new_mnt_userns, new_dir,
4496                                            new_dentry, new_is_dir);
4497         }
4498         if (error)
4499                 return error;
4500
4501         if (!old_dir->i_op->rename)
4502                 return -EPERM;
4503
4504         /*
4505          * If we are going to change the parent - check write permissions,
4506          * we'll need to flip '..'.
4507          */
4508         if (new_dir != old_dir) {
4509                 if (is_dir) {
4510                         error = inode_permission(rd->old_mnt_userns, source,
4511                                                  MAY_WRITE);
4512                         if (error)
4513                                 return error;
4514                 }
4515                 if ((flags & RENAME_EXCHANGE) && new_is_dir) {
4516                         error = inode_permission(rd->new_mnt_userns, target,
4517                                                  MAY_WRITE);
4518                         if (error)
4519                                 return error;
4520                 }
4521         }
4522
4523         error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
4524                                       flags);
4525         if (error)
4526                 return error;
4527
4528         take_dentry_name_snapshot(&old_name, old_dentry);
4529         dget(new_dentry);
4530         if (!is_dir || (flags & RENAME_EXCHANGE))
4531                 lock_two_nondirectories(source, target);
4532         else if (target)
4533                 inode_lock(target);
4534
4535         error = -EBUSY;
4536         if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
4537                 goto out;
4538
4539         if (max_links && new_dir != old_dir) {
4540                 error = -EMLINK;
4541                 if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4542                         goto out;
4543                 if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
4544                     old_dir->i_nlink >= max_links)
4545                         goto out;
4546         }
4547         if (!is_dir) {
4548                 error = try_break_deleg(source, delegated_inode);
4549                 if (error)
4550                         goto out;
4551         }
4552         if (target && !new_is_dir) {
4553                 error = try_break_deleg(target, delegated_inode);
4554                 if (error)
4555                         goto out;
4556         }
4557         error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry,
4558                                       new_dir, new_dentry, flags);
4559         if (error)
4560                 goto out;
4561
4562         if (!(flags & RENAME_EXCHANGE) && target) {
4563                 if (is_dir) {
4564                         shrink_dcache_parent(new_dentry);
4565                         target->i_flags |= S_DEAD;
4566                 }
4567                 dont_mount(new_dentry);
4568                 detach_mounts(new_dentry);
4569         }
4570         if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
4571                 if (!(flags & RENAME_EXCHANGE))
4572                         d_move(old_dentry, new_dentry);
4573                 else
4574                         d_exchange(old_dentry, new_dentry);
4575         }
4576 out:
4577         if (!is_dir || (flags & RENAME_EXCHANGE))
4578                 unlock_two_nondirectories(source, target);
4579         else if (target)
4580                 inode_unlock(target);
4581         dput(new_dentry);
4582         if (!error) {
4583                 fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
4584                               !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
4585                 if (flags & RENAME_EXCHANGE) {
4586                         fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
4587                                       new_is_dir, NULL, new_dentry);
4588                 }
4589         }
4590         release_dentry_name_snapshot(&old_name);
4591
4592         return error;
4593 }
4594 EXPORT_SYMBOL(vfs_rename);
4595
4596 int do_renameat2(int olddfd, struct filename *from, int newdfd,
4597                  struct filename *to, unsigned int flags)
4598 {
4599         struct renamedata rd;
4600         struct dentry *old_dentry, *new_dentry;
4601         struct dentry *trap;
4602         struct path old_path, new_path;
4603         struct qstr old_last, new_last;
4604         int old_type, new_type;
4605         struct inode *delegated_inode = NULL;
4606         unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
4607         bool should_retry = false;
4608         int error = -EINVAL;
4609
4610         if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4611                 goto put_names;
4612
4613         if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
4614             (flags & RENAME_EXCHANGE))
4615                 goto put_names;
4616
4617         if (flags & RENAME_EXCHANGE)
4618                 target_flags = 0;
4619
4620 retry:
4621         error = __filename_parentat(olddfd, from, lookup_flags, &old_path,
4622                                         &old_last, &old_type);
4623         if (error)
4624                 goto put_names;
4625
4626         error = __filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
4627                                 &new_type);
4628         if (error)
4629                 goto exit1;
4630
4631         error = -EXDEV;
4632         if (old_path.mnt != new_path.mnt)
4633                 goto exit2;
4634
4635         error = -EBUSY;
4636         if (old_type != LAST_NORM)
4637                 goto exit2;
4638
4639         if (flags & RENAME_NOREPLACE)
4640                 error = -EEXIST;
4641         if (new_type != LAST_NORM)
4642                 goto exit2;
4643
4644         error = mnt_want_write(old_path.mnt);
4645         if (error)
4646                 goto exit2;
4647
4648 retry_deleg:
4649         trap = lock_rename(new_path.dentry, old_path.dentry);
4650
4651         old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
4652         error = PTR_ERR(old_dentry);
4653         if (IS_ERR(old_dentry))
4654                 goto exit3;
4655         /* source must exist */
4656         error = -ENOENT;
4657         if (d_is_negative(old_dentry))
4658                 goto exit4;
4659         new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
4660         error = PTR_ERR(new_dentry);
4661         if (IS_ERR(new_dentry))
4662                 goto exit4;
4663         error = -EEXIST;
4664         if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
4665                 goto exit5;
4666         if (flags & RENAME_EXCHANGE) {
4667                 error = -ENOENT;
4668                 if (d_is_negative(new_dentry))
4669                         goto exit5;
4670
4671                 if (!d_is_dir(new_dentry)) {
4672                         error = -ENOTDIR;
4673                         if (new_last.name[new_last.len])
4674                                 goto exit5;
4675                 }
4676         }
4677         /* unless the source is a directory trailing slashes give -ENOTDIR */
4678         if (!d_is_dir(old_dentry)) {
4679                 error = -ENOTDIR;
4680                 if (old_last.name[old_last.len])
4681                         goto exit5;
4682                 if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
4683                         goto exit5;
4684         }
4685         /* source should not be ancestor of target */
4686         error = -EINVAL;
4687         if (old_dentry == trap)
4688                 goto exit5;
4689         /* target should not be an ancestor of source */
4690         if (!(flags & RENAME_EXCHANGE))
4691                 error = -ENOTEMPTY;
4692         if (new_dentry == trap)
4693                 goto exit5;
4694
4695         error = security_path_rename(&old_path, old_dentry,
4696                                      &new_path, new_dentry, flags);
4697         if (error)
4698                 goto exit5;
4699
4700         rd.old_dir         = old_path.dentry->d_inode;
4701         rd.old_dentry      = old_dentry;
4702         rd.old_mnt_userns  = mnt_user_ns(old_path.mnt);
4703         rd.new_dir         = new_path.dentry->d_inode;
4704         rd.new_dentry      = new_dentry;
4705         rd.new_mnt_userns  = mnt_user_ns(new_path.mnt);
4706         rd.delegated_inode = &delegated_inode;
4707         rd.flags           = flags;
4708         error = vfs_rename(&rd);
4709 exit5:
4710         dput(new_dentry);
4711 exit4:
4712         dput(old_dentry);
4713 exit3:
4714         unlock_rename(new_path.dentry, old_path.dentry);
4715         if (delegated_inode) {
4716                 error = break_deleg_wait(&delegated_inode);
4717                 if (!error)
4718                         goto retry_deleg;
4719         }
4720         mnt_drop_write(old_path.mnt);
4721 exit2:
4722         if (retry_estale(error, lookup_flags))
4723                 should_retry = true;
4724         path_put(&new_path);
4725 exit1:
4726         path_put(&old_path);
4727         if (should_retry) {
4728                 should_retry = false;
4729                 lookup_flags |= LOOKUP_REVAL;
4730                 goto retry;
4731         }
4732 put_names:
4733         putname(from);
4734         putname(to);
4735         return error;
4736 }
4737
4738 SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
4739                 int, newdfd, const char __user *, newname, unsigned int, flags)
4740 {
4741         return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
4742                                 flags);
4743 }
4744
4745 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
4746                 int, newdfd, const char __user *, newname)
4747 {
4748         return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
4749                                 0);
4750 }
4751
4752 SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4753 {
4754         return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
4755                                 getname(newname), 0);
4756 }
4757
4758 int readlink_copy(char __user *buffer, int buflen, const char *link)
4759 {
4760         int len = PTR_ERR(link);
4761         if (IS_ERR(link))
4762                 goto out;
4763
4764         len = strlen(link);
4765         if (len > (unsigned) buflen)
4766                 len = buflen;
4767         if (copy_to_user(buffer, link, len))
4768                 len = -EFAULT;
4769 out:
4770         return len;
4771 }
4772
4773 /**
4774  * vfs_readlink - copy symlink body into userspace buffer
4775  * @dentry: dentry on which to get symbolic link
4776  * @buffer: user memory pointer
4777  * @buflen: size of buffer
4778  *
4779  * Does not touch atime.  That's up to the caller if necessary
4780  *
4781  * Does not call security hook.
4782  */
4783 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4784 {
4785         struct inode *inode = d_inode(dentry);
4786         DEFINE_DELAYED_CALL(done);
4787         const char *link;
4788         int res;
4789
4790         if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
4791                 if (unlikely(inode->i_op->readlink))
4792                         return inode->i_op->readlink(dentry, buffer, buflen);
4793
4794                 if (!d_is_symlink(dentry))
4795                         return -EINVAL;
4796
4797                 spin_lock(&inode->i_lock);
4798                 inode->i_opflags |= IOP_DEFAULT_READLINK;
4799                 spin_unlock(&inode->i_lock);
4800         }
4801
4802         link = READ_ONCE(inode->i_link);
4803         if (!link) {
4804                 link = inode->i_op->get_link(dentry, inode, &done);
4805                 if (IS_ERR(link))
4806                         return PTR_ERR(link);
4807         }
4808         res = readlink_copy(buffer, buflen, link);
4809         do_delayed_call(&done);
4810         return res;
4811 }
4812 EXPORT_SYMBOL(vfs_readlink);
4813
4814 /**
4815  * vfs_get_link - get symlink body
4816  * @dentry: dentry on which to get symbolic link
4817  * @done: caller needs to free returned data with this
4818  *
4819  * Calls security hook and i_op->get_link() on the supplied inode.
4820  *
4821  * It does not touch atime.  That's up to the caller if necessary.
4822  *
4823  * Does not work on "special" symlinks like /proc/$$/fd/N
4824  */
4825 const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
4826 {
4827         const char *res = ERR_PTR(-EINVAL);
4828         struct inode *inode = d_inode(dentry);
4829
4830         if (d_is_symlink(dentry)) {
4831                 res = ERR_PTR(security_inode_readlink(dentry));
4832                 if (!res)
4833                         res = inode->i_op->get_link(dentry, inode, done);
4834         }
4835         return res;
4836 }
4837 EXPORT_SYMBOL(vfs_get_link);
4838
4839 /* get the link contents into pagecache */
4840 const char *page_get_link(struct dentry *dentry, struct inode *inode,
4841                           struct delayed_call *callback)
4842 {
4843         char *kaddr;
4844         struct page *page;
4845         struct address_space *mapping = inode->i_mapping;
4846
4847         if (!dentry) {
4848                 page = find_get_page(mapping, 0);
4849                 if (!page)
4850                         return ERR_PTR(-ECHILD);
4851                 if (!PageUptodate(page)) {
4852                         put_page(page);
4853                         return ERR_PTR(-ECHILD);
4854                 }
4855         } else {
4856                 page = read_mapping_page(mapping, 0, NULL);
4857                 if (IS_ERR(page))
4858                         return (char*)page;
4859         }
4860         set_delayed_call(callback, page_put_link, page);
4861         BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
4862         kaddr = page_address(page);
4863         nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
4864         return kaddr;
4865 }
4866
4867 EXPORT_SYMBOL(page_get_link);
4868
4869 void page_put_link(void *arg)
4870 {
4871         put_page(arg);
4872 }
4873 EXPORT_SYMBOL(page_put_link);
4874
4875 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4876 {
4877         DEFINE_DELAYED_CALL(done);
4878         int res = readlink_copy(buffer, buflen,
4879                                 page_get_link(dentry, d_inode(dentry),
4880                                               &done));
4881         do_delayed_call(&done);
4882         return res;
4883 }
4884 EXPORT_SYMBOL(page_readlink);
4885
4886 /*
4887  * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
4888  */
4889 int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
4890 {
4891         struct address_space *mapping = inode->i_mapping;
4892         struct page *page;
4893         void *fsdata;
4894         int err;
4895         unsigned int flags = 0;
4896         if (nofs)
4897                 flags |= AOP_FLAG_NOFS;
4898
4899 retry:
4900         err = pagecache_write_begin(NULL, mapping, 0, len-1,
4901                                 flags, &page, &fsdata);
4902         if (err)
4903                 goto fail;
4904
4905         memcpy(page_address(page), symname, len-1);
4906
4907         err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
4908                                                         page, fsdata);
4909         if (err < 0)
4910                 goto fail;
4911         if (err < len-1)
4912                 goto retry;
4913
4914         mark_inode_dirty(inode);
4915         return 0;
4916 fail:
4917         return err;
4918 }
4919 EXPORT_SYMBOL(__page_symlink);
4920
4921 int page_symlink(struct inode *inode, const char *symname, int len)
4922 {
4923         return __page_symlink(inode, symname, len,
4924                         !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
4925 }
4926 EXPORT_SYMBOL(page_symlink);
4927
4928 const struct inode_operations page_symlink_inode_operations = {
4929         .get_link       = page_get_link,
4930 };
4931 EXPORT_SYMBOL(page_symlink_inode_operations);