fs/namei.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *  linux/fs/namei.c
   4  *
   5  *  Copyright (C) 1991, 1992  Linus Torvalds
   6  */
   7
   8 /*
   9  * Some corrections by tytso.
  10  */
  11
  12 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
  13  * lookup logic.
  14  */
  15 /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
  16  */
  17
  18 #include <linux/init.h>
  19 #include <linux/export.h>
  20 #include <linux/kernel.h>
  21 #include <linux/slab.h>
  22 #include <linux/fs.h>
  23 #include <linux/namei.h>
  24 #include <linux/pagemap.h>
  25 #include <linux/fsnotify.h>
  26 #include <linux/personality.h>
  27 #include <linux/security.h>
  28 #include <linux/ima.h>
  29 #include <linux/syscalls.h>
  30 #include <linux/mount.h>
  31 #include <linux/audit.h>
  32 #include <linux/capability.h>
  33 #include <linux/file.h>
  34 #include <linux/fcntl.h>
  35 #include <linux/device_cgroup.h>
  36 #include <linux/fs_struct.h>
  37 #include <linux/posix_acl.h>
  38 #include <linux/hash.h>
  39 #include <linux/bitops.h>
  40 #include <linux/init_task.h>
  41 #include <linux/uaccess.h>
  42
  43 #include "internal.h"
  44 #include "mount.h"
  45
  46 /* [Feb-1997 T. Schoebel-Theuer]
  47  * Fundamental changes in the pathname lookup mechanisms (namei)
  48  * were necessary because of omirr.  The reason is that omirr needs
  49  * to know the _real_ pathname, not the user-supplied one, in case
  50  * of symlinks (and also when transname replacements occur).
  51  *
  52  * The new code replaces the old recursive symlink resolution with
  53  * an iterative one (in case of non-nested symlink chains).  It does
  54  * this with calls to <fs>_follow_link().
  55  * As a side effect, dir_namei(), _namei() and follow_link() are now
  56  * replaced with a single function lookup_dentry() that can handle all
  57  * the special cases of the former code.
  58  *
  59  * With the new dcache, the pathname is stored at each inode, at least as
  60  * long as the refcount of the inode is positive.  As a side effect, the
  61  * size of the dcache depends on the inode cache and thus is dynamic.
  62  *
  63  * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
  64  * resolution to correspond with current state of the code.
  65  *
  66  * Note that the symlink resolution is not *completely* iterative.
  67  * There is still a significant amount of tail- and mid- recursion in
  68  * the algorithm.  Also, note that <fs>_readlink() is not used in
  69  * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
  70  * may return different results than <fs>_follow_link().  Many virtual
  71  * filesystems (including /proc) exhibit this behavior.
  72  */
  73
  74 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
  75  * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
  76  * and the name already exists in form of a symlink, try to create the new
  77  * name indicated by the symlink. The old code always complained that the
  78  * name already exists, due to not following the symlink even if its target
  79  * is nonexistent.  The new semantics affects also mknod() and link() when
  80  * the name is a symlink pointing to a non-existent name.
  81  *
  82  * I don't know which semantics is the right one, since I have no access
  83  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
  84  * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
  85  * "old" one. Personally, I think the new semantics is much more logical.
  86  * Note that "ln old new" where "new" is a symlink pointing to a non-existing
  87  * file does succeed in both HP-UX and SunOs, but not in Solaris
  88  * and in the old Linux semantics.
  89  */
  90
  91 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
  92  * semantics.  See the comments in "open_namei" and "do_link" below.
  93  *
  94  * [10-Sep-98 Alan Modra] Another symlink change.
  95  */
  96
  97 /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
  98  *      inside the path - always follow.
  99  *      in the last component in creation/removal/renaming - never follow.
 100  *      if LOOKUP_FOLLOW passed - follow.
 101  *      if the pathname has trailing slashes - follow.
 102  *      otherwise - don't follow.
 103  * (applied in that order).
 104  *
 105  * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 106  * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 107  * During the 2.4 we need to fix the userland stuff depending on it -
 108  * hopefully we will be able to get rid of that wart in 2.5. So far only
 109  * XEmacs seems to be relying on it...
 110  */
 111 /*
 112  * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 113  * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
 114  * any extra contention...
 115  */
 116
 117 /* In order to reduce some races, while at the same time doing additional
 118  * checking and hopefully speeding things up, we copy filenames to the
 119  * kernel data space before using them..
 120  *
 121  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 122  * PATH_MAX includes the nul terminator --RR.
 123  */
 124
 125 #define EMBEDDED_NAME_MAX       (PATH_MAX - offsetof(struct filename, iname))
 126
 127 struct filename *
 128 getname_flags(const char __user *filename, int flags, int *empty)
 129 {
 130         struct filename *result;
 131         char *kname;
 132         int len;
 133
 134         result = audit_reusename(filename);
 135         if (result)
 136                 return result;
 137
 138         result = __getname();
 139         if (unlikely(!result))
 140                 return ERR_PTR(-ENOMEM);
 141
 142         /*
 143          * First, try to embed the struct filename inside the names_cache
 144          * allocation
 145          */
 146         kname = (char *)result->iname;
 147         result->name = kname;
 148
 149         len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
 150         if (unlikely(len < 0)) {
 151                 __putname(result);
 152                 return ERR_PTR(len);
 153         }
 154
 155         /*
 156          * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
 157          * separate struct filename so we can dedicate the entire
 158          * names_cache allocation for the pathname, and re-do the copy from
 159          * userland.
 160          */
 161         if (unlikely(len == EMBEDDED_NAME_MAX)) {
 162                 const size_t size = offsetof(struct filename, iname[1]);
 163                 kname = (char *)result;
 164
 165                 /*
 166                  * size is chosen that way we to guarantee that
 167                  * result->iname[0] is within the same object and that
 168                  * kname can't be equal to result->iname, no matter what.
 169                  */
 170                 result = kzalloc(size, GFP_KERNEL);
 171                 if (unlikely(!result)) {
 172                         __putname(kname);
 173                         return ERR_PTR(-ENOMEM);
 174                 }
 175                 result->name = kname;
 176                 len = strncpy_from_user(kname, filename, PATH_MAX);
 177                 if (unlikely(len < 0)) {
 178                         __putname(kname);
 179                         kfree(result);
 180                         return ERR_PTR(len);
 181                 }
 182                 if (unlikely(len == PATH_MAX)) {
 183                         __putname(kname);
 184                         kfree(result);
 185                         return ERR_PTR(-ENAMETOOLONG);
 186                 }
 187         }
 188
 189         result->refcnt = 1;
 190         /* The empty path is special. */
 191         if (unlikely(!len)) {
 192                 if (empty)
 193                         *empty = 1;
 194                 if (!(flags & LOOKUP_EMPTY)) {
 195                         putname(result);
 196                         return ERR_PTR(-ENOENT);
 197                 }
 198         }
 199
 200         result->uptr = filename;
 201         result->aname = NULL;
 202         audit_getname(result);
 203         return result;
 204 }
 205
 206 struct filename *
 207 getname(const char __user * filename)
 208 {
 209         return getname_flags(filename, 0, NULL);
 210 }
 211
 212 struct filename *
 213 getname_kernel(const char * filename)
 214 {
 215         struct filename *result;
 216         int len = strlen(filename) + 1;
 217
 218         result = __getname();
 219         if (unlikely(!result))
 220                 return ERR_PTR(-ENOMEM);
 221
 222         if (len <= EMBEDDED_NAME_MAX) {
 223                 result->name = (char *)result->iname;
 224         } else if (len <= PATH_MAX) {
 225                 const size_t size = offsetof(struct filename, iname[1]);
 226                 struct filename *tmp;
 227
 228                 tmp = kmalloc(size, GFP_KERNEL);
 229                 if (unlikely(!tmp)) {
 230                         __putname(result);
 231                         return ERR_PTR(-ENOMEM);
 232                 }
 233                 tmp->name = (char *)result;
 234                 result = tmp;
 235         } else {
 236                 __putname(result);
 237                 return ERR_PTR(-ENAMETOOLONG);
 238         }
 239         memcpy((char *)result->name, filename, len);
 240         result->uptr = NULL;
 241         result->aname = NULL;
 242         result->refcnt = 1;
 243         audit_getname(result);
 244
 245         return result;
 246 }
 247
 248 void putname(struct filename *name)
 249 {
 250         if (IS_ERR_OR_NULL(name))
 251                 return;
 252
 253         BUG_ON(name->refcnt <= 0);
 254
 255         if (--name->refcnt > 0)
 256                 return;
 257
 258         if (name->name != name->iname) {
 259                 __putname(name->name);
 260                 kfree(name);
 261         } else
 262                 __putname(name);
 263 }
 264
 265 /**
 266  * check_acl - perform ACL permission checking
 267  * @mnt_userns: user namespace of the mount the inode was found from
 268  * @inode:      inode to check permissions on
 269  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 270  *
 271  * This function performs the ACL permission checking. Since this function
 272  * retrieve POSIX acls it needs to know whether it is called from a blocking or
 273  * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 274  *
 275  * If the inode has been found through an idmapped mount the user namespace of
 276  * the vfsmount must be passed through @mnt_userns. This function will then take
 277  * care to map the inode according to @mnt_userns before checking permissions.
 278  * On non-idmapped mounts or if permission checking is to be performed on the
 279  * raw inode simply passs init_user_ns.
 280  */
 281 static int check_acl(struct user_namespace *mnt_userns,
 282                      struct inode *inode, int mask)
 283 {
 284 #ifdef CONFIG_FS_POSIX_ACL
 285         struct posix_acl *acl;
 286
 287         if (mask & MAY_NOT_BLOCK) {
 288                 acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
 289                 if (!acl)
 290                         return -EAGAIN;
 291                 /* no ->get_acl() calls in RCU mode... */
 292                 if (is_uncached_acl(acl))
 293                         return -ECHILD;
 294                 return posix_acl_permission(mnt_userns, inode, acl, mask);
 295         }
 296
 297         acl = get_acl(inode, ACL_TYPE_ACCESS);
 298         if (IS_ERR(acl))
 299                 return PTR_ERR(acl);
 300         if (acl) {
 301                 int error = posix_acl_permission(mnt_userns, inode, acl, mask);
 302                 posix_acl_release(acl);
 303                 return error;
 304         }
 305 #endif
 306
 307         return -EAGAIN;
 308 }
 309
 310 /**
 311  * acl_permission_check - perform basic UNIX permission checking
 312  * @mnt_userns: user namespace of the mount the inode was found from
 313  * @inode:      inode to check permissions on
 314  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 315  *
 316  * This function performs the basic UNIX permission checking. Since this
 317  * function may retrieve POSIX acls it needs to know whether it is called from a
 318  * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 319  *
 320  * If the inode has been found through an idmapped mount the user namespace of
 321  * the vfsmount must be passed through @mnt_userns. This function will then take
 322  * care to map the inode according to @mnt_userns before checking permissions.
 323  * On non-idmapped mounts or if permission checking is to be performed on the
 324  * raw inode simply passs init_user_ns.
 325  */
 326 static int acl_permission_check(struct user_namespace *mnt_userns,
 327                                 struct inode *inode, int mask)
 328 {
 329         unsigned int mode = inode->i_mode;
 330         kuid_t i_uid;
 331
 332         /* Are we the owner? If so, ACL's don't matter */
 333         i_uid = i_uid_into_mnt(mnt_userns, inode);
 334         if (likely(uid_eq(current_fsuid(), i_uid))) {
 335                 mask &= 7;
 336                 mode >>= 6;
 337                 return (mask & ~mode) ? -EACCES : 0;
 338         }
 339
 340         /* Do we have ACL's? */
 341         if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
 342                 int error = check_acl(mnt_userns, inode, mask);
 343                 if (error != -EAGAIN)
 344                         return error;
 345         }
 346
 347         /* Only RWX matters for group/other mode bits */
 348         mask &= 7;
 349
 350         /*
 351          * Are the group permissions different from
 352          * the other permissions in the bits we care
 353          * about? Need to check group ownership if so.
 354          */
 355         if (mask & (mode ^ (mode >> 3))) {
 356                 kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
 357                 if (in_group_p(kgid))
 358                         mode >>= 3;
 359         }
 360
 361         /* Bits in 'mode' clear that we require? */
 362         return (mask & ~mode) ? -EACCES : 0;
 363 }
 364
 365 /**
 366  * generic_permission -  check for access rights on a Posix-like filesystem
 367  * @mnt_userns: user namespace of the mount the inode was found from
 368  * @inode:      inode to check access rights for
 369  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
 370  *              %MAY_NOT_BLOCK ...)
 371  *
 372  * Used to check for read/write/execute permissions on a file.
 373  * We use "fsuid" for this, letting us set arbitrary permissions
 374  * for filesystem access without changing the "normal" uids which
 375  * are used for other things.
 376  *
 377  * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 378  * request cannot be satisfied (eg. requires blocking or too much complexity).
 379  * It would then be called again in ref-walk mode.
 380  *
 381  * If the inode has been found through an idmapped mount the user namespace of
 382  * the vfsmount must be passed through @mnt_userns. This function will then take
 383  * care to map the inode according to @mnt_userns before checking permissions.
 384  * On non-idmapped mounts or if permission checking is to be performed on the
 385  * raw inode simply passs init_user_ns.
 386  */
 387 int generic_permission(struct user_namespace *mnt_userns, struct inode *inode,
 388                        int mask)
 389 {
 390         int ret;
 391
 392         /*
 393          * Do the basic permission checks.
 394          */
 395         ret = acl_permission_check(mnt_userns, inode, mask);
 396         if (ret != -EACCES)
 397                 return ret;
 398
 399         if (S_ISDIR(inode->i_mode)) {
 400                 /* DACs are overridable for directories */
 401                 if (!(mask & MAY_WRITE))
 402                         if (capable_wrt_inode_uidgid(mnt_userns, inode,
 403                                                      CAP_DAC_READ_SEARCH))
 404                                 return 0;
 405                 if (capable_wrt_inode_uidgid(mnt_userns, inode,
 406                                              CAP_DAC_OVERRIDE))
 407                         return 0;
 408                 return -EACCES;
 409         }
 410
 411         /*
 412          * Searching includes executable on directories, else just read.
 413          */
 414         mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 415         if (mask == MAY_READ)
 416                 if (capable_wrt_inode_uidgid(mnt_userns, inode,
 417                                              CAP_DAC_READ_SEARCH))
 418                         return 0;
 419         /*
 420          * Read/write DACs are always overridable.
 421          * Executable DACs are overridable when there is
 422          * at least one exec bit set.
 423          */
 424         if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
 425                 if (capable_wrt_inode_uidgid(mnt_userns, inode,
 426                                              CAP_DAC_OVERRIDE))
 427                         return 0;
 428
 429         return -EACCES;
 430 }
 431 EXPORT_SYMBOL(generic_permission);
 432
 433 /**
 434  * do_inode_permission - UNIX permission checking
 435  * @mnt_userns: user namespace of the mount the inode was found from
 436  * @inode:      inode to check permissions on
 437  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 438  *
 439  * We _really_ want to just do "generic_permission()" without
 440  * even looking at the inode->i_op values. So we keep a cache
 441  * flag in inode->i_opflags, that says "this has not special
 442  * permission function, use the fast case".
 443  */
 444 static inline int do_inode_permission(struct user_namespace *mnt_userns,
 445                                       struct inode *inode, int mask)
 446 {
 447         if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
 448                 if (likely(inode->i_op->permission))
 449                         return inode->i_op->permission(mnt_userns, inode, mask);
 450
 451                 /* This gets set once for the inode lifetime */
 452                 spin_lock(&inode->i_lock);
 453                 inode->i_opflags |= IOP_FASTPERM;
 454                 spin_unlock(&inode->i_lock);
 455         }
 456         return generic_permission(mnt_userns, inode, mask);
 457 }
 458
 459 /**
 460  * sb_permission - Check superblock-level permissions
 461  * @sb: Superblock of inode to check permission on
 462  * @inode: Inode to check permission on
 463  * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 464  *
 465  * Separate out file-system wide checks from inode-specific permission checks.
 466  */
 467 static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
 468 {
 469         if (unlikely(mask & MAY_WRITE)) {
 470                 umode_t mode = inode->i_mode;
 471
 472                 /* Nobody gets write access to a read-only fs. */
 473                 if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 474                         return -EROFS;
 475         }
 476         return 0;
 477 }
 478
 479 /**
 480  * inode_permission - Check for access rights to a given inode
 481  * @mnt_userns: User namespace of the mount the inode was found from
 482  * @inode:      Inode to check permission on
 483  * @mask:       Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 484  *
 485  * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 486  * this, letting us set arbitrary permissions for filesystem access without
 487  * changing the "normal" UIDs which are used for other things.
 488  *
 489  * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 490  */
 491 int inode_permission(struct user_namespace *mnt_userns,
 492                      struct inode *inode, int mask)
 493 {
 494         int retval;
 495
 496         retval = sb_permission(inode->i_sb, inode, mask);
 497         if (retval)
 498                 return retval;
 499
 500         if (unlikely(mask & MAY_WRITE)) {
 501                 /*
 502                  * Nobody gets write access to an immutable file.
 503                  */
 504                 if (IS_IMMUTABLE(inode))
 505                         return -EPERM;
 506
 507                 /*
 508                  * Updating mtime will likely cause i_uid and i_gid to be
 509                  * written back improperly if their true value is unknown
 510                  * to the vfs.
 511                  */
 512                 if (HAS_UNMAPPED_ID(mnt_userns, inode))
 513                         return -EACCES;
 514         }
 515
 516         retval = do_inode_permission(mnt_userns, inode, mask);
 517         if (retval)
 518                 return retval;
 519
 520         retval = devcgroup_inode_permission(inode, mask);
 521         if (retval)
 522                 return retval;
 523
 524         return security_inode_permission(inode, mask);
 525 }
 526 EXPORT_SYMBOL(inode_permission);
 527
 528 /**
 529  * path_get - get a reference to a path
 530  * @path: path to get the reference to
 531  *
 532  * Given a path increment the reference count to the dentry and the vfsmount.
 533  */
 534 void path_get(const struct path *path)
 535 {
 536         mntget(path->mnt);
 537         dget(path->dentry);
 538 }
 539 EXPORT_SYMBOL(path_get);
 540
 541 /**
 542  * path_put - put a reference to a path
 543  * @path: path to put the reference to
 544  *
 545  * Given a path decrement the reference count to the dentry and the vfsmount.
 546  */
 547 void path_put(const struct path *path)
 548 {
 549         dput(path->dentry);
 550         mntput(path->mnt);
 551 }
 552 EXPORT_SYMBOL(path_put);
 553
 554 #define EMBEDDED_LEVELS 2
 555 struct nameidata {
 556         struct path     path;
 557         struct qstr     last;
 558         struct path     root;
 559         struct inode    *inode; /* path.dentry.d_inode */
 560         unsigned int    flags, state;
 561         unsigned        seq, m_seq, r_seq;
 562         int             last_type;
 563         unsigned        depth;
 564         int             total_link_count;
 565         struct saved {
 566                 struct path link;
 567                 struct delayed_call done;
 568                 const char *name;
 569                 unsigned seq;
 570         } *stack, internal[EMBEDDED_LEVELS];
 571         struct filename *name;
 572         struct nameidata *saved;
 573         unsigned        root_seq;
 574         int             dfd;
 575         kuid_t          dir_uid;
 576         umode_t         dir_mode;
 577 } __randomize_layout;
 578
 579 #define ND_ROOT_PRESET 1
 580 #define ND_ROOT_GRABBED 2
 581 #define ND_JUMPED 4
 582
 583 static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 584 {
 585         struct nameidata *old = current->nameidata;
 586         p->stack = p->internal;
 587         p->depth = 0;
 588         p->dfd = dfd;
 589         p->name = name;
 590         p->path.mnt = NULL;
 591         p->path.dentry = NULL;
 592         p->total_link_count = old ? old->total_link_count : 0;
 593         p->saved = old;
 594         current->nameidata = p;
 595 }
 596
 597 static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name,
 598                           const struct path *root)
 599 {
 600         __set_nameidata(p, dfd, name);
 601         p->state = 0;
 602         if (unlikely(root)) {
 603                 p->state = ND_ROOT_PRESET;
 604                 p->root = *root;
 605         }
 606 }
 607
 608 static void restore_nameidata(void)
 609 {
 610         struct nameidata *now = current->nameidata, *old = now->saved;
 611
 612         current->nameidata = old;
 613         if (old)
 614                 old->total_link_count = now->total_link_count;
 615         if (now->stack != now->internal)
 616                 kfree(now->stack);
 617 }
 618
 619 static bool nd_alloc_stack(struct nameidata *nd)
 620 {
 621         struct saved *p;
 622
 623         p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
 624                          nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
 625         if (unlikely(!p))
 626                 return false;
 627         memcpy(p, nd->internal, sizeof(nd->internal));
 628         nd->stack = p;
 629         return true;
 630 }
 631
 632 /**
 633  * path_connected - Verify that a dentry is below mnt.mnt_root
 634  *
 635  * Rename can sometimes move a file or directory outside of a bind
 636  * mount, path_connected allows those cases to be detected.
 637  */
 638 static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
 639 {
 640         struct super_block *sb = mnt->mnt_sb;
 641
 642         /* Bind mounts can have disconnected paths */
 643         if (mnt->mnt_root == sb->s_root)
 644                 return true;
 645
 646         return is_subdir(dentry, mnt->mnt_root);
 647 }
 648
 649 static void drop_links(struct nameidata *nd)
 650 {
 651         int i = nd->depth;
 652         while (i--) {
 653                 struct saved *last = nd->stack + i;
 654                 do_delayed_call(&last->done);
 655                 clear_delayed_call(&last->done);
 656         }
 657 }
 658
 659 static void terminate_walk(struct nameidata *nd)
 660 {
 661         drop_links(nd);
 662         if (!(nd->flags & LOOKUP_RCU)) {
 663                 int i;
 664                 path_put(&nd->path);
 665                 for (i = 0; i < nd->depth; i++)
 666                         path_put(&nd->stack[i].link);
 667                 if (nd->state & ND_ROOT_GRABBED) {
 668                         path_put(&nd->root);
 669                         nd->state &= ~ND_ROOT_GRABBED;
 670                 }
 671         } else {
 672                 nd->flags &= ~LOOKUP_RCU;
 673                 rcu_read_unlock();
 674         }
 675         nd->depth = 0;
 676         nd->path.mnt = NULL;
 677         nd->path.dentry = NULL;
 678 }
 679
 680 /* path_put is needed afterwards regardless of success or failure */
 681 static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
 682 {
 683         int res = __legitimize_mnt(path->mnt, mseq);
 684         if (unlikely(res)) {
 685                 if (res > 0)
 686                         path->mnt = NULL;
 687                 path->dentry = NULL;
 688                 return false;
 689         }
 690         if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
 691                 path->dentry = NULL;
 692                 return false;
 693         }
 694         return !read_seqcount_retry(&path->dentry->d_seq, seq);
 695 }
 696
 697 static inline bool legitimize_path(struct nameidata *nd,
 698                             struct path *path, unsigned seq)
 699 {
 700         return __legitimize_path(path, seq, nd->m_seq);
 701 }
 702
 703 static bool legitimize_links(struct nameidata *nd)
 704 {
 705         int i;
 706         if (unlikely(nd->flags & LOOKUP_CACHED)) {
 707                 drop_links(nd);
 708                 nd->depth = 0;
 709                 return false;
 710         }
 711         for (i = 0; i < nd->depth; i++) {
 712                 struct saved *last = nd->stack + i;
 713                 if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
 714                         drop_links(nd);
 715                         nd->depth = i + 1;
 716                         return false;
 717                 }
 718         }
 719         return true;
 720 }
 721
 722 static bool legitimize_root(struct nameidata *nd)
 723 {
 724         /*
 725          * For scoped-lookups (where nd->root has been zeroed), we need to
 726          * restart the whole lookup from scratch -- because set_root() is wrong
 727          * for these lookups (nd->dfd is the root, not the filesystem root).
 728          */
 729         if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
 730                 return false;
 731         /* Nothing to do if nd->root is zero or is managed by the VFS user. */
 732         if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
 733                 return true;
 734         nd->state |= ND_ROOT_GRABBED;
 735         return legitimize_path(nd, &nd->root, nd->root_seq);
 736 }
 737
 738 /*
 739  * Path walking has 2 modes, rcu-walk and ref-walk (see
 740  * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 741  * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 742  * normal reference counts on dentries and vfsmounts to transition to ref-walk
 743  * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 744  * got stuck, so ref-walk may continue from there. If this is not successful
 745  * (eg. a seqcount has changed), then failure is returned and it's up to caller
 746  * to restart the path walk from the beginning in ref-walk mode.
 747  */
 748
 749 /**
 750  * try_to_unlazy - try to switch to ref-walk mode.
 751  * @nd: nameidata pathwalk data
 752  * Returns: true on success, false on failure
 753  *
 754  * try_to_unlazy attempts to legitimize the current nd->path and nd->root
 755  * for ref-walk mode.
 756  * Must be called from rcu-walk context.
 757  * Nothing should touch nameidata between try_to_unlazy() failure and
 758  * terminate_walk().
 759  */
 760 static bool try_to_unlazy(struct nameidata *nd)
 761 {
 762         struct dentry *parent = nd->path.dentry;
 763
 764         BUG_ON(!(nd->flags & LOOKUP_RCU));
 765
 766         nd->flags &= ~LOOKUP_RCU;
 767         if (unlikely(!legitimize_links(nd)))
 768                 goto out1;
 769         if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
 770                 goto out;
 771         if (unlikely(!legitimize_root(nd)))
 772                 goto out;
 773         rcu_read_unlock();
 774         BUG_ON(nd->inode != parent->d_inode);
 775         return true;
 776
 777 out1:
 778         nd->path.mnt = NULL;
 779         nd->path.dentry = NULL;
 780 out:
 781         rcu_read_unlock();
 782         return false;
 783 }
 784
 785 /**
 786  * try_to_unlazy_next - try to switch to ref-walk mode.
 787  * @nd: nameidata pathwalk data
 788  * @dentry: next dentry to step into
 789  * @seq: seq number to check @dentry against
 790  * Returns: true on success, false on failure
 791  *
 792  * Similar to to try_to_unlazy(), but here we have the next dentry already
 793  * picked by rcu-walk and want to legitimize that in addition to the current
 794  * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
 795  * Nothing should touch nameidata between try_to_unlazy_next() failure and
 796  * terminate_walk().
 797  */
 798 static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq)
 799 {
 800         BUG_ON(!(nd->flags & LOOKUP_RCU));
 801
 802         nd->flags &= ~LOOKUP_RCU;
 803         if (unlikely(!legitimize_links(nd)))
 804                 goto out2;
 805         if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
 806                 goto out2;
 807         if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
 808                 goto out1;
 809
 810         /*
 811          * We need to move both the parent and the dentry from the RCU domain
 812          * to be properly refcounted. And the sequence number in the dentry
 813          * validates *both* dentry counters, since we checked the sequence
 814          * number of the parent after we got the child sequence number. So we
 815          * know the parent must still be valid if the child sequence number is
 816          */
 817         if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
 818                 goto out;
 819         if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
 820                 goto out_dput;
 821         /*
 822          * Sequence counts matched. Now make sure that the root is
 823          * still valid and get it if required.
 824          */
 825         if (unlikely(!legitimize_root(nd)))
 826                 goto out_dput;
 827         rcu_read_unlock();
 828         return true;
 829
 830 out2:
 831         nd->path.mnt = NULL;
 832 out1:
 833         nd->path.dentry = NULL;
 834 out:
 835         rcu_read_unlock();
 836         return false;
 837 out_dput:
 838         rcu_read_unlock();
 839         dput(dentry);
 840         return false;
 841 }
 842
 843 static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
 844 {
 845         if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
 846                 return dentry->d_op->d_revalidate(dentry, flags);
 847         else
 848                 return 1;
 849 }
 850
 851 /**
 852  * complete_walk - successful completion of path walk
 853  * @nd:  pointer nameidata
 854  *
 855  * If we had been in RCU mode, drop out of it and legitimize nd->path.
 856  * Revalidate the final result, unless we'd already done that during
 857  * the path walk or the filesystem doesn't ask for it.  Return 0 on
 858  * success, -error on failure.  In case of failure caller does not
 859  * need to drop nd->path.
 860  */
 861 static int complete_walk(struct nameidata *nd)
 862 {
 863         struct dentry *dentry = nd->path.dentry;
 864         int status;
 865
 866         if (nd->flags & LOOKUP_RCU) {
 867                 /*
 868                  * We don't want to zero nd->root for scoped-lookups or
 869                  * externally-managed nd->root.
 870                  */
 871                 if (!(nd->state & ND_ROOT_PRESET))
 872                         if (!(nd->flags & LOOKUP_IS_SCOPED))
 873                                 nd->root.mnt = NULL;
 874                 nd->flags &= ~LOOKUP_CACHED;
 875                 if (!try_to_unlazy(nd))
 876                         return -ECHILD;
 877         }
 878
 879         if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
 880                 /*
 881                  * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
 882                  * ever step outside the root during lookup" and should already
 883                  * be guaranteed by the rest of namei, we want to avoid a namei
 884                  * BUG resulting in userspace being given a path that was not
 885                  * scoped within the root at some point during the lookup.
 886                  *
 887                  * So, do a final sanity-check to make sure that in the
 888                  * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
 889                  * we won't silently return an fd completely outside of the
 890                  * requested root to userspace.
 891                  *
 892                  * Userspace could move the path outside the root after this
 893                  * check, but as discussed elsewhere this is not a concern (the
 894                  * resolved file was inside the root at some point).
 895                  */
 896                 if (!path_is_under(&nd->path, &nd->root))
 897                         return -EXDEV;
 898         }
 899
 900         if (likely(!(nd->state & ND_JUMPED)))
 901                 return 0;
 902
 903         if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
 904                 return 0;
 905
 906         status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
 907         if (status > 0)
 908                 return 0;
 909
 910         if (!status)
 911                 status = -ESTALE;
 912
 913         return status;
 914 }
 915
 916 static int set_root(struct nameidata *nd)
 917 {
 918         struct fs_struct *fs = current->fs;
 919
 920         /*
 921          * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
 922          * still have to ensure it doesn't happen because it will cause a breakout
 923          * from the dirfd.
 924          */
 925         if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
 926                 return -ENOTRECOVERABLE;
 927
 928         if (nd->flags & LOOKUP_RCU) {
 929                 unsigned seq;
 930
 931                 do {
 932                         seq = read_seqcount_begin(&fs->seq);
 933                         nd->root = fs->root;
 934                         nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
 935                 } while (read_seqcount_retry(&fs->seq, seq));
 936         } else {
 937                 get_fs_root(fs, &nd->root);
 938                 nd->state |= ND_ROOT_GRABBED;
 939         }
 940         return 0;
 941 }
 942
 943 static int nd_jump_root(struct nameidata *nd)
 944 {
 945         if (unlikely(nd->flags & LOOKUP_BENEATH))
 946                 return -EXDEV;
 947         if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
 948                 /* Absolute path arguments to path_init() are allowed. */
 949                 if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
 950                         return -EXDEV;
 951         }
 952         if (!nd->root.mnt) {
 953                 int error = set_root(nd);
 954                 if (error)
 955                         return error;
 956         }
 957         if (nd->flags & LOOKUP_RCU) {
 958                 struct dentry *d;
 959                 nd->path = nd->root;
 960                 d = nd->path.dentry;
 961                 nd->inode = d->d_inode;
 962                 nd->seq = nd->root_seq;
 963                 if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
 964                         return -ECHILD;
 965         } else {
 966                 path_put(&nd->path);
 967                 nd->path = nd->root;
 968                 path_get(&nd->path);
 969                 nd->inode = nd->path.dentry->d_inode;
 970         }
 971         nd->state |= ND_JUMPED;
 972         return 0;
 973 }
 974
 975 /*
 976  * Helper to directly jump to a known parsed path from ->get_link,
 977  * caller must have taken a reference to path beforehand.
 978  */
 979 int nd_jump_link(struct path *path)
 980 {
 981         int error = -ELOOP;
 982         struct nameidata *nd = current->nameidata;
 983
 984         if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
 985                 goto err;
 986
 987         error = -EXDEV;
 988         if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
 989                 if (nd->path.mnt != path->mnt)
 990                         goto err;
 991         }
 992         /* Not currently safe for scoped-lookups. */
 993         if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
 994                 goto err;
 995
 996         path_put(&nd->path);
 997         nd->path = *path;
 998         nd->inode = nd->path.dentry->d_inode;
 999         nd->state |= ND_JUMPED;
1000         return 0;
1001
1002 err:
1003         path_put(path);
1004         return error;
1005 }
1006
1007 static inline void put_link(struct nameidata *nd)
1008 {
1009         struct saved *last = nd->stack + --nd->depth;
1010         do_delayed_call(&last->done);
1011         if (!(nd->flags & LOOKUP_RCU))
1012                 path_put(&last->link);
1013 }
1014
1015 int sysctl_protected_symlinks __read_mostly = 0;
1016 int sysctl_protected_hardlinks __read_mostly = 0;
1017 int sysctl_protected_fifos __read_mostly;
1018 int sysctl_protected_regular __read_mostly;
1019
1020 /**
1021  * may_follow_link - Check symlink following for unsafe situations
1022  * @nd: nameidata pathwalk data
1023  *
1024  * In the case of the sysctl_protected_symlinks sysctl being enabled,
1025  * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
1026  * in a sticky world-writable directory. This is to protect privileged
1027  * processes from failing races against path names that may change out
1028  * from under them by way of other users creating malicious symlinks.
1029  * It will permit symlinks to be followed only when outside a sticky
1030  * world-writable directory, or when the uid of the symlink and follower
1031  * match, or when the directory owner matches the symlink's owner.
1032  *
1033  * Returns 0 if following the symlink is allowed, -ve on error.
1034  */
1035 static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
1036 {
1037         struct user_namespace *mnt_userns;
1038         kuid_t i_uid;
1039
1040         if (!sysctl_protected_symlinks)
1041                 return 0;
1042
1043         mnt_userns = mnt_user_ns(nd->path.mnt);
1044         i_uid = i_uid_into_mnt(mnt_userns, inode);
1045         /* Allowed if owner and follower match. */
1046         if (uid_eq(current_cred()->fsuid, i_uid))
1047                 return 0;
1048
1049         /* Allowed if parent directory not sticky and world-writable. */
1050         if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
1051                 return 0;
1052
1053         /* Allowed if parent directory and link owner match. */
1054         if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
1055                 return 0;
1056
1057         if (nd->flags & LOOKUP_RCU)
1058                 return -ECHILD;
1059
1060         audit_inode(nd->name, nd->stack[0].link.dentry, 0);
1061         audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
1062         return -EACCES;
1063 }
1064
1065 /**
1066  * safe_hardlink_source - Check for safe hardlink conditions
1067  * @mnt_userns: user namespace of the mount the inode was found from
1068  * @inode: the source inode to hardlink from
1069  *
1070  * Return false if at least one of the following conditions:
1071  *    - inode is not a regular file
1072  *    - inode is setuid
1073  *    - inode is setgid and group-exec
1074  *    - access failure for read and write
1075  *
1076  * Otherwise returns true.
1077  */
1078 static bool safe_hardlink_source(struct user_namespace *mnt_userns,
1079                                  struct inode *inode)
1080 {
1081         umode_t mode = inode->i_mode;
1082
1083         /* Special files should not get pinned to the filesystem. */
1084         if (!S_ISREG(mode))
1085                 return false;
1086
1087         /* Setuid files should not get pinned to the filesystem. */
1088         if (mode & S_ISUID)
1089                 return false;
1090
1091         /* Executable setgid files should not get pinned to the filesystem. */
1092         if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
1093                 return false;
1094
1095         /* Hardlinking to unreadable or unwritable sources is dangerous. */
1096         if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE))
1097                 return false;
1098
1099         return true;
1100 }
1101
1102 /**
1103  * may_linkat - Check permissions for creating a hardlink
1104  * @mnt_userns: user namespace of the mount the inode was found from
1105  * @link: the source to hardlink from
1106  *
1107  * Block hardlink when all of:
1108  *  - sysctl_protected_hardlinks enabled
1109  *  - fsuid does not match inode
1110  *  - hardlink source is unsafe (see safe_hardlink_source() above)
1111  *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
1112  *
1113  * If the inode has been found through an idmapped mount the user namespace of
1114  * the vfsmount must be passed through @mnt_userns. This function will then take
1115  * care to map the inode according to @mnt_userns before checking permissions.
1116  * On non-idmapped mounts or if permission checking is to be performed on the
1117  * raw inode simply passs init_user_ns.
1118  *
1119  * Returns 0 if successful, -ve on error.
1120  */
1121 int may_linkat(struct user_namespace *mnt_userns, struct path *link)
1122 {
1123         struct inode *inode = link->dentry->d_inode;
1124
1125         /* Inode writeback is not safe when the uid or gid are invalid. */
1126         if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
1127             !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
1128                 return -EOVERFLOW;
1129
1130         if (!sysctl_protected_hardlinks)
1131                 return 0;
1132
1133         /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
1134          * otherwise, it must be a safe source.
1135          */
1136         if (safe_hardlink_source(mnt_userns, inode) ||
1137             inode_owner_or_capable(mnt_userns, inode))
1138                 return 0;
1139
1140         audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
1141         return -EPERM;
1142 }
1143
1144 /**
1145  * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
1146  *                        should be allowed, or not, on files that already
1147  *                        exist.
1148  * @mnt_userns: user namespace of the mount the inode was found from
1149  * @nd: nameidata pathwalk data
1150  * @inode: the inode of the file to open
1151  *
1152  * Block an O_CREAT open of a FIFO (or a regular file) when:
1153  *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
1154  *   - the file already exists
1155  *   - we are in a sticky directory
1156  *   - we don't own the file
1157  *   - the owner of the directory doesn't own the file
1158  *   - the directory is world writable
1159  * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
1160  * the directory doesn't have to be world writable: being group writable will
1161  * be enough.
1162  *
1163  * If the inode has been found through an idmapped mount the user namespace of
1164  * the vfsmount must be passed through @mnt_userns. This function will then take
1165  * care to map the inode according to @mnt_userns before checking permissions.
1166  * On non-idmapped mounts or if permission checking is to be performed on the
1167  * raw inode simply passs init_user_ns.
1168  *
1169  * Returns 0 if the open is allowed, -ve on error.
1170  */
1171 static int may_create_in_sticky(struct user_namespace *mnt_userns,
1172                                 struct nameidata *nd, struct inode *const inode)
1173 {
1174         umode_t dir_mode = nd->dir_mode;
1175         kuid_t dir_uid = nd->dir_uid;
1176
1177         if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
1178             (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
1179             likely(!(dir_mode & S_ISVTX)) ||
1180             uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) ||
1181             uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
1182                 return 0;
1183
1184         if (likely(dir_mode & 0002) ||
1185             (dir_mode & 0020 &&
1186              ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
1187               (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
1188                 const char *operation = S_ISFIFO(inode->i_mode) ?
1189                                         "sticky_create_fifo" :
1190                                         "sticky_create_regular";
1191                 audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
1192                 return -EACCES;
1193         }
1194         return 0;
1195 }
1196
1197 /*
1198  * follow_up - Find the mountpoint of path's vfsmount
1199  *
1200  * Given a path, find the mountpoint of its source file system.
1201  * Replace @path with the path of the mountpoint in the parent mount.
1202  * Up is towards /.
1203  *
1204  * Return 1 if we went up a level and 0 if we were already at the
1205  * root.
1206  */
1207 int follow_up(struct path *path)
1208 {
1209         struct mount *mnt = real_mount(path->mnt);
1210         struct mount *parent;
1211         struct dentry *mountpoint;
1212
1213         read_seqlock_excl(&mount_lock);
1214         parent = mnt->mnt_parent;
1215         if (parent == mnt) {
1216                 read_sequnlock_excl(&mount_lock);
1217                 return 0;
1218         }
1219         mntget(&parent->mnt);
1220         mountpoint = dget(mnt->mnt_mountpoint);
1221         read_sequnlock_excl(&mount_lock);
1222         dput(path->dentry);
1223         path->dentry = mountpoint;
1224         mntput(path->mnt);
1225         path->mnt = &parent->mnt;
1226         return 1;
1227 }
1228 EXPORT_SYMBOL(follow_up);
1229
1230 static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
1231                                   struct path *path, unsigned *seqp)
1232 {
1233         while (mnt_has_parent(m)) {
1234                 struct dentry *mountpoint = m->mnt_mountpoint;
1235
1236                 m = m->mnt_parent;
1237                 if (unlikely(root->dentry == mountpoint &&
1238                              root->mnt == &m->mnt))
1239                         break;
1240                 if (mountpoint != m->mnt.mnt_root) {
1241                         path->mnt = &m->mnt;
1242                         path->dentry = mountpoint;
1243                         *seqp = read_seqcount_begin(&mountpoint->d_seq);
1244                         return true;
1245                 }
1246         }
1247         return false;
1248 }
1249
1250 static bool choose_mountpoint(struct mount *m, const struct path *root,
1251                               struct path *path)
1252 {
1253         bool found;
1254
1255         rcu_read_lock();
1256         while (1) {
1257                 unsigned seq, mseq = read_seqbegin(&mount_lock);
1258
1259                 found = choose_mountpoint_rcu(m, root, path, &seq);
1260                 if (unlikely(!found)) {
1261                         if (!read_seqretry(&mount_lock, mseq))
1262                                 break;
1263                 } else {
1264                         if (likely(__legitimize_path(path, seq, mseq)))
1265                                 break;
1266                         rcu_read_unlock();
1267                         path_put(path);
1268                         rcu_read_lock();
1269                 }
1270         }
1271         rcu_read_unlock();
1272         return found;
1273 }
1274
1275 /*
1276  * Perform an automount
1277  * - return -EISDIR to tell follow_managed() to stop and return the path we
1278  *   were called with.
1279  */
1280 static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
1281 {
1282         struct dentry *dentry = path->dentry;
1283
1284         /* We don't want to mount if someone's just doing a stat -
1285          * unless they're stat'ing a directory and appended a '/' to
1286          * the name.
1287          *
1288          * We do, however, want to mount if someone wants to open or
1289          * create a file of any type under the mountpoint, wants to
1290          * traverse through the mountpoint or wants to open the
1291          * mounted directory.  Also, autofs may mark negative dentries
1292          * as being automount points.  These will need the attentions
1293          * of the daemon to instantiate them before they can be used.
1294          */
1295         if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1296                            LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1297             dentry->d_inode)
1298                 return -EISDIR;
1299
1300         if (count && (*count)++ >= MAXSYMLINKS)
1301                 return -ELOOP;
1302
1303         return finish_automount(dentry->d_op->d_automount(path), path);
1304 }
1305
1306 /*
1307  * mount traversal - out-of-line part.  One note on ->d_flags accesses -
1308  * dentries are pinned but not locked here, so negative dentry can go
1309  * positive right under us.  Use of smp_load_acquire() provides a barrier
1310  * sufficient for ->d_inode and ->d_flags consistency.
1311  */
1312 static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
1313                              int *count, unsigned lookup_flags)
1314 {
1315         struct vfsmount *mnt = path->mnt;
1316         bool need_mntput = false;
1317         int ret = 0;
1318
1319         while (flags & DCACHE_MANAGED_DENTRY) {
1320                 /* Allow the filesystem to manage the transit without i_mutex
1321                  * being held. */
1322                 if (flags & DCACHE_MANAGE_TRANSIT) {
1323                         ret = path->dentry->d_op->d_manage(path, false);
1324                         flags = smp_load_acquire(&path->dentry->d_flags);
1325                         if (ret < 0)
1326                                 break;
1327                 }
1328
1329                 if (flags & DCACHE_MOUNTED) {   // something's mounted on it..
1330                         struct vfsmount *mounted = lookup_mnt(path);
1331                         if (mounted) {          // ... in our namespace
1332                                 dput(path->dentry);
1333                                 if (need_mntput)
1334                                         mntput(path->mnt);
1335                                 path->mnt = mounted;
1336                                 path->dentry = dget(mounted->mnt_root);
1337                                 // here we know it's positive
1338                                 flags = path->dentry->d_flags;
1339                                 need_mntput = true;
1340                                 continue;
1341                         }
1342                 }
1343
1344                 if (!(flags & DCACHE_NEED_AUTOMOUNT))
1345                         break;
1346
1347                 // uncovered automount point
1348                 ret = follow_automount(path, count, lookup_flags);
1349                 flags = smp_load_acquire(&path->dentry->d_flags);
1350                 if (ret < 0)
1351                         break;
1352         }
1353
1354         if (ret == -EISDIR)
1355                 ret = 0;
1356         // possible if you race with several mount --move
1357         if (need_mntput && path->mnt == mnt)
1358                 mntput(path->mnt);
1359         if (!ret && unlikely(d_flags_negative(flags)))
1360                 ret = -ENOENT;
1361         *jumped = need_mntput;
1362         return ret;
1363 }
1364
1365 static inline int traverse_mounts(struct path *path, bool *jumped,
1366                                   int *count, unsigned lookup_flags)
1367 {
1368         unsigned flags = smp_load_acquire(&path->dentry->d_flags);
1369
1370         /* fastpath */
1371         if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
1372                 *jumped = false;
1373                 if (unlikely(d_flags_negative(flags)))
1374                         return -ENOENT;
1375                 return 0;
1376         }
1377         return __traverse_mounts(path, flags, jumped, count, lookup_flags);
1378 }
1379
1380 int follow_down_one(struct path *path)
1381 {
1382         struct vfsmount *mounted;
1383
1384         mounted = lookup_mnt(path);
1385         if (mounted) {
1386                 dput(path->dentry);
1387                 mntput(path->mnt);
1388                 path->mnt = mounted;
1389                 path->dentry = dget(mounted->mnt_root);
1390                 return 1;
1391         }
1392         return 0;
1393 }
1394 EXPORT_SYMBOL(follow_down_one);
1395
1396 /*
1397  * Follow down to the covering mount currently visible to userspace.  At each
1398  * point, the filesystem owning that dentry may be queried as to whether the
1399  * caller is permitted to proceed or not.
1400  */
1401 int follow_down(struct path *path)
1402 {
1403         struct vfsmount *mnt = path->mnt;
1404         bool jumped;
1405         int ret = traverse_mounts(path, &jumped, NULL, 0);
1406
1407         if (path->mnt != mnt)
1408                 mntput(mnt);
1409         return ret;
1410 }
1411 EXPORT_SYMBOL(follow_down);
1412
1413 /*
1414  * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
1415  * we meet a managed dentry that would need blocking.
1416  */
1417 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1418                                struct inode **inode, unsigned *seqp)
1419 {
1420         struct dentry *dentry = path->dentry;
1421         unsigned int flags = dentry->d_flags;
1422
1423         if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
1424                 return true;
1425
1426         if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1427                 return false;
1428
1429         for (;;) {
1430                 /*
1431                  * Don't forget we might have a non-mountpoint managed dentry
1432                  * that wants to block transit.
1433                  */
1434                 if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
1435                         int res = dentry->d_op->d_manage(path, true);
1436                         if (res)
1437                                 return res == -EISDIR;
1438                         flags = dentry->d_flags;
1439                 }
1440
1441                 if (flags & DCACHE_MOUNTED) {
1442                         struct mount *mounted = __lookup_mnt(path->mnt, dentry);
1443                         if (mounted) {
1444                                 path->mnt = &mounted->mnt;
1445                                 dentry = path->dentry = mounted->mnt.mnt_root;
1446                                 nd->state |= ND_JUMPED;
1447                                 *seqp = read_seqcount_begin(&dentry->d_seq);
1448                                 *inode = dentry->d_inode;
1449                                 /*
1450                                  * We don't need to re-check ->d_seq after this
1451                                  * ->d_inode read - there will be an RCU delay
1452                                  * between mount hash removal and ->mnt_root
1453                                  * becoming unpinned.
1454                                  */
1455                                 flags = dentry->d_flags;
1456                                 continue;
1457                         }
1458                         if (read_seqretry(&mount_lock, nd->m_seq))
1459                                 return false;
1460                 }
1461                 return !(flags & DCACHE_NEED_AUTOMOUNT);
1462         }
1463 }
1464
1465 static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
1466                           struct path *path, struct inode **inode,
1467                           unsigned int *seqp)
1468 {
1469         bool jumped;
1470         int ret;
1471
1472         path->mnt = nd->path.mnt;
1473         path->dentry = dentry;
1474         if (nd->flags & LOOKUP_RCU) {
1475                 unsigned int seq = *seqp;
1476                 if (unlikely(!*inode))
1477                         return -ENOENT;
1478                 if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
1479                         return 0;
1480                 if (!try_to_unlazy_next(nd, dentry, seq))
1481                         return -ECHILD;
1482                 // *path might've been clobbered by __follow_mount_rcu()
1483                 path->mnt = nd->path.mnt;
1484                 path->dentry = dentry;
1485         }
1486         ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
1487         if (jumped) {
1488                 if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1489                         ret = -EXDEV;
1490                 else
1491                         nd->state |= ND_JUMPED;
1492         }
1493         if (unlikely(ret)) {
1494                 dput(path->dentry);
1495                 if (path->mnt != nd->path.mnt)
1496                         mntput(path->mnt);
1497         } else {
1498                 *inode = d_backing_inode(path->dentry);
1499                 *seqp = 0; /* out of RCU mode, so the value doesn't matter */
1500         }
1501         return ret;
1502 }
1503
1504 /*
1505  * This looks up the name in dcache and possibly revalidates the found dentry.
1506  * NULL is returned if the dentry does not exist in the cache.
1507  */
1508 static struct dentry *lookup_dcache(const struct qstr *name,
1509                                     struct dentry *dir,
1510                                     unsigned int flags)
1511 {
1512         struct dentry *dentry = d_lookup(dir, name);
1513         if (dentry) {
1514                 int error = d_revalidate(dentry, flags);
1515                 if (unlikely(error <= 0)) {
1516                         if (!error)
1517                                 d_invalidate(dentry);
1518                         dput(dentry);
1519                         return ERR_PTR(error);
1520                 }
1521         }
1522         return dentry;
1523 }
1524
1525 /*
1526  * Parent directory has inode locked exclusive.  This is one
1527  * and only case when ->lookup() gets called on non in-lookup
1528  * dentries - as the matter of fact, this only gets called
1529  * when directory is guaranteed to have no in-lookup children
1530  * at all.
1531  */
1532 static struct dentry *__lookup_hash(const struct qstr *name,
1533                 struct dentry *base, unsigned int flags)
1534 {
1535         struct dentry *dentry = lookup_dcache(name, base, flags);
1536         struct dentry *old;
1537         struct inode *dir = base->d_inode;
1538
1539         if (dentry)
1540                 return dentry;
1541
1542         /* Don't create child dentry for a dead directory. */
1543         if (unlikely(IS_DEADDIR(dir)))
1544                 return ERR_PTR(-ENOENT);
1545
1546         dentry = d_alloc(base, name);
1547         if (unlikely(!dentry))
1548                 return ERR_PTR(-ENOMEM);
1549
1550         old = dir->i_op->lookup(dir, dentry, flags);
1551         if (unlikely(old)) {
1552                 dput(dentry);
1553                 dentry = old;
1554         }
1555         return dentry;
1556 }
1557
1558 static struct dentry *lookup_fast(struct nameidata *nd,
1559                                   struct inode **inode,
1560                                   unsigned *seqp)
1561 {
1562         struct dentry *dentry, *parent = nd->path.dentry;
1563         int status = 1;
1564
1565         /*
1566          * Rename seqlock is not required here because in the off chance
1567          * of a false negative due to a concurrent rename, the caller is
1568          * going to fall back to non-racy lookup.
1569          */
1570         if (nd->flags & LOOKUP_RCU) {
1571                 unsigned seq;
1572                 dentry = __d_lookup_rcu(parent, &nd->last, &seq);
1573                 if (unlikely(!dentry)) {
1574                         if (!try_to_unlazy(nd))
1575                                 return ERR_PTR(-ECHILD);
1576                         return NULL;
1577                 }
1578
1579                 /*
1580                  * This sequence count validates that the inode matches
1581                  * the dentry name information from lookup.
1582                  */
1583                 *inode = d_backing_inode(dentry);
1584                 if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
1585                         return ERR_PTR(-ECHILD);
1586
1587                 /*
1588                  * This sequence count validates that the parent had no
1589                  * changes while we did the lookup of the dentry above.
1590                  *
1591                  * The memory barrier in read_seqcount_begin of child is
1592                  *  enough, we can use __read_seqcount_retry here.
1593                  */
1594                 if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
1595                         return ERR_PTR(-ECHILD);
1596
1597                 *seqp = seq;
1598                 status = d_revalidate(dentry, nd->flags);
1599                 if (likely(status > 0))
1600                         return dentry;
1601                 if (!try_to_unlazy_next(nd, dentry, seq))
1602                         return ERR_PTR(-ECHILD);
1603                 if (status == -ECHILD)
1604                         /* we'd been told to redo it in non-rcu mode */
1605                         status = d_revalidate(dentry, nd->flags);
1606         } else {
1607                 dentry = __d_lookup(parent, &nd->last);
1608                 if (unlikely(!dentry))
1609                         return NULL;
1610                 status = d_revalidate(dentry, nd->flags);
1611         }
1612         if (unlikely(status <= 0)) {
1613                 if (!status)
1614                         d_invalidate(dentry);
1615                 dput(dentry);
1616                 return ERR_PTR(status);
1617         }
1618         return dentry;
1619 }
1620
1621 /* Fast lookup failed, do it the slow way */
1622 static struct dentry *__lookup_slow(const struct qstr *name,
1623                                     struct dentry *dir,
1624                                     unsigned int flags)
1625 {
1626         struct dentry *dentry, *old;
1627         struct inode *inode = dir->d_inode;
1628         DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1629
1630         /* Don't go there if it's already dead */
1631         if (unlikely(IS_DEADDIR(inode)))
1632                 return ERR_PTR(-ENOENT);
1633 again:
1634         dentry = d_alloc_parallel(dir, name, &wq);
1635         if (IS_ERR(dentry))
1636                 return dentry;
1637         if (unlikely(!d_in_lookup(dentry))) {
1638                 int error = d_revalidate(dentry, flags);
1639                 if (unlikely(error <= 0)) {
1640                         if (!error) {
1641                                 d_invalidate(dentry);
1642                                 dput(dentry);
1643                                 goto again;
1644                         }
1645                         dput(dentry);
1646                         dentry = ERR_PTR(error);
1647                 }
1648         } else {
1649                 old = inode->i_op->lookup(inode, dentry, flags);
1650                 d_lookup_done(dentry);
1651                 if (unlikely(old)) {
1652                         dput(dentry);
1653                         dentry = old;
1654                 }
1655         }
1656         return dentry;
1657 }
1658
1659 static struct dentry *lookup_slow(const struct qstr *name,
1660                                   struct dentry *dir,
1661                                   unsigned int flags)
1662 {
1663         struct inode *inode = dir->d_inode;
1664         struct dentry *res;
1665         inode_lock_shared(inode);
1666         res = __lookup_slow(name, dir, flags);
1667         inode_unlock_shared(inode);
1668         return res;
1669 }
1670
1671 static inline int may_lookup(struct user_namespace *mnt_userns,
1672                              struct nameidata *nd)
1673 {
1674         if (nd->flags & LOOKUP_RCU) {
1675                 int err = inode_permission(mnt_userns, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1676                 if (err != -ECHILD || !try_to_unlazy(nd))
1677                         return err;
1678         }
1679         return inode_permission(mnt_userns, nd->inode, MAY_EXEC);
1680 }
1681
1682 static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
1683 {
1684         if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
1685                 return -ELOOP;
1686
1687         if (likely(nd->depth != EMBEDDED_LEVELS))
1688                 return 0;
1689         if (likely(nd->stack != nd->internal))
1690                 return 0;
1691         if (likely(nd_alloc_stack(nd)))
1692                 return 0;
1693
1694         if (nd->flags & LOOKUP_RCU) {
1695                 // we need to grab link before we do unlazy.  And we can't skip
1696                 // unlazy even if we fail to grab the link - cleanup needs it
1697                 bool grabbed_link = legitimize_path(nd, link, seq);
1698
1699                 if (!try_to_unlazy(nd) != 0 || !grabbed_link)
1700                         return -ECHILD;
1701
1702                 if (nd_alloc_stack(nd))
1703                         return 0;
1704         }
1705         return -ENOMEM;
1706 }
1707
1708 enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
1709
1710 static const char *pick_link(struct nameidata *nd, struct path *link,
1711                      struct inode *inode, unsigned seq, int flags)
1712 {
1713         struct saved *last;
1714         const char *res;
1715         int error = reserve_stack(nd, link, seq);
1716
1717         if (unlikely(error)) {
1718                 if (!(nd->flags & LOOKUP_RCU))
1719                         path_put(link);
1720                 return ERR_PTR(error);
1721         }
1722         last = nd->stack + nd->depth++;
1723         last->link = *link;
1724         clear_delayed_call(&last->done);
1725         last->seq = seq;
1726
1727         if (flags & WALK_TRAILING) {
1728                 error = may_follow_link(nd, inode);
1729                 if (unlikely(error))
1730                         return ERR_PTR(error);
1731         }
1732
1733         if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
1734                         unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
1735                 return ERR_PTR(-ELOOP);
1736
1737         if (!(nd->flags & LOOKUP_RCU)) {
1738                 touch_atime(&last->link);
1739                 cond_resched();
1740         } else if (atime_needs_update(&last->link, inode)) {
1741                 if (!try_to_unlazy(nd))
1742                         return ERR_PTR(-ECHILD);
1743                 touch_atime(&last->link);
1744         }
1745
1746         error = security_inode_follow_link(link->dentry, inode,
1747                                            nd->flags & LOOKUP_RCU);
1748         if (unlikely(error))
1749                 return ERR_PTR(error);
1750
1751         res = READ_ONCE(inode->i_link);
1752         if (!res) {
1753                 const char * (*get)(struct dentry *, struct inode *,
1754                                 struct delayed_call *);
1755                 get = inode->i_op->get_link;
1756                 if (nd->flags & LOOKUP_RCU) {
1757                         res = get(NULL, inode, &last->done);
1758                         if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
1759                                 res = get(link->dentry, inode, &last->done);
1760                 } else {
1761                         res = get(link->dentry, inode, &last->done);
1762                 }
1763                 if (!res)
1764                         goto all_done;
1765                 if (IS_ERR(res))
1766                         return res;
1767         }
1768         if (*res == '/') {
1769                 error = nd_jump_root(nd);
1770                 if (unlikely(error))
1771                         return ERR_PTR(error);
1772                 while (unlikely(*++res == '/'))
1773                         ;
1774         }
1775         if (*res)
1776                 return res;
1777 all_done: // pure jump
1778         put_link(nd);
1779         return NULL;
1780 }
1781
1782 /*
1783  * Do we need to follow links? We _really_ want to be able
1784  * to do this check without having to look at inode->i_op,
1785  * so we keep a cache of "no, this doesn't need follow_link"
1786  * for the common case.
1787  */
1788 static const char *step_into(struct nameidata *nd, int flags,
1789                      struct dentry *dentry, struct inode *inode, unsigned seq)
1790 {
1791         struct path path;
1792         int err = handle_mounts(nd, dentry, &path, &inode, &seq);
1793
1794         if (err < 0)
1795                 return ERR_PTR(err);
1796         if (likely(!d_is_symlink(path.dentry)) ||
1797            ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
1798            (flags & WALK_NOFOLLOW)) {
1799                 /* not a symlink or should not follow */
1800                 if (!(nd->flags & LOOKUP_RCU)) {
1801                         dput(nd->path.dentry);
1802                         if (nd->path.mnt != path.mnt)
1803                                 mntput(nd->path.mnt);
1804                 }
1805                 nd->path = path;
1806                 nd->inode = inode;
1807                 nd->seq = seq;
1808                 return NULL;
1809         }
1810         if (nd->flags & LOOKUP_RCU) {
1811                 /* make sure that d_is_symlink above matches inode */
1812                 if (read_seqcount_retry(&path.dentry->d_seq, seq))
1813                         return ERR_PTR(-ECHILD);
1814         } else {
1815                 if (path.mnt == nd->path.mnt)
1816                         mntget(path.mnt);
1817         }
1818         return pick_link(nd, &path, inode, seq, flags);
1819 }
1820
1821 static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
1822                                         struct inode **inodep,
1823                                         unsigned *seqp)
1824 {
1825         struct dentry *parent, *old;
1826
1827         if (path_equal(&nd->path, &nd->root))
1828                 goto in_root;
1829         if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
1830                 struct path path;
1831                 unsigned seq;
1832                 if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
1833                                            &nd->root, &path, &seq))
1834                         goto in_root;
1835                 if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1836                         return ERR_PTR(-ECHILD);
1837                 nd->path = path;
1838                 nd->inode = path.dentry->d_inode;
1839                 nd->seq = seq;
1840                 if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
1841                         return ERR_PTR(-ECHILD);
1842                 /* we know that mountpoint was pinned */
1843         }
1844         old = nd->path.dentry;
1845         parent = old->d_parent;
1846         *inodep = parent->d_inode;
1847         *seqp = read_seqcount_begin(&parent->d_seq);
1848         if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
1849                 return ERR_PTR(-ECHILD);
1850         if (unlikely(!path_connected(nd->path.mnt, parent)))
1851                 return ERR_PTR(-ECHILD);
1852         return parent;
1853 in_root:
1854         if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
1855                 return ERR_PTR(-ECHILD);
1856         if (unlikely(nd->flags & LOOKUP_BENEATH))
1857                 return ERR_PTR(-ECHILD);
1858         return NULL;
1859 }
1860
1861 static struct dentry *follow_dotdot(struct nameidata *nd,
1862                                  struct inode **inodep,
1863                                  unsigned *seqp)
1864 {
1865         struct dentry *parent;
1866
1867         if (path_equal(&nd->path, &nd->root))
1868                 goto in_root;
1869         if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
1870                 struct path path;
1871
1872                 if (!choose_mountpoint(real_mount(nd->path.mnt),
1873                                        &nd->root, &path))
1874                         goto in_root;
1875                 path_put(&nd->path);
1876                 nd->path = path;
1877                 nd->inode = path.dentry->d_inode;
1878                 if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1879                         return ERR_PTR(-EXDEV);
1880         }
1881         /* rare case of legitimate dget_parent()... */
1882         parent = dget_parent(nd->path.dentry);
1883         if (unlikely(!path_connected(nd->path.mnt, parent))) {
1884                 dput(parent);
1885                 return ERR_PTR(-ENOENT);
1886         }
1887         *seqp = 0;
1888         *inodep = parent->d_inode;
1889         return parent;
1890
1891 in_root:
1892         if (unlikely(nd->flags & LOOKUP_BENEATH))
1893                 return ERR_PTR(-EXDEV);
1894         dget(nd->path.dentry);
1895         return NULL;
1896 }
1897
1898 static const char *handle_dots(struct nameidata *nd, int type)
1899 {
1900         if (type == LAST_DOTDOT) {
1901                 const char *error = NULL;
1902                 struct dentry *parent;
1903                 struct inode *inode;
1904                 unsigned seq;
1905
1906                 if (!nd->root.mnt) {
1907                         error = ERR_PTR(set_root(nd));
1908                         if (error)
1909                                 return error;
1910                 }
1911                 if (nd->flags & LOOKUP_RCU)
1912                         parent = follow_dotdot_rcu(nd, &inode, &seq);
1913                 else
1914                         parent = follow_dotdot(nd, &inode, &seq);
1915                 if (IS_ERR(parent))
1916                         return ERR_CAST(parent);
1917                 if (unlikely(!parent))
1918                         error = step_into(nd, WALK_NOFOLLOW,
1919                                          nd->path.dentry, nd->inode, nd->seq);
1920                 else
1921                         error = step_into(nd, WALK_NOFOLLOW,
1922                                          parent, inode, seq);
1923                 if (unlikely(error))
1924                         return error;
1925
1926                 if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
1927                         /*
1928                          * If there was a racing rename or mount along our
1929                          * path, then we can't be sure that ".." hasn't jumped
1930                          * above nd->root (and so userspace should retry or use
1931                          * some fallback).
1932                          */
1933                         smp_rmb();
1934                         if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
1935                                 return ERR_PTR(-EAGAIN);
1936                         if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
1937                                 return ERR_PTR(-EAGAIN);
1938                 }
1939         }
1940         return NULL;
1941 }
1942
1943 static const char *walk_component(struct nameidata *nd, int flags)
1944 {
1945         struct dentry *dentry;
1946         struct inode *inode;
1947         unsigned seq;
1948         /*
1949          * "." and ".." are special - ".." especially so because it has
1950          * to be able to know about the current root directory and
1951          * parent relationships.
1952          */
1953         if (unlikely(nd->last_type != LAST_NORM)) {
1954                 if (!(flags & WALK_MORE) && nd->depth)
1955                         put_link(nd);
1956                 return handle_dots(nd, nd->last_type);
1957         }
1958         dentry = lookup_fast(nd, &inode, &seq);
1959         if (IS_ERR(dentry))
1960                 return ERR_CAST(dentry);
1961         if (unlikely(!dentry)) {
1962                 dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
1963                 if (IS_ERR(dentry))
1964                         return ERR_CAST(dentry);
1965         }
1966         if (!(flags & WALK_MORE) && nd->depth)
1967                 put_link(nd);
1968         return step_into(nd, flags, dentry, inode, seq);
1969 }
1970
1971 /*
1972  * We can do the critical dentry name comparison and hashing
1973  * operations one word at a time, but we are limited to:
1974  *
1975  * - Architectures with fast unaligned word accesses. We could
1976  *   do a "get_unaligned()" if this helps and is sufficiently
1977  *   fast.
1978  *
1979  * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
1980  *   do not trap on the (extremely unlikely) case of a page
1981  *   crossing operation.
1982  *
1983  * - Furthermore, we need an efficient 64-bit compile for the
1984  *   64-bit case in order to generate the "number of bytes in
1985  *   the final mask". Again, that could be replaced with a
1986  *   efficient population count instruction or similar.
1987  */
1988 #ifdef CONFIG_DCACHE_WORD_ACCESS
1989
1990 #include <asm/word-at-a-time.h>
1991
1992 #ifdef HASH_MIX
1993
1994 /* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
1995
1996 #elif defined(CONFIG_64BIT)
1997 /*
1998  * Register pressure in the mixing function is an issue, particularly
1999  * on 32-bit x86, but almost any function requires one state value and
2000  * one temporary.  Instead, use a function designed for two state values
2001  * and no temporaries.
2002  *
2003  * This function cannot create a collision in only two iterations, so
2004  * we have two iterations to achieve avalanche.  In those two iterations,
2005  * we have six layers of mixing, which is enough to spread one bit's
2006  * influence out to 2^6 = 64 state bits.
2007  *
2008  * Rotate constants are scored by considering either 64 one-bit input
2009  * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
2010  * probability of that delta causing a change to each of the 128 output
2011  * bits, using a sample of random initial states.
2012  *
2013  * The Shannon entropy of the computed probabilities is then summed
2014  * to produce a score.  Ideally, any input change has a 50% chance of
2015  * toggling any given output bit.
2016  *
2017  * Mixing scores (in bits) for (12,45):
2018  * Input delta: 1-bit      2-bit
2019  * 1 round:     713.3    42542.6
2020  * 2 rounds:   2753.7   140389.8
2021  * 3 rounds:   5954.1   233458.2
2022  * 4 rounds:   7862.6   256672.2
2023  * Perfect:    8192     258048
2024  *            (64*128) (64*63/2 * 128)
2025  */
2026 #define HASH_MIX(x, y, a)       \
2027         (       x ^= (a),       \
2028         y ^= x, x = rol64(x,12),\
2029         x += y, y = rol64(y,45),\
2030         y *= 9                  )
2031
2032 /*
2033  * Fold two longs into one 32-bit hash value.  This must be fast, but
2034  * latency isn't quite as critical, as there is a fair bit of additional
2035  * work done before the hash value is used.
2036  */
2037 static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2038 {
2039         y ^= x * GOLDEN_RATIO_64;
2040         y *= GOLDEN_RATIO_64;
2041         return y >> 32;
2042 }
2043
2044 #else   /* 32-bit case */
2045
2046 /*
2047  * Mixing scores (in bits) for (7,20):
2048  * Input delta: 1-bit      2-bit
2049  * 1 round:     330.3     9201.6
2050  * 2 rounds:   1246.4    25475.4
2051  * 3 rounds:   1907.1    31295.1
2052  * 4 rounds:   2042.3    31718.6
2053  * Perfect:    2048      31744
2054  *            (32*64)   (32*31/2 * 64)
2055  */
2056 #define HASH_MIX(x, y, a)       \
2057         (       x ^= (a),       \
2058         y ^= x, x = rol32(x, 7),\
2059         x += y, y = rol32(y,20),\
2060         y *= 9                  )
2061
2062 static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2063 {
2064         /* Use arch-optimized multiply if one exists */
2065         return __hash_32(y ^ __hash_32(x));
2066 }
2067
2068 #endif
2069
2070 /*
2071  * Return the hash of a string of known length.  This is carfully
2072  * designed to match hash_name(), which is the more critical function.
2073  * In particular, we must end by hashing a final word containing 0..7
2074  * payload bytes, to match the way that hash_name() iterates until it
2075  * finds the delimiter after the name.
2076  */
2077 unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
2078 {
2079         unsigned long a, x = 0, y = (unsigned long)salt;
2080
2081         for (;;) {
2082                 if (!len)
2083                         goto done;
2084                 a = load_unaligned_zeropad(name);
2085                 if (len < sizeof(unsigned long))
2086                         break;
2087                 HASH_MIX(x, y, a);
2088                 name += sizeof(unsigned long);
2089                 len -= sizeof(unsigned long);
2090         }
2091         x ^= a & bytemask_from_count(len);
2092 done:
2093         return fold_hash(x, y);
2094 }
2095 EXPORT_SYMBOL(full_name_hash);
2096
2097 /* Return the "hash_len" (hash and length) of a null-terminated string */
2098 u64 hashlen_string(const void *salt, const char *name)
2099 {
2100         unsigned long a = 0, x = 0, y = (unsigned long)salt;
2101         unsigned long adata, mask, len;
2102         const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2103
2104         len = 0;
2105         goto inside;
2106
2107         do {
2108                 HASH_MIX(x, y, a);
2109                 len += sizeof(unsigned long);
2110 inside:
2111                 a = load_unaligned_zeropad(name+len);
2112         } while (!has_zero(a, &adata, &constants));
2113
2114         adata = prep_zero_mask(a, adata, &constants);
2115         mask = create_zero_mask(adata);
2116         x ^= a & zero_bytemask(mask);
2117
2118         return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2119 }
2120 EXPORT_SYMBOL(hashlen_string);
2121
2122 /*
2123  * Calculate the length and hash of the path component, and
2124  * return the "hash_len" as the result.
2125  */
2126 static inline u64 hash_name(const void *salt, const char *name)
2127 {
2128         unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
2129         unsigned long adata, bdata, mask, len;
2130         const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2131
2132         len = 0;
2133         goto inside;
2134
2135         do {
2136                 HASH_MIX(x, y, a);
2137                 len += sizeof(unsigned long);
2138 inside:
2139                 a = load_unaligned_zeropad(name+len);
2140                 b = a ^ REPEAT_BYTE('/');
2141         } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
2142
2143         adata = prep_zero_mask(a, adata, &constants);
2144         bdata = prep_zero_mask(b, bdata, &constants);
2145         mask = create_zero_mask(adata | bdata);
2146         x ^= a & zero_bytemask(mask);
2147
2148         return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2149 }
2150
2151 #else   /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
2152
2153 /* Return the hash of a string of known length */
2154 unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
2155 {
2156         unsigned long hash = init_name_hash(salt);
2157         while (len--)
2158                 hash = partial_name_hash((unsigned char)*name++, hash);
2159         return end_name_hash(hash);
2160 }
2161 EXPORT_SYMBOL(full_name_hash);
2162
2163 /* Return the "hash_len" (hash and length) of a null-terminated string */
2164 u64 hashlen_string(const void *salt, const char *name)
2165 {
2166         unsigned long hash = init_name_hash(salt);
2167         unsigned long len = 0, c;
2168
2169         c = (unsigned char)*name;
2170         while (c) {
2171                 len++;
2172                 hash = partial_name_hash(c, hash);
2173                 c = (unsigned char)name[len];
2174         }
2175         return hashlen_create(end_name_hash(hash), len);
2176 }
2177 EXPORT_SYMBOL(hashlen_string);
2178
2179 /*
2180  * We know there's a real path component here of at least
2181  * one character.
2182  */
2183 static inline u64 hash_name(const void *salt, const char *name)
2184 {
2185         unsigned long hash = init_name_hash(salt);
2186         unsigned long len = 0, c;
2187
2188         c = (unsigned char)*name;
2189         do {
2190                 len++;
2191                 hash = partial_name_hash(c, hash);
2192                 c = (unsigned char)name[len];
2193         } while (c && c != '/');
2194         return hashlen_create(end_name_hash(hash), len);
2195 }
2196
2197 #endif
2198
2199 /*
2200  * Name resolution.
2201  * This is the basic name resolution function, turning a pathname into
2202  * the final dentry. We expect 'base' to be positive and a directory.
2203  *
2204  * Returns 0 and nd will have valid dentry and mnt on success.
2205  * Returns error and drops reference to input namei data on failure.
2206  */
2207 static int link_path_walk(const char *name, struct nameidata *nd)
2208 {
2209         int depth = 0; // depth <= nd->depth
2210         int err;
2211
2212         nd->last_type = LAST_ROOT;
2213         nd->flags |= LOOKUP_PARENT;
2214         if (IS_ERR(name))
2215                 return PTR_ERR(name);
2216         while (*name=='/')
2217                 name++;
2218         if (!*name) {
2219                 nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
2220                 return 0;
2221         }
2222
2223         /* At this point we know we have a real path component. */
2224         for(;;) {
2225                 struct user_namespace *mnt_userns;
2226                 const char *link;
2227                 u64 hash_len;
2228                 int type;
2229
2230                 mnt_userns = mnt_user_ns(nd->path.mnt);
2231                 err = may_lookup(mnt_userns, nd);
2232                 if (err)
2233                         return err;
2234
2235                 hash_len = hash_name(nd->path.dentry, name);
2236
2237                 type = LAST_NORM;
2238                 if (name[0] == '.') switch (hashlen_len(hash_len)) {
2239                         case 2:
2240                                 if (name[1] == '.') {
2241                                         type = LAST_DOTDOT;
2242                                         nd->state |= ND_JUMPED;
2243                                 }
2244                                 break;
2245                         case 1:
2246                                 type = LAST_DOT;
2247                 }
2248                 if (likely(type == LAST_NORM)) {
2249                         struct dentry *parent = nd->path.dentry;
2250                         nd->state &= ~ND_JUMPED;
2251                         if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2252                                 struct qstr this = { { .hash_len = hash_len }, .name = name };
2253                                 err = parent->d_op->d_hash(parent, &this);
2254                                 if (err < 0)
2255                                         return err;
2256                                 hash_len = this.hash_len;
2257                                 name = this.name;
2258                         }
2259                 }
2260
2261                 nd->last.hash_len = hash_len;
2262                 nd->last.name = name;
2263                 nd->last_type = type;
2264
2265                 name += hashlen_len(hash_len);
2266                 if (!*name)
2267                         goto OK;
2268                 /*
2269                  * If it wasn't NUL, we know it was '/'. Skip that
2270                  * slash, and continue until no more slashes.
2271                  */
2272                 do {
2273                         name++;
2274                 } while (unlikely(*name == '/'));
2275                 if (unlikely(!*name)) {
2276 OK:
2277                         /* pathname or trailing symlink, done */
2278                         if (!depth) {
2279                                 nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
2280                                 nd->dir_mode = nd->inode->i_mode;
2281                                 nd->flags &= ~LOOKUP_PARENT;
2282                                 return 0;
2283                         }
2284                         /* last component of nested symlink */
2285                         name = nd->stack[--depth].name;
2286                         link = walk_component(nd, 0);
2287                 } else {
2288                         /* not the last component */
2289                         link = walk_component(nd, WALK_MORE);
2290                 }
2291                 if (unlikely(link)) {
2292                         if (IS_ERR(link))
2293                                 return PTR_ERR(link);
2294                         /* a symlink to follow */
2295                         nd->stack[depth++].name = name;
2296                         name = link;
2297                         continue;
2298                 }
2299                 if (unlikely(!d_can_lookup(nd->path.dentry))) {
2300                         if (nd->flags & LOOKUP_RCU) {
2301                                 if (!try_to_unlazy(nd))
2302                                         return -ECHILD;
2303                         }
2304                         return -ENOTDIR;
2305                 }
2306         }
2307 }
2308
2309 /* must be paired with terminate_walk() */
2310 static const char *path_init(struct nameidata *nd, unsigned flags)
2311 {
2312         int error;
2313         const char *s = nd->name->name;
2314
2315         /* LOOKUP_CACHED requires RCU, ask caller to retry */
2316         if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
2317                 return ERR_PTR(-EAGAIN);
2318
2319         if (!*s)
2320                 flags &= ~LOOKUP_RCU;
2321         if (flags & LOOKUP_RCU)
2322                 rcu_read_lock();
2323
2324         nd->flags = flags;
2325         nd->state |= ND_JUMPED;
2326
2327         nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
2328         nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
2329         smp_rmb();
2330
2331         if (nd->state & ND_ROOT_PRESET) {
2332                 struct dentry *root = nd->root.dentry;
2333                 struct inode *inode = root->d_inode;
2334                 if (*s && unlikely(!d_can_lookup(root)))
2335                         return ERR_PTR(-ENOTDIR);
2336                 nd->path = nd->root;
2337                 nd->inode = inode;
2338                 if (flags & LOOKUP_RCU) {
2339                         nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2340                         nd->root_seq = nd->seq;
2341                 } else {
2342                         path_get(&nd->path);
2343                 }
2344                 return s;
2345         }
2346
2347         nd->root.mnt = NULL;
2348
2349         /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
2350         if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
2351                 error = nd_jump_root(nd);
2352                 if (unlikely(error))
2353                         return ERR_PTR(error);
2354                 return s;
2355         }
2356
2357         /* Relative pathname -- get the starting-point it is relative to. */
2358         if (nd->dfd == AT_FDCWD) {
2359                 if (flags & LOOKUP_RCU) {
2360                         struct fs_struct *fs = current->fs;
2361                         unsigned seq;
2362
2363                         do {
2364                                 seq = read_seqcount_begin(&fs->seq);
2365                                 nd->path = fs->pwd;
2366                                 nd->inode = nd->path.dentry->d_inode;
2367                                 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2368                         } while (read_seqcount_retry(&fs->seq, seq));
2369                 } else {
2370                         get_fs_pwd(current->fs, &nd->path);
2371                         nd->inode = nd->path.dentry->d_inode;
2372                 }
2373         } else {
2374                 /* Caller must check execute permissions on the starting path component */
2375                 struct fd f = fdget_raw(nd->dfd);
2376                 struct dentry *dentry;
2377
2378                 if (!f.file)
2379                         return ERR_PTR(-EBADF);
2380
2381                 dentry = f.file->f_path.dentry;
2382
2383                 if (*s && unlikely(!d_can_lookup(dentry))) {
2384                         fdput(f);
2385                         return ERR_PTR(-ENOTDIR);
2386                 }
2387
2388                 nd->path = f.file->f_path;
2389                 if (flags & LOOKUP_RCU) {
2390                         nd->inode = nd->path.dentry->d_inode;
2391                         nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2392                 } else {
2393                         path_get(&nd->path);
2394                         nd->inode = nd->path.dentry->d_inode;
2395                 }
2396                 fdput(f);
2397         }
2398
2399         /* For scoped-lookups we need to set the root to the dirfd as well. */
2400         if (flags & LOOKUP_IS_SCOPED) {
2401                 nd->root = nd->path;
2402                 if (flags & LOOKUP_RCU) {
2403                         nd->root_seq = nd->seq;
2404                 } else {
2405                         path_get(&nd->root);
2406                         nd->state |= ND_ROOT_GRABBED;
2407                 }
2408         }
2409         return s;
2410 }
2411
2412 static inline const char *lookup_last(struct nameidata *nd)
2413 {
2414         if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
2415                 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2416
2417         return walk_component(nd, WALK_TRAILING);
2418 }
2419
2420 static int handle_lookup_down(struct nameidata *nd)
2421 {
2422         if (!(nd->flags & LOOKUP_RCU))
2423                 dget(nd->path.dentry);
2424         return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
2425                         nd->path.dentry, nd->inode, nd->seq));
2426 }
2427
2428 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2429 static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2430 {
2431         const char *s = path_init(nd, flags);
2432         int err;
2433
2434         if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
2435                 err = handle_lookup_down(nd);
2436                 if (unlikely(err < 0))
2437                         s = ERR_PTR(err);
2438         }
2439
2440         while (!(err = link_path_walk(s, nd)) &&
2441                (s = lookup_last(nd)) != NULL)
2442                 ;
2443         if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
2444                 err = handle_lookup_down(nd);
2445                 nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
2446         }
2447         if (!err)
2448                 err = complete_walk(nd);
2449
2450         if (!err && nd->flags & LOOKUP_DIRECTORY)
2451                 if (!d_can_lookup(nd->path.dentry))
2452                         err = -ENOTDIR;
2453         if (!err) {
2454                 *path = nd->path;
2455                 nd->path.mnt = NULL;
2456                 nd->path.dentry = NULL;
2457         }
2458         terminate_walk(nd);
2459         return err;
2460 }
2461
2462 int filename_lookup(int dfd, struct filename *name, unsigned flags,
2463                     struct path *path, struct path *root)
2464 {
2465         int retval;
2466         struct nameidata nd;
2467         if (IS_ERR(name))
2468                 return PTR_ERR(name);
2469         set_nameidata(&nd, dfd, name, root);
2470         retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
2471         if (unlikely(retval == -ECHILD))
2472                 retval = path_lookupat(&nd, flags, path);
2473         if (unlikely(retval == -ESTALE))
2474                 retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
2475
2476         if (likely(!retval))
2477                 audit_inode(name, path->dentry,
2478                             flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
2479         restore_nameidata();
2480         putname(name);
2481         return retval;
2482 }
2483
2484 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2485 static int path_parentat(struct nameidata *nd, unsigned flags,
2486                                 struct path *parent)
2487 {
2488         const char *s = path_init(nd, flags);
2489         int err = link_path_walk(s, nd);
2490         if (!err)
2491                 err = complete_walk(nd);
2492         if (!err) {
2493                 *parent = nd->path;
2494                 nd->path.mnt = NULL;
2495                 nd->path.dentry = NULL;
2496         }
2497         terminate_walk(nd);
2498         return err;
2499 }
2500
2501 static struct filename *filename_parentat(int dfd, struct filename *name,
2502                                 unsigned int flags, struct path *parent,
2503                                 struct qstr *last, int *type)
2504 {
2505         int retval;
2506         struct nameidata nd;
2507
2508         if (IS_ERR(name))
2509                 return name;
2510         set_nameidata(&nd, dfd, name, NULL);
2511         retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2512         if (unlikely(retval == -ECHILD))
2513                 retval = path_parentat(&nd, flags, parent);
2514         if (unlikely(retval == -ESTALE))
2515                 retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2516         if (likely(!retval)) {
2517                 *last = nd.last;
2518                 *type = nd.last_type;
2519                 audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
2520         } else {
2521                 putname(name);
2522                 name = ERR_PTR(retval);
2523         }
2524         restore_nameidata();
2525         return name;
2526 }
2527
2528 /* does lookup, returns the object with parent locked */
2529 struct dentry *kern_path_locked(const char *name, struct path *path)
2530 {
2531         struct filename *filename;
2532         struct dentry *d;
2533         struct qstr last;
2534         int type;
2535
2536         filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
2537                                     &last, &type);
2538         if (IS_ERR(filename))
2539                 return ERR_CAST(filename);
2540         if (unlikely(type != LAST_NORM)) {
2541                 path_put(path);
2542                 putname(filename);
2543                 return ERR_PTR(-EINVAL);
2544         }
2545         inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
2546         d = __lookup_hash(&last, path->dentry, 0);
2547         if (IS_ERR(d)) {
2548                 inode_unlock(path->dentry->d_inode);
2549                 path_put(path);
2550         }
2551         putname(filename);
2552         return d;
2553 }
2554
2555 int kern_path(const char *name, unsigned int flags, struct path *path)
2556 {
2557         return filename_lookup(AT_FDCWD, getname_kernel(name),
2558                                flags, path, NULL);
2559 }
2560 EXPORT_SYMBOL(kern_path);
2561
2562 /**
2563  * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
2564  * @dentry:  pointer to dentry of the base directory
2565  * @mnt: pointer to vfs mount of the base directory
2566  * @name: pointer to file name
2567  * @flags: lookup flags
2568  * @path: pointer to struct path to fill
2569  */
2570 int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
2571                     const char *name, unsigned int flags,
2572                     struct path *path)
2573 {
2574         struct path root = {.mnt = mnt, .dentry = dentry};
2575         /* the first argument of filename_lookup() is ignored with root */
2576         return filename_lookup(AT_FDCWD, getname_kernel(name),
2577                                flags , path, &root);
2578 }
2579 EXPORT_SYMBOL(vfs_path_lookup);
2580
2581 static int lookup_one_len_common(const char *name, struct dentry *base,
2582                                  int len, struct qstr *this)
2583 {
2584         this->name = name;
2585         this->len = len;
2586         this->hash = full_name_hash(base, name, len);
2587         if (!len)
2588                 return -EACCES;
2589
2590         if (unlikely(name[0] == '.')) {
2591                 if (len < 2 || (len == 2 && name[1] == '.'))
2592                         return -EACCES;
2593         }
2594
2595         while (len--) {
2596                 unsigned int c = *(const unsigned char *)name++;
2597                 if (c == '/' || c == '\0')
2598                         return -EACCES;
2599         }
2600         /*
2601          * See if the low-level filesystem might want
2602          * to use its own hash..
2603          */
2604         if (base->d_flags & DCACHE_OP_HASH) {
2605                 int err = base->d_op->d_hash(base, this);
2606                 if (err < 0)
2607                         return err;
2608         }
2609
2610         return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC);
2611 }
2612
2613 /**
2614  * try_lookup_one_len - filesystem helper to lookup single pathname component
2615  * @name:       pathname component to lookup
2616  * @base:       base directory to lookup from
2617  * @len:        maximum length @len should be interpreted to
2618  *
2619  * Look up a dentry by name in the dcache, returning NULL if it does not
2620  * currently exist.  The function does not try to create a dentry.
2621  *
2622  * Note that this routine is purely a helper for filesystem usage and should
2623  * not be called by generic code.
2624  *
2625  * The caller must hold base->i_mutex.
2626  */
2627 struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
2628 {
2629         struct qstr this;
2630         int err;
2631
2632         WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2633
2634         err = lookup_one_len_common(name, base, len, &this);
2635         if (err)
2636                 return ERR_PTR(err);
2637
2638         return lookup_dcache(&this, base, 0);
2639 }
2640 EXPORT_SYMBOL(try_lookup_one_len);
2641
2642 /**
2643  * lookup_one_len - filesystem helper to lookup single pathname component
2644  * @name:       pathname component to lookup
2645  * @base:       base directory to lookup from
2646  * @len:        maximum length @len should be interpreted to
2647  *
2648  * Note that this routine is purely a helper for filesystem usage and should
2649  * not be called by generic code.
2650  *
2651  * The caller must hold base->i_mutex.
2652  */
2653 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2654 {
2655         struct dentry *dentry;
2656         struct qstr this;
2657         int err;
2658
2659         WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2660
2661         err = lookup_one_len_common(name, base, len, &this);
2662         if (err)
2663                 return ERR_PTR(err);
2664
2665         dentry = lookup_dcache(&this, base, 0);
2666         return dentry ? dentry : __lookup_slow(&this, base, 0);
2667 }
2668 EXPORT_SYMBOL(lookup_one_len);
2669
2670 /**
2671  * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
2672  * @name:       pathname component to lookup
2673  * @base:       base directory to lookup from
2674  * @len:        maximum length @len should be interpreted to
2675  *
2676  * Note that this routine is purely a helper for filesystem usage and should
2677  * not be called by generic code.
2678  *
2679  * Unlike lookup_one_len, it should be called without the parent
2680  * i_mutex held, and will take the i_mutex itself if necessary.
2681  */
2682 struct dentry *lookup_one_len_unlocked(const char *name,
2683                                        struct dentry *base, int len)
2684 {
2685         struct qstr this;
2686         int err;
2687         struct dentry *ret;
2688
2689         err = lookup_one_len_common(name, base, len, &this);
2690         if (err)
2691                 return ERR_PTR(err);
2692
2693         ret = lookup_dcache(&this, base, 0);
2694         if (!ret)
2695                 ret = lookup_slow(&this, base, 0);
2696         return ret;
2697 }
2698 EXPORT_SYMBOL(lookup_one_len_unlocked);
2699
2700 /*
2701  * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
2702  * on negatives.  Returns known positive or ERR_PTR(); that's what
2703  * most of the users want.  Note that pinned negative with unlocked parent
2704  * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
2705  * need to be very careful; pinned positives have ->d_inode stable, so
2706  * this one avoids such problems.
2707  */
2708 struct dentry *lookup_positive_unlocked(const char *name,
2709                                        struct dentry *base, int len)
2710 {
2711         struct dentry *ret = lookup_one_len_unlocked(name, base, len);
2712         if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
2713                 dput(ret);
2714                 ret = ERR_PTR(-ENOENT);
2715         }
2716         return ret;
2717 }
2718 EXPORT_SYMBOL(lookup_positive_unlocked);
2719
2720 #ifdef CONFIG_UNIX98_PTYS
2721 int path_pts(struct path *path)
2722 {
2723         /* Find something mounted on "pts" in the same directory as
2724          * the input path.
2725          */
2726         struct dentry *parent = dget_parent(path->dentry);
2727         struct dentry *child;
2728         struct qstr this = QSTR_INIT("pts", 3);
2729
2730         if (unlikely(!path_connected(path->mnt, parent))) {
2731                 dput(parent);
2732                 return -ENOENT;
2733         }
2734         dput(path->dentry);
2735         path->dentry = parent;
2736         child = d_hash_and_lookup(parent, &this);
2737         if (!child)
2738                 return -ENOENT;
2739
2740         path->dentry = child;
2741         dput(parent);
2742         follow_down(path);
2743         return 0;
2744 }
2745 #endif
2746
2747 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
2748                  struct path *path, int *empty)
2749 {
2750         return filename_lookup(dfd, getname_flags(name, flags, empty),
2751                                flags, path, NULL);
2752 }
2753 EXPORT_SYMBOL(user_path_at_empty);
2754
2755 int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
2756                    struct inode *inode)
2757 {
2758         kuid_t fsuid = current_fsuid();
2759
2760         if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid))
2761                 return 0;
2762         if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid))
2763                 return 0;
2764         return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER);
2765 }
2766 EXPORT_SYMBOL(__check_sticky);
2767
2768 /*
2769  *      Check whether we can remove a link victim from directory dir, check
2770  *  whether the type of victim is right.
2771  *  1. We can't do it if dir is read-only (done in permission())
2772  *  2. We should have write and exec permissions on dir
2773  *  3. We can't remove anything from append-only dir
2774  *  4. We can't do anything with immutable dir (done in permission())
2775  *  5. If the sticky bit on dir is set we should either
2776  *      a. be owner of dir, or
2777  *      b. be owner of victim, or
2778  *      c. have CAP_FOWNER capability
2779  *  6. If the victim is append-only or immutable we can't do antyhing with
2780  *     links pointing to it.
2781  *  7. If the victim has an unknown uid or gid we can't change the inode.
2782  *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
2783  *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
2784  * 10. We can't remove a root or mountpoint.
2785  * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
2786  *     nfs_async_unlink().
2787  */
2788 static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
2789                       struct dentry *victim, bool isdir)
2790 {
2791         struct inode *inode = d_backing_inode(victim);
2792         int error;
2793
2794         if (d_is_negative(victim))
2795                 return -ENOENT;
2796         BUG_ON(!inode);
2797
2798         BUG_ON(victim->d_parent->d_inode != dir);
2799
2800         /* Inode writeback is not safe when the uid or gid are invalid. */
2801         if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
2802             !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
2803                 return -EOVERFLOW;
2804
2805         audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
2806
2807         error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
2808         if (error)
2809                 return error;
2810         if (IS_APPEND(dir))
2811                 return -EPERM;
2812
2813         if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) ||
2814             IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
2815             HAS_UNMAPPED_ID(mnt_userns, inode))
2816                 return -EPERM;
2817         if (isdir) {
2818                 if (!d_is_dir(victim))
2819                         return -ENOTDIR;
2820                 if (IS_ROOT(victim))
2821                         return -EBUSY;
2822         } else if (d_is_dir(victim))
2823                 return -EISDIR;
2824         if (IS_DEADDIR(dir))
2825                 return -ENOENT;
2826         if (victim->d_flags & DCACHE_NFSFS_RENAMED)
2827                 return -EBUSY;
2828         return 0;
2829 }
2830
2831 /*      Check whether we can create an object with dentry child in directory
2832  *  dir.
2833  *  1. We can't do it if child already exists (open has special treatment for
2834  *     this case, but since we are inlined it's OK)
2835  *  2. We can't do it if dir is read-only (done in permission())
2836  *  3. We can't do it if the fs can't represent the fsuid or fsgid.
2837  *  4. We should have write and exec permissions on dir
2838  *  5. We can't do it if dir is immutable (done in permission())
2839  */
2840 static inline int may_create(struct user_namespace *mnt_userns,
2841                              struct inode *dir, struct dentry *child)
2842 {
2843         audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
2844         if (child->d_inode)
2845                 return -EEXIST;
2846         if (IS_DEADDIR(dir))
2847                 return -ENOENT;
2848         if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
2849                 return -EOVERFLOW;
2850
2851         return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
2852 }
2853
2854 /*
2855  * p1 and p2 should be directories on the same fs.
2856  */
2857 struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
2858 {
2859         struct dentry *p;
2860
2861         if (p1 == p2) {
2862                 inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
2863                 return NULL;
2864         }
2865
2866         mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
2867
2868         p = d_ancestor(p2, p1);
2869         if (p) {
2870                 inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
2871                 inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
2872                 return p;
2873         }
2874
2875         p = d_ancestor(p1, p2);
2876         if (p) {
2877                 inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
2878                 inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
2879                 return p;
2880         }
2881
2882         inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
2883         inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
2884         return NULL;
2885 }
2886 EXPORT_SYMBOL(lock_rename);
2887
2888 void unlock_rename(struct dentry *p1, struct dentry *p2)
2889 {
2890         inode_unlock(p1->d_inode);
2891         if (p1 != p2) {
2892                 inode_unlock(p2->d_inode);
2893                 mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
2894         }
2895 }
2896 EXPORT_SYMBOL(unlock_rename);
2897
2898 /**
2899  * vfs_create - create new file
2900  * @mnt_userns: user namespace of the mount the inode was found from
2901  * @dir:        inode of @dentry
2902  * @dentry:     pointer to dentry of the base directory
2903  * @mode:       mode of the new file
2904  * @want_excl:  whether the file must not yet exist
2905  *
2906  * Create a new file.
2907  *
2908  * If the inode has been found through an idmapped mount the user namespace of
2909  * the vfsmount must be passed through @mnt_userns. This function will then take
2910  * care to map the inode according to @mnt_userns before checking permissions.
2911  * On non-idmapped mounts or if permission checking is to be performed on the
2912  * raw inode simply passs init_user_ns.
2913  */
2914 int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
2915                struct dentry *dentry, umode_t mode, bool want_excl)
2916 {
2917         int error = may_create(mnt_userns, dir, dentry);
2918         if (error)
2919                 return error;
2920
2921         if (!dir->i_op->create)
2922                 return -EACCES; /* shouldn't it be ENOSYS? */
2923         mode &= S_IALLUGO;
2924         mode |= S_IFREG;
2925         error = security_inode_create(dir, dentry, mode);
2926         if (error)
2927                 return error;
2928         error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl);
2929         if (!error)
2930                 fsnotify_create(dir, dentry);
2931         return error;
2932 }
2933 EXPORT_SYMBOL(vfs_create);
2934
2935 int vfs_mkobj(struct dentry *dentry, umode_t mode,
2936                 int (*f)(struct dentry *, umode_t, void *),
2937                 void *arg)
2938 {
2939         struct inode *dir = dentry->d_parent->d_inode;
2940         int error = may_create(&init_user_ns, dir, dentry);
2941         if (error)
2942                 return error;
2943
2944         mode &= S_IALLUGO;
2945         mode |= S_IFREG;
2946         error = security_inode_create(dir, dentry, mode);
2947         if (error)
2948                 return error;
2949         error = f(dentry, mode, arg);
2950         if (!error)
2951                 fsnotify_create(dir, dentry);
2952         return error;
2953 }
2954 EXPORT_SYMBOL(vfs_mkobj);
2955
2956 bool may_open_dev(const struct path *path)
2957 {
2958         return !(path->mnt->mnt_flags & MNT_NODEV) &&
2959                 !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
2960 }
2961
2962 static int may_open(struct user_namespace *mnt_userns, const struct path *path,
2963                     int acc_mode, int flag)
2964 {
2965         struct dentry *dentry = path->dentry;
2966         struct inode *inode = dentry->d_inode;
2967         int error;
2968
2969         if (!inode)
2970                 return -ENOENT;
2971
2972         switch (inode->i_mode & S_IFMT) {
2973         case S_IFLNK:
2974                 return -ELOOP;
2975         case S_IFDIR:
2976                 if (acc_mode & MAY_WRITE)
2977                         return -EISDIR;
2978                 if (acc_mode & MAY_EXEC)
2979                         return -EACCES;
2980                 break;
2981         case S_IFBLK:
2982         case S_IFCHR:
2983                 if (!may_open_dev(path))
2984                         return -EACCES;
2985                 fallthrough;
2986         case S_IFIFO:
2987         case S_IFSOCK:
2988                 if (acc_mode & MAY_EXEC)
2989                         return -EACCES;
2990                 flag &= ~O_TRUNC;
2991                 break;
2992         case S_IFREG:
2993                 if ((acc_mode & MAY_EXEC) && path_noexec(path))
2994                         return -EACCES;
2995                 break;
2996         }
2997
2998         error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode);
2999         if (error)
3000                 return error;
3001
3002         /*
3003          * An append-only file must be opened in append mode for writing.
3004          */
3005         if (IS_APPEND(inode)) {
3006                 if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
3007                         return -EPERM;
3008                 if (flag & O_TRUNC)
3009                         return -EPERM;
3010         }
3011
3012         /* O_NOATIME can only be set by the owner or superuser */
3013         if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode))
3014                 return -EPERM;
3015
3016         return 0;
3017 }
3018
3019 static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
3020 {
3021         const struct path *path = &filp->f_path;
3022         struct inode *inode = path->dentry->d_inode;
3023         int error = get_write_access(inode);
3024         if (error)
3025                 return error;
3026         /*
3027          * Refuse to truncate files with mandatory locks held on them.
3028          */
3029         error = locks_verify_locked(filp);
3030         if (!error)
3031                 error = security_path_truncate(path);
3032         if (!error) {
3033                 error = do_truncate(mnt_userns, path->dentry, 0,
3034                                     ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
3035                                     filp);
3036         }
3037         put_write_access(inode);
3038         return error;
3039 }
3040
3041 static inline int open_to_namei_flags(int flag)
3042 {
3043         if ((flag & O_ACCMODE) == 3)
3044                 flag--;
3045         return flag;
3046 }
3047
3048 static int may_o_create(struct user_namespace *mnt_userns,
3049                         const struct path *dir, struct dentry *dentry,
3050                         umode_t mode)
3051 {
3052         int error = security_path_mknod(dir, dentry, mode, 0);
3053         if (error)
3054                 return error;
3055
3056         if (!fsuidgid_has_mapping(dir->dentry->d_sb, mnt_userns))
3057                 return -EOVERFLOW;
3058
3059         error = inode_permission(mnt_userns, dir->dentry->d_inode,
3060                                  MAY_WRITE | MAY_EXEC);
3061         if (error)
3062                 return error;
3063
3064         return security_inode_create(dir->dentry->d_inode, dentry, mode);
3065 }
3066
3067 /*
3068  * Attempt to atomically look up, create and open a file from a negative
3069  * dentry.
3070  *
3071  * Returns 0 if successful.  The file will have been created and attached to
3072  * @file by the filesystem calling finish_open().
3073  *
3074  * If the file was looked up only or didn't need creating, FMODE_OPENED won't
3075  * be set.  The caller will need to perform the open themselves.  @path will
3076  * have been updated to point to the new dentry.  This may be negative.
3077  *
3078  * Returns an error code otherwise.
3079  */
3080 static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
3081                                   struct file *file,
3082                                   int open_flag, umode_t mode)
3083 {
3084         struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
3085         struct inode *dir =  nd->path.dentry->d_inode;
3086         int error;
3087
3088         if (nd->flags & LOOKUP_DIRECTORY)
3089                 open_flag |= O_DIRECTORY;
3090
3091         file->f_path.dentry = DENTRY_NOT_SET;
3092         file->f_path.mnt = nd->path.mnt;
3093         error = dir->i_op->atomic_open(dir, dentry, file,
3094                                        open_to_namei_flags(open_flag), mode);
3095         d_lookup_done(dentry);
3096         if (!error) {
3097                 if (file->f_mode & FMODE_OPENED) {
3098                         if (unlikely(dentry != file->f_path.dentry)) {
3099                                 dput(dentry);
3100                                 dentry = dget(file->f_path.dentry);
3101                         }
3102                 } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
3103                         error = -EIO;
3104                 } else {
3105                         if (file->f_path.dentry) {
3106                                 dput(dentry);
3107                                 dentry = file->f_path.dentry;
3108                         }
3109                         if (unlikely(d_is_negative(dentry)))
3110                                 error = -ENOENT;
3111                 }
3112         }
3113         if (error) {
3114                 dput(dentry);
3115                 dentry = ERR_PTR(error);
3116         }
3117         return dentry;
3118 }
3119
3120 /*
3121  * Look up and maybe create and open the last component.
3122  *
3123  * Must be called with parent locked (exclusive in O_CREAT case).
3124  *
3125  * Returns 0 on success, that is, if
3126  *  the file was successfully atomically created (if necessary) and opened, or
3127  *  the file was not completely opened at this time, though lookups and
3128  *  creations were performed.
3129  * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
3130  * In the latter case dentry returned in @path might be negative if O_CREAT
3131  * hadn't been specified.
3132  *
3133  * An error code is returned on failure.
3134  */
3135 static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
3136                                   const struct open_flags *op,
3137                                   bool got_write)
3138 {
3139         struct user_namespace *mnt_userns;
3140         struct dentry *dir = nd->path.dentry;
3141         struct inode *dir_inode = dir->d_inode;
3142         int open_flag = op->open_flag;
3143         struct dentry *dentry;
3144         int error, create_error = 0;
3145         umode_t mode = op->mode;
3146         DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
3147
3148         if (unlikely(IS_DEADDIR(dir_inode)))
3149                 return ERR_PTR(-ENOENT);
3150
3151         file->f_mode &= ~FMODE_CREATED;
3152         dentry = d_lookup(dir, &nd->last);
3153         for (;;) {
3154                 if (!dentry) {
3155                         dentry = d_alloc_parallel(dir, &nd->last, &wq);
3156                         if (IS_ERR(dentry))
3157                                 return dentry;
3158                 }
3159                 if (d_in_lookup(dentry))
3160                         break;
3161
3162                 error = d_revalidate(dentry, nd->flags);
3163                 if (likely(error > 0))
3164                         break;
3165                 if (error)
3166                         goto out_dput;
3167                 d_invalidate(dentry);
3168                 dput(dentry);
3169                 dentry = NULL;
3170         }
3171         if (dentry->d_inode) {
3172                 /* Cached positive dentry: will open in f_op->open */
3173                 return dentry;
3174         }
3175
3176         /*
3177          * Checking write permission is tricky, bacuse we don't know if we are
3178          * going to actually need it: O_CREAT opens should work as long as the
3179          * file exists.  But checking existence breaks atomicity.  The trick is
3180          * to check access and if not granted clear O_CREAT from the flags.
3181          *
3182          * Another problem is returing the "right" error value (e.g. for an
3183          * O_EXCL open we want to return EEXIST not EROFS).
3184          */
3185         if (unlikely(!got_write))
3186                 open_flag &= ~O_TRUNC;
3187         mnt_userns = mnt_user_ns(nd->path.mnt);
3188         if (open_flag & O_CREAT) {
3189                 if (open_flag & O_EXCL)
3190                         open_flag &= ~O_TRUNC;
3191                 if (!IS_POSIXACL(dir->d_inode))
3192                         mode &= ~current_umask();
3193                 if (likely(got_write))
3194                         create_error = may_o_create(mnt_userns, &nd->path,
3195                                                     dentry, mode);
3196                 else
3197                         create_error = -EROFS;
3198         }
3199         if (create_error)
3200                 open_flag &= ~O_CREAT;
3201         if (dir_inode->i_op->atomic_open) {
3202                 dentry = atomic_open(nd, dentry, file, open_flag, mode);
3203                 if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
3204                         dentry = ERR_PTR(create_error);
3205                 return dentry;
3206         }
3207
3208         if (d_in_lookup(dentry)) {
3209                 struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
3210                                                              nd->flags);
3211                 d_lookup_done(dentry);
3212                 if (unlikely(res)) {
3213                         if (IS_ERR(res)) {
3214                                 error = PTR_ERR(res);
3215                                 goto out_dput;
3216                         }
3217                         dput(dentry);
3218                         dentry = res;
3219                 }
3220         }
3221
3222         /* Negative dentry, just create the file */
3223         if (!dentry->d_inode && (open_flag & O_CREAT)) {
3224                 file->f_mode |= FMODE_CREATED;
3225                 audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
3226                 if (!dir_inode->i_op->create) {
3227                         error = -EACCES;
3228                         goto out_dput;
3229                 }
3230
3231                 error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry,
3232                                                 mode, open_flag & O_EXCL);
3233                 if (error)
3234                         goto out_dput;
3235         }
3236         if (unlikely(create_error) && !dentry->d_inode) {
3237                 error = create_error;
3238                 goto out_dput;
3239         }
3240         return dentry;
3241
3242 out_dput:
3243         dput(dentry);
3244         return ERR_PTR(error);
3245 }
3246
3247 static const char *open_last_lookups(struct nameidata *nd,
3248                    struct file *file, const struct open_flags *op)
3249 {
3250         struct dentry *dir = nd->path.dentry;
3251         int open_flag = op->open_flag;
3252         bool got_write = false;
3253         unsigned seq;
3254         struct inode *inode;
3255         struct dentry *dentry;
3256         const char *res;
3257
3258         nd->flags |= op->intent;
3259
3260         if (nd->last_type != LAST_NORM) {
3261                 if (nd->depth)
3262                         put_link(nd);
3263                 return handle_dots(nd, nd->last_type);
3264         }
3265
3266         if (!(open_flag & O_CREAT)) {
3267                 if (nd->last.name[nd->last.len])
3268                         nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
3269                 /* we _can_ be in RCU mode here */
3270                 dentry = lookup_fast(nd, &inode, &seq);
3271                 if (IS_ERR(dentry))
3272                         return ERR_CAST(dentry);
3273                 if (likely(dentry))
3274                         goto finish_lookup;
3275
3276                 BUG_ON(nd->flags & LOOKUP_RCU);
3277         } else {
3278                 /* create side of things */
3279                 if (nd->flags & LOOKUP_RCU) {
3280                         if (!try_to_unlazy(nd))
3281                                 return ERR_PTR(-ECHILD);
3282                 }
3283                 audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
3284                 /* trailing slashes? */
3285                 if (unlikely(nd->last.name[nd->last.len]))
3286                         return ERR_PTR(-EISDIR);
3287         }
3288
3289         if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
3290                 got_write = !mnt_want_write(nd->path.mnt);
3291                 /*
3292                  * do _not_ fail yet - we might not need that or fail with
3293                  * a different error; let lookup_open() decide; we'll be
3294                  * dropping this one anyway.
3295                  */
3296         }
3297         if (open_flag & O_CREAT)
3298                 inode_lock(dir->d_inode);
3299         else
3300                 inode_lock_shared(dir->d_inode);
3301         dentry = lookup_open(nd, file, op, got_write);
3302         if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
3303                 fsnotify_create(dir->d_inode, dentry);
3304         if (open_flag & O_CREAT)
3305                 inode_unlock(dir->d_inode);
3306         else
3307                 inode_unlock_shared(dir->d_inode);
3308
3309         if (got_write)
3310                 mnt_drop_write(nd->path.mnt);
3311
3312         if (IS_ERR(dentry))
3313                 return ERR_CAST(dentry);
3314
3315         if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
3316                 dput(nd->path.dentry);
3317                 nd->path.dentry = dentry;
3318                 return NULL;
3319         }
3320
3321 finish_lookup:
3322         if (nd->depth)
3323                 put_link(nd);
3324         res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
3325         if (unlikely(res))
3326                 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3327         return res;
3328 }
3329
3330 /*
3331  * Handle the last step of open()
3332  */
3333 static int do_open(struct nameidata *nd,
3334                    struct file *file, const struct open_flags *op)
3335 {
3336         struct user_namespace *mnt_userns;
3337         int open_flag = op->open_flag;
3338         bool do_truncate;
3339         int acc_mode;
3340         int error;
3341
3342         if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
3343                 error = complete_walk(nd);
3344                 if (error)
3345                         return error;
3346         }
3347         if (!(file->f_mode & FMODE_CREATED))
3348                 audit_inode(nd->name, nd->path.dentry, 0);
3349         mnt_userns = mnt_user_ns(nd->path.mnt);
3350         if (open_flag & O_CREAT) {
3351                 if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
3352                         return -EEXIST;
3353                 if (d_is_dir(nd->path.dentry))
3354                         return -EISDIR;
3355                 error = may_create_in_sticky(mnt_userns, nd,
3356                                              d_backing_inode(nd->path.dentry));
3357                 if (unlikely(error))
3358                         return error;
3359         }
3360         if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3361                 return -ENOTDIR;
3362
3363         do_truncate = false;
3364         acc_mode = op->acc_mode;
3365         if (file->f_mode & FMODE_CREATED) {
3366                 /* Don't check for write permission, don't truncate */
3367                 open_flag &= ~O_TRUNC;
3368                 acc_mode = 0;
3369         } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
3370                 error = mnt_want_write(nd->path.mnt);
3371                 if (error)
3372                         return error;
3373                 do_truncate = true;
3374         }
3375         error = may_open(mnt_userns, &nd->path, acc_mode, open_flag);
3376         if (!error && !(file->f_mode & FMODE_OPENED))
3377                 error = vfs_open(&nd->path, file);
3378         if (!error)
3379                 error = ima_file_check(file, op->acc_mode);
3380         if (!error && do_truncate)
3381                 error = handle_truncate(mnt_userns, file);
3382         if (unlikely(error > 0)) {
3383                 WARN_ON(1);
3384                 error = -EINVAL;
3385         }
3386         if (do_truncate)
3387                 mnt_drop_write(nd->path.mnt);
3388         return error;
3389 }
3390
3391 /**
3392  * vfs_tmpfile - create tmpfile
3393  * @mnt_userns: user namespace of the mount the inode was found from
3394  * @dentry:     pointer to dentry of the base directory
3395  * @mode:       mode of the new tmpfile
3396  * @open_flag:  flags
3397  *
3398  * Create a temporary file.
3399  *
3400  * If the inode has been found through an idmapped mount the user namespace of
3401  * the vfsmount must be passed through @mnt_userns. This function will then take
3402  * care to map the inode according to @mnt_userns before checking permissions.
3403  * On non-idmapped mounts or if permission checking is to be performed on the
3404  * raw inode simply passs init_user_ns.
3405  */
3406 struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
3407                            struct dentry *dentry, umode_t mode, int open_flag)
3408 {
3409         struct dentry *child = NULL;
3410         struct inode *dir = dentry->d_inode;
3411         struct inode *inode;
3412         int error;
3413
3414         /* we want directory to be writable */
3415         error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
3416         if (error)
3417                 goto out_err;
3418         error = -EOPNOTSUPP;
3419         if (!dir->i_op->tmpfile)
3420                 goto out_err;
3421         error = -ENOMEM;
3422         child = d_alloc(dentry, &slash_name);
3423         if (unlikely(!child))
3424                 goto out_err;
3425         error = dir->i_op->tmpfile(mnt_userns, dir, child, mode);
3426         if (error)
3427                 goto out_err;
3428         error = -ENOENT;
3429         inode = child->d_inode;
3430         if (unlikely(!inode))
3431                 goto out_err;
3432         if (!(open_flag & O_EXCL)) {
3433                 spin_lock(&inode->i_lock);
3434                 inode->i_state |= I_LINKABLE;
3435                 spin_unlock(&inode->i_lock);
3436         }
3437         ima_post_create_tmpfile(mnt_userns, inode);
3438         return child;
3439
3440 out_err:
3441         dput(child);
3442         return ERR_PTR(error);
3443 }
3444 EXPORT_SYMBOL(vfs_tmpfile);
3445
3446 static int do_tmpfile(struct nameidata *nd, unsigned flags,
3447                 const struct open_flags *op,
3448                 struct file *file)
3449 {
3450         struct user_namespace *mnt_userns;
3451         struct dentry *child;
3452         struct path path;
3453         int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
3454         if (unlikely(error))
3455                 return error;
3456         error = mnt_want_write(path.mnt);
3457         if (unlikely(error))
3458                 goto out;
3459         mnt_userns = mnt_user_ns(path.mnt);
3460         child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag);
3461         error = PTR_ERR(child);
3462         if (IS_ERR(child))
3463                 goto out2;
3464         dput(path.dentry);
3465         path.dentry = child;
3466         audit_inode(nd->name, child, 0);
3467         /* Don't check for other permissions, the inode was just created */
3468         error = may_open(mnt_userns, &path, 0, op->open_flag);
3469         if (!error)
3470                 error = vfs_open(&path, file);
3471 out2:
3472         mnt_drop_write(path.mnt);
3473 out:
3474         path_put(&path);
3475         return error;
3476 }
3477
3478 static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
3479 {
3480         struct path path;
3481         int error = path_lookupat(nd, flags, &path);
3482         if (!error) {
3483                 audit_inode(nd->name, path.dentry, 0);
3484                 error = vfs_open(&path, file);
3485                 path_put(&path);
3486         }
3487         return error;
3488 }
3489
3490 static struct file *path_openat(struct nameidata *nd,
3491                         const struct open_flags *op, unsigned flags)
3492 {
3493         struct file *file;
3494         int error;
3495
3496         file = alloc_empty_file(op->open_flag, current_cred());
3497         if (IS_ERR(file))
3498                 return file;
3499
3500         if (unlikely(file->f_flags & __O_TMPFILE)) {
3501                 error = do_tmpfile(nd, flags, op, file);
3502         } else if (unlikely(file->f_flags & O_PATH)) {
3503                 error = do_o_path(nd, flags, file);
3504         } else {
3505                 const char *s = path_init(nd, flags);
3506                 while (!(error = link_path_walk(s, nd)) &&
3507                        (s = open_last_lookups(nd, file, op)) != NULL)
3508                         ;
3509                 if (!error)
3510                         error = do_open(nd, file, op);
3511                 terminate_walk(nd);
3512         }
3513         if (likely(!error)) {
3514                 if (likely(file->f_mode & FMODE_OPENED))
3515                         return file;
3516                 WARN_ON(1);
3517                 error = -EINVAL;
3518         }
3519         fput(file);
3520         if (error == -EOPENSTALE) {
3521                 if (flags & LOOKUP_RCU)
3522                         error = -ECHILD;
3523                 else
3524                         error = -ESTALE;
3525         }
3526         return ERR_PTR(error);
3527 }
3528
3529 struct file *do_filp_open(int dfd, struct filename *pathname,
3530                 const struct open_flags *op)
3531 {
3532         struct nameidata nd;
3533         int flags = op->lookup_flags;
3534         struct file *filp;
3535
3536         set_nameidata(&nd, dfd, pathname, NULL);
3537         filp = path_openat(&nd, op, flags | LOOKUP_RCU);
3538         if (unlikely(filp == ERR_PTR(-ECHILD)))
3539                 filp = path_openat(&nd, op, flags);
3540         if (unlikely(filp == ERR_PTR(-ESTALE)))
3541                 filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
3542         restore_nameidata();
3543         return filp;
3544 }
3545
3546 struct file *do_file_open_root(const struct path *root,
3547                 const char *name, const struct open_flags *op)
3548 {
3549         struct nameidata nd;
3550         struct file *file;
3551         struct filename *filename;
3552         int flags = op->lookup_flags;
3553
3554         if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
3555                 return ERR_PTR(-ELOOP);
3556
3557         filename = getname_kernel(name);
3558         if (IS_ERR(filename))
3559                 return ERR_CAST(filename);
3560
3561         set_nameidata(&nd, -1, filename, root);
3562         file = path_openat(&nd, op, flags | LOOKUP_RCU);
3563         if (unlikely(file == ERR_PTR(-ECHILD)))
3564                 file = path_openat(&nd, op, flags);
3565         if (unlikely(file == ERR_PTR(-ESTALE)))
3566                 file = path_openat(&nd, op, flags | LOOKUP_REVAL);
3567         restore_nameidata();
3568         putname(filename);
3569         return file;
3570 }
3571
3572 static struct dentry *filename_create(int dfd, struct filename *name,
3573                                 struct path *path, unsigned int lookup_flags)
3574 {
3575         struct dentry *dentry = ERR_PTR(-EEXIST);
3576         struct qstr last;
3577         int type;
3578         int err2;
3579         int error;
3580         bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
3581
3582         /*
3583          * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
3584          * other flags passed in are ignored!
3585          */
3586         lookup_flags &= LOOKUP_REVAL;
3587
3588         name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
3589         if (IS_ERR(name))
3590                 return ERR_CAST(name);
3591
3592         /*
3593          * Yucky last component or no last component at all?
3594          * (foo/., foo/.., /////)
3595          */
3596         if (unlikely(type != LAST_NORM))
3597                 goto out;
3598
3599         /* don't fail immediately if it's r/o, at least try to report other errors */
3600         err2 = mnt_want_write(path->mnt);
3601         /*
3602          * Do the final lookup.
3603          */
3604         lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
3605         inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
3606         dentry = __lookup_hash(&last, path->dentry, lookup_flags);
3607         if (IS_ERR(dentry))
3608                 goto unlock;
3609
3610         error = -EEXIST;
3611         if (d_is_positive(dentry))
3612                 goto fail;
3613
3614         /*
3615          * Special case - lookup gave negative, but... we had foo/bar/
3616          * From the vfs_mknod() POV we just have a negative dentry -
3617          * all is fine. Let's be bastards - you had / on the end, you've
3618          * been asking for (non-existent) directory. -ENOENT for you.
3619          */
3620         if (unlikely(!is_dir && last.name[last.len])) {
3621                 error = -ENOENT;
3622                 goto fail;
3623         }
3624         if (unlikely(err2)) {
3625                 error = err2;
3626                 goto fail;
3627         }
3628         putname(name);
3629         return dentry;
3630 fail:
3631         dput(dentry);
3632         dentry = ERR_PTR(error);
3633 unlock:
3634         inode_unlock(path->dentry->d_inode);
3635         if (!err2)
3636                 mnt_drop_write(path->mnt);
3637 out:
3638         path_put(path);
3639         putname(name);
3640         return dentry;
3641 }
3642
3643 struct dentry *kern_path_create(int dfd, const char *pathname,
3644                                 struct path *path, unsigned int lookup_flags)
3645 {
3646         return filename_create(dfd, getname_kernel(pathname),
3647                                 path, lookup_flags);
3648 }
3649 EXPORT_SYMBOL(kern_path_create);
3650
3651 void done_path_create(struct path *path, struct dentry *dentry)
3652 {
3653         dput(dentry);
3654         inode_unlock(path->dentry->d_inode);
3655         mnt_drop_write(path->mnt);
3656         path_put(path);
3657 }
3658 EXPORT_SYMBOL(done_path_create);
3659
3660 inline struct dentry *user_path_create(int dfd, const char __user *pathname,
3661                                 struct path *path, unsigned int lookup_flags)
3662 {
3663         return filename_create(dfd, getname(pathname), path, lookup_flags);
3664 }
3665 EXPORT_SYMBOL(user_path_create);
3666
3667 /**
3668  * vfs_mknod - create device node or file
3669  * @mnt_userns: user namespace of the mount the inode was found from
3670  * @dir:        inode of @dentry
3671  * @dentry:     pointer to dentry of the base directory
3672  * @mode:       mode of the new device node or file
3673  * @dev:        device number of device to create
3674  *
3675  * Create a device node or file.
3676  *
3677  * If the inode has been found through an idmapped mount the user namespace of
3678  * the vfsmount must be passed through @mnt_userns. This function will then take
3679  * care to map the inode according to @mnt_userns before checking permissions.
3680  * On non-idmapped mounts or if permission checking is to be performed on the
3681  * raw inode simply passs init_user_ns.
3682  */
3683 int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
3684               struct dentry *dentry, umode_t mode, dev_t dev)
3685 {
3686         bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
3687         int error = may_create(mnt_userns, dir, dentry);
3688
3689         if (error)
3690                 return error;
3691
3692         if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
3693             !capable(CAP_MKNOD))
3694                 return -EPERM;
3695
3696         if (!dir->i_op->mknod)
3697                 return -EPERM;
3698
3699         error = devcgroup_inode_mknod(mode, dev);
3700         if (error)
3701                 return error;
3702
3703         error = security_inode_mknod(dir, dentry, mode, dev);
3704         if (error)
3705                 return error;
3706
3707         error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev);
3708         if (!error)
3709                 fsnotify_create(dir, dentry);
3710         return error;
3711 }
3712 EXPORT_SYMBOL(vfs_mknod);
3713
3714 static int may_mknod(umode_t mode)
3715 {
3716         switch (mode & S_IFMT) {
3717         case S_IFREG:
3718         case S_IFCHR:
3719         case S_IFBLK:
3720         case S_IFIFO:
3721         case S_IFSOCK:
3722         case 0: /* zero mode translates to S_IFREG */
3723                 return 0;
3724         case S_IFDIR:
3725                 return -EPERM;
3726         default:
3727                 return -EINVAL;
3728         }
3729 }
3730
3731 static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
3732                 unsigned int dev)
3733 {
3734         struct user_namespace *mnt_userns;
3735         struct dentry *dentry;
3736         struct path path;
3737         int error;
3738         unsigned int lookup_flags = 0;
3739
3740         error = may_mknod(mode);
3741         if (error)
3742                 return error;
3743 retry:
3744         dentry = user_path_create(dfd, filename, &path, lookup_flags);
3745         if (IS_ERR(dentry))
3746                 return PTR_ERR(dentry);
3747
3748         if (!IS_POSIXACL(path.dentry->d_inode))
3749                 mode &= ~current_umask();
3750         error = security_path_mknod(&path, dentry, mode, dev);
3751         if (error)
3752                 goto out;
3753
3754         mnt_userns = mnt_user_ns(path.mnt);
3755         switch (mode & S_IFMT) {
3756                 case 0: case S_IFREG:
3757                         error = vfs_create(mnt_userns, path.dentry->d_inode,
3758                                            dentry, mode, true);
3759                         if (!error)
3760                                 ima_post_path_mknod(mnt_userns, dentry);
3761                         break;
3762                 case S_IFCHR: case S_IFBLK:
3763                         error = vfs_mknod(mnt_userns, path.dentry->d_inode,
3764                                           dentry, mode, new_decode_dev(dev));
3765                         break;
3766                 case S_IFIFO: case S_IFSOCK:
3767                         error = vfs_mknod(mnt_userns, path.dentry->d_inode,
3768                                           dentry, mode, 0);
3769                         break;
3770         }
3771 out:
3772         done_path_create(&path, dentry);
3773         if (retry_estale(error, lookup_flags)) {
3774                 lookup_flags |= LOOKUP_REVAL;
3775                 goto retry;
3776         }
3777         return error;
3778 }
3779
3780 SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
3781                 unsigned int, dev)
3782 {
3783         return do_mknodat(dfd, filename, mode, dev);
3784 }
3785
3786 SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3787 {
3788         return do_mknodat(AT_FDCWD, filename, mode, dev);
3789 }
3790
3791 /**
3792  * vfs_mkdir - create directory
3793  * @mnt_userns: user namespace of the mount the inode was found from
3794  * @dir:        inode of @dentry
3795  * @dentry:     pointer to dentry of the base directory
3796  * @mode:       mode of the new directory
3797  *
3798  * Create a directory.
3799  *
3800  * If the inode has been found through an idmapped mount the user namespace of
3801  * the vfsmount must be passed through @mnt_userns. This function will then take
3802  * care to map the inode according to @mnt_userns before checking permissions.
3803  * On non-idmapped mounts or if permission checking is to be performed on the
3804  * raw inode simply passs init_user_ns.
3805  */
3806 int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
3807               struct dentry *dentry, umode_t mode)
3808 {
3809         int error = may_create(mnt_userns, dir, dentry);
3810         unsigned max_links = dir->i_sb->s_max_links;
3811
3812         if (error)
3813                 return error;
3814
3815         if (!dir->i_op->mkdir)
3816                 return -EPERM;
3817
3818         mode &= (S_IRWXUGO|S_ISVTX);
3819         error = security_inode_mkdir(dir, dentry, mode);
3820         if (error)
3821                 return error;
3822
3823         if (max_links && dir->i_nlink >= max_links)
3824                 return -EMLINK;
3825
3826         error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode);
3827         if (!error)
3828                 fsnotify_mkdir(dir, dentry);
3829         return error;
3830 }
3831 EXPORT_SYMBOL(vfs_mkdir);
3832
3833 static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
3834 {
3835         struct dentry *dentry;
3836         struct path path;
3837         int error;
3838         unsigned int lookup_flags = LOOKUP_DIRECTORY;
3839
3840 retry:
3841         dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3842         if (IS_ERR(dentry))
3843                 return PTR_ERR(dentry);
3844
3845         if (!IS_POSIXACL(path.dentry->d_inode))
3846                 mode &= ~current_umask();
3847         error = security_path_mkdir(&path, dentry, mode);
3848         if (!error) {
3849                 struct user_namespace *mnt_userns;
3850                 mnt_userns = mnt_user_ns(path.mnt);
3851                 error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry,
3852                                   mode);
3853         }
3854         done_path_create(&path, dentry);
3855         if (retry_estale(error, lookup_flags)) {
3856                 lookup_flags |= LOOKUP_REVAL;
3857                 goto retry;
3858         }
3859         return error;
3860 }
3861
3862 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
3863 {
3864         return do_mkdirat(dfd, pathname, mode);
3865 }
3866
3867 SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3868 {
3869         return do_mkdirat(AT_FDCWD, pathname, mode);
3870 }
3871
3872 /**
3873  * vfs_rmdir - remove directory
3874  * @mnt_userns: user namespace of the mount the inode was found from
3875  * @dir:        inode of @dentry
3876  * @dentry:     pointer to dentry of the base directory
3877  *
3878  * Remove a directory.
3879  *
3880  * If the inode has been found through an idmapped mount the user namespace of
3881  * the vfsmount must be passed through @mnt_userns. This function will then take
3882  * care to map the inode according to @mnt_userns before checking permissions.
3883  * On non-idmapped mounts or if permission checking is to be performed on the
3884  * raw inode simply passs init_user_ns.
3885  */
3886 int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
3887                      struct dentry *dentry)
3888 {
3889         int error = may_delete(mnt_userns, dir, dentry, 1);
3890
3891         if (error)
3892                 return error;
3893
3894         if (!dir->i_op->rmdir)
3895                 return -EPERM;
3896
3897         dget(dentry);
3898         inode_lock(dentry->d_inode);
3899
3900         error = -EBUSY;
3901         if (is_local_mountpoint(dentry))
3902                 goto out;
3903
3904         error = security_inode_rmdir(dir, dentry);
3905         if (error)
3906                 goto out;
3907
3908         error = dir->i_op->rmdir(dir, dentry);
3909         if (error)
3910                 goto out;
3911
3912         shrink_dcache_parent(dentry);
3913         dentry->d_inode->i_flags |= S_DEAD;
3914         dont_mount(dentry);
3915         detach_mounts(dentry);
3916         fsnotify_rmdir(dir, dentry);
3917
3918 out:
3919         inode_unlock(dentry->d_inode);
3920         dput(dentry);
3921         if (!error)
3922                 d_delete(dentry);
3923         return error;
3924 }
3925 EXPORT_SYMBOL(vfs_rmdir);
3926
3927 long do_rmdir(int dfd, struct filename *name)
3928 {
3929         struct user_namespace *mnt_userns;
3930         int error = 0;
3931         struct dentry *dentry;
3932         struct path path;
3933         struct qstr last;
3934         int type;
3935         unsigned int lookup_flags = 0;
3936 retry:
3937         name = filename_parentat(dfd, name, lookup_flags,
3938                                 &path, &last, &type);
3939         if (IS_ERR(name))
3940                 return PTR_ERR(name);
3941
3942         switch (type) {
3943         case LAST_DOTDOT:
3944                 error = -ENOTEMPTY;
3945                 goto exit1;
3946         case LAST_DOT:
3947                 error = -EINVAL;
3948                 goto exit1;
3949         case LAST_ROOT:
3950                 error = -EBUSY;
3951                 goto exit1;
3952         }
3953
3954         error = mnt_want_write(path.mnt);
3955         if (error)
3956                 goto exit1;
3957
3958         inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
3959         dentry = __lookup_hash(&last, path.dentry, lookup_flags);
3960         error = PTR_ERR(dentry);
3961         if (IS_ERR(dentry))
3962                 goto exit2;
3963         if (!dentry->d_inode) {
3964                 error = -ENOENT;
3965                 goto exit3;
3966         }
3967         error = security_path_rmdir(&path, dentry);
3968         if (error)
3969                 goto exit3;
3970         mnt_userns = mnt_user_ns(path.mnt);
3971         error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry);
3972 exit3:
3973         dput(dentry);
3974 exit2:
3975         inode_unlock(path.dentry->d_inode);
3976         mnt_drop_write(path.mnt);
3977 exit1:
3978         path_put(&path);
3979         if (retry_estale(error, lookup_flags)) {
3980                 lookup_flags |= LOOKUP_REVAL;
3981                 goto retry;
3982         }
3983         putname(name);
3984         return error;
3985 }
3986
3987 SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3988 {
3989         return do_rmdir(AT_FDCWD, getname(pathname));
3990 }
3991
3992 /**
3993  * vfs_unlink - unlink a filesystem object
3994  * @mnt_userns: user namespace of the mount the inode was found from
3995  * @dir:        parent directory
3996  * @dentry:     victim
3997  * @delegated_inode: returns victim inode, if the inode is delegated.
3998  *
3999  * The caller must hold dir->i_mutex.
4000  *
4001  * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
4002  * return a reference to the inode in delegated_inode.  The caller
4003  * should then break the delegation on that inode and retry.  Because
4004  * breaking a delegation may take a long time, the caller should drop
4005  * dir->i_mutex before doing so.
4006  *
4007  * Alternatively, a caller may pass NULL for delegated_inode.  This may
4008  * be appropriate for callers that expect the underlying filesystem not
4009  * to be NFS exported.
4010  *
4011  * If the inode has been found through an idmapped mount the user namespace of
4012  * the vfsmount must be passed through @mnt_userns. This function will then take
4013  * care to map the inode according to @mnt_userns before checking permissions.
4014  * On non-idmapped mounts or if permission checking is to be performed on the
4015  * raw inode simply passs init_user_ns.
4016  */
4017 int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
4018                struct dentry *dentry, struct inode **delegated_inode)
4019 {
4020         struct inode *target = dentry->d_inode;
4021         int error = may_delete(mnt_userns, dir, dentry, 0);
4022
4023         if (error)
4024                 return error;
4025
4026         if (!dir->i_op->unlink)
4027                 return -EPERM;
4028
4029         inode_lock(target);
4030         if (is_local_mountpoint(dentry))
4031                 error = -EBUSY;
4032         else {
4033                 error = security_inode_unlink(dir, dentry);
4034                 if (!error) {
4035                         error = try_break_deleg(target, delegated_inode);
4036                         if (error)
4037                                 goto out;
4038                         error = dir->i_op->unlink(dir, dentry);
4039                         if (!error) {
4040                                 dont_mount(dentry);
4041                                 detach_mounts(dentry);
4042                                 fsnotify_unlink(dir, dentry);
4043                         }
4044                 }
4045         }
4046 out:
4047         inode_unlock(target);
4048
4049         /* We don't d_delete() NFS sillyrenamed files--they still exist. */
4050         if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
4051                 fsnotify_link_count(target);
4052                 d_delete(dentry);
4053         }
4054
4055         return error;
4056 }
4057 EXPORT_SYMBOL(vfs_unlink);
4058
4059 /*
4060  * Make sure that the actual truncation of the file will occur outside its
4061  * directory's i_mutex.  Truncate can take a long time if there is a lot of
4062  * writeout happening, and we don't want to prevent access to the directory
4063  * while waiting on the I/O.
4064  */
4065 long do_unlinkat(int dfd, struct filename *name)
4066 {
4067         int error;
4068         struct dentry *dentry;
4069         struct path path;
4070         struct qstr last;
4071         int type;
4072         struct inode *inode = NULL;
4073         struct inode *delegated_inode = NULL;
4074         unsigned int lookup_flags = 0;
4075 retry:
4076         name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
4077         if (IS_ERR(name))
4078                 return PTR_ERR(name);
4079
4080         error = -EISDIR;
4081         if (type != LAST_NORM)
4082                 goto exit1;
4083
4084         error = mnt_want_write(path.mnt);
4085         if (error)
4086                 goto exit1;
4087 retry_deleg:
4088         inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
4089         dentry = __lookup_hash(&last, path.dentry, lookup_flags);
4090         error = PTR_ERR(dentry);
4091         if (!IS_ERR(dentry)) {
4092                 struct user_namespace *mnt_userns;
4093
4094                 /* Why not before? Because we want correct error value */
4095                 if (last.name[last.len])
4096                         goto slashes;
4097                 inode = dentry->d_inode;
4098                 if (d_is_negative(dentry))
4099                         goto slashes;
4100                 ihold(inode);
4101                 error = security_path_unlink(&path, dentry);
4102                 if (error)
4103                         goto exit2;
4104                 mnt_userns = mnt_user_ns(path.mnt);
4105                 error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry,
4106                                    &delegated_inode);
4107 exit2:
4108                 dput(dentry);
4109         }
4110         inode_unlock(path.dentry->d_inode);
4111         if (inode)
4112                 iput(inode);    /* truncate the inode here */
4113         inode = NULL;
4114         if (delegated_inode) {
4115                 error = break_deleg_wait(&delegated_inode);
4116                 if (!error)
4117                         goto retry_deleg;
4118         }
4119         mnt_drop_write(path.mnt);
4120 exit1:
4121         path_put(&path);
4122         if (retry_estale(error, lookup_flags)) {
4123                 lookup_flags |= LOOKUP_REVAL;
4124                 inode = NULL;
4125                 goto retry;
4126         }
4127         putname(name);
4128         return error;
4129
4130 slashes:
4131         if (d_is_negative(dentry))
4132                 error = -ENOENT;
4133         else if (d_is_dir(dentry))
4134                 error = -EISDIR;
4135         else
4136                 error = -ENOTDIR;
4137         goto exit2;
4138 }
4139
4140 SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
4141 {
4142         if ((flag & ~AT_REMOVEDIR) != 0)
4143                 return -EINVAL;
4144
4145         if (flag & AT_REMOVEDIR)
4146                 return do_rmdir(dfd, getname(pathname));
4147         return do_unlinkat(dfd, getname(pathname));
4148 }
4149
4150 SYSCALL_DEFINE1(unlink, const char __user *, pathname)
4151 {
4152         return do_unlinkat(AT_FDCWD, getname(pathname));
4153 }
4154
4155 /**
4156  * vfs_symlink - create symlink
4157  * @mnt_userns: user namespace of the mount the inode was found from
4158  * @dir:        inode of @dentry
4159  * @dentry:     pointer to dentry of the base directory
4160  * @oldname:    name of the file to link to
4161  *
4162  * Create a symlink.
4163  *
4164  * If the inode has been found through an idmapped mount the user namespace of
4165  * the vfsmount must be passed through @mnt_userns. This function will then take
4166  * care to map the inode according to @mnt_userns before checking permissions.
4167  * On non-idmapped mounts or if permission checking is to be performed on the
4168  * raw inode simply passs init_user_ns.
4169  */
4170 int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
4171                 struct dentry *dentry, const char *oldname)
4172 {
4173         int error = may_create(mnt_userns, dir, dentry);
4174
4175         if (error)
4176                 return error;
4177
4178         if (!dir->i_op->symlink)
4179                 return -EPERM;
4180
4181         error = security_inode_symlink(dir, dentry, oldname);
4182         if (error)
4183                 return error;
4184
4185         error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname);
4186         if (!error)
4187                 fsnotify_create(dir, dentry);
4188         return error;
4189 }
4190 EXPORT_SYMBOL(vfs_symlink);
4191
4192 static long do_symlinkat(const char __user *oldname, int newdfd,
4193                   const char __user *newname)
4194 {
4195         int error;
4196         struct filename *from;
4197         struct dentry *dentry;
4198         struct path path;
4199         unsigned int lookup_flags = 0;
4200
4201         from = getname(oldname);
4202         if (IS_ERR(from))
4203                 return PTR_ERR(from);
4204 retry:
4205         dentry = user_path_create(newdfd, newname, &path, lookup_flags);
4206         error = PTR_ERR(dentry);
4207         if (IS_ERR(dentry))
4208                 goto out_putname;
4209
4210         error = security_path_symlink(&path, dentry, from->name);
4211         if (!error) {
4212                 struct user_namespace *mnt_userns;
4213
4214                 mnt_userns = mnt_user_ns(path.mnt);
4215                 error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry,
4216                                     from->name);
4217         }
4218         done_path_create(&path, dentry);
4219         if (retry_estale(error, lookup_flags)) {
4220                 lookup_flags |= LOOKUP_REVAL;
4221                 goto retry;
4222         }
4223 out_putname:
4224         putname(from);
4225         return error;
4226 }
4227
4228 SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
4229                 int, newdfd, const char __user *, newname)
4230 {
4231         return do_symlinkat(oldname, newdfd, newname);
4232 }
4233
4234 SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
4235 {
4236         return do_symlinkat(oldname, AT_FDCWD, newname);
4237 }
4238
4239 /**
4240  * vfs_link - create a new link
4241  * @old_dentry: object to be linked
4242  * @mnt_userns: the user namespace of the mount
4243  * @dir:        new parent
4244  * @new_dentry: where to create the new link
4245  * @delegated_inode: returns inode needing a delegation break
4246  *
4247  * The caller must hold dir->i_mutex
4248  *
4249  * If vfs_link discovers a delegation on the to-be-linked file in need
4250  * of breaking, it will return -EWOULDBLOCK and return a reference to the
4251  * inode in delegated_inode.  The caller should then break the delegation
4252  * and retry.  Because breaking a delegation may take a long time, the
4253  * caller should drop the i_mutex before doing so.
4254  *
4255  * Alternatively, a caller may pass NULL for delegated_inode.  This may
4256  * be appropriate for callers that expect the underlying filesystem not
4257  * to be NFS exported.
4258  *
4259  * If the inode has been found through an idmapped mount the user namespace of
4260  * the vfsmount must be passed through @mnt_userns. This function will then take
4261  * care to map the inode according to @mnt_userns before checking permissions.
4262  * On non-idmapped mounts or if permission checking is to be performed on the
4263  * raw inode simply passs init_user_ns.
4264  */
4265 int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns,
4266              struct inode *dir, struct dentry *new_dentry,
4267              struct inode **delegated_inode)
4268 {
4269         struct inode *inode = old_dentry->d_inode;
4270         unsigned max_links = dir->i_sb->s_max_links;
4271         int error;
4272
4273         if (!inode)
4274                 return -ENOENT;
4275
4276         error = may_create(mnt_userns, dir, new_dentry);
4277         if (error)
4278                 return error;
4279
4280         if (dir->i_sb != inode->i_sb)
4281                 return -EXDEV;
4282
4283         /*
4284          * A link to an append-only or immutable file cannot be created.
4285          */
4286         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4287                 return -EPERM;
4288         /*
4289          * Updating the link count will likely cause i_uid and i_gid to
4290          * be writen back improperly if their true value is unknown to
4291          * the vfs.
4292          */
4293         if (HAS_UNMAPPED_ID(mnt_userns, inode))
4294                 return -EPERM;
4295         if (!dir->i_op->link)
4296                 return -EPERM;
4297         if (S_ISDIR(inode->i_mode))
4298                 return -EPERM;
4299
4300         error = security_inode_link(old_dentry, dir, new_dentry);
4301         if (error)
4302                 return error;
4303
4304         inode_lock(inode);
4305         /* Make sure we don't allow creating hardlink to an unlinked file */
4306         if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
4307                 error =  -ENOENT;
4308         else if (max_links && inode->i_nlink >= max_links)
4309                 error = -EMLINK;
4310         else {
4311                 error = try_break_deleg(inode, delegated_inode);
4312                 if (!error)
4313                         error = dir->i_op->link(old_dentry, dir, new_dentry);
4314         }
4315
4316         if (!error && (inode->i_state & I_LINKABLE)) {
4317                 spin_lock(&inode->i_lock);
4318                 inode->i_state &= ~I_LINKABLE;
4319                 spin_unlock(&inode->i_lock);
4320         }
4321         inode_unlock(inode);
4322         if (!error)
4323                 fsnotify_link(dir, inode, new_dentry);
4324         return error;
4325 }
4326 EXPORT_SYMBOL(vfs_link);
4327
4328 /*
4329  * Hardlinks are often used in delicate situations.  We avoid
4330  * security-related surprises by not following symlinks on the
4331  * newname.  --KAB
4332  *
4333  * We don't follow them on the oldname either to be compatible
4334  * with linux 2.0, and to avoid hard-linking to directories
4335  * and other special files.  --ADM
4336  */
4337 static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
4338               const char __user *newname, int flags)
4339 {
4340         struct user_namespace *mnt_userns;
4341         struct dentry *new_dentry;
4342         struct path old_path, new_path;
4343         struct inode *delegated_inode = NULL;
4344         int how = 0;
4345         int error;
4346
4347         if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
4348                 return -EINVAL;
4349         /*
4350          * To use null names we require CAP_DAC_READ_SEARCH
4351          * This ensures that not everyone will be able to create
4352          * handlink using the passed filedescriptor.
4353          */
4354         if (flags & AT_EMPTY_PATH) {
4355                 if (!capable(CAP_DAC_READ_SEARCH))
4356                         return -ENOENT;
4357                 how = LOOKUP_EMPTY;
4358         }
4359
4360         if (flags & AT_SYMLINK_FOLLOW)
4361                 how |= LOOKUP_FOLLOW;
4362 retry:
4363         error = user_path_at(olddfd, oldname, how, &old_path);
4364         if (error)
4365                 return error;
4366
4367         new_dentry = user_path_create(newdfd, newname, &new_path,
4368                                         (how & LOOKUP_REVAL));
4369         error = PTR_ERR(new_dentry);
4370         if (IS_ERR(new_dentry))
4371                 goto out;
4372
4373         error = -EXDEV;
4374         if (old_path.mnt != new_path.mnt)
4375                 goto out_dput;
4376         mnt_userns = mnt_user_ns(new_path.mnt);
4377         error = may_linkat(mnt_userns, &old_path);
4378         if (unlikely(error))
4379                 goto out_dput;
4380         error = security_path_link(old_path.dentry, &new_path, new_dentry);
4381         if (error)
4382                 goto out_dput;
4383         error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode,
4384                          new_dentry, &delegated_inode);
4385 out_dput:
4386         done_path_create(&new_path, new_dentry);
4387         if (delegated_inode) {
4388                 error = break_deleg_wait(&delegated_inode);
4389                 if (!error) {
4390                         path_put(&old_path);
4391                         goto retry;
4392                 }
4393         }
4394         if (retry_estale(error, how)) {
4395                 path_put(&old_path);
4396                 how |= LOOKUP_REVAL;
4397                 goto retry;
4398         }
4399 out:
4400         path_put(&old_path);
4401
4402         return error;
4403 }
4404
4405 SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
4406                 int, newdfd, const char __user *, newname, int, flags)
4407 {
4408         return do_linkat(olddfd, oldname, newdfd, newname, flags);
4409 }
4410
4411 SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4412 {
4413         return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4414 }
4415
4416 /**
4417  * vfs_rename - rename a filesystem object
4418  * @rd:         pointer to &struct renamedata info
4419  *
4420  * The caller must hold multiple mutexes--see lock_rename()).
4421  *
4422  * If vfs_rename discovers a delegation in need of breaking at either
4423  * the source or destination, it will return -EWOULDBLOCK and return a
4424  * reference to the inode in delegated_inode.  The caller should then
4425  * break the delegation and retry.  Because breaking a delegation may
4426  * take a long time, the caller should drop all locks before doing
4427  * so.
4428  *
4429  * Alternatively, a caller may pass NULL for delegated_inode.  This may
4430  * be appropriate for callers that expect the underlying filesystem not
4431  * to be NFS exported.
4432  *
4433  * The worst of all namespace operations - renaming directory. "Perverted"
4434  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
4435  * Problems:
4436  *
4437  *      a) we can get into loop creation.
4438  *      b) race potential - two innocent renames can create a loop together.
4439  *         That's where 4.4 screws up. Current fix: serialization on
4440  *         sb->s_vfs_rename_mutex. We might be more accurate, but that's another
4441  *         story.
4442  *      c) we have to lock _four_ objects - parents and victim (if it exists),
4443  *         and source (if it is not a directory).
4444  *         And that - after we got ->i_mutex on parents (until then we don't know
4445  *         whether the target exists).  Solution: try to be smart with locking
4446  *         order for inodes.  We rely on the fact that tree topology may change
4447  *         only under ->s_vfs_rename_mutex _and_ that parent of the object we
4448  *         move will be locked.  Thus we can rank directories by the tree
4449  *         (ancestors first) and rank all non-directories after them.
4450  *         That works since everybody except rename does "lock parent, lookup,
4451  *         lock child" and rename is under ->s_vfs_rename_mutex.
4452  *         HOWEVER, it relies on the assumption that any object with ->lookup()
4453  *         has no more than 1 dentry.  If "hybrid" objects will ever appear,
4454  *         we'd better make sure that there's no link(2) for them.
4455  *      d) conversion from fhandle to dentry may come in the wrong moment - when
4456  *         we are removing the target. Solution: we will have to grab ->i_mutex
4457  *         in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4458  *         ->i_mutex on parents, which works but leads to some truly excessive
4459  *         locking].
4460  */
4461 int vfs_rename(struct renamedata *rd)
4462 {
4463         int error;
4464         struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
4465         struct dentry *old_dentry = rd->old_dentry;
4466         struct dentry *new_dentry = rd->new_dentry;
4467         struct inode **delegated_inode = rd->delegated_inode;
4468         unsigned int flags = rd->flags;
4469         bool is_dir = d_is_dir(old_dentry);
4470         struct inode *source = old_dentry->d_inode;
4471         struct inode *target = new_dentry->d_inode;
4472         bool new_is_dir = false;
4473         unsigned max_links = new_dir->i_sb->s_max_links;
4474         struct name_snapshot old_name;
4475
4476         if (source == target)
4477                 return 0;
4478
4479         error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir);
4480         if (error)
4481                 return error;
4482
4483         if (!target) {
4484                 error = may_create(rd->new_mnt_userns, new_dir, new_dentry);
4485         } else {
4486                 new_is_dir = d_is_dir(new_dentry);
4487
4488                 if (!(flags & RENAME_EXCHANGE))
4489                         error = may_delete(rd->new_mnt_userns, new_dir,
4490                                            new_dentry, is_dir);
4491                 else
4492                         error = may_delete(rd->new_mnt_userns, new_dir,
4493                                            new_dentry, new_is_dir);
4494         }
4495         if (error)
4496                 return error;
4497
4498         if (!old_dir->i_op->rename)
4499                 return -EPERM;
4500
4501         /*
4502          * If we are going to change the parent - check write permissions,
4503          * we'll need to flip '..'.
4504          */
4505         if (new_dir != old_dir) {
4506                 if (is_dir) {
4507                         error = inode_permission(rd->old_mnt_userns, source,
4508                                                  MAY_WRITE);
4509                         if (error)
4510                                 return error;
4511                 }
4512                 if ((flags & RENAME_EXCHANGE) && new_is_dir) {
4513                         error = inode_permission(rd->new_mnt_userns, target,
4514                                                  MAY_WRITE);
4515                         if (error)
4516                                 return error;
4517                 }
4518         }
4519
4520         error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
4521                                       flags);
4522         if (error)
4523                 return error;
4524
4525         take_dentry_name_snapshot(&old_name, old_dentry);
4526         dget(new_dentry);
4527         if (!is_dir || (flags & RENAME_EXCHANGE))
4528                 lock_two_nondirectories(source, target);
4529         else if (target)
4530                 inode_lock(target);
4531
4532         error = -EBUSY;
4533         if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
4534                 goto out;
4535
4536         if (max_links && new_dir != old_dir) {
4537                 error = -EMLINK;
4538                 if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4539                         goto out;
4540                 if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
4541                     old_dir->i_nlink >= max_links)
4542                         goto out;
4543         }
4544         if (!is_dir) {
4545                 error = try_break_deleg(source, delegated_inode);
4546                 if (error)
4547                         goto out;
4548         }
4549         if (target && !new_is_dir) {
4550                 error = try_break_deleg(target, delegated_inode);
4551                 if (error)
4552                         goto out;
4553         }
4554         error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry,
4555                                       new_dir, new_dentry, flags);
4556         if (error)
4557                 goto out;
4558
4559         if (!(flags & RENAME_EXCHANGE) && target) {
4560                 if (is_dir) {
4561                         shrink_dcache_parent(new_dentry);
4562                         target->i_flags |= S_DEAD;
4563                 }
4564                 dont_mount(new_dentry);
4565                 detach_mounts(new_dentry);
4566         }
4567         if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
4568                 if (!(flags & RENAME_EXCHANGE))
4569                         d_move(old_dentry, new_dentry);
4570                 else
4571                         d_exchange(old_dentry, new_dentry);
4572         }
4573 out:
4574         if (!is_dir || (flags & RENAME_EXCHANGE))
4575                 unlock_two_nondirectories(source, target);
4576         else if (target)
4577                 inode_unlock(target);
4578         dput(new_dentry);
4579         if (!error) {
4580                 fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
4581                               !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
4582                 if (flags & RENAME_EXCHANGE) {
4583                         fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
4584                                       new_is_dir, NULL, new_dentry);
4585                 }
4586         }
4587         release_dentry_name_snapshot(&old_name);
4588
4589         return error;
4590 }
4591 EXPORT_SYMBOL(vfs_rename);
4592
4593 int do_renameat2(int olddfd, struct filename *from, int newdfd,
4594                  struct filename *to, unsigned int flags)
4595 {
4596         struct renamedata rd;
4597         struct dentry *old_dentry, *new_dentry;
4598         struct dentry *trap;
4599         struct path old_path, new_path;
4600         struct qstr old_last, new_last;
4601         int old_type, new_type;
4602         struct inode *delegated_inode = NULL;
4603         unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
4604         bool should_retry = false;
4605         int error = -EINVAL;
4606
4607         if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4608                 goto put_both;
4609
4610         if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
4611             (flags & RENAME_EXCHANGE))
4612                 goto put_both;
4613
4614         if (flags & RENAME_EXCHANGE)
4615                 target_flags = 0;
4616
4617 retry:
4618         from = filename_parentat(olddfd, from, lookup_flags, &old_path,
4619                                         &old_last, &old_type);
4620         if (IS_ERR(from)) {
4621                 error = PTR_ERR(from);
4622                 goto put_new;
4623         }
4624
4625         to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
4626                                 &new_type);
4627         if (IS_ERR(to)) {
4628                 error = PTR_ERR(to);
4629                 goto exit1;
4630         }
4631
4632         error = -EXDEV;
4633         if (old_path.mnt != new_path.mnt)
4634                 goto exit2;
4635
4636         error = -EBUSY;
4637         if (old_type != LAST_NORM)
4638                 goto exit2;
4639
4640         if (flags & RENAME_NOREPLACE)
4641                 error = -EEXIST;
4642         if (new_type != LAST_NORM)
4643                 goto exit2;
4644
4645         error = mnt_want_write(old_path.mnt);
4646         if (error)
4647                 goto exit2;
4648
4649 retry_deleg:
4650         trap = lock_rename(new_path.dentry, old_path.dentry);
4651
4652         old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
4653         error = PTR_ERR(old_dentry);
4654         if (IS_ERR(old_dentry))
4655                 goto exit3;
4656         /* source must exist */
4657         error = -ENOENT;
4658         if (d_is_negative(old_dentry))
4659                 goto exit4;
4660         new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
4661         error = PTR_ERR(new_dentry);
4662         if (IS_ERR(new_dentry))
4663                 goto exit4;
4664         error = -EEXIST;
4665         if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
4666                 goto exit5;
4667         if (flags & RENAME_EXCHANGE) {
4668                 error = -ENOENT;
4669                 if (d_is_negative(new_dentry))
4670                         goto exit5;
4671
4672                 if (!d_is_dir(new_dentry)) {
4673                         error = -ENOTDIR;
4674                         if (new_last.name[new_last.len])
4675                                 goto exit5;
4676                 }
4677         }
4678         /* unless the source is a directory trailing slashes give -ENOTDIR */
4679         if (!d_is_dir(old_dentry)) {
4680                 error = -ENOTDIR;
4681                 if (old_last.name[old_last.len])
4682                         goto exit5;
4683                 if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
4684                         goto exit5;
4685         }
4686         /* source should not be ancestor of target */
4687         error = -EINVAL;
4688         if (old_dentry == trap)
4689                 goto exit5;
4690         /* target should not be an ancestor of source */
4691         if (!(flags & RENAME_EXCHANGE))
4692                 error = -ENOTEMPTY;
4693         if (new_dentry == trap)
4694                 goto exit5;
4695
4696         error = security_path_rename(&old_path, old_dentry,
4697                                      &new_path, new_dentry, flags);
4698         if (error)
4699                 goto exit5;
4700
4701         rd.old_dir         = old_path.dentry->d_inode;
4702         rd.old_dentry      = old_dentry;
4703         rd.old_mnt_userns  = mnt_user_ns(old_path.mnt);
4704         rd.new_dir         = new_path.dentry->d_inode;
4705         rd.new_dentry      = new_dentry;
4706         rd.new_mnt_userns  = mnt_user_ns(new_path.mnt);
4707         rd.delegated_inode = &delegated_inode;
4708         rd.flags           = flags;
4709         error = vfs_rename(&rd);
4710 exit5:
4711         dput(new_dentry);
4712 exit4:
4713         dput(old_dentry);
4714 exit3:
4715         unlock_rename(new_path.dentry, old_path.dentry);
4716         if (delegated_inode) {
4717                 error = break_deleg_wait(&delegated_inode);
4718                 if (!error)
4719                         goto retry_deleg;
4720         }
4721         mnt_drop_write(old_path.mnt);
4722 exit2:
4723         if (retry_estale(error, lookup_flags))
4724                 should_retry = true;
4725         path_put(&new_path);
4726 exit1:
4727         path_put(&old_path);
4728         if (should_retry) {
4729                 should_retry = false;
4730                 lookup_flags |= LOOKUP_REVAL;
4731                 goto retry;
4732         }
4733 put_both:
4734         putname(from);
4735 put_new:
4736         putname(to);
4737         return error;
4738 }
4739
4740 SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
4741                 int, newdfd, const char __user *, newname, unsigned int, flags)
4742 {
4743         return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
4744                                 flags);
4745 }
4746
4747 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
4748                 int, newdfd, const char __user *, newname)
4749 {
4750         return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
4751                                 0);
4752 }
4753
4754 SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4755 {
4756         return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
4757                                 getname(newname), 0);
4758 }
4759
4760 int readlink_copy(char __user *buffer, int buflen, const char *link)
4761 {
4762         int len = PTR_ERR(link);
4763         if (IS_ERR(link))
4764                 goto out;
4765
4766         len = strlen(link);
4767         if (len > (unsigned) buflen)
4768                 len = buflen;
4769         if (copy_to_user(buffer, link, len))
4770                 len = -EFAULT;
4771 out:
4772         return len;
4773 }
4774
4775 /**
4776  * vfs_readlink - copy symlink body into userspace buffer
4777  * @dentry: dentry on which to get symbolic link
4778  * @buffer: user memory pointer
4779  * @buflen: size of buffer
4780  *
4781  * Does not touch atime.  That's up to the caller if necessary
4782  *
4783  * Does not call security hook.
4784  */
4785 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4786 {
4787         struct inode *inode = d_inode(dentry);
4788         DEFINE_DELAYED_CALL(done);
4789         const char *link;
4790         int res;
4791
4792         if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
4793                 if (unlikely(inode->i_op->readlink))
4794                         return inode->i_op->readlink(dentry, buffer, buflen);
4795
4796                 if (!d_is_symlink(dentry))
4797                         return -EINVAL;
4798
4799                 spin_lock(&inode->i_lock);
4800                 inode->i_opflags |= IOP_DEFAULT_READLINK;
4801                 spin_unlock(&inode->i_lock);
4802         }
4803
4804         link = READ_ONCE(inode->i_link);
4805         if (!link) {
4806                 link = inode->i_op->get_link(dentry, inode, &done);
4807                 if (IS_ERR(link))
4808                         return PTR_ERR(link);
4809         }
4810         res = readlink_copy(buffer, buflen, link);
4811         do_delayed_call(&done);
4812         return res;
4813 }
4814 EXPORT_SYMBOL(vfs_readlink);
4815
4816 /**
4817  * vfs_get_link - get symlink body
4818  * @dentry: dentry on which to get symbolic link
4819  * @done: caller needs to free returned data with this
4820  *
4821  * Calls security hook and i_op->get_link() on the supplied inode.
4822  *
4823  * It does not touch atime.  That's up to the caller if necessary.
4824  *
4825  * Does not work on "special" symlinks like /proc/$$/fd/N
4826  */
4827 const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
4828 {
4829         const char *res = ERR_PTR(-EINVAL);
4830         struct inode *inode = d_inode(dentry);
4831
4832         if (d_is_symlink(dentry)) {
4833                 res = ERR_PTR(security_inode_readlink(dentry));
4834                 if (!res)
4835                         res = inode->i_op->get_link(dentry, inode, done);
4836         }
4837         return res;
4838 }
4839 EXPORT_SYMBOL(vfs_get_link);
4840
4841 /* get the link contents into pagecache */
4842 const char *page_get_link(struct dentry *dentry, struct inode *inode,
4843                           struct delayed_call *callback)
4844 {
4845         char *kaddr;
4846         struct page *page;
4847         struct address_space *mapping = inode->i_mapping;
4848
4849         if (!dentry) {
4850                 page = find_get_page(mapping, 0);
4851                 if (!page)
4852                         return ERR_PTR(-ECHILD);
4853                 if (!PageUptodate(page)) {
4854                         put_page(page);
4855                         return ERR_PTR(-ECHILD);
4856                 }
4857         } else {
4858                 page = read_mapping_page(mapping, 0, NULL);
4859                 if (IS_ERR(page))
4860                         return (char*)page;
4861         }
4862         set_delayed_call(callback, page_put_link, page);
4863         BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
4864         kaddr = page_address(page);
4865         nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
4866         return kaddr;
4867 }
4868
4869 EXPORT_SYMBOL(page_get_link);
4870
4871 void page_put_link(void *arg)
4872 {
4873         put_page(arg);
4874 }
4875 EXPORT_SYMBOL(page_put_link);
4876
4877 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4878 {
4879         DEFINE_DELAYED_CALL(done);
4880         int res = readlink_copy(buffer, buflen,
4881                                 page_get_link(dentry, d_inode(dentry),
4882                                               &done));
4883         do_delayed_call(&done);
4884         return res;
4885 }
4886 EXPORT_SYMBOL(page_readlink);
4887
4888 /*
4889  * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
4890  */
4891 int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
4892 {
4893         struct address_space *mapping = inode->i_mapping;
4894         struct page *page;
4895         void *fsdata;
4896         int err;
4897         unsigned int flags = 0;
4898         if (nofs)
4899                 flags |= AOP_FLAG_NOFS;
4900
4901 retry:
4902         err = pagecache_write_begin(NULL, mapping, 0, len-1,
4903                                 flags, &page, &fsdata);
4904         if (err)
4905                 goto fail;
4906
4907         memcpy(page_address(page), symname, len-1);
4908
4909         err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
4910                                                         page, fsdata);
4911         if (err < 0)
4912                 goto fail;
4913         if (err < len-1)
4914                 goto retry;
4915
4916         mark_inode_dirty(inode);
4917         return 0;
4918 fail:
4919         return err;
4920 }
4921 EXPORT_SYMBOL(__page_symlink);
4922
4923 int page_symlink(struct inode *inode, const char *symname, int len)
4924 {
4925         return __page_symlink(inode, symname, len,
4926                         !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
4927 }
4928 EXPORT_SYMBOL(page_symlink);
4929
4930 const struct inode_operations page_symlink_inode_operations = {
4931         .get_link       = page_get_link,
4932 };
4933 EXPORT_SYMBOL(page_symlink_inode_operations);