Merge branch 'work.init' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 9 Sep 2021 19:38:18 +0000 (12:38 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 9 Sep 2021 19:38:18 +0000 (12:38 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 9 Sep 2021 19:38:18 +0000 (12:38 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 9 Sep 2021 19:38:18 +0000 (12:38 -0700)
diff --combined include/linux/fs.h

index 37ad9a7,c76dfc0..29b3550
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -319,8 -319,6 +319,8 @@@ enum rw_hint 
   /* iocb->ki_waitq is valid */
   #define IOCB_WAITQ            (1 << 19)
   #define IOCB_NOIO             (1 << 20)
+ +/* can use bio alloc cache */
+ +#define IOCB_ALLOC_CACHE      (1 << 21)
   
   struct kiocb {
         struct file             *ki_filp;
@@@ -438,10 -436,6 +438,10 @@@ int pagecache_write_end(struct file *, 
    * struct address_space - Contents of a cacheable, mappable object.
    * @host: Owner, either the inode or the block_device.
    * @i_pages: Cached pages.
+ + * @invalidate_lock: Guards coherency between page cache contents and
+ + *   file offset->disk block mappings in the filesystem during invalidates.
+ + *   It is also used to block modification of page cache contents through
+ + *   memory mappings.
    * @gfp_mask: Memory allocation flags to use for allocating pages.
    * @i_mmap_writable: Number of VM_SHARED mappings.
    * @nr_thps: Number of THPs in the pagecache (non-shmem only).
@@@ -459,7 -453,6 +459,7 @@@
   struct address_space {
         struct inode            *host;
         struct xarray           i_pages;
+ +      struct rw_semaphore     invalidate_lock;
         gfp_t                   gfp_mask;
         atomic_t                i_mmap_writable;
   #ifdef CONFIG_READ_ONLY_THP_FOR_FS
@@@ -588,11 -581,6 +588,11 @@@ static inline void mapping_allow_writab
   
   struct posix_acl;
   #define ACL_NOT_CACHED ((void *)(-1))
+ +/*
+ + * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to
+ + * cache the ACL.  This also means that ->get_acl() can be called in RCU mode
+ + * with the LOOKUP_RCU flag.
+ + */
   #define ACL_DONT_CACHE ((void *)(-3))
   
   static inline struct posix_acl *
@@@ -826,42 -814,9 +826,42 @@@ static inline void inode_lock_shared_ne
         down_read_nested(&inode->i_rwsem, subclass);
   }
   
+ +static inline void filemap_invalidate_lock(struct address_space *mapping)
+ +{
+ +      down_write(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline void filemap_invalidate_unlock(struct address_space *mapping)
+ +{
+ +      up_write(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline void filemap_invalidate_lock_shared(struct address_space *mapping)
+ +{
+ +      down_read(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline int filemap_invalidate_trylock_shared(
+ +                                      struct address_space *mapping)
+ +{
+ +      return down_read_trylock(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline void filemap_invalidate_unlock_shared(
+ +                                      struct address_space *mapping)
+ +{
+ +      up_read(&mapping->invalidate_lock);
+ +}
+ +
   void lock_two_nondirectories(struct inode *, struct inode*);
   void unlock_two_nondirectories(struct inode *, struct inode*);
   
+ +void filemap_invalidate_lock_two(struct address_space *mapping1,
+ +                               struct address_space *mapping2);
+ +void filemap_invalidate_unlock_two(struct address_space *mapping1,
+ +                                 struct address_space *mapping2);
+ +
+ +
   /*
    * NOTE: in a 32bit arch with a preemptable kernel and
    * an UP compile the i_size_read/write must be atomic
@@@ -1042,7 -997,6 +1042,7 @@@ static inline struct file *get_file(str
   #define FL_UNLOCK_PENDING     512 /* Lease is being broken */
   #define FL_OFDLCK     1024    /* lock is "owned" by struct file */
   #define FL_LAYOUT     2048    /* outstanding pNFS layout */
+ +#define FL_RECLAIM    4096    /* reclaiming from a reboot server */
   
   #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)
   
@@@ -1553,11 -1507,8 +1553,11 @@@ struct super_block 
         /* Number of inodes with nlink == 0 but still referenced */
         atomic_long_t s_remove_count;
   
- -      /* Pending fsnotify inode refs */
- -      atomic_long_t s_fsnotify_inode_refs;
+ +      /*
+ +       * Number of inode/mount/sb objects that are being watched, note that
+ +       * inodes objects are currently double-accounted.
+ +       */
+ +      atomic_long_t s_fsnotify_connectors;
   
         /* Being remounted read-only */
         int s_readonly_remount;
@@@ -2114,7 -2065,7 +2114,7 @@@ struct inode_operations 
         struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
         const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
         int (*permission) (struct user_namespace *, struct inode *, int);
- -      struct posix_acl * (*get_acl)(struct inode *, int);
+ +      struct posix_acl * (*get_acl)(struct inode *, int, bool);
   
         int (*readlink) (struct dentry *, char __user *,int);
   
@@@ -2506,6 -2457,7 +2506,6 @@@ static inline void file_accessed(struc
   
   extern int file_modified(struct file *file);
   
- -int sync_inode(struct inode *inode, struct writeback_control *wbc);
   int sync_inode_metadata(struct inode *inode, int wait);
   
   struct file_system_type {
@@@ -2535,7 -2487,6 +2535,7 @@@
   
         struct lock_class_key i_lock_key;
         struct lock_class_key i_mutex_key;
+ +      struct lock_class_key invalidate_lock_key;
         struct lock_class_key i_mutex_dir_key;
   };
   
@@@ -2619,6 -2570,90 +2619,6 @@@ extern struct kobject *fs_kobj
   
   #define MAX_RW_COUNT (INT_MAX & PAGE_MASK)
   
- -#ifdef CONFIG_MANDATORY_FILE_LOCKING
- -extern int locks_mandatory_locked(struct file *);
- -extern int locks_mandatory_area(struct inode *, struct file *, loff_t, loff_t, unsigned char);
- -
- -/*
- - * Candidates for mandatory locking have the setgid bit set
- - * but no group execute bit -  an otherwise meaningless combination.
- - */
- -
- -static inline int __mandatory_lock(struct inode *ino)
- -{
- -      return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
- -}
- -
- -/*
- - * ... and these candidates should be on SB_MANDLOCK mounted fs,
- - * otherwise these will be advisory locks
- - */
- -
- -static inline int mandatory_lock(struct inode *ino)
- -{
- -      return IS_MANDLOCK(ino) && __mandatory_lock(ino);
- -}
- -
- -static inline int locks_verify_locked(struct file *file)
- -{
- -      if (mandatory_lock(locks_inode(file)))
- -              return locks_mandatory_locked(file);
- -      return 0;
- -}
- -
- -static inline int locks_verify_truncate(struct inode *inode,
- -                                  struct file *f,
- -                                  loff_t size)
- -{
- -      if (!inode->i_flctx || !mandatory_lock(inode))
- -              return 0;
- -
- -      if (size < inode->i_size) {
- -              return locks_mandatory_area(inode, f, size, inode->i_size - 1,
- -                              F_WRLCK);
- -      } else {
- -              return locks_mandatory_area(inode, f, inode->i_size, size - 1,
- -                              F_WRLCK);
- -      }
- -}
- -
- -#else /* !CONFIG_MANDATORY_FILE_LOCKING */
- -
- -static inline int locks_mandatory_locked(struct file *file)
- -{
- -      return 0;
- -}
- -
- -static inline int locks_mandatory_area(struct inode *inode, struct file *filp,
- -                                       loff_t start, loff_t end, unsigned char type)
- -{
- -      return 0;
- -}
- -
- -static inline int __mandatory_lock(struct inode *inode)
- -{
- -      return 0;
- -}
- -
- -static inline int mandatory_lock(struct inode *inode)
- -{
- -      return 0;
- -}
- -
- -static inline int locks_verify_locked(struct file *file)
- -{
- -      return 0;
- -}
- -
- -static inline int locks_verify_truncate(struct inode *inode, struct file *filp,
- -                                      size_t size)
- -{
- -      return 0;
- -}
- -
- -#endif /* CONFIG_MANDATORY_FILE_LOCKING */
- -
- -
   #ifdef CONFIG_FILE_LOCKING
   static inline int break_lease(struct inode *inode, unsigned int mode)
   {
@@@ -2751,7 -2786,6 +2751,7 @@@ static inline struct file *file_clone_o
   extern int filp_close(struct file *, fl_owner_t id);
   
   extern struct filename *getname_flags(const char __user *, int, int *);
+ +extern struct filename *getname_uflags(const char __user *, int);
   extern struct filename *getname(const char __user *);
   extern struct filename *getname_kernel(const char *);
   extern void putname(struct filename *name);
@@@ -2857,8 -2891,6 +2857,8 @@@ extern int filemap_fdatawrite_range(str
                                 loff_t start, loff_t end);
   extern int filemap_check_errors(struct address_space *mapping);
   extern void __filemap_set_wb_err(struct address_space *mapping, int err);
+ +int filemap_fdatawrite_wbc(struct address_space *mapping,
+ +                         struct writeback_control *wbc);
   
   static inline int filemap_write_and_wait(struct address_space *mapping)
   {
@@@ -3023,20 -3055,15 +3023,20 @@@ static inline void file_end_write(struc
   }
   
   /*
+ + * This is used for regular files where some users -- especially the
+ + * currently executed binary in a process, previously handled via
+ + * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap
+ + * read-write shared) accesses.
+ + *
    * get_write_access() gets write permission for a file.
    * put_write_access() releases this write permission.
- - * This is used for regular files.
- - * We cannot support write (and maybe mmap read-write shared) accesses and
- - * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
- - * can have the following values:
- - * 0: no writers, no VM_DENYWRITE mappings
- - * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
- - * > 0: (i_writecount) users are writing to the file.
+ + * deny_write_access() denies write access to a file.
+ + * allow_write_access() re-enables write access to a file.
+ + *
+ + * The i_writecount field of an inode can have the following values:
+ + * 0: no write access, no denied write access
+ + * < 0: (-i_writecount) users that denied write access to the file.
+ + * > 0: (i_writecount) users that have write access to the file.
    *
    * Normally we operate on that counter with atomic_{inc,dec} and it's safe
    * except for the cases where we don't hold i_writecount yet. Then we need to
@@@ -3219,6 -3246,10 +3219,6 @@@ ssize_t vfs_iocb_iter_read(struct file 
   ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                             struct iov_iter *iter);
   
- -/* fs/block_dev.c */
- -extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
- -                      int datasync);
- -
   /* fs/splice.c */
   extern ssize_t generic_file_splice_read(struct file *, loff_t *,
                 struct pipe_inode_info *, size_t, unsigned int);
@@@ -3324,7 -3355,6 +3324,7 @@@ extern int page_symlink(struct inode *i
   extern const struct inode_operations page_symlink_inode_operations;
   extern void kfree_link(void *);
   void generic_fillattr(struct user_namespace *, struct inode *, struct kstat *);
+ +void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
   extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
   extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
   void __inode_add_bytes(struct inode *inode, loff_t bytes);
@@@ -3592,7 -3622,7 +3592,7 @@@ int proc_nr_dentry(struct ctl_table *ta
                   void *buffer, size_t *lenp, loff_t *ppos);
   int proc_nr_inodes(struct ctl_table *table, int write,
                    void *buffer, size_t *lenp, loff_t *ppos);
- int __init get_filesystem_list(char *buf);
+ int __init list_bdev_fs_names(char *buf, size_t size);
   
   #define __FMODE_EXEC          ((__force int) FMODE_EXEC)
   #define __FMODE_NONOTIFY      ((__force int) FMODE_NONOTIFY)
diff --combined init/do_mounts.c

index b691d68,9b4a1f8..2ed30ff
--- 1/init/do_mounts.c
--- 2/init/do_mounts.c
+++ b/init/do_mounts.c
@@@ -338,31 -338,22 +338,22 @@@ __setup("rootflags=", root_data_setup)
   __setup("rootfstype=", fs_names_setup);
   __setup("rootdelay=", root_delay_setup);
   
- static void __init get_fs_names(char *page)
+ static int __init split_fs_names(char *page, char *names)
   {
-       char *s = page;
+       int count = 0;
+       char *p = page;
   
-       if (root_fs_names) {
-               strcpy(page, root_fs_names);
-               while (*s++) {
-                       if (s[-1] == ',')
-                               s[-1] = '\0';
-               }
-       } else {
-               int len = get_filesystem_list(page);
-               char *p, *next;
- 
-               page[len] = '\0';
-               for (p = page-1; p; p = next) {
-                       next = strchr(++p, '\n');
-                       if (*p++ != '\t')
-                               continue;
-                       while ((*s++ = *p++) != '\n')
-                               ;
-                       s[-1] = '\0';
-               }
+       strcpy(p, root_fs_names);
+       while (*p++) {
+               if (p[-1] == ',')
+                       p[-1] = '\0';
         }
-       *s = '\0';
+       *p = '\0';
+ 
+       for (p = page; *p; p += strlen(p)+1)
+               count++;
+ 
+       return count;
   }
   
   static int __init do_mount_root(const char *name, const char *fs,
@@@ -408,12 -399,16 +399,16 @@@ void __init mount_block_root(char *name
         char *fs_names = page_address(page);
         char *p;
         char b[BDEVNAME_SIZE];
+       int num_fs, i;
   
         scnprintf(b, BDEVNAME_SIZE, "unknown-block(%u,%u)",
                   MAJOR(ROOT_DEV), MINOR(ROOT_DEV));
-       get_fs_names(fs_names);
+       if (root_fs_names)
+               num_fs = split_fs_names(fs_names, root_fs_names);
+       else
+               num_fs = list_bdev_fs_names(fs_names, PAGE_SIZE);
   retry:
-       for (p = fs_names; *p; p += strlen(p)+1) {
+       for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1) {
                 int err = do_mount_root(name, p, flags, root_mount_data);
                 switch (err) {
                         case 0:
@@@ -432,6 -427,10 +427,6 @@@
                 printk("Please append a correct \"root=\" boot option; here are the available partitions:\n");
   
                 printk_all_partitions();
- -#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
- -              printk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify "
- -                     "explicit textual name for \"root=\" boot option.\n");
- -#endif
                 panic("VFS: Unable to mount root fs on %s", b);
         }
         if (!(flags & SB_RDONLY)) {
@@@ -442,7 -441,7 +437,7 @@@
         printk("List of all partitions:\n");
         printk_all_partitions();
         printk("No filesystem could mount root, tried: ");
-       for (p = fs_names; *p; p += strlen(p)+1)
+       for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1)
                 printk(" %s", p);
         printk("\n");
         panic("VFS: Unable to mount root fs on %s", b);
@@@ -526,6 -525,47 +521,47 @@@ static int __init mount_cifs_root(void
   }
   #endif
   
+ static bool __init fs_is_nodev(char *fstype)
+ {
+       struct file_system_type *fs = get_fs_type(fstype);
+       bool ret = false;
+ 
+       if (fs) {
+               ret = !(fs->fs_flags & FS_REQUIRES_DEV);
+               put_filesystem(fs);
+       }
+ 
+       return ret;
+ }
+ 
+ static int __init mount_nodev_root(void)
+ {
+       char *fs_names, *fstype;
+       int err = -EINVAL;
+       int num_fs, i;
+ 
+       fs_names = (void *)__get_free_page(GFP_KERNEL);
+       if (!fs_names)
+               return -EINVAL;
+       num_fs = split_fs_names(fs_names, root_fs_names);
+ 
+       for (i = 0, fstype = fs_names; i < num_fs;
+            i++, fstype += strlen(fstype) + 1) {
+               if (!fs_is_nodev(fstype))
+                       continue;
+               err = do_mount_root(root_device_name, fstype, root_mountflags,
+                                   root_mount_data);
+               if (!err)
+                       break;
+               if (err != -EACCES && err != -EINVAL)
+                       panic("VFS: Unable to mount root \"%s\" (%s), err=%d\n",
+                             root_device_name, fstype, err);
+       }
+ 
+       free_page((unsigned long)fs_names);
+       return err;
+ }
+ 
   void __init mount_root(void)
   {
   #ifdef CONFIG_ROOT_NFS
@@@ -542,6 -582,10 +578,10 @@@
                 return;
         }
   #endif
+       if (ROOT_DEV == 0 && root_device_name && root_fs_names) {
+               if (mount_nodev_root() == 0)
+                       return;
+       }
   #ifdef CONFIG_BLOCK
         {
                 int err = create_dev("/dev/root", ROOT_DEV);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 9 Sep 2021 19:38:18 +0000 (12:38 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 9 Sep 2021 19:38:18 +0000 (12:38 -0700)
		1	2
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/do_mounts.c	patch \|	diff1 \|	diff2 \|	blob \| history