Merge tag 'denywrite-for-5.15' of git://github.com/davidhildenbrand/linux

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 4 Sep 2021 18:35:47 +0000 (11:35 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 4 Sep 2021 18:35:47 +0000 (11:35 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 4 Sep 2021 18:35:47 +0000 (11:35 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 4 Sep 2021 18:35:47 +0000 (11:35 -0700)
diff --combined fs/exec.c

index 2dc489c,9294049..a098c13
--- 1/fs/exec.c
--- 2/fs/exec.c
+++ b/fs/exec.c
@@@ -217,10 -217,8 +217,10 @@@ static struct page *get_arg_page(struc
          * We are doing an exec().  'current' is the process
          * doing the exec and bprm->mm is the new process's mm.
          */
+ +      mmap_read_lock(bprm->mm);
         ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
                         &page, NULL, NULL);
+ +      mmap_read_unlock(bprm->mm);
         if (ret <= 0)
                 return NULL;
   
@@@ -576,7 -574,7 +576,7 @@@ static int copy_strings(int argc, struc
                                 }
   
                                 if (kmapped_page) {
- -                                      flush_kernel_dcache_page(kmapped_page);
+ +                                      flush_dcache_page(kmapped_page);
                                         kunmap(kmapped_page);
                                         put_arg_page(kmapped_page);
                                 }
@@@ -594,7 -592,7 +594,7 @@@
         ret = 0;
   out:
         if (kmapped_page) {
- -              flush_kernel_dcache_page(kmapped_page);
+ +              flush_dcache_page(kmapped_page);
                 kunmap(kmapped_page);
                 put_arg_page(kmapped_page);
         }
@@@ -636,7 -634,7 +636,7 @@@ int copy_string_kernel(const char *arg
                 kaddr = kmap_atomic(page);
                 flush_arg_page(bprm, pos & PAGE_MASK, page);
                 memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
- -              flush_kernel_dcache_page(page);
+ +              flush_dcache_page(page);
                 kunmap_atomic(kaddr);
                 put_arg_page(page);
         }
@@@ -1272,7 -1270,9 +1272,9 @@@ int begin_new_exec(struct linux_binprm 
          * not visibile until then. This also enables the update
          * to be lockless.
          */
-       set_mm_exe_file(bprm->mm, bprm->file);
+       retval = set_mm_exe_file(bprm->mm, bprm->file);
+       if (retval)
+               goto out;
   
         /* If the binary is not readable then enforce mm->dumpable=0 */
         would_dump(bprm, bprm->file);
@@@ -2072,8 -2072,10 +2074,8 @@@ SYSCALL_DEFINE5(execveat
                 const char __user *const __user *, envp,
                 int, flags)
   {
- -      int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
- -
         return do_execveat(fd,
- -                         getname_flags(filename, lookup_flags, NULL),
+ +                         getname_uflags(filename, flags),
                            argv, envp, flags);
   }
   
@@@ -2091,8 -2093,10 +2093,8 @@@ COMPAT_SYSCALL_DEFINE5(execveat, int, f
                        const compat_uptr_t __user *, envp,
                        int,  flags)
   {
- -      int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
- -
         return compat_do_execveat(fd,
- -                                getname_flags(filename, lookup_flags, NULL),
+ +                                getname_uflags(filename, flags),
                                   argv, envp, flags);
   }
   #endif
diff --combined include/linux/fs.h

index a6074cd,e0dc3e9..37ad9a7
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -319,8 -319,6 +319,8 @@@ enum rw_hint 
   /* iocb->ki_waitq is valid */
   #define IOCB_WAITQ            (1 << 19)
   #define IOCB_NOIO             (1 << 20)
+ +/* can use bio alloc cache */
+ +#define IOCB_ALLOC_CACHE      (1 << 21)
   
   struct kiocb {
         struct file             *ki_filp;
@@@ -438,10 -436,6 +438,10 @@@ int pagecache_write_end(struct file *, 
    * struct address_space - Contents of a cacheable, mappable object.
    * @host: Owner, either the inode or the block_device.
    * @i_pages: Cached pages.
+ + * @invalidate_lock: Guards coherency between page cache contents and
+ + *   file offset->disk block mappings in the filesystem during invalidates.
+ + *   It is also used to block modification of page cache contents through
+ + *   memory mappings.
    * @gfp_mask: Memory allocation flags to use for allocating pages.
    * @i_mmap_writable: Number of VM_SHARED mappings.
    * @nr_thps: Number of THPs in the pagecache (non-shmem only).
@@@ -459,7 -453,6 +459,7 @@@
   struct address_space {
         struct inode            *host;
         struct xarray           i_pages;
+ +      struct rw_semaphore     invalidate_lock;
         gfp_t                   gfp_mask;
         atomic_t                i_mmap_writable;
   #ifdef CONFIG_READ_ONLY_THP_FOR_FS
@@@ -588,11 -581,6 +588,11 @@@ static inline void mapping_allow_writab
   
   struct posix_acl;
   #define ACL_NOT_CACHED ((void *)(-1))
+ +/*
+ + * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to
+ + * cache the ACL.  This also means that ->get_acl() can be called in RCU mode
+ + * with the LOOKUP_RCU flag.
+ + */
   #define ACL_DONT_CACHE ((void *)(-3))
   
   static inline struct posix_acl *
@@@ -826,42 -814,9 +826,42 @@@ static inline void inode_lock_shared_ne
         down_read_nested(&inode->i_rwsem, subclass);
   }
   
+ +static inline void filemap_invalidate_lock(struct address_space *mapping)
+ +{
+ +      down_write(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline void filemap_invalidate_unlock(struct address_space *mapping)
+ +{
+ +      up_write(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline void filemap_invalidate_lock_shared(struct address_space *mapping)
+ +{
+ +      down_read(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline int filemap_invalidate_trylock_shared(
+ +                                      struct address_space *mapping)
+ +{
+ +      return down_read_trylock(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline void filemap_invalidate_unlock_shared(
+ +                                      struct address_space *mapping)
+ +{
+ +      up_read(&mapping->invalidate_lock);
+ +}
+ +
   void lock_two_nondirectories(struct inode *, struct inode*);
   void unlock_two_nondirectories(struct inode *, struct inode*);
   
+ +void filemap_invalidate_lock_two(struct address_space *mapping1,
+ +                               struct address_space *mapping2);
+ +void filemap_invalidate_unlock_two(struct address_space *mapping1,
+ +                                 struct address_space *mapping2);
+ +
+ +
   /*
    * NOTE: in a 32bit arch with a preemptable kernel and
    * an UP compile the i_size_read/write must be atomic
@@@ -1042,7 -997,6 +1042,7 @@@ static inline struct file *get_file(str
   #define FL_UNLOCK_PENDING     512 /* Lease is being broken */
   #define FL_OFDLCK     1024    /* lock is "owned" by struct file */
   #define FL_LAYOUT     2048    /* outstanding pNFS layout */
+ +#define FL_RECLAIM    4096    /* reclaiming from a reboot server */
   
   #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)
   
@@@ -1553,11 -1507,8 +1553,11 @@@ struct super_block 
         /* Number of inodes with nlink == 0 but still referenced */
         atomic_long_t s_remove_count;
   
- -      /* Pending fsnotify inode refs */
- -      atomic_long_t s_fsnotify_inode_refs;
+ +      /*
+ +       * Number of inode/mount/sb objects that are being watched, note that
+ +       * inodes objects are currently double-accounted.
+ +       */
+ +      atomic_long_t s_fsnotify_connectors;
   
         /* Being remounted read-only */
         int s_readonly_remount;
@@@ -2114,7 -2065,7 +2114,7 @@@ struct inode_operations 
         struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
         const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
         int (*permission) (struct user_namespace *, struct inode *, int);
- -      struct posix_acl * (*get_acl)(struct inode *, int);
+ +      struct posix_acl * (*get_acl)(struct inode *, int, bool);
   
         int (*readlink) (struct dentry *, char __user *,int);
   
@@@ -2506,6 -2457,7 +2506,6 @@@ static inline void file_accessed(struc
   
   extern int file_modified(struct file *file);
   
- -int sync_inode(struct inode *inode, struct writeback_control *wbc);
   int sync_inode_metadata(struct inode *inode, int wait);
   
   struct file_system_type {
@@@ -2535,7 -2487,6 +2535,7 @@@
   
         struct lock_class_key i_lock_key;
         struct lock_class_key i_mutex_key;
+ +      struct lock_class_key invalidate_lock_key;
         struct lock_class_key i_mutex_dir_key;
   };
   
@@@ -2619,6 -2570,90 +2619,6 @@@ extern struct kobject *fs_kobj
   
   #define MAX_RW_COUNT (INT_MAX & PAGE_MASK)
   
- -#ifdef CONFIG_MANDATORY_FILE_LOCKING
- -extern int locks_mandatory_locked(struct file *);
- -extern int locks_mandatory_area(struct inode *, struct file *, loff_t, loff_t, unsigned char);
- -
- -/*
- - * Candidates for mandatory locking have the setgid bit set
- - * but no group execute bit -  an otherwise meaningless combination.
- - */
- -
- -static inline int __mandatory_lock(struct inode *ino)
- -{
- -      return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
- -}
- -
- -/*
- - * ... and these candidates should be on SB_MANDLOCK mounted fs,
- - * otherwise these will be advisory locks
- - */
- -
- -static inline int mandatory_lock(struct inode *ino)
- -{
- -      return IS_MANDLOCK(ino) && __mandatory_lock(ino);
- -}
- -
- -static inline int locks_verify_locked(struct file *file)
- -{
- -      if (mandatory_lock(locks_inode(file)))
- -              return locks_mandatory_locked(file);
- -      return 0;
- -}
- -
- -static inline int locks_verify_truncate(struct inode *inode,
- -                                  struct file *f,
- -                                  loff_t size)
- -{
- -      if (!inode->i_flctx || !mandatory_lock(inode))
- -              return 0;
- -
- -      if (size < inode->i_size) {
- -              return locks_mandatory_area(inode, f, size, inode->i_size - 1,
- -                              F_WRLCK);
- -      } else {
- -              return locks_mandatory_area(inode, f, inode->i_size, size - 1,
- -                              F_WRLCK);
- -      }
- -}
- -
- -#else /* !CONFIG_MANDATORY_FILE_LOCKING */
- -
- -static inline int locks_mandatory_locked(struct file *file)
- -{
- -      return 0;
- -}
- -
- -static inline int locks_mandatory_area(struct inode *inode, struct file *filp,
- -                                       loff_t start, loff_t end, unsigned char type)
- -{
- -      return 0;
- -}
- -
- -static inline int __mandatory_lock(struct inode *inode)
- -{
- -      return 0;
- -}
- -
- -static inline int mandatory_lock(struct inode *inode)
- -{
- -      return 0;
- -}
- -
- -static inline int locks_verify_locked(struct file *file)
- -{
- -      return 0;
- -}
- -
- -static inline int locks_verify_truncate(struct inode *inode, struct file *filp,
- -                                      size_t size)
- -{
- -      return 0;
- -}
- -
- -#endif /* CONFIG_MANDATORY_FILE_LOCKING */
- -
- -
   #ifdef CONFIG_FILE_LOCKING
   static inline int break_lease(struct inode *inode, unsigned int mode)
   {
@@@ -2751,7 -2786,6 +2751,7 @@@ static inline struct file *file_clone_o
   extern int filp_close(struct file *, fl_owner_t id);
   
   extern struct filename *getname_flags(const char __user *, int, int *);
+ +extern struct filename *getname_uflags(const char __user *, int);
   extern struct filename *getname(const char __user *);
   extern struct filename *getname_kernel(const char *);
   extern void putname(struct filename *name);
@@@ -2857,8 -2891,6 +2857,8 @@@ extern int filemap_fdatawrite_range(str
                                 loff_t start, loff_t end);
   extern int filemap_check_errors(struct address_space *mapping);
   extern void __filemap_set_wb_err(struct address_space *mapping, int err);
+ +int filemap_fdatawrite_wbc(struct address_space *mapping,
+ +                         struct writeback_control *wbc);
   
   static inline int filemap_write_and_wait(struct address_space *mapping)
   {
@@@ -3023,15 -3055,20 +3023,20 @@@ static inline void file_end_write(struc
   }
   
   /*
+  * This is used for regular files where some users -- especially the
+  * currently executed binary in a process, previously handled via
+  * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap
+  * read-write shared) accesses.
+  *
    * get_write_access() gets write permission for a file.
    * put_write_access() releases this write permission.
-  * This is used for regular files.
-  * We cannot support write (and maybe mmap read-write shared) accesses and
-  * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
-  * can have the following values:
-  * 0: no writers, no VM_DENYWRITE mappings
-  * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
-  * > 0: (i_writecount) users are writing to the file.
+  * deny_write_access() denies write access to a file.
+  * allow_write_access() re-enables write access to a file.
+  *
+  * The i_writecount field of an inode can have the following values:
+  * 0: no write access, no denied write access
+  * < 0: (-i_writecount) users that denied write access to the file.
+  * > 0: (i_writecount) users that have write access to the file.
    *
    * Normally we operate on that counter with atomic_{inc,dec} and it's safe
    * except for the cases where we don't hold i_writecount yet. Then we need to
@@@ -3214,6 -3251,10 +3219,6 @@@ ssize_t vfs_iocb_iter_read(struct file 
   ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                             struct iov_iter *iter);
   
- -/* fs/block_dev.c */
- -extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
- -                      int datasync);
- -
   /* fs/splice.c */
   extern ssize_t generic_file_splice_read(struct file *, loff_t *,
                 struct pipe_inode_info *, size_t, unsigned int);
@@@ -3319,7 -3360,6 +3324,7 @@@ extern int page_symlink(struct inode *i
   extern const struct inode_operations page_symlink_inode_operations;
   extern void kfree_link(void *);
   void generic_fillattr(struct user_namespace *, struct inode *, struct kstat *);
+ +void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
   extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
   extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
   void __inode_add_bytes(struct inode *inode, loff_t bytes);
diff --combined include/linux/mm.h

index ed2552c,257995f..50e2c29
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -281,7 -281,6 +281,6 @@@ extern unsigned int kobjsize(const voi
   #define VM_GROWSDOWN  0x00000100      /* general info on the segment */
   #define VM_UFFD_MISSING       0x00000200      /* missing pages tracking */
   #define VM_PFNMAP     0x00000400      /* Page-ranges managed without "struct page", just pure PFN */
- #define VM_DENYWRITE  0x00000800      /* ETXTBSY on write attempts.. */
   #define VM_UFFD_WP    0x00001000      /* wrprotect pages tracking */
   
   #define VM_LOCKED     0x00002000
@@@ -829,8 -828,6 +828,8 @@@ static inline void *kvcalloc(size_t n, 
         return kvmalloc_array(n, size, flags | __GFP_ZERO);
   }
   
+ +extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize,
+ +              gfp_t flags);
   extern void kvfree(const void *addr);
   extern void kvfree_sensitive(const void *addr, size_t len);
   
@@@ -1216,10 -1213,18 +1215,10 @@@ static inline void get_page(struct pag
   }
   
   bool __must_check try_grab_page(struct page *page, unsigned int flags);
- -__maybe_unused struct page *try_grab_compound_head(struct page *page, int refs,
- -                                                 unsigned int flags);
+ +struct page *try_grab_compound_head(struct page *page, int refs,
+ +                                  unsigned int flags);
   
- -
- -static inline __must_check bool try_get_page(struct page *page)
- -{
- -      page = compound_head(page);
- -      if (WARN_ON_ONCE(page_ref_count(page) <= 0))
- -              return false;
- -      page_ref_inc(page);
- -      return true;
- -}
+ +struct page *try_get_compound_head(struct page *page, int refs);
   
   static inline void put_page(struct page *page)
   {
@@@ -1841,6 -1846,7 +1840,6 @@@ int __account_locked_vm(struct mm_struc
   struct kvec;
   int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
                         struct page **pages);
- -int get_kernel_page(unsigned long start, int write, struct page **pages);
   struct page *get_dump_page(unsigned long addr);
   
   extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
@@@ -2573,7 -2579,8 +2572,8 @@@ static inline int check_data_rlimit(uns
   extern int mm_take_all_locks(struct mm_struct *mm);
   extern void mm_drop_all_locks(struct mm_struct *mm);
   
- extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
+ extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
+ extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
   extern struct file *get_mm_exe_file(struct mm_struct *mm);
   extern struct file *get_task_exe_file(struct task_struct *task);
   
@@@ -3112,7 -3119,7 +3112,7 @@@ extern void memory_failure_queue_kick(i
   extern int unpoison_memory(unsigned long pfn);
   extern int sysctl_memory_failure_early_kill;
   extern int sysctl_memory_failure_recovery;
- -extern void shake_page(struct page *p, int access);
+ +extern void shake_page(struct page *p);
   extern atomic_long_t num_poisoned_pages __read_mostly;
   extern int soft_offline_page(unsigned long pfn, int flags);
   
diff --combined kernel/events/core.c

index 011cc50,19767bb..744e872
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -4697,6 -4697,7 +4697,6 @@@ errout
   }
   
   static void perf_event_free_filter(struct perf_event *event);
- -static void perf_event_free_bpf_prog(struct perf_event *event);
   
   static void free_event_rcu(struct rcu_head *head)
   {
@@@ -5573,6 -5574,7 +5573,6 @@@ static inline int perf_fget_light(int f
   static int perf_event_set_output(struct perf_event *event,
                                  struct perf_event *output_event);
   static int perf_event_set_filter(struct perf_event *event, void __user *arg);
- -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
   static int perf_copy_attr(struct perf_event_attr __user *uattr,
                           struct perf_event_attr *attr);
   
@@@ -5635,22 -5637,7 +5635,22 @@@ static long _perf_ioctl(struct perf_eve
                 return perf_event_set_filter(event, (void __user *)arg);
   
         case PERF_EVENT_IOC_SET_BPF:
- -              return perf_event_set_bpf_prog(event, arg);
+ +      {
+ +              struct bpf_prog *prog;
+ +              int err;
+ +
+ +              prog = bpf_prog_get(arg);
+ +              if (IS_ERR(prog))
+ +                      return PTR_ERR(prog);
+ +
+ +              err = perf_event_set_bpf_prog(event, prog, 0);
+ +              if (err) {
+ +                      bpf_prog_put(prog);
+ +                      return err;
+ +              }
+ +
+ +              return 0;
+ +      }
   
         case PERF_EVENT_IOC_PAUSE_OUTPUT: {
                 struct perf_buffer *rb;
@@@ -8320,8 -8307,6 +8320,6 @@@ static void perf_event_mmap_event(struc
         else
                 flags = MAP_PRIVATE;
   
-       if (vma->vm_flags & VM_DENYWRITE)
-               flags |= MAP_DENYWRITE;
         if (vma->vm_flags & VM_LOCKED)
                 flags |= MAP_LOCKED;
         if (is_vm_hugetlb_page(vma))
@@@ -9920,16 -9905,13 +9918,16 @@@ static void bpf_overflow_handler(struc
                 .data = data,
                 .event = event,
         };
+ +      struct bpf_prog *prog;
         int ret = 0;
   
         ctx.regs = perf_arch_bpf_user_pt_regs(regs);
         if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
                 goto out;
         rcu_read_lock();
- -      ret = BPF_PROG_RUN(event->prog, &ctx);
+ +      prog = READ_ONCE(event->prog);
+ +      if (prog)
+ +              ret = bpf_prog_run(prog, &ctx);
         rcu_read_unlock();
   out:
         __this_cpu_dec(bpf_prog_active);
@@@ -9939,10 -9921,10 +9937,10 @@@
         event->orig_overflow_handler(event, data, regs);
   }
   
- -static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+ +static int perf_event_set_bpf_handler(struct perf_event *event,
+ +                                    struct bpf_prog *prog,
+ +                                    u64 bpf_cookie)
   {
- -      struct bpf_prog *prog;
- -
         if (event->overflow_handler_context)
                 /* hw breakpoint or kernel counter */
                 return -EINVAL;
@@@ -9950,8 -9932,9 +9948,8 @@@
         if (event->prog)
                 return -EEXIST;
   
- -      prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
- -      if (IS_ERR(prog))
- -              return PTR_ERR(prog);
+ +      if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
+ +              return -EINVAL;
   
         if (event->attr.precise_ip &&
             prog->call_get_stack &&
@@@ -9967,11 -9950,11 +9965,11 @@@
                  * attached to perf_sample_data, do not allow attaching BPF
                  * program that calls bpf_get_[stack|stackid].
                  */
- -              bpf_prog_put(prog);
                 return -EPROTO;
         }
   
         event->prog = prog;
+ +      event->bpf_cookie = bpf_cookie;
         event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
         WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
         return 0;
@@@ -9989,9 -9972,7 +9987,9 @@@ static void perf_event_free_bpf_handler
         bpf_prog_put(prog);
   }
   #else
- -static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+ +static int perf_event_set_bpf_handler(struct perf_event *event,
+ +                                    struct bpf_prog *prog,
+ +                                    u64 bpf_cookie)
   {
         return -EOPNOTSUPP;
   }
@@@ -10019,13 -10000,14 +10017,13 @@@ static inline bool perf_event_is_tracin
         return false;
   }
   
- -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+ +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+ +                          u64 bpf_cookie)
   {
         bool is_kprobe, is_tracepoint, is_syscall_tp;
- -      struct bpf_prog *prog;
- -      int ret;
   
         if (!perf_event_is_tracing(event))
- -              return perf_event_set_bpf_handler(event, prog_fd);
+ +              return perf_event_set_bpf_handler(event, prog, bpf_cookie);
   
         is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
         is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
@@@ -10034,27 -10016,41 +10032,27 @@@
                 /* bpf programs can only be attached to u/kprobe or tracepoint */
                 return -EINVAL;
   
- -      prog = bpf_prog_get(prog_fd);
- -      if (IS_ERR(prog))
- -              return PTR_ERR(prog);
- -
         if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
             (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
- -          (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
- -              /* valid fd, but invalid bpf program type */
- -              bpf_prog_put(prog);
+ +          (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
                 return -EINVAL;
- -      }
   
         /* Kprobe override only works for kprobes, not uprobes. */
         if (prog->kprobe_override &&
- -          !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
- -              bpf_prog_put(prog);
+ +          !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
                 return -EINVAL;
- -      }
   
         if (is_tracepoint || is_syscall_tp) {
                 int off = trace_event_get_offsets(event->tp_event);
   
- -              if (prog->aux->max_ctx_offset > off) {
- -                      bpf_prog_put(prog);
+ +              if (prog->aux->max_ctx_offset > off)
                         return -EACCES;
- -              }
         }
   
- -      ret = perf_event_attach_bpf_prog(event, prog);
- -      if (ret)
- -              bpf_prog_put(prog);
- -      return ret;
+ +      return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
   }
   
- -static void perf_event_free_bpf_prog(struct perf_event *event)
+ +void perf_event_free_bpf_prog(struct perf_event *event)
   {
         if (!perf_event_is_tracing(event)) {
                 perf_event_free_bpf_handler(event);
@@@ -10073,13 -10069,12 +10071,13 @@@ static void perf_event_free_filter(stru
   {
   }
   
- -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+ +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+ +                          u64 bpf_cookie)
   {
         return -ENOENT;
   }
   
- -static void perf_event_free_bpf_prog(struct perf_event *event)
+ +void perf_event_free_bpf_prog(struct perf_event *event)
   {
   }
   #endif /* CONFIG_EVENT_TRACING */
diff --combined kernel/fork.c

index 695d134,feef105..6d2e10a
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -446,7 -446,6 +446,7 @@@ void put_task_stack(struct task_struct 
   
   void free_task(struct task_struct *tsk)
   {
+ +      release_user_cpus_ptr(tsk);
         scs_release(tsk);
   
   #ifndef CONFIG_THREAD_INFO_IN_TASK
@@@ -471,6 -470,20 +471,20 @@@
   }
   EXPORT_SYMBOL(free_task);
   
+ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
+ {
+       struct file *exe_file;
+ 
+       exe_file = get_mm_exe_file(oldmm);
+       RCU_INIT_POINTER(mm->exe_file, exe_file);
+       /*
+        * We depend on the oldmm having properly denied write access to the
+        * exe_file already.
+        */
+       if (exe_file && deny_write_access(exe_file))
+               pr_warn_once("deny_write_access() failed in %s\n", __func__);
+ }
+ 
   #ifdef CONFIG_MMU
   static __latent_entropy int dup_mmap(struct mm_struct *mm,
                                         struct mm_struct *oldmm)
@@@ -494,7 -507,7 +508,7 @@@
         mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
   
         /* No ordering required: file already has been exposed. */
-       RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+       dup_mm_exe_file(mm, oldmm);
   
         mm->total_vm = oldmm->total_vm;
         mm->data_vm = oldmm->data_vm;
@@@ -557,12 -570,9 +571,9 @@@
                 tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
                 file = tmp->vm_file;
                 if (file) {
-                       struct inode *inode = file_inode(file);
                         struct address_space *mapping = file->f_mapping;
   
                         get_file(file);
-                       if (tmp->vm_flags & VM_DENYWRITE)
-                               put_write_access(inode);
                         i_mmap_lock_write(mapping);
                         if (tmp->vm_flags & VM_SHARED)
                                 mapping_allow_writable(mapping);
@@@ -640,7 -650,7 +651,7 @@@ static inline void mm_free_pgd(struct m
   static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
   {
         mmap_write_lock(oldmm);
-       RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+       dup_mm_exe_file(mm, oldmm);
         mmap_write_unlock(oldmm);
         return 0;
   }
@@@ -925,7 -935,6 +936,7 @@@ static struct task_struct *dup_task_str
   #endif
         if (orig->cpus_ptr == &orig->cpus_mask)
                 tsk->cpus_ptr = &tsk->cpus_mask;
+ +      dup_user_cpus_ptr(tsk, orig, node);
   
         /*
          * One for the user space visible state that goes away when reaped.
@@@ -1150,11 -1159,11 +1161,11 @@@ void mmput_async(struct mm_struct *mm
    *
    * Main users are mmput() and sys_execve(). Callers prevent concurrent
    * invocations: in mmput() nobody alive left, in execve task is single
-  * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
-  * mm->exe_file, but does so without using set_mm_exe_file() in order
-  * to avoid the need for any locks.
+  * threaded.
+  *
+  * Can only fail if new_exe_file != NULL.
    */
- void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
   {
         struct file *old_exe_file;
   
@@@ -1165,11 -1174,73 +1176,73 @@@
          */
         old_exe_file = rcu_dereference_raw(mm->exe_file);
   
-       if (new_exe_file)
+       if (new_exe_file) {
+               /*
+                * We expect the caller (i.e., sys_execve) to already denied
+                * write access, so this is unlikely to fail.
+                */
+               if (unlikely(deny_write_access(new_exe_file)))
+                       return -EACCES;
                 get_file(new_exe_file);
+       }
         rcu_assign_pointer(mm->exe_file, new_exe_file);
-       if (old_exe_file)
+       if (old_exe_file) {
+               allow_write_access(old_exe_file);
                 fput(old_exe_file);
+       }
+       return 0;
+ }
+ 
+ /**
+  * replace_mm_exe_file - replace a reference to the mm's executable file
+  *
+  * This changes mm's executable file (shown as symlink /proc/[pid]/exe),
+  * dealing with concurrent invocation and without grabbing the mmap lock in
+  * write mode.
+  *
+  * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
+  */
+ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+ {
+       struct vm_area_struct *vma;
+       struct file *old_exe_file;
+       int ret = 0;
+ 
+       /* Forbid mm->exe_file change if old file still mapped. */
+       old_exe_file = get_mm_exe_file(mm);
+       if (old_exe_file) {
+               mmap_read_lock(mm);
+               for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) {
+                       if (!vma->vm_file)
+                               continue;
+                       if (path_equal(&vma->vm_file->f_path,
+                                      &old_exe_file->f_path))
+                               ret = -EBUSY;
+               }
+               mmap_read_unlock(mm);
+               fput(old_exe_file);
+               if (ret)
+                       return ret;
+       }
+ 
+       /* set the new file, lockless */
+       ret = deny_write_access(new_exe_file);
+       if (ret)
+               return -EACCES;
+       get_file(new_exe_file);
+ 
+       old_exe_file = xchg(&mm->exe_file, new_exe_file);
+       if (old_exe_file) {
+               /*
+                * Don't race with dup_mmap() getting the file and disallowing
+                * write access while someone might open the file writable.
+                */
+               mmap_read_lock(mm);
+               allow_write_access(old_exe_file);
+               fput(old_exe_file);
+               mmap_read_unlock(mm);
+       }
+       return 0;
   }
   
   /**
@@@ -2085,7 -2156,6 +2158,7 @@@ static __latent_entropy struct task_str
   #endif
   #ifdef CONFIG_BPF_SYSCALL
         RCU_INIT_POINTER(p->bpf_storage, NULL);
+ +      p->bpf_ctx = NULL;
   #endif
   
         /* Perform scheduler related setup. Assign this task to a CPU. */
diff --combined kernel/sys.c

index 72c7639,30c12e5..b6aa704
--- 1/kernel/sys.c
--- 2/kernel/sys.c
+++ b/kernel/sys.c
@@@ -480,8 -480,7 +480,8 @@@ static int set_user(struct cred *new
          * failure to the execve() stage.
          */
         if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
- -                      new_user != INIT_USER)
+ +                      new_user != INIT_USER &&
+ +                      !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                 current->flags |= PF_NPROC_EXCEEDED;
         else
                 current->flags &= ~PF_NPROC_EXCEEDED;
@@@ -1847,7 -1846,6 +1847,6 @@@ SYSCALL_DEFINE1(umask, int, mask
   static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
   {
         struct fd exe;
-       struct file *old_exe, *exe_file;
         struct inode *inode;
         int err;
   
@@@ -1870,40 -1868,10 +1869,10 @@@
         if (err)
                 goto exit;
   
-       /*
-        * Forbid mm->exe_file change if old file still mapped.
-        */
-       exe_file = get_mm_exe_file(mm);
-       err = -EBUSY;
-       if (exe_file) {
-               struct vm_area_struct *vma;
- 
-               mmap_read_lock(mm);
-               for (vma = mm->mmap; vma; vma = vma->vm_next) {
-                       if (!vma->vm_file)
-                               continue;
-                       if (path_equal(&vma->vm_file->f_path,
-                                      &exe_file->f_path))
-                               goto exit_err;
-               }
- 
-               mmap_read_unlock(mm);
-               fput(exe_file);
-       }
- 
-       err = 0;
-       /* set the new file, lockless */
-       get_file(exe.file);
-       old_exe = xchg(&mm->exe_file, exe.file);
-       if (old_exe)
-               fput(old_exe);
+       err = replace_mm_exe_file(mm, exe.file);
   exit:
         fdput(exe);
         return err;
- exit_err:
-       mmap_read_unlock(mm);
-       fput(exe_file);
-       goto exit;
   }
   
   /*
diff --combined mm/mmap.c

index dce4610,bf11fc6..88dcc5c
--- 1/mm/mmap.c
--- 2/mm/mmap.c
+++ b/mm/mmap.c
@@@ -148,8 -148,6 +148,6 @@@ void vma_set_page_prot(struct vm_area_s
   static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                 struct file *file, struct address_space *mapping)
   {
-       if (vma->vm_flags & VM_DENYWRITE)
-               allow_write_access(file);
         if (vma->vm_flags & VM_SHARED)
                 mapping_unmap_writable(mapping);
   
@@@ -534,7 -532,6 +532,7 @@@ static int find_vma_links(struct mm_str
   {
         struct rb_node **__rb_link, *__rb_parent, *rb_prev;
   
+ +      mmap_assert_locked(mm);
         __rb_link = &mm->mm_rb.rb_node;
         rb_prev = __rb_parent = NULL;
   
@@@ -667,8 -664,6 +665,6 @@@ static void __vma_link_file(struct vm_a
         if (file) {
                 struct address_space *mapping = file->f_mapping;
   
-               if (vma->vm_flags & VM_DENYWRITE)
-                       put_write_access(file_inode(file));
                 if (vma->vm_flags & VM_SHARED)
                         mapping_allow_writable(mapping);
   
@@@ -1518,6 -1513,12 +1514,6 @@@ unsigned long do_mmap(struct file *file
                         if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
                                 return -EACCES;
   
- -                      /*
- -                       * Make sure there are no mandatory locks on the file.
- -                       */
- -                      if (locks_verify_locked(file))
- -                              return -EAGAIN;
- -
                         vm_flags |= VM_SHARED | VM_MAYSHARE;
                         if (!(file->f_mode & FMODE_WRITE))
                                 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
@@@ -1625,8 -1626,6 +1621,6 @@@ unsigned long ksys_mmap_pgoff(unsigned 
                         return PTR_ERR(file);
         }
   
-       flags &= ~MAP_DENYWRITE;
- 
         retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
   out_fput:
         if (file)
@@@ -1783,22 -1782,12 +1777,12 @@@ unsigned long mmap_region(struct file *
         vma->vm_pgoff = pgoff;
   
         if (file) {
-               if (vm_flags & VM_DENYWRITE) {
-                       error = deny_write_access(file);
-                       if (error)
-                               goto free_vma;
-               }
                 if (vm_flags & VM_SHARED) {
                         error = mapping_map_writable(file->f_mapping);
                         if (error)
-                               goto allow_write_and_free_vma;
+                               goto free_vma;
                 }
   
-               /* ->mmap() can change vma->vm_file, but must guarantee that
-                * vma_link() below can deny write-access if VM_DENYWRITE is set
-                * and map writably if VM_SHARED is set. This usually means the
-                * new file must not have been exposed to user-space, yet.
-                */
                 vma->vm_file = get_file(file);
                 error = call_mmap(file, vma);
                 if (error)
@@@ -1855,13 -1844,9 +1839,9 @@@
   
         vma_link(mm, vma, prev, rb_link, rb_parent);
         /* Once vma denies write, undo our temporary denial count */
-       if (file) {
   unmap_writable:
-               if (vm_flags & VM_SHARED)
-                       mapping_unmap_writable(file->f_mapping);
-               if (vm_flags & VM_DENYWRITE)
-                       allow_write_access(file);
-       }
+       if (file && vm_flags & VM_SHARED)
+               mapping_unmap_writable(file->f_mapping);
         file = vma->vm_file;
   out:
         perf_event_mmap(vma);
@@@ -1901,9 -1886,6 +1881,6 @@@ unmap_and_free_vma
         charged = 0;
         if (vm_flags & VM_SHARED)
                 mapping_unmap_writable(file->f_mapping);
- allow_write_and_free_vma:
-       if (vm_flags & VM_DENYWRITE)
-               allow_write_access(file);
   free_vma:
         vm_area_free(vma);
   unacct_error:
@@@ -2298,7 -2280,6 +2275,7 @@@ struct vm_area_struct *find_vma(struct 
         struct rb_node *rb_node;
         struct vm_area_struct *vma;
   
+ +      mmap_assert_locked(mm);
         /* Check the cache first. */
         vma = vmacache_find(mm, addr);
         if (likely(vma))
@@@ -2988,11 -2969,14 +2965,11 @@@ SYSCALL_DEFINE5(remap_file_pages, unsig
         if (mmap_write_lock_killable(mm))
                 return -EINTR;
   
- -      vma = find_vma(mm, start);
+ +      vma = vma_lookup(mm, start);
   
         if (!vma || !(vma->vm_flags & VM_SHARED))
                 goto out;
   
- -      if (start < vma->vm_start)
- -              goto out;
- -
         if (start + size > vma->vm_end) {
                 struct vm_area_struct *next;
   
diff --combined mm/nommu.c

index 9d0ad98,0987d13..02d2427
--- 1/mm/nommu.c
--- 2/mm/nommu.c
+++ b/mm/nommu.c
@@@ -826,6 -826,9 +826,6 @@@ static int validate_mmap_request(struc
                             (file->f_mode & FMODE_WRITE))
                                 return -EACCES;
   
- -                      if (locks_verify_locked(file))
- -                              return -EAGAIN;
- -
                         if (!(capabilities & NOMMU_MAP_DIRECT))
                                 return -ENODEV;
   
@@@ -1293,8 -1296,6 +1293,6 @@@ unsigned long ksys_mmap_pgoff(unsigned 
                         goto out;
         }
   
-       flags &= ~MAP_DENYWRITE;
- 
         retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
   
         if (file)
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 4 Sep 2021 18:35:47 +0000 (11:35 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 4 Sep 2021 18:35:47 +0000 (11:35 -0700)
		1	2
fs/exec.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sys.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/mmap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/nommu.c	patch \|	diff1 \|	diff2 \|	blob \| history