Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 19 Dec 2020 19:39:50 +0000 (11:39 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 19 Dec 2020 19:39:50 +0000 (11:39 -0800)
Merge still more updates from Andrew Morton:
 "18 patches.

  Subsystems affected by this patch series: mm (memcg and cleanups) and
  epoll"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm/Kconfig: fix spelling mistake "whats" -> "what's"
  selftests/filesystems: expand epoll with epoll_pwait2
  epoll: wire up syscall epoll_pwait2
  epoll: add syscall epoll_pwait2
  epoll: convert internal api to timespec64
  epoll: eliminate unnecessary lock for zero timeout
  epoll: replace gotos with a proper loop
  epoll: pull all code between fetch_events and send_event into the loop
  epoll: simplify and optimize busy loop logic
  epoll: move eavail next to the list_empty_careful check
  epoll: pull fatal signal checks into ep_send_events()
  epoll: simplify signal handling
  epoll: check for events when removing a timed out thread from the wait queue
  mm/memcontrol:rewrite mem_cgroup_page_lruvec()
  mm, kvm: account kvm_vcpu_mmap to kmemcg
  mm/memcg: remove unused definitions
  mm/memcg: warning on !memcg after readahead page charged
  mm/memcg: bail early from swap accounting if memcg disabled

31 files changed:
arch/alpha/kernel/syscalls/syscall.tbl
arch/arm/tools/syscall.tbl
arch/arm64/include/asm/unistd.h
arch/arm64/include/asm/unistd32.h
arch/ia64/kernel/syscalls/syscall.tbl
arch/m68k/kernel/syscalls/syscall.tbl
arch/microblaze/kernel/syscalls/syscall.tbl
arch/mips/kernel/syscalls/syscall_n32.tbl
arch/mips/kernel/syscalls/syscall_n64.tbl
arch/mips/kernel/syscalls/syscall_o32.tbl
arch/parisc/kernel/syscalls/syscall.tbl
arch/powerpc/kernel/syscalls/syscall.tbl
arch/s390/kernel/syscalls/syscall.tbl
arch/sh/kernel/syscalls/syscall.tbl
arch/sparc/kernel/syscalls/syscall.tbl
arch/x86/entry/syscalls/syscall_32.tbl
arch/x86/entry/syscalls/syscall_64.tbl
arch/x86/kvm/x86.c
arch/xtensa/kernel/syscalls/syscall.tbl
fs/eventpoll.c
include/linux/compat.h
include/linux/memcontrol.h
include/linux/mmdebug.h
include/linux/syscalls.h
include/uapi/asm-generic/unistd.h
kernel/sys_ni.c
mm/Kconfig
mm/memcontrol.c
tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
virt/kvm/coalesced_mmio.c
virt/kvm/kvm_main.c

index ee7b01b..a661706 100644 (file)
 548    common  pidfd_getfd                     sys_pidfd_getfd
 549    common  faccessat2                      sys_faccessat2
 550    common  process_madvise                 sys_process_madvise
+551    common  epoll_pwait2                    sys_epoll_pwait2
index d056a54..20e1170 100644 (file)
 438    common  pidfd_getfd                     sys_pidfd_getfd
 439    common  faccessat2                      sys_faccessat2
 440    common  process_madvise                 sys_process_madvise
+441    common  epoll_pwait2                    sys_epoll_pwait2
index b3b2019..86a9d7b 100644 (file)
@@ -38,7 +38,7 @@
 #define __ARM_NR_compat_set_tls                (__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END            (__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls           441
+#define __NR_compat_syscalls           442
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
index 107f08e..f4bca2b 100644 (file)
@@ -889,6 +889,8 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
 __SYSCALL(__NR_faccessat2, sys_faccessat2)
 #define __NR_process_madvise 440
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
+#define __NR_epoll_pwait2 441
+__SYSCALL(__NR_epoll_pwait2, sys_epoll_pwait2)
 
 /*
  * Please add new compat syscalls above this comment and update
index b96ed8b..bfc00f2 100644 (file)
 438    common  pidfd_getfd                     sys_pidfd_getfd
 439    common  faccessat2                      sys_faccessat2
 440    common  process_madvise                 sys_process_madvise
+441    common  epoll_pwait2                    sys_epoll_pwait2
index 625fb6d..7fe4e45 100644 (file)
 438    common  pidfd_getfd                     sys_pidfd_getfd
 439    common  faccessat2                      sys_faccessat2
 440    common  process_madvise                 sys_process_madvise
+441    common  epoll_pwait2                    sys_epoll_pwait2
index aae729c..a522adf 100644 (file)
 438    common  pidfd_getfd                     sys_pidfd_getfd
 439    common  faccessat2                      sys_faccessat2
 440    common  process_madvise                 sys_process_madvise
+441    common  epoll_pwait2                    sys_epoll_pwait2
index 32817c9..ad9c3dd 100644 (file)
 438    n32     pidfd_getfd                     sys_pidfd_getfd
 439    n32     faccessat2                      sys_faccessat2
 440    n32     process_madvise                 sys_process_madvise
+441    n32     epoll_pwait2                    sys_epoll_pwait2
index 9e4ea3c..9164969 100644 (file)
 438    n64     pidfd_getfd                     sys_pidfd_getfd
 439    n64     faccessat2                      sys_faccessat2
 440    n64     process_madvise                 sys_process_madvise
+441    n64     epoll_pwait2                    sys_epoll_pwait2
index 29f5f28..4bad0c4 100644 (file)
 438    o32     pidfd_getfd                     sys_pidfd_getfd
 439    o32     faccessat2                      sys_faccessat2
 440    o32     process_madvise                 sys_process_madvise
+441    o32     epoll_pwait2                    sys_epoll_pwait2                compat_sys_epoll_pwait2
index f375ea5..6bcc319 100644 (file)
 438    common  pidfd_getfd                     sys_pidfd_getfd
 439    common  faccessat2                      sys_faccessat2
 440    common  process_madvise                 sys_process_madvise
+441    common  epoll_pwait2                    sys_epoll_pwait2                compat_sys_epoll_pwait2
index 1275dae..f744eb5 100644 (file)
 438    common  pidfd_getfd                     sys_pidfd_getfd
 439    common  faccessat2                      sys_faccessat2
 440    common  process_madvise                 sys_process_madvise
+441    common  epoll_pwait2                    sys_epoll_pwait2                compat_sys_epoll_pwait2
index 28c1680..14f6525 100644 (file)
 438  common    pidfd_getfd             sys_pidfd_getfd                 sys_pidfd_getfd
 439  common    faccessat2              sys_faccessat2                  sys_faccessat2
 440  common    process_madvise         sys_process_madvise             sys_process_madvise
+441  common    epoll_pwait2            sys_epoll_pwait2                sys_epoll_pwait2
index 7837384..9df40ac 100644 (file)
 438    common  pidfd_getfd                     sys_pidfd_getfd
 439    common  faccessat2                      sys_faccessat2
 440    common  process_madvise                 sys_process_madvise
+441    common  epoll_pwait2                    sys_epoll_pwait2
index 7816026..c7da4c3 100644 (file)
 438    common  pidfd_getfd                     sys_pidfd_getfd
 439    common  faccessat2                      sys_faccessat2
 440    common  process_madvise                 sys_process_madvise
+441    common  epoll_pwait2                    sys_epoll_pwait2
index 0d0667a..874aeac 100644 (file)
 438    i386    pidfd_getfd             sys_pidfd_getfd
 439    i386    faccessat2              sys_faccessat2
 440    i386    process_madvise         sys_process_madvise
+441    i386    epoll_pwait2            sys_epoll_pwait2                compat_sys_epoll_pwait2
index 3798192..7867212 100644 (file)
 438    common  pidfd_getfd             sys_pidfd_getfd
 439    common  faccessat2              sys_faccessat2
 440    common  process_madvise         sys_process_madvise
+441    common  epoll_pwait2            sys_epoll_pwait2
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
index e545a8a..b05aec1 100644 (file)
@@ -9869,7 +9869,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 
        r = -ENOMEM;
 
-       page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+       page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
        if (!page)
                goto fail_free_lapic;
        vcpu->arch.pio_data = page_address(page);
index b070f27..46116a2 100644 (file)
 438    common  pidfd_getfd                     sys_pidfd_getfd
 439    common  faccessat2                      sys_faccessat2
 440    common  process_madvise                 sys_process_madvise
+441    common  epoll_pwait2                    sys_epoll_pwait2
index 10b81e6..a829af0 100644 (file)
@@ -389,19 +389,24 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
  *
  * we must do our busy polling with irqs enabled
  */
-static void ep_busy_loop(struct eventpoll *ep, int nonblock)
+static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
 {
        unsigned int napi_id = READ_ONCE(ep->napi_id);
 
-       if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
+       if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
                napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
                               BUSY_POLL_BUDGET);
-}
-
-static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
-{
-       if (ep->napi_id)
+               if (ep_events_available(ep))
+                       return true;
+               /*
+                * Busy poll timed out.  Drop NAPI ID for now, we can add
+                * it back in when we have moved a socket with a valid NAPI
+                * ID onto the ready list.
+                */
                ep->napi_id = 0;
+               return false;
+       }
+       return false;
 }
 
 /*
@@ -441,12 +446,9 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
 
 #else
 
-static inline void ep_busy_loop(struct eventpoll *ep, int nonblock)
-{
-}
-
-static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
+static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
 {
+       return false;
 }
 
 static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
@@ -1625,6 +1627,14 @@ static int ep_send_events(struct eventpoll *ep,
        poll_table pt;
        int res = 0;
 
+       /*
+        * Always short-circuit for fatal signals to allow threads to make a
+        * timely exit without the chance of finding more events available and
+        * fetching repeatedly.
+        */
+       if (fatal_signal_pending(current))
+               return -EINTR;
+
        init_poll_funcptr(&pt, NULL);
 
        mutex_lock(&ep->mtx);
@@ -1702,15 +1712,25 @@ static int ep_send_events(struct eventpoll *ep,
        return res;
 }
 
-static inline struct timespec64 ep_set_mstimeout(long ms)
+static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
 {
-       struct timespec64 now, ts = {
-               .tv_sec = ms / MSEC_PER_SEC,
-               .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
-       };
+       struct timespec64 now;
+
+       if (ms < 0)
+               return NULL;
+
+       if (!ms) {
+               to->tv_sec = 0;
+               to->tv_nsec = 0;
+               return to;
+       }
+
+       to->tv_sec = ms / MSEC_PER_SEC;
+       to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
 
        ktime_get_ts64(&now);
-       return timespec64_add_safe(now, ts);
+       *to = timespec64_add_safe(now, *to);
+       return to;
 }
 
 /**
@@ -1722,8 +1742,8 @@ static inline struct timespec64 ep_set_mstimeout(long ms)
  *          stored.
  * @maxevents: Size (in terms of number of events) of the caller event buffer.
  * @timeout: Maximum timeout for the ready events fetch operation, in
- *           milliseconds. If the @timeout is zero, the function will not block,
- *           while if the @timeout is less than zero, the function will block
+ *           timespec. If the timeout is zero, the function will not block,
+ *           while if the @timeout ptr is NULL, the function will block
  *           until at least one event has been retrieved (or an error
  *           occurred).
  *
@@ -1731,55 +1751,59 @@ static inline struct timespec64 ep_set_mstimeout(long ms)
  *          error code, in case of error.
  */
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
-                  int maxevents, long timeout)
+                  int maxevents, struct timespec64 *timeout)
 {
-       int res = 0, eavail, timed_out = 0;
+       int res, eavail, timed_out = 0;
        u64 slack = 0;
        wait_queue_entry_t wait;
        ktime_t expires, *to = NULL;
 
        lockdep_assert_irqs_enabled();
 
-       if (timeout > 0) {
-               struct timespec64 end_time = ep_set_mstimeout(timeout);
-
-               slack = select_estimate_accuracy(&end_time);
+       if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
+               slack = select_estimate_accuracy(timeout);
                to = &expires;
-               *to = timespec64_to_ktime(end_time);
-       } else if (timeout == 0) {
+               *to = timespec64_to_ktime(*timeout);
+       } else if (timeout) {
                /*
                 * Avoid the unnecessary trip to the wait queue loop, if the
-                * caller specified a non blocking operation. We still need
-                * lock because we could race and not see an epi being added
-                * to the ready list while in irq callback. Thus incorrectly
-                * returning 0 back to userspace.
+                * caller specified a non blocking operation.
                 */
                timed_out = 1;
-
-               write_lock_irq(&ep->lock);
-               eavail = ep_events_available(ep);
-               write_unlock_irq(&ep->lock);
-
-               goto send_events;
        }
 
-fetch_events:
+       /*
+        * This call is racy: We may or may not see events that are being added
+        * to the ready list under the lock (e.g., in IRQ callbacks). For, cases
+        * with a non-zero timeout, this thread will check the ready list under
+        * lock and will added to the wait queue.  For, cases with a zero
+        * timeout, the user by definition should not care and will have to
+        * recheck again.
+        */
+       eavail = ep_events_available(ep);
+
+       while (1) {
+               if (eavail) {
+                       /*
+                        * Try to transfer events to user space. In case we get
+                        * 0 events and there's still timeout left over, we go
+                        * trying again in search of more luck.
+                        */
+                       res = ep_send_events(ep, events, maxevents);
+                       if (res)
+                               return res;
+               }
 
-       if (!ep_events_available(ep))
-               ep_busy_loop(ep, timed_out);
+               if (timed_out)
+                       return 0;
 
-       eavail = ep_events_available(ep);
-       if (eavail)
-               goto send_events;
+               eavail = ep_busy_loop(ep, timed_out);
+               if (eavail)
+                       continue;
 
-       /*
-        * Busy poll timed out.  Drop NAPI ID for now, we can add
-        * it back in when we have moved a socket with a valid NAPI
-        * ID onto the ready list.
-        */
-       ep_reset_busy_poll_napi_id(ep);
+               if (signal_pending(current))
+                       return -EINTR;
 
-       do {
                /*
                 * Internally init_wait() uses autoremove_wake_function(),
                 * thus wait entry is removed from the wait queue on each
@@ -1809,55 +1833,38 @@ fetch_events:
                 * important.
                 */
                eavail = ep_events_available(ep);
-               if (!eavail) {
-                       if (signal_pending(current))
-                               res = -EINTR;
-                       else
-                               __add_wait_queue_exclusive(&ep->wq, &wait);
-               }
-               write_unlock_irq(&ep->lock);
-
-               if (eavail || res)
-                       break;
-
-               if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
-                       timed_out = 1;
-                       break;
-               }
-
-               /* We were woken up, thus go and try to harvest some events */
-               eavail = 1;
-
-       } while (0);
+               if (!eavail)
+                       __add_wait_queue_exclusive(&ep->wq, &wait);
 
-       __set_current_state(TASK_RUNNING);
-
-       if (!list_empty_careful(&wait.entry)) {
-               write_lock_irq(&ep->lock);
-               __remove_wait_queue(&ep->wq, &wait);
                write_unlock_irq(&ep->lock);
-       }
 
-send_events:
-       if (fatal_signal_pending(current)) {
+               if (!eavail)
+                       timed_out = !schedule_hrtimeout_range(to, slack,
+                                                             HRTIMER_MODE_ABS);
+               __set_current_state(TASK_RUNNING);
+
                /*
-                * Always short-circuit for fatal signals to allow
-                * threads to make a timely exit without the chance of
-                * finding more events available and fetching
-                * repeatedly.
+                * We were woken up, thus go and try to harvest some events.
+                * If timed out and still on the wait queue, recheck eavail
+                * carefully under lock, below.
                 */
-               res = -EINTR;
-       }
-       /*
-        * Try to transfer events to user space. In case we get 0 events and
-        * there's still timeout left over, we go trying again in search of
-        * more luck.
-        */
-       if (!res && eavail &&
-           !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
-               goto fetch_events;
+               eavail = 1;
 
-       return res;
+               if (!list_empty_careful(&wait.entry)) {
+                       write_lock_irq(&ep->lock);
+                       /*
+                        * If the thread timed out and is not on the wait queue,
+                        * it means that the thread was woken up after its
+                        * timeout expired before it could reacquire the lock.
+                        * Thus, when wait.entry is empty, it needs to harvest
+                        * events.
+                        */
+                       if (timed_out)
+                               eavail = list_empty(&wait.entry);
+                       __remove_wait_queue(&ep->wq, &wait);
+                       write_unlock_irq(&ep->lock);
+               }
+       }
 }
 
 /**
@@ -2176,7 +2183,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
  * part of the user space epoll_wait(2).
  */
 static int do_epoll_wait(int epfd, struct epoll_event __user *events,
-                        int maxevents, int timeout)
+                        int maxevents, struct timespec64 *to)
 {
        int error;
        struct fd f;
@@ -2210,7 +2217,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
        ep = f.file->private_data;
 
        /* Time to fish for events ... */
-       error = ep_poll(ep, events, maxevents, timeout);
+       error = ep_poll(ep, events, maxevents, to);
 
 error_fput:
        fdput(f);
@@ -2220,16 +2227,19 @@ error_fput:
 SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
                int, maxevents, int, timeout)
 {
-       return do_epoll_wait(epfd, events, maxevents, timeout);
+       struct timespec64 to;
+
+       return do_epoll_wait(epfd, events, maxevents,
+                            ep_timeout_to_timespec(&to, timeout));
 }
 
 /*
  * Implement the event wait interface for the eventpoll file. It is the kernel
  * part of the user space epoll_pwait(2).
  */
-SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
-               int, maxevents, int, timeout, const sigset_t __user *, sigmask,
-               size_t, sigsetsize)
+static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
+                         int maxevents, struct timespec64 *to,
+                         const sigset_t __user *sigmask, size_t sigsetsize)
 {
        int error;
 
@@ -2241,18 +2251,47 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
        if (error)
                return error;
 
-       error = do_epoll_wait(epfd, events, maxevents, timeout);
+       error = do_epoll_wait(epfd, events, maxevents, to);
+
        restore_saved_sigmask_unless(error == -EINTR);
 
        return error;
 }
 
+SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
+               int, maxevents, int, timeout, const sigset_t __user *, sigmask,
+               size_t, sigsetsize)
+{
+       struct timespec64 to;
+
+       return do_epoll_pwait(epfd, events, maxevents,
+                             ep_timeout_to_timespec(&to, timeout),
+                             sigmask, sigsetsize);
+}
+
+SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
+               int, maxevents, const struct __kernel_timespec __user *, timeout,
+               const sigset_t __user *, sigmask, size_t, sigsetsize)
+{
+       struct timespec64 ts, *to = NULL;
+
+       if (timeout) {
+               if (get_timespec64(&ts, timeout))
+                       return -EFAULT;
+               to = &ts;
+               if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+                       return -EINVAL;
+       }
+
+       return do_epoll_pwait(epfd, events, maxevents, to,
+                             sigmask, sigsetsize);
+}
+
 #ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
-                       struct epoll_event __user *, events,
-                       int, maxevents, int, timeout,
-                       const compat_sigset_t __user *, sigmask,
-                       compat_size_t, sigsetsize)
+static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
+                                int maxevents, struct timespec64 *timeout,
+                                const compat_sigset_t __user *sigmask,
+                                compat_size_t sigsetsize)
 {
        long err;
 
@@ -2265,10 +2304,46 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
                return err;
 
        err = do_epoll_wait(epfd, events, maxevents, timeout);
+
        restore_saved_sigmask_unless(err == -EINTR);
 
        return err;
 }
+
+COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
+                      struct epoll_event __user *, events,
+                      int, maxevents, int, timeout,
+                      const compat_sigset_t __user *, sigmask,
+                      compat_size_t, sigsetsize)
+{
+       struct timespec64 to;
+
+       return do_compat_epoll_pwait(epfd, events, maxevents,
+                                    ep_timeout_to_timespec(&to, timeout),
+                                    sigmask, sigsetsize);
+}
+
+COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
+                      struct epoll_event __user *, events,
+                      int, maxevents,
+                      const struct __kernel_timespec __user *, timeout,
+                      const compat_sigset_t __user *, sigmask,
+                      compat_size_t, sigsetsize)
+{
+       struct timespec64 ts, *to = NULL;
+
+       if (timeout) {
+               if (get_timespec64(&ts, timeout))
+                       return -EFAULT;
+               to = &ts;
+               if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+                       return -EINVAL;
+       }
+
+       return do_compat_epoll_pwait(epfd, events, maxevents, to,
+                                    sigmask, sigsetsize);
+}
+
 #endif
 
 static int __init eventpoll_init(void)
index 400c094..6e65be7 100644 (file)
@@ -537,6 +537,12 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
                        int maxevents, int timeout,
                        const compat_sigset_t __user *sigmask,
                        compat_size_t sigsetsize);
+asmlinkage long compat_sys_epoll_pwait2(int epfd,
+                       struct epoll_event __user *events,
+                       int maxevents,
+                       const struct __kernel_timespec __user *timeout,
+                       const compat_sigset_t __user *sigmask,
+                       compat_size_t sigsetsize);
 
 /* fs/fcntl.c */
 asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd,
index 08ed57e..d827bd7 100644 (file)
@@ -620,9 +620,10 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
 /**
  * mem_cgroup_lruvec - get the lru list vector for a memcg & node
  * @memcg: memcg of the wanted lruvec
+ * @pgdat: pglist_data
  *
  * Returns the lru list vector holding pages for a given @memcg &
- * @node combination. This can be the node lruvec, if the memory
+ * @pgdat combination. This can be the node lruvec, if the memory
  * controller is disabled.
  */
 static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
@@ -652,7 +653,21 @@ out:
        return lruvec;
 }
 
-struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
+/**
+ * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
+ * @page: the page
+ * @pgdat: pgdat of the page
+ *
+ * This function relies on page->mem_cgroup being stable.
+ */
+static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
+                                               struct pglist_data *pgdat)
+{
+       struct mem_cgroup *memcg = page_memcg(page);
+
+       VM_WARN_ON_ONCE_PAGE(!memcg, page);
+       return mem_cgroup_lruvec(memcg, pgdat);
+}
 
 static inline bool lruvec_holds_page_lru_lock(struct page *page,
                                              struct lruvec *lruvec)
@@ -913,41 +928,6 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
        local_irq_restore(flags);
 }
 
-/**
- * mod_memcg_page_state - update page state statistics
- * @page: the page
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- *
- * The @page must be locked or the caller must use lock_page_memcg()
- * to prevent double accounting when the page is concurrently being
- * moved to another memcg:
- *
- *   lock_page(page) or lock_page_memcg(page)
- *   if (TestClearPageState(page))
- *     mod_memcg_page_state(page, state, -1);
- *   unlock_page(page) or unlock_page_memcg(page)
- *
- * Kernel pages are an exception to this, since they'll never move.
- */
-static inline void __mod_memcg_page_state(struct page *page,
-                                         int idx, int val)
-{
-       struct mem_cgroup *memcg = page_memcg(page);
-
-       if (memcg)
-               __mod_memcg_state(memcg, idx, val);
-}
-
-static inline void mod_memcg_page_state(struct page *page,
-                                       int idx, int val)
-{
-       struct mem_cgroup *memcg = page_memcg(page);
-
-       if (memcg)
-               mod_memcg_state(memcg, idx, val);
-}
-
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
                                              enum node_stat_item idx)
 {
@@ -1395,18 +1375,6 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
 {
 }
 
-static inline void __mod_memcg_page_state(struct page *page,
-                                         int idx,
-                                         int nr)
-{
-}
-
-static inline void mod_memcg_page_state(struct page *page,
-                                       int idx,
-                                       int nr)
-{
-}
-
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
                                              enum node_stat_item idx)
 {
@@ -1479,34 +1447,6 @@ static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
 }
 #endif /* CONFIG_MEMCG */
 
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __inc_memcg_state(struct mem_cgroup *memcg,
-                                    int idx)
-{
-       __mod_memcg_state(memcg, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __dec_memcg_state(struct mem_cgroup *memcg,
-                                    int idx)
-{
-       __mod_memcg_state(memcg, idx, -1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __inc_memcg_page_state(struct page *page,
-                                         int idx)
-{
-       __mod_memcg_page_state(page, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __dec_memcg_page_state(struct page *page,
-                                         int idx)
-{
-       __mod_memcg_page_state(page, idx, -1);
-}
-
 static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
 {
        __mod_lruvec_kmem_state(p, idx, 1);
@@ -1517,34 +1457,6 @@ static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
        __mod_lruvec_kmem_state(p, idx, -1);
 }
 
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void inc_memcg_state(struct mem_cgroup *memcg,
-                                  int idx)
-{
-       mod_memcg_state(memcg, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void dec_memcg_state(struct mem_cgroup *memcg,
-                                  int idx)
-{
-       mod_memcg_state(memcg, idx, -1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void inc_memcg_page_state(struct page *page,
-                                       int idx)
-{
-       mod_memcg_page_state(page, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void dec_memcg_page_state(struct page *page,
-                                       int idx)
-{
-       mod_memcg_page_state(page, idx, -1);
-}
-
 static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
 {
        struct mem_cgroup *memcg;
@@ -1733,21 +1645,6 @@ static inline void memcg_kmem_uncharge_page(struct page *page, int order)
                __memcg_kmem_uncharge_page(page, order);
 }
 
-static inline int memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
-                                   unsigned int nr_pages)
-{
-       if (memcg_kmem_enabled())
-               return __memcg_kmem_charge(memcg, gfp, nr_pages);
-       return 0;
-}
-
-static inline void memcg_kmem_uncharge(struct mem_cgroup *memcg,
-                                      unsigned int nr_pages)
-{
-       if (memcg_kmem_enabled())
-               __memcg_kmem_uncharge(memcg, nr_pages);
-}
-
 /*
  * A helper for accessing memcg's kmem_id, used for getting
  * corresponding LRU lists.
index 2ad72d2..5d0767c 100644 (file)
@@ -37,6 +37,18 @@ void dump_mm(const struct mm_struct *mm);
                        BUG();                                          \
                }                                                       \
        } while (0)
+#define VM_WARN_ON_ONCE_PAGE(cond, page)       ({                      \
+       static bool __section(".data.once") __warned;                   \
+       int __ret_warn_once = !!(cond);                                 \
+                                                                       \
+       if (unlikely(__ret_warn_once && !__warned)) {                   \
+               dump_page(page, "VM_WARN_ON_ONCE_PAGE(" __stringify(cond)")");\
+               __warned = true;                                        \
+               WARN_ON(1);                                             \
+       }                                                               \
+       unlikely(__ret_warn_once);                                      \
+})
+
 #define VM_WARN_ON(cond) (void)WARN_ON(cond)
 #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
 #define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format)
@@ -48,6 +60,7 @@ void dump_mm(const struct mm_struct *mm);
 #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
 #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_ONCE_PAGE(cond, page)  BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #endif
index df0c3c7..f3929af 100644 (file)
@@ -362,6 +362,11 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
                                int maxevents, int timeout,
                                const sigset_t __user *sigmask,
                                size_t sigsetsize);
+asmlinkage long sys_epoll_pwait2(int epfd, struct epoll_event __user *events,
+                                int maxevents,
+                                const struct __kernel_timespec __user *timeout,
+                                const sigset_t __user *sigmask,
+                                size_t sigsetsize);
 
 /* fs/fcntl.c */
 asmlinkage long sys_dup(unsigned int fildes);
index fc48c64..7287529 100644 (file)
@@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
 __SYSCALL(__NR_faccessat2, sys_faccessat2)
 #define __NR_process_madvise 440
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
+#define __NR_epoll_pwait2 441
+__SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
 
 #undef __NR_syscalls
-#define __NR_syscalls 441
+#define __NR_syscalls 442
 
 /*
  * 32 bit systems traditionally used different
index f27ac94..19aa806 100644 (file)
@@ -68,6 +68,8 @@ COND_SYSCALL(epoll_create1);
 COND_SYSCALL(epoll_ctl);
 COND_SYSCALL(epoll_pwait);
 COND_SYSCALL_COMPAT(epoll_pwait);
+COND_SYSCALL(epoll_pwait2);
+COND_SYSCALL_COMPAT(epoll_pwait2);
 
 /* fs/fcntl.c */
 
index 4275c25..f730605 100644 (file)
@@ -713,7 +713,7 @@ config ZSMALLOC_STAT
        select DEBUG_FS
        help
          This option enables code in the zsmalloc to collect various
-         statistics about whats happening in zsmalloc and exports that
+         statistics about what's happening in zsmalloc and exports that
          information to userspace via debugfs.
          If unsure, say N.
 
index e3c7ca7..605f671 100644 (file)
@@ -1342,46 +1342,6 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
 }
 #endif
 
-/**
- * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
- * @page: the page
- * @pgdat: pgdat of the page
- *
- * This function relies on page's memcg being stable - see the
- * access rules in commit_charge().
- */
-struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
-{
-       struct mem_cgroup_per_node *mz;
-       struct mem_cgroup *memcg;
-       struct lruvec *lruvec;
-
-       if (mem_cgroup_disabled()) {
-               lruvec = &pgdat->__lruvec;
-               goto out;
-       }
-
-       memcg = page_memcg(page);
-       /*
-        * Swapcache readahead pages are added to the LRU - and
-        * possibly migrated - before they are charged.
-        */
-       if (!memcg)
-               memcg = root_mem_cgroup;
-
-       mz = mem_cgroup_page_nodeinfo(memcg, page);
-       lruvec = &mz->lruvec;
-out:
-       /*
-        * Since a node can be onlined after the mem_cgroup was created,
-        * we have to be prepared to initialize lruvec->zone here;
-        * and if offlined then reonlined, we need to reinitialize it.
-        */
-       if (unlikely(lruvec->pgdat != pgdat))
-               lruvec->pgdat = pgdat;
-       return lruvec;
-}
-
 /**
  * lock_page_lruvec - lock and return lruvec for a given page.
  * @page: the page
@@ -6987,6 +6947,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
                return;
 
        memcg = page_memcg(oldpage);
+       VM_WARN_ON_ONCE_PAGE(!memcg, oldpage);
        if (!memcg)
                return;
 
@@ -7178,12 +7139,15 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
        VM_BUG_ON_PAGE(PageLRU(page), page);
        VM_BUG_ON_PAGE(page_count(page), page);
 
+       if (mem_cgroup_disabled())
+               return;
+
        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return;
 
        memcg = page_memcg(page);
 
-       /* Readahead page, never charged */
+       VM_WARN_ON_ONCE_PAGE(!memcg, page);
        if (!memcg)
                return;
 
@@ -7242,12 +7206,15 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
        struct mem_cgroup *memcg;
        unsigned short oldid;
 
+       if (mem_cgroup_disabled())
+               return 0;
+
        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return 0;
 
        memcg = page_memcg(page);
 
-       /* Readahead page, never charged */
+       VM_WARN_ON_ONCE_PAGE(!memcg, page);
        if (!memcg)
                return 0;
 
index 8f82f99..ad7fabd 100644 (file)
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #define _GNU_SOURCE
+#include <asm/unistd.h>
+#include <linux/time_types.h>
 #include <poll.h>
 #include <unistd.h>
 #include <assert.h>
@@ -21,6 +23,19 @@ struct epoll_mtcontext
        pthread_t waiter;
 };
 
+#ifndef __NR_epoll_pwait2
+#define __NR_epoll_pwait2 -1
+#endif
+
+static inline int sys_epoll_pwait2(int fd, struct epoll_event *events,
+                                  int maxevents,
+                                  const struct __kernel_timespec *timeout,
+                                  const sigset_t *sigset, size_t sigsetsize)
+{
+       return syscall(__NR_epoll_pwait2, fd, events, maxevents, timeout,
+                      sigset, sigsetsize);
+}
+
 static void signal_handler(int signum)
 {
 }
@@ -3377,4 +3392,61 @@ TEST(epoll61)
        close(ctx.evfd);
 }
 
+/* Equivalent to basic test epoll1, but exercising epoll_pwait2. */
+TEST(epoll62)
+{
+       int efd;
+       int sfd[2];
+       struct epoll_event e;
+
+       ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+       efd = epoll_create(1);
+       ASSERT_GE(efd, 0);
+
+       e.events = EPOLLIN;
+       ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+       ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+       EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, NULL, NULL, 0), 1);
+       EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, NULL, NULL, 0), 1);
+
+       close(efd);
+       close(sfd[0]);
+       close(sfd[1]);
+}
+
+/* Epoll_pwait2 basic timeout test. */
+TEST(epoll63)
+{
+       const int cfg_delay_ms = 10;
+       unsigned long long tdiff;
+       struct __kernel_timespec ts;
+       int efd;
+       int sfd[2];
+       struct epoll_event e;
+
+       ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+       efd = epoll_create(1);
+       ASSERT_GE(efd, 0);
+
+       e.events = EPOLLIN;
+       ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+       ts.tv_sec = 0;
+       ts.tv_nsec = cfg_delay_ms * 1000 * 1000;
+
+       tdiff = msecs();
+       EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, &ts, NULL, 0), 0);
+       tdiff = msecs() - tdiff;
+
+       EXPECT_GE(tdiff, cfg_delay_ms);
+
+       close(efd);
+       close(sfd[0]);
+       close(sfd[1]);
+}
+
 TEST_HARNESS_MAIN
index e2c197f..62bd908 100644 (file)
@@ -111,7 +111,7 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
 {
        struct page *page;
 
-       page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+       page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
        if (!page)
                return -ENOMEM;
 
index 2541a17..f69357a 100644 (file)
@@ -3116,7 +3116,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
        }
 
        BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
-       page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+       page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
        if (!page) {
                r = -ENOMEM;
                goto vcpu_free;