548 common pidfd_getfd sys_pidfd_getfd
549 common faccessat2 sys_faccessat2
550 common process_madvise sys_process_madvise
+551 common epoll_pwait2 sys_epoll_pwait2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
-#define __NR_compat_syscalls 441
+#define __NR_compat_syscalls 442
#endif
#define __ARCH_WANT_SYS_CLONE
__SYSCALL(__NR_faccessat2, sys_faccessat2)
#define __NR_process_madvise 440
__SYSCALL(__NR_process_madvise, sys_process_madvise)
+#define __NR_epoll_pwait2 441
+__SYSCALL(__NR_epoll_pwait2, sys_epoll_pwait2)
/*
* Please add new compat syscalls above this comment and update
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2
438 n32 pidfd_getfd sys_pidfd_getfd
439 n32 faccessat2 sys_faccessat2
440 n32 process_madvise sys_process_madvise
+441 n32 epoll_pwait2 sys_epoll_pwait2
438 n64 pidfd_getfd sys_pidfd_getfd
439 n64 faccessat2 sys_faccessat2
440 n64 process_madvise sys_process_madvise
+441 n64 epoll_pwait2 sys_epoll_pwait2
438 o32 pidfd_getfd sys_pidfd_getfd
439 o32 faccessat2 sys_faccessat2
440 o32 process_madvise sys_process_madvise
+441 o32 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2 sys_epoll_pwait2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2
438 i386 pidfd_getfd sys_pidfd_getfd
439 i386 faccessat2 sys_faccessat2
440 i386 process_madvise sys_process_madvise
+441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2
#
# Due to a historical design error, certain syscalls are numbered differently
r = -ENOMEM;
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!page)
goto fail_free_lapic;
vcpu->arch.pio_data = page_address(page);
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2
*
* we must do our busy polling with irqs enabled
*/
-static void ep_busy_loop(struct eventpoll *ep, int nonblock)
+static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);
- if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
+ if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
BUSY_POLL_BUDGET);
-}
-
-static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
-{
- if (ep->napi_id)
+ if (ep_events_available(ep))
+ return true;
+ /*
+ * Busy poll timed out. Drop NAPI ID for now, we can add
+ * it back in when we have moved a socket with a valid NAPI
+ * ID onto the ready list.
+ */
ep->napi_id = 0;
+ return false;
+ }
+ return false;
}
/*
#else
-static inline void ep_busy_loop(struct eventpoll *ep, int nonblock)
-{
-}
-
-static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
+static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
+ return false;
}
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
poll_table pt;
int res = 0;
+ /*
+ * Always short-circuit for fatal signals to allow threads to make a
+ * timely exit without the chance of finding more events available and
+ * fetching repeatedly.
+ */
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
init_poll_funcptr(&pt, NULL);
mutex_lock(&ep->mtx);
return res;
}
-static inline struct timespec64 ep_set_mstimeout(long ms)
+static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
{
- struct timespec64 now, ts = {
- .tv_sec = ms / MSEC_PER_SEC,
- .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
- };
+ struct timespec64 now;
+
+ if (ms < 0)
+ return NULL;
+
+ if (!ms) {
+ to->tv_sec = 0;
+ to->tv_nsec = 0;
+ return to;
+ }
+
+ to->tv_sec = ms / MSEC_PER_SEC;
+ to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
ktime_get_ts64(&now);
- return timespec64_add_safe(now, ts);
+ *to = timespec64_add_safe(now, *to);
+ return to;
}
/**
* stored.
* @maxevents: Size (in terms of number of events) of the caller event buffer.
* @timeout: Maximum timeout for the ready events fetch operation, in
- * milliseconds. If the @timeout is zero, the function will not block,
- * while if the @timeout is less than zero, the function will block
+ * timespec. If the timeout is zero, the function will not block,
+ * while if the @timeout ptr is NULL, the function will block
* until at least one event has been retrieved (or an error
* occurred).
*
* error code, in case of error.
*/
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
- int maxevents, long timeout)
+ int maxevents, struct timespec64 *timeout)
{
- int res = 0, eavail, timed_out = 0;
+ int res, eavail, timed_out = 0;
u64 slack = 0;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
lockdep_assert_irqs_enabled();
- if (timeout > 0) {
- struct timespec64 end_time = ep_set_mstimeout(timeout);
-
- slack = select_estimate_accuracy(&end_time);
+ if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
+ slack = select_estimate_accuracy(timeout);
to = &expires;
- *to = timespec64_to_ktime(end_time);
- } else if (timeout == 0) {
+ *to = timespec64_to_ktime(*timeout);
+ } else if (timeout) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
- * caller specified a non blocking operation. We still need
- * lock because we could race and not see an epi being added
- * to the ready list while in irq callback. Thus incorrectly
- * returning 0 back to userspace.
+ * caller specified a non blocking operation.
*/
timed_out = 1;
-
- write_lock_irq(&ep->lock);
- eavail = ep_events_available(ep);
- write_unlock_irq(&ep->lock);
-
- goto send_events;
}
-fetch_events:
+ /*
+ * This call is racy: We may or may not see events that are being added
+ * to the ready list under the lock (e.g., in IRQ callbacks). For, cases
+ * with a non-zero timeout, this thread will check the ready list under
+ * lock and will added to the wait queue. For, cases with a zero
+ * timeout, the user by definition should not care and will have to
+ * recheck again.
+ */
+ eavail = ep_events_available(ep);
+
+ while (1) {
+ if (eavail) {
+ /*
+ * Try to transfer events to user space. In case we get
+ * 0 events and there's still timeout left over, we go
+ * trying again in search of more luck.
+ */
+ res = ep_send_events(ep, events, maxevents);
+ if (res)
+ return res;
+ }
- if (!ep_events_available(ep))
- ep_busy_loop(ep, timed_out);
+ if (timed_out)
+ return 0;
- eavail = ep_events_available(ep);
- if (eavail)
- goto send_events;
+ eavail = ep_busy_loop(ep, timed_out);
+ if (eavail)
+ continue;
- /*
- * Busy poll timed out. Drop NAPI ID for now, we can add
- * it back in when we have moved a socket with a valid NAPI
- * ID onto the ready list.
- */
- ep_reset_busy_poll_napi_id(ep);
+ if (signal_pending(current))
+ return -EINTR;
- do {
/*
* Internally init_wait() uses autoremove_wake_function(),
* thus wait entry is removed from the wait queue on each
* important.
*/
eavail = ep_events_available(ep);
- if (!eavail) {
- if (signal_pending(current))
- res = -EINTR;
- else
- __add_wait_queue_exclusive(&ep->wq, &wait);
- }
- write_unlock_irq(&ep->lock);
-
- if (eavail || res)
- break;
-
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
- timed_out = 1;
- break;
- }
-
- /* We were woken up, thus go and try to harvest some events */
- eavail = 1;
-
- } while (0);
+ if (!eavail)
+ __add_wait_queue_exclusive(&ep->wq, &wait);
- __set_current_state(TASK_RUNNING);
-
- if (!list_empty_careful(&wait.entry)) {
- write_lock_irq(&ep->lock);
- __remove_wait_queue(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
- }
-send_events:
- if (fatal_signal_pending(current)) {
+ if (!eavail)
+ timed_out = !schedule_hrtimeout_range(to, slack,
+ HRTIMER_MODE_ABS);
+ __set_current_state(TASK_RUNNING);
+
/*
- * Always short-circuit for fatal signals to allow
- * threads to make a timely exit without the chance of
- * finding more events available and fetching
- * repeatedly.
+ * We were woken up, thus go and try to harvest some events.
+ * If timed out and still on the wait queue, recheck eavail
+ * carefully under lock, below.
*/
- res = -EINTR;
- }
- /*
- * Try to transfer events to user space. In case we get 0 events and
- * there's still timeout left over, we go trying again in search of
- * more luck.
- */
- if (!res && eavail &&
- !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
- goto fetch_events;
+ eavail = 1;
- return res;
+ if (!list_empty_careful(&wait.entry)) {
+ write_lock_irq(&ep->lock);
+ /*
+ * If the thread timed out and is not on the wait queue,
+ * it means that the thread was woken up after its
+ * timeout expired before it could reacquire the lock.
+ * Thus, when wait.entry is empty, it needs to harvest
+ * events.
+ */
+ if (timed_out)
+ eavail = list_empty(&wait.entry);
+ __remove_wait_queue(&ep->wq, &wait);
+ write_unlock_irq(&ep->lock);
+ }
+ }
}
/**
* part of the user space epoll_wait(2).
*/
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
- int maxevents, int timeout)
+ int maxevents, struct timespec64 *to)
{
int error;
struct fd f;
ep = f.file->private_data;
/* Time to fish for events ... */
- error = ep_poll(ep, events, maxevents, timeout);
+ error = ep_poll(ep, events, maxevents, to);
error_fput:
fdput(f);
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout)
{
- return do_epoll_wait(epfd, events, maxevents, timeout);
+ struct timespec64 to;
+
+ return do_epoll_wait(epfd, events, maxevents,
+ ep_timeout_to_timespec(&to, timeout));
}
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_pwait(2).
*/
-SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
- int, maxevents, int, timeout, const sigset_t __user *, sigmask,
- size_t, sigsetsize)
+static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
+ int maxevents, struct timespec64 *to,
+ const sigset_t __user *sigmask, size_t sigsetsize)
{
int error;
if (error)
return error;
- error = do_epoll_wait(epfd, events, maxevents, timeout);
+ error = do_epoll_wait(epfd, events, maxevents, to);
+
restore_saved_sigmask_unless(error == -EINTR);
return error;
}
+SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
+ int, maxevents, int, timeout, const sigset_t __user *, sigmask,
+ size_t, sigsetsize)
+{
+ struct timespec64 to;
+
+ return do_epoll_pwait(epfd, events, maxevents,
+ ep_timeout_to_timespec(&to, timeout),
+ sigmask, sigsetsize);
+}
+
+SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
+ int, maxevents, const struct __kernel_timespec __user *, timeout,
+ const sigset_t __user *, sigmask, size_t, sigsetsize)
+{
+ struct timespec64 ts, *to = NULL;
+
+ if (timeout) {
+ if (get_timespec64(&ts, timeout))
+ return -EFAULT;
+ to = &ts;
+ if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ return -EINVAL;
+ }
+
+ return do_epoll_pwait(epfd, events, maxevents, to,
+ sigmask, sigsetsize);
+}
+
#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
- struct epoll_event __user *, events,
- int, maxevents, int, timeout,
- const compat_sigset_t __user *, sigmask,
- compat_size_t, sigsetsize)
+static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
+ int maxevents, struct timespec64 *timeout,
+ const compat_sigset_t __user *sigmask,
+ compat_size_t sigsetsize)
{
long err;
return err;
err = do_epoll_wait(epfd, events, maxevents, timeout);
+
restore_saved_sigmask_unless(err == -EINTR);
return err;
}
+
+COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
+ struct epoll_event __user *, events,
+ int, maxevents, int, timeout,
+ const compat_sigset_t __user *, sigmask,
+ compat_size_t, sigsetsize)
+{
+ struct timespec64 to;
+
+ return do_compat_epoll_pwait(epfd, events, maxevents,
+ ep_timeout_to_timespec(&to, timeout),
+ sigmask, sigsetsize);
+}
+
+COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
+ struct epoll_event __user *, events,
+ int, maxevents,
+ const struct __kernel_timespec __user *, timeout,
+ const compat_sigset_t __user *, sigmask,
+ compat_size_t, sigsetsize)
+{
+ struct timespec64 ts, *to = NULL;
+
+ if (timeout) {
+ if (get_timespec64(&ts, timeout))
+ return -EFAULT;
+ to = &ts;
+ if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ return -EINVAL;
+ }
+
+ return do_compat_epoll_pwait(epfd, events, maxevents, to,
+ sigmask, sigsetsize);
+}
+
#endif
static int __init eventpoll_init(void)
int maxevents, int timeout,
const compat_sigset_t __user *sigmask,
compat_size_t sigsetsize);
+asmlinkage long compat_sys_epoll_pwait2(int epfd,
+ struct epoll_event __user *events,
+ int maxevents,
+ const struct __kernel_timespec __user *timeout,
+ const compat_sigset_t __user *sigmask,
+ compat_size_t sigsetsize);
/* fs/fcntl.c */
asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd,
/**
* mem_cgroup_lruvec - get the lru list vector for a memcg & node
* @memcg: memcg of the wanted lruvec
+ * @pgdat: pglist_data
*
* Returns the lru list vector holding pages for a given @memcg &
- * @node combination. This can be the node lruvec, if the memory
+ * @pgdat combination. This can be the node lruvec, if the memory
* controller is disabled.
*/
static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
return lruvec;
}
-struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
+/**
+ * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
+ * @page: the page
+ * @pgdat: pgdat of the page
+ *
+ * This function relies on page->mem_cgroup being stable.
+ */
+static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
+ struct pglist_data *pgdat)
+{
+ struct mem_cgroup *memcg = page_memcg(page);
+
+ VM_WARN_ON_ONCE_PAGE(!memcg, page);
+ return mem_cgroup_lruvec(memcg, pgdat);
+}
static inline bool lruvec_holds_page_lru_lock(struct page *page,
struct lruvec *lruvec)
local_irq_restore(flags);
}
-/**
- * mod_memcg_page_state - update page state statistics
- * @page: the page
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- *
- * The @page must be locked or the caller must use lock_page_memcg()
- * to prevent double accounting when the page is concurrently being
- * moved to another memcg:
- *
- * lock_page(page) or lock_page_memcg(page)
- * if (TestClearPageState(page))
- * mod_memcg_page_state(page, state, -1);
- * unlock_page(page) or unlock_page_memcg(page)
- *
- * Kernel pages are an exception to this, since they'll never move.
- */
-static inline void __mod_memcg_page_state(struct page *page,
- int idx, int val)
-{
- struct mem_cgroup *memcg = page_memcg(page);
-
- if (memcg)
- __mod_memcg_state(memcg, idx, val);
-}
-
-static inline void mod_memcg_page_state(struct page *page,
- int idx, int val)
-{
- struct mem_cgroup *memcg = page_memcg(page);
-
- if (memcg)
- mod_memcg_state(memcg, idx, val);
-}
-
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
enum node_stat_item idx)
{
{
}
-static inline void __mod_memcg_page_state(struct page *page,
- int idx,
- int nr)
-{
-}
-
-static inline void mod_memcg_page_state(struct page *page,
- int idx,
- int nr)
-{
-}
-
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
enum node_stat_item idx)
{
}
#endif /* CONFIG_MEMCG */
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __inc_memcg_state(struct mem_cgroup *memcg,
- int idx)
-{
- __mod_memcg_state(memcg, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __dec_memcg_state(struct mem_cgroup *memcg,
- int idx)
-{
- __mod_memcg_state(memcg, idx, -1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __inc_memcg_page_state(struct page *page,
- int idx)
-{
- __mod_memcg_page_state(page, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __dec_memcg_page_state(struct page *page,
- int idx)
-{
- __mod_memcg_page_state(page, idx, -1);
-}
-
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
{
__mod_lruvec_kmem_state(p, idx, 1);
__mod_lruvec_kmem_state(p, idx, -1);
}
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void inc_memcg_state(struct mem_cgroup *memcg,
- int idx)
-{
- mod_memcg_state(memcg, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void dec_memcg_state(struct mem_cgroup *memcg,
- int idx)
-{
- mod_memcg_state(memcg, idx, -1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void inc_memcg_page_state(struct page *page,
- int idx)
-{
- mod_memcg_page_state(page, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void dec_memcg_page_state(struct page *page,
- int idx)
-{
- mod_memcg_page_state(page, idx, -1);
-}
-
static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
{
struct mem_cgroup *memcg;
__memcg_kmem_uncharge_page(page, order);
}
-static inline int memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned int nr_pages)
-{
- if (memcg_kmem_enabled())
- return __memcg_kmem_charge(memcg, gfp, nr_pages);
- return 0;
-}
-
-static inline void memcg_kmem_uncharge(struct mem_cgroup *memcg,
- unsigned int nr_pages)
-{
- if (memcg_kmem_enabled())
- __memcg_kmem_uncharge(memcg, nr_pages);
-}
-
/*
* A helper for accessing memcg's kmem_id, used for getting
* corresponding LRU lists.
BUG(); \
} \
} while (0)
+#define VM_WARN_ON_ONCE_PAGE(cond, page) ({ \
+ static bool __section(".data.once") __warned; \
+ int __ret_warn_once = !!(cond); \
+ \
+ if (unlikely(__ret_warn_once && !__warned)) { \
+ dump_page(page, "VM_WARN_ON_ONCE_PAGE(" __stringify(cond)")");\
+ __warned = true; \
+ WARN_ON(1); \
+ } \
+ unlikely(__ret_warn_once); \
+})
+
#define VM_WARN_ON(cond) (void)WARN_ON(cond)
#define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
#define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format)
#define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
#endif
int maxevents, int timeout,
const sigset_t __user *sigmask,
size_t sigsetsize);
+asmlinkage long sys_epoll_pwait2(int epfd, struct epoll_event __user *events,
+ int maxevents,
+ const struct __kernel_timespec __user *timeout,
+ const sigset_t __user *sigmask,
+ size_t sigsetsize);
/* fs/fcntl.c */
asmlinkage long sys_dup(unsigned int fildes);
__SYSCALL(__NR_faccessat2, sys_faccessat2)
#define __NR_process_madvise 440
__SYSCALL(__NR_process_madvise, sys_process_madvise)
+#define __NR_epoll_pwait2 441
+__SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
#undef __NR_syscalls
-#define __NR_syscalls 441
+#define __NR_syscalls 442
/*
* 32 bit systems traditionally used different
COND_SYSCALL(epoll_ctl);
COND_SYSCALL(epoll_pwait);
COND_SYSCALL_COMPAT(epoll_pwait);
+COND_SYSCALL(epoll_pwait2);
+COND_SYSCALL_COMPAT(epoll_pwait2);
/* fs/fcntl.c */
select DEBUG_FS
help
This option enables code in the zsmalloc to collect various
- statistics about whats happening in zsmalloc and exports that
+ statistics about what's happening in zsmalloc and exports that
information to userspace via debugfs.
If unsure, say N.
}
#endif
-/**
- * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
- * @page: the page
- * @pgdat: pgdat of the page
- *
- * This function relies on page's memcg being stable - see the
- * access rules in commit_charge().
- */
-struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
-{
- struct mem_cgroup_per_node *mz;
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
- if (mem_cgroup_disabled()) {
- lruvec = &pgdat->__lruvec;
- goto out;
- }
-
- memcg = page_memcg(page);
- /*
- * Swapcache readahead pages are added to the LRU - and
- * possibly migrated - before they are charged.
- */
- if (!memcg)
- memcg = root_mem_cgroup;
-
- mz = mem_cgroup_page_nodeinfo(memcg, page);
- lruvec = &mz->lruvec;
-out:
- /*
- * Since a node can be onlined after the mem_cgroup was created,
- * we have to be prepared to initialize lruvec->zone here;
- * and if offlined then reonlined, we need to reinitialize it.
- */
- if (unlikely(lruvec->pgdat != pgdat))
- lruvec->pgdat = pgdat;
- return lruvec;
-}
-
/**
* lock_page_lruvec - lock and return lruvec for a given page.
* @page: the page
return;
memcg = page_memcg(oldpage);
+ VM_WARN_ON_ONCE_PAGE(!memcg, oldpage);
if (!memcg)
return;
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
+ if (mem_cgroup_disabled())
+ return;
+
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
memcg = page_memcg(page);
- /* Readahead page, never charged */
+ VM_WARN_ON_ONCE_PAGE(!memcg, page);
if (!memcg)
return;
struct mem_cgroup *memcg;
unsigned short oldid;
+ if (mem_cgroup_disabled())
+ return 0;
+
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
memcg = page_memcg(page);
- /* Readahead page, never charged */
+ VM_WARN_ON_ONCE_PAGE(!memcg, page);
if (!memcg)
return 0;
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
+#include <asm/unistd.h>
+#include <linux/time_types.h>
#include <poll.h>
#include <unistd.h>
#include <assert.h>
pthread_t waiter;
};
+#ifndef __NR_epoll_pwait2
+#define __NR_epoll_pwait2 -1
+#endif
+
+static inline int sys_epoll_pwait2(int fd, struct epoll_event *events,
+ int maxevents,
+ const struct __kernel_timespec *timeout,
+ const sigset_t *sigset, size_t sigsetsize)
+{
+ return syscall(__NR_epoll_pwait2, fd, events, maxevents, timeout,
+ sigset, sigsetsize);
+}
+
static void signal_handler(int signum)
{
}
close(ctx.evfd);
}
+/* Equivalent to basic test epoll1, but exercising epoll_pwait2. */
+TEST(epoll62)
+{
+ int efd;
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, NULL, NULL, 0), 1);
+ EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, NULL, NULL, 0), 1);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/* Epoll_pwait2 basic timeout test. */
+TEST(epoll63)
+{
+ const int cfg_delay_ms = 10;
+ unsigned long long tdiff;
+ struct __kernel_timespec ts;
+ int efd;
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ ts.tv_sec = 0;
+ ts.tv_nsec = cfg_delay_ms * 1000 * 1000;
+
+ tdiff = msecs();
+ EXPECT_EQ(sys_epoll_pwait2(efd, &e, 1, &ts, NULL, 0), 0);
+ tdiff = msecs() - tdiff;
+
+ EXPECT_GE(tdiff, cfg_delay_ms);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
TEST_HARNESS_MAIN
{
struct page *page;
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!page)
return -ENOMEM;
}
BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!page) {
r = -ENOMEM;
goto vcpu_free;