Merge branches 'clk-range', 'clk-uniphier', 'clk-apple' and 'clk-qcom' into clk-next
[linux-2.6-microblaze.git] / tools / testing / selftests / vm / userfaultfd.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Stress userfaultfd syscall.
4  *
5  *  Copyright (C) 2015  Red Hat, Inc.
6  *
7  * This test allocates two virtual areas and bounces the physical
8  * memory across the two virtual areas (from area_src to area_dst)
9  * using userfaultfd.
10  *
11  * There are three threads running per CPU:
12  *
13  * 1) one per-CPU thread takes a per-page pthread_mutex in a random
14  *    page of the area_dst (while the physical page may still be in
15  *    area_src), and increments a per-page counter in the same page,
16  *    and checks its value against a verification region.
17  *
18  * 2) another per-CPU thread handles the userfaults generated by
19  *    thread 1 above. userfaultfd blocking reads or poll() modes are
20  *    exercised interleaved.
21  *
22  * 3) one last per-CPU thread transfers the memory in the background
23  *    at maximum bandwidth (if not already transferred by thread
24  *    2). Each cpu thread takes cares of transferring a portion of the
25  *    area.
26  *
27  * When all threads of type 3 completed the transfer, one bounce is
28  * complete. area_src and area_dst are then swapped. All threads are
29  * respawned and so the bounce is immediately restarted in the
30  * opposite direction.
31  *
32  * per-CPU threads 1 by triggering userfaults inside
33  * pthread_mutex_lock will also verify the atomicity of the memory
34  * transfer (UFFDIO_COPY).
35  */
36
37 #define _GNU_SOURCE
38 #include <stdio.h>
39 #include <errno.h>
40 #include <unistd.h>
41 #include <stdlib.h>
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <fcntl.h>
45 #include <time.h>
46 #include <signal.h>
47 #include <poll.h>
48 #include <string.h>
49 #include <sys/mman.h>
50 #include <sys/syscall.h>
51 #include <sys/ioctl.h>
52 #include <sys/wait.h>
53 #include <pthread.h>
54 #include <linux/userfaultfd.h>
55 #include <setjmp.h>
56 #include <stdbool.h>
57 #include <assert.h>
58 #include <inttypes.h>
59 #include <stdint.h>
60 #include <sys/random.h>
61
62 #include "../kselftest.h"
63
64 #ifdef __NR_userfaultfd
65
66 static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
67
68 #define BOUNCE_RANDOM           (1<<0)
69 #define BOUNCE_RACINGFAULTS     (1<<1)
70 #define BOUNCE_VERIFY           (1<<2)
71 #define BOUNCE_POLL             (1<<3)
72 static int bounces;
73
74 #define TEST_ANON       1
75 #define TEST_HUGETLB    2
76 #define TEST_SHMEM      3
77 static int test_type;
78
79 /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
80 #define ALARM_INTERVAL_SECS 10
81 static volatile bool test_uffdio_copy_eexist = true;
82 static volatile bool test_uffdio_zeropage_eexist = true;
83 /* Whether to test uffd write-protection */
84 static bool test_uffdio_wp = false;
85 /* Whether to test uffd minor faults */
86 static bool test_uffdio_minor = false;
87
88 static bool map_shared;
89 static int shm_fd;
90 static int huge_fd;
91 static char *huge_fd_off0;
92 static unsigned long long *count_verify;
93 static int uffd = -1;
94 static int uffd_flags, finished, *pipefd;
95 static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
96 static char *zeropage;
97 pthread_attr_t attr;
98
99 /* Userfaultfd test statistics */
100 struct uffd_stats {
101         int cpu;
102         unsigned long missing_faults;
103         unsigned long wp_faults;
104         unsigned long minor_faults;
105 };
106
107 /* pthread_mutex_t starts at page offset 0 */
108 #define area_mutex(___area, ___nr)                                      \
109         ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
110 /*
111  * count is placed in the page after pthread_mutex_t naturally aligned
112  * to avoid non alignment faults on non-x86 archs.
113  */
114 #define area_count(___area, ___nr)                                      \
115         ((volatile unsigned long long *) ((unsigned long)               \
116                                  ((___area) + (___nr)*page_size +       \
117                                   sizeof(pthread_mutex_t) +             \
118                                   sizeof(unsigned long long) - 1) &     \
119                                  ~(unsigned long)(sizeof(unsigned long long) \
120                                                   -  1)))
121
122 const char *examples =
123     "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
124     "./userfaultfd anon 100 99999\n\n"
125     "# Run share memory test on 1GiB region with 99 bounces:\n"
126     "./userfaultfd shmem 1000 99\n\n"
127     "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n"
128     "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n"
129     "# Run the same hugetlb test but using shmem:\n"
130     "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
131     "# 10MiB-~6GiB 999 bounces anonymous test, "
132     "continue forever unless an error triggers\n"
133     "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
134
135 static void usage(void)
136 {
137         fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
138                 "[hugetlbfs_file]\n\n");
139         fprintf(stderr, "Supported <test type>: anon, hugetlb, "
140                 "hugetlb_shared, shmem\n\n");
141         fprintf(stderr, "Examples:\n\n");
142         fprintf(stderr, "%s", examples);
143         exit(1);
144 }
145
146 #define _err(fmt, ...)                                          \
147         do {                                                    \
148                 int ret = errno;                                \
149                 fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__);  \
150                 fprintf(stderr, " (errno=%d, line=%d)\n",       \
151                         ret, __LINE__);                         \
152         } while (0)
153
154 #define err(fmt, ...)                           \
155         do {                                    \
156                 _err(fmt, ##__VA_ARGS__);       \
157                 exit(1);                        \
158         } while (0)
159
160 static void uffd_stats_reset(struct uffd_stats *uffd_stats,
161                              unsigned long n_cpus)
162 {
163         int i;
164
165         for (i = 0; i < n_cpus; i++) {
166                 uffd_stats[i].cpu = i;
167                 uffd_stats[i].missing_faults = 0;
168                 uffd_stats[i].wp_faults = 0;
169                 uffd_stats[i].minor_faults = 0;
170         }
171 }
172
173 static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
174 {
175         int i;
176         unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
177
178         for (i = 0; i < n_cpus; i++) {
179                 miss_total += stats[i].missing_faults;
180                 wp_total += stats[i].wp_faults;
181                 minor_total += stats[i].minor_faults;
182         }
183
184         printf("userfaults: ");
185         if (miss_total) {
186                 printf("%llu missing (", miss_total);
187                 for (i = 0; i < n_cpus; i++)
188                         printf("%lu+", stats[i].missing_faults);
189                 printf("\b) ");
190         }
191         if (wp_total) {
192                 printf("%llu wp (", wp_total);
193                 for (i = 0; i < n_cpus; i++)
194                         printf("%lu+", stats[i].wp_faults);
195                 printf("\b) ");
196         }
197         if (minor_total) {
198                 printf("%llu minor (", minor_total);
199                 for (i = 0; i < n_cpus; i++)
200                         printf("%lu+", stats[i].minor_faults);
201                 printf("\b)");
202         }
203         printf("\n");
204 }
205
206 static void anon_release_pages(char *rel_area)
207 {
208         if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
209                 err("madvise(MADV_DONTNEED) failed");
210 }
211
212 static void anon_allocate_area(void **alloc_area)
213 {
214         *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
215                            MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
216         if (*alloc_area == MAP_FAILED)
217                 err("mmap of anonymous memory failed");
218 }
219
220 static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
221 {
222 }
223
224 static void hugetlb_release_pages(char *rel_area)
225 {
226         if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
227                       rel_area == huge_fd_off0 ? 0 : nr_pages * page_size,
228                       nr_pages * page_size))
229                 err("fallocate() failed");
230 }
231
232 static void hugetlb_allocate_area(void **alloc_area)
233 {
234         void *area_alias = NULL;
235         char **alloc_area_alias;
236
237         *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
238                            (map_shared ? MAP_SHARED : MAP_PRIVATE) |
239                            MAP_HUGETLB |
240                            (*alloc_area == area_src ? 0 : MAP_NORESERVE),
241                            huge_fd, *alloc_area == area_src ? 0 :
242                            nr_pages * page_size);
243         if (*alloc_area == MAP_FAILED)
244                 err("mmap of hugetlbfs file failed");
245
246         if (map_shared) {
247                 area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
248                                   MAP_SHARED | MAP_HUGETLB,
249                                   huge_fd, *alloc_area == area_src ? 0 :
250                                   nr_pages * page_size);
251                 if (area_alias == MAP_FAILED)
252                         err("mmap of hugetlb file alias failed");
253         }
254
255         if (*alloc_area == area_src) {
256                 huge_fd_off0 = *alloc_area;
257                 alloc_area_alias = &area_src_alias;
258         } else {
259                 alloc_area_alias = &area_dst_alias;
260         }
261         if (area_alias)
262                 *alloc_area_alias = area_alias;
263 }
264
265 static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
266 {
267         if (!map_shared)
268                 return;
269         /*
270          * We can't zap just the pagetable with hugetlbfs because
271          * MADV_DONTEED won't work. So exercise -EEXIST on a alias
272          * mapping where the pagetables are not established initially,
273          * this way we'll exercise the -EEXEC at the fs level.
274          */
275         *start = (unsigned long) area_dst_alias + offset;
276 }
277
278 static void shmem_release_pages(char *rel_area)
279 {
280         if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
281                 err("madvise(MADV_REMOVE) failed");
282 }
283
284 static void shmem_allocate_area(void **alloc_area)
285 {
286         void *area_alias = NULL;
287         bool is_src = alloc_area == (void **)&area_src;
288         unsigned long offset = is_src ? 0 : nr_pages * page_size;
289
290         *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
291                            MAP_SHARED, shm_fd, offset);
292         if (*alloc_area == MAP_FAILED)
293                 err("mmap of memfd failed");
294
295         area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
296                           MAP_SHARED, shm_fd, offset);
297         if (area_alias == MAP_FAILED)
298                 err("mmap of memfd alias failed");
299
300         if (is_src)
301                 area_src_alias = area_alias;
302         else
303                 area_dst_alias = area_alias;
304 }
305
306 static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
307 {
308         *start = (unsigned long)area_dst_alias + offset;
309 }
310
311 struct uffd_test_ops {
312         void (*allocate_area)(void **alloc_area);
313         void (*release_pages)(char *rel_area);
314         void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
315 };
316
317 static struct uffd_test_ops anon_uffd_test_ops = {
318         .allocate_area  = anon_allocate_area,
319         .release_pages  = anon_release_pages,
320         .alias_mapping = noop_alias_mapping,
321 };
322
323 static struct uffd_test_ops shmem_uffd_test_ops = {
324         .allocate_area  = shmem_allocate_area,
325         .release_pages  = shmem_release_pages,
326         .alias_mapping = shmem_alias_mapping,
327 };
328
329 static struct uffd_test_ops hugetlb_uffd_test_ops = {
330         .allocate_area  = hugetlb_allocate_area,
331         .release_pages  = hugetlb_release_pages,
332         .alias_mapping = hugetlb_alias_mapping,
333 };
334
335 static struct uffd_test_ops *uffd_test_ops;
336
337 static inline uint64_t uffd_minor_feature(void)
338 {
339         if (test_type == TEST_HUGETLB && map_shared)
340                 return UFFD_FEATURE_MINOR_HUGETLBFS;
341         else if (test_type == TEST_SHMEM)
342                 return UFFD_FEATURE_MINOR_SHMEM;
343         else
344                 return 0;
345 }
346
347 static uint64_t get_expected_ioctls(uint64_t mode)
348 {
349         uint64_t ioctls = UFFD_API_RANGE_IOCTLS;
350
351         if (test_type == TEST_HUGETLB)
352                 ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
353
354         if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp))
355                 ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
356
357         if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor))
358                 ioctls &= ~(1 << _UFFDIO_CONTINUE);
359
360         return ioctls;
361 }
362
363 static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
364 {
365         uint64_t expected = get_expected_ioctls(mode);
366         uint64_t actual = ioctls & expected;
367
368         if (actual != expected) {
369                 err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64,
370                     expected, actual);
371         }
372 }
373
374 static void userfaultfd_open(uint64_t *features)
375 {
376         struct uffdio_api uffdio_api;
377
378         uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
379         if (uffd < 0)
380                 err("userfaultfd syscall not available in this kernel");
381         uffd_flags = fcntl(uffd, F_GETFD, NULL);
382
383         uffdio_api.api = UFFD_API;
384         uffdio_api.features = *features;
385         if (ioctl(uffd, UFFDIO_API, &uffdio_api))
386                 err("UFFDIO_API failed.\nPlease make sure to "
387                     "run with either root or ptrace capability.");
388         if (uffdio_api.api != UFFD_API)
389                 err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
390
391         *features = uffdio_api.features;
392 }
393
394 static inline void munmap_area(void **area)
395 {
396         if (*area)
397                 if (munmap(*area, nr_pages * page_size))
398                         err("munmap");
399
400         *area = NULL;
401 }
402
403 static void uffd_test_ctx_clear(void)
404 {
405         size_t i;
406
407         if (pipefd) {
408                 for (i = 0; i < nr_cpus * 2; ++i) {
409                         if (close(pipefd[i]))
410                                 err("close pipefd");
411                 }
412                 free(pipefd);
413                 pipefd = NULL;
414         }
415
416         if (count_verify) {
417                 free(count_verify);
418                 count_verify = NULL;
419         }
420
421         if (uffd != -1) {
422                 if (close(uffd))
423                         err("close uffd");
424                 uffd = -1;
425         }
426
427         huge_fd_off0 = NULL;
428         munmap_area((void **)&area_src);
429         munmap_area((void **)&area_src_alias);
430         munmap_area((void **)&area_dst);
431         munmap_area((void **)&area_dst_alias);
432 }
433
434 static void uffd_test_ctx_init(uint64_t features)
435 {
436         unsigned long nr, cpu;
437
438         uffd_test_ctx_clear();
439
440         uffd_test_ops->allocate_area((void **)&area_src);
441         uffd_test_ops->allocate_area((void **)&area_dst);
442
443         userfaultfd_open(&features);
444
445         count_verify = malloc(nr_pages * sizeof(unsigned long long));
446         if (!count_verify)
447                 err("count_verify");
448
449         for (nr = 0; nr < nr_pages; nr++) {
450                 *area_mutex(area_src, nr) =
451                         (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
452                 count_verify[nr] = *area_count(area_src, nr) = 1;
453                 /*
454                  * In the transition between 255 to 256, powerpc will
455                  * read out of order in my_bcmp and see both bytes as
456                  * zero, so leave a placeholder below always non-zero
457                  * after the count, to avoid my_bcmp to trigger false
458                  * positives.
459                  */
460                 *(area_count(area_src, nr) + 1) = 1;
461         }
462
463         /*
464          * After initialization of area_src, we must explicitly release pages
465          * for area_dst to make sure it's fully empty.  Otherwise we could have
466          * some area_dst pages be errornously initialized with zero pages,
467          * hence we could hit memory corruption later in the test.
468          *
469          * One example is when THP is globally enabled, above allocate_area()
470          * calls could have the two areas merged into a single VMA (as they
471          * will have the same VMA flags so they're mergeable).  When we
472          * initialize the area_src above, it's possible that some part of
473          * area_dst could have been faulted in via one huge THP that will be
474          * shared between area_src and area_dst.  It could cause some of the
475          * area_dst won't be trapped by missing userfaults.
476          *
477          * This release_pages() will guarantee even if that happened, we'll
478          * proactively split the thp and drop any accidentally initialized
479          * pages within area_dst.
480          */
481         uffd_test_ops->release_pages(area_dst);
482
483         pipefd = malloc(sizeof(int) * nr_cpus * 2);
484         if (!pipefd)
485                 err("pipefd");
486         for (cpu = 0; cpu < nr_cpus; cpu++)
487                 if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
488                         err("pipe");
489 }
490
491 static int my_bcmp(char *str1, char *str2, size_t n)
492 {
493         unsigned long i;
494         for (i = 0; i < n; i++)
495                 if (str1[i] != str2[i])
496                         return 1;
497         return 0;
498 }
499
500 static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
501 {
502         struct uffdio_writeprotect prms;
503
504         /* Write protection page faults */
505         prms.range.start = start;
506         prms.range.len = len;
507         /* Undo write-protect, do wakeup after that */
508         prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
509
510         if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
511                 err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
512 }
513
514 static void continue_range(int ufd, __u64 start, __u64 len)
515 {
516         struct uffdio_continue req;
517         int ret;
518
519         req.range.start = start;
520         req.range.len = len;
521         req.mode = 0;
522
523         if (ioctl(ufd, UFFDIO_CONTINUE, &req))
524                 err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
525                     (uint64_t)start);
526
527         /*
528          * Error handling within the kernel for continue is subtly different
529          * from copy or zeropage, so it may be a source of bugs. Trigger an
530          * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
531          */
532         req.mapped = 0;
533         ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
534         if (ret >= 0 || req.mapped != -EEXIST)
535                 err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
536                     ret, (int64_t) req.mapped);
537 }
538
539 static void *locking_thread(void *arg)
540 {
541         unsigned long cpu = (unsigned long) arg;
542         unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
543         unsigned long long count;
544
545         if (!(bounces & BOUNCE_RANDOM)) {
546                 page_nr = -bounces;
547                 if (!(bounces & BOUNCE_RACINGFAULTS))
548                         page_nr += cpu * nr_pages_per_cpu;
549         }
550
551         while (!finished) {
552                 if (bounces & BOUNCE_RANDOM) {
553                         if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr))
554                                 err("getrandom failed");
555                 } else
556                         page_nr += 1;
557                 page_nr %= nr_pages;
558                 pthread_mutex_lock(area_mutex(area_dst, page_nr));
559                 count = *area_count(area_dst, page_nr);
560                 if (count != count_verify[page_nr])
561                         err("page_nr %lu memory corruption %llu %llu",
562                             page_nr, count, count_verify[page_nr]);
563                 count++;
564                 *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
565                 pthread_mutex_unlock(area_mutex(area_dst, page_nr));
566         }
567
568         return NULL;
569 }
570
571 static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
572                             unsigned long offset)
573 {
574         uffd_test_ops->alias_mapping(&uffdio_copy->dst,
575                                      uffdio_copy->len,
576                                      offset);
577         if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
578                 /* real retval in ufdio_copy.copy */
579                 if (uffdio_copy->copy != -EEXIST)
580                         err("UFFDIO_COPY retry error: %"PRId64,
581                             (int64_t)uffdio_copy->copy);
582         } else {
583                 err("UFFDIO_COPY retry unexpected: %"PRId64,
584                     (int64_t)uffdio_copy->copy);
585         }
586 }
587
588 static void wake_range(int ufd, unsigned long addr, unsigned long len)
589 {
590         struct uffdio_range uffdio_wake;
591
592         uffdio_wake.start = addr;
593         uffdio_wake.len = len;
594
595         if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
596                 fprintf(stderr, "error waking %lu\n",
597                         addr), exit(1);
598 }
599
600 static int __copy_page(int ufd, unsigned long offset, bool retry)
601 {
602         struct uffdio_copy uffdio_copy;
603
604         if (offset >= nr_pages * page_size)
605                 err("unexpected offset %lu\n", offset);
606         uffdio_copy.dst = (unsigned long) area_dst + offset;
607         uffdio_copy.src = (unsigned long) area_src + offset;
608         uffdio_copy.len = page_size;
609         if (test_uffdio_wp)
610                 uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
611         else
612                 uffdio_copy.mode = 0;
613         uffdio_copy.copy = 0;
614         if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
615                 /* real retval in ufdio_copy.copy */
616                 if (uffdio_copy.copy != -EEXIST)
617                         err("UFFDIO_COPY error: %"PRId64,
618                             (int64_t)uffdio_copy.copy);
619                 wake_range(ufd, uffdio_copy.dst, page_size);
620         } else if (uffdio_copy.copy != page_size) {
621                 err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
622         } else {
623                 if (test_uffdio_copy_eexist && retry) {
624                         test_uffdio_copy_eexist = false;
625                         retry_copy_page(ufd, &uffdio_copy, offset);
626                 }
627                 return 1;
628         }
629         return 0;
630 }
631
632 static int copy_page_retry(int ufd, unsigned long offset)
633 {
634         return __copy_page(ufd, offset, true);
635 }
636
637 static int copy_page(int ufd, unsigned long offset)
638 {
639         return __copy_page(ufd, offset, false);
640 }
641
642 static int uffd_read_msg(int ufd, struct uffd_msg *msg)
643 {
644         int ret = read(uffd, msg, sizeof(*msg));
645
646         if (ret != sizeof(*msg)) {
647                 if (ret < 0) {
648                         if (errno == EAGAIN || errno == EINTR)
649                                 return 1;
650                         err("blocking read error");
651                 } else {
652                         err("short read");
653                 }
654         }
655
656         return 0;
657 }
658
659 static void uffd_handle_page_fault(struct uffd_msg *msg,
660                                    struct uffd_stats *stats)
661 {
662         unsigned long offset;
663
664         if (msg->event != UFFD_EVENT_PAGEFAULT)
665                 err("unexpected msg event %u", msg->event);
666
667         if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
668                 /* Write protect page faults */
669                 wp_range(uffd, msg->arg.pagefault.address, page_size, false);
670                 stats->wp_faults++;
671         } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
672                 uint8_t *area;
673                 int b;
674
675                 /*
676                  * Minor page faults
677                  *
678                  * To prove we can modify the original range for testing
679                  * purposes, we're going to bit flip this range before
680                  * continuing.
681                  *
682                  * Note that this requires all minor page fault tests operate on
683                  * area_dst (non-UFFD-registered) and area_dst_alias
684                  * (UFFD-registered).
685                  */
686
687                 area = (uint8_t *)(area_dst +
688                                    ((char *)msg->arg.pagefault.address -
689                                     area_dst_alias));
690                 for (b = 0; b < page_size; ++b)
691                         area[b] = ~area[b];
692                 continue_range(uffd, msg->arg.pagefault.address, page_size);
693                 stats->minor_faults++;
694         } else {
695                 /* Missing page faults */
696                 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
697                         err("unexpected write fault");
698
699                 offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
700                 offset &= ~(page_size-1);
701
702                 if (copy_page(uffd, offset))
703                         stats->missing_faults++;
704         }
705 }
706
707 static void *uffd_poll_thread(void *arg)
708 {
709         struct uffd_stats *stats = (struct uffd_stats *)arg;
710         unsigned long cpu = stats->cpu;
711         struct pollfd pollfd[2];
712         struct uffd_msg msg;
713         struct uffdio_register uffd_reg;
714         int ret;
715         char tmp_chr;
716
717         pollfd[0].fd = uffd;
718         pollfd[0].events = POLLIN;
719         pollfd[1].fd = pipefd[cpu*2];
720         pollfd[1].events = POLLIN;
721
722         for (;;) {
723                 ret = poll(pollfd, 2, -1);
724                 if (ret <= 0) {
725                         if (errno == EINTR || errno == EAGAIN)
726                                 continue;
727                         err("poll error: %d", ret);
728                 }
729                 if (pollfd[1].revents & POLLIN) {
730                         if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
731                                 err("read pipefd error");
732                         break;
733                 }
734                 if (!(pollfd[0].revents & POLLIN))
735                         err("pollfd[0].revents %d", pollfd[0].revents);
736                 if (uffd_read_msg(uffd, &msg))
737                         continue;
738                 switch (msg.event) {
739                 default:
740                         err("unexpected msg event %u\n", msg.event);
741                         break;
742                 case UFFD_EVENT_PAGEFAULT:
743                         uffd_handle_page_fault(&msg, stats);
744                         break;
745                 case UFFD_EVENT_FORK:
746                         close(uffd);
747                         uffd = msg.arg.fork.ufd;
748                         pollfd[0].fd = uffd;
749                         break;
750                 case UFFD_EVENT_REMOVE:
751                         uffd_reg.range.start = msg.arg.remove.start;
752                         uffd_reg.range.len = msg.arg.remove.end -
753                                 msg.arg.remove.start;
754                         if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
755                                 err("remove failure");
756                         break;
757                 case UFFD_EVENT_REMAP:
758                         area_dst = (char *)(unsigned long)msg.arg.remap.to;
759                         break;
760                 }
761         }
762
763         return NULL;
764 }
765
766 pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
767
768 static void *uffd_read_thread(void *arg)
769 {
770         struct uffd_stats *stats = (struct uffd_stats *)arg;
771         struct uffd_msg msg;
772
773         pthread_mutex_unlock(&uffd_read_mutex);
774         /* from here cancellation is ok */
775
776         for (;;) {
777                 if (uffd_read_msg(uffd, &msg))
778                         continue;
779                 uffd_handle_page_fault(&msg, stats);
780         }
781
782         return NULL;
783 }
784
785 static void *background_thread(void *arg)
786 {
787         unsigned long cpu = (unsigned long) arg;
788         unsigned long page_nr, start_nr, mid_nr, end_nr;
789
790         start_nr = cpu * nr_pages_per_cpu;
791         end_nr = (cpu+1) * nr_pages_per_cpu;
792         mid_nr = (start_nr + end_nr) / 2;
793
794         /* Copy the first half of the pages */
795         for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
796                 copy_page_retry(uffd, page_nr * page_size);
797
798         /*
799          * If we need to test uffd-wp, set it up now.  Then we'll have
800          * at least the first half of the pages mapped already which
801          * can be write-protected for testing
802          */
803         if (test_uffdio_wp)
804                 wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
805                         nr_pages_per_cpu * page_size, true);
806
807         /*
808          * Continue the 2nd half of the page copying, handling write
809          * protection faults if any
810          */
811         for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
812                 copy_page_retry(uffd, page_nr * page_size);
813
814         return NULL;
815 }
816
817 static int stress(struct uffd_stats *uffd_stats)
818 {
819         unsigned long cpu;
820         pthread_t locking_threads[nr_cpus];
821         pthread_t uffd_threads[nr_cpus];
822         pthread_t background_threads[nr_cpus];
823
824         finished = 0;
825         for (cpu = 0; cpu < nr_cpus; cpu++) {
826                 if (pthread_create(&locking_threads[cpu], &attr,
827                                    locking_thread, (void *)cpu))
828                         return 1;
829                 if (bounces & BOUNCE_POLL) {
830                         if (pthread_create(&uffd_threads[cpu], &attr,
831                                            uffd_poll_thread,
832                                            (void *)&uffd_stats[cpu]))
833                                 return 1;
834                 } else {
835                         if (pthread_create(&uffd_threads[cpu], &attr,
836                                            uffd_read_thread,
837                                            (void *)&uffd_stats[cpu]))
838                                 return 1;
839                         pthread_mutex_lock(&uffd_read_mutex);
840                 }
841                 if (pthread_create(&background_threads[cpu], &attr,
842                                    background_thread, (void *)cpu))
843                         return 1;
844         }
845         for (cpu = 0; cpu < nr_cpus; cpu++)
846                 if (pthread_join(background_threads[cpu], NULL))
847                         return 1;
848
849         /*
850          * Be strict and immediately zap area_src, the whole area has
851          * been transferred already by the background treads. The
852          * area_src could then be faulted in in a racy way by still
853          * running uffdio_threads reading zeropages after we zapped
854          * area_src (but they're guaranteed to get -EEXIST from
855          * UFFDIO_COPY without writing zero pages into area_dst
856          * because the background threads already completed).
857          */
858         uffd_test_ops->release_pages(area_src);
859
860         finished = 1;
861         for (cpu = 0; cpu < nr_cpus; cpu++)
862                 if (pthread_join(locking_threads[cpu], NULL))
863                         return 1;
864
865         for (cpu = 0; cpu < nr_cpus; cpu++) {
866                 char c;
867                 if (bounces & BOUNCE_POLL) {
868                         if (write(pipefd[cpu*2+1], &c, 1) != 1)
869                                 err("pipefd write error");
870                         if (pthread_join(uffd_threads[cpu],
871                                          (void *)&uffd_stats[cpu]))
872                                 return 1;
873                 } else {
874                         if (pthread_cancel(uffd_threads[cpu]))
875                                 return 1;
876                         if (pthread_join(uffd_threads[cpu], NULL))
877                                 return 1;
878                 }
879         }
880
881         return 0;
882 }
883
884 sigjmp_buf jbuf, *sigbuf;
885
886 static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
887 {
888         if (sig == SIGBUS) {
889                 if (sigbuf)
890                         siglongjmp(*sigbuf, 1);
891                 abort();
892         }
893 }
894
895 /*
896  * For non-cooperative userfaultfd test we fork() a process that will
897  * generate pagefaults, will mremap the area monitored by the
898  * userfaultfd and at last this process will release the monitored
899  * area.
900  * For the anonymous and shared memory the area is divided into two
901  * parts, the first part is accessed before mremap, and the second
902  * part is accessed after mremap. Since hugetlbfs does not support
903  * mremap, the entire monitored area is accessed in a single pass for
904  * HUGETLB_TEST.
905  * The release of the pages currently generates event for shmem and
906  * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
907  * for hugetlb.
908  * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
909  * monitored area, generate pagefaults and test that signal is delivered.
910  * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
911  * test robustness use case - we release monitored area, fork a process
912  * that will generate pagefaults and verify signal is generated.
913  * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
914  * feature. Using monitor thread, verify no userfault events are generated.
915  */
916 static int faulting_process(int signal_test)
917 {
918         unsigned long nr;
919         unsigned long long count;
920         unsigned long split_nr_pages;
921         unsigned long lastnr;
922         struct sigaction act;
923         unsigned long signalled = 0;
924
925         if (test_type != TEST_HUGETLB)
926                 split_nr_pages = (nr_pages + 1) / 2;
927         else
928                 split_nr_pages = nr_pages;
929
930         if (signal_test) {
931                 sigbuf = &jbuf;
932                 memset(&act, 0, sizeof(act));
933                 act.sa_sigaction = sighndl;
934                 act.sa_flags = SA_SIGINFO;
935                 if (sigaction(SIGBUS, &act, 0))
936                         err("sigaction");
937                 lastnr = (unsigned long)-1;
938         }
939
940         for (nr = 0; nr < split_nr_pages; nr++) {
941                 int steps = 1;
942                 unsigned long offset = nr * page_size;
943
944                 if (signal_test) {
945                         if (sigsetjmp(*sigbuf, 1) != 0) {
946                                 if (steps == 1 && nr == lastnr)
947                                         err("Signal repeated");
948
949                                 lastnr = nr;
950                                 if (signal_test == 1) {
951                                         if (steps == 1) {
952                                                 /* This is a MISSING request */
953                                                 steps++;
954                                                 if (copy_page(uffd, offset))
955                                                         signalled++;
956                                         } else {
957                                                 /* This is a WP request */
958                                                 assert(steps == 2);
959                                                 wp_range(uffd,
960                                                          (__u64)area_dst +
961                                                          offset,
962                                                          page_size, false);
963                                         }
964                                 } else {
965                                         signalled++;
966                                         continue;
967                                 }
968                         }
969                 }
970
971                 count = *area_count(area_dst, nr);
972                 if (count != count_verify[nr])
973                         err("nr %lu memory corruption %llu %llu\n",
974                             nr, count, count_verify[nr]);
975                 /*
976                  * Trigger write protection if there is by writing
977                  * the same value back.
978                  */
979                 *area_count(area_dst, nr) = count;
980         }
981
982         if (signal_test)
983                 return signalled != split_nr_pages;
984
985         if (test_type == TEST_HUGETLB)
986                 return 0;
987
988         area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
989                           MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
990         if (area_dst == MAP_FAILED)
991                 err("mremap");
992         /* Reset area_src since we just clobbered it */
993         area_src = NULL;
994
995         for (; nr < nr_pages; nr++) {
996                 count = *area_count(area_dst, nr);
997                 if (count != count_verify[nr]) {
998                         err("nr %lu memory corruption %llu %llu\n",
999                             nr, count, count_verify[nr]);
1000                 }
1001                 /*
1002                  * Trigger write protection if there is by writing
1003                  * the same value back.
1004                  */
1005                 *area_count(area_dst, nr) = count;
1006         }
1007
1008         uffd_test_ops->release_pages(area_dst);
1009
1010         for (nr = 0; nr < nr_pages; nr++)
1011                 if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
1012                         err("nr %lu is not zero", nr);
1013
1014         return 0;
1015 }
1016
1017 static void retry_uffdio_zeropage(int ufd,
1018                                   struct uffdio_zeropage *uffdio_zeropage,
1019                                   unsigned long offset)
1020 {
1021         uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
1022                                      uffdio_zeropage->range.len,
1023                                      offset);
1024         if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
1025                 if (uffdio_zeropage->zeropage != -EEXIST)
1026                         err("UFFDIO_ZEROPAGE error: %"PRId64,
1027                             (int64_t)uffdio_zeropage->zeropage);
1028         } else {
1029                 err("UFFDIO_ZEROPAGE error: %"PRId64,
1030                     (int64_t)uffdio_zeropage->zeropage);
1031         }
1032 }
1033
1034 static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
1035 {
1036         struct uffdio_zeropage uffdio_zeropage;
1037         int ret;
1038         bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE);
1039         __s64 res;
1040
1041         if (offset >= nr_pages * page_size)
1042                 err("unexpected offset %lu", offset);
1043         uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
1044         uffdio_zeropage.range.len = page_size;
1045         uffdio_zeropage.mode = 0;
1046         ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
1047         res = uffdio_zeropage.zeropage;
1048         if (ret) {
1049                 /* real retval in ufdio_zeropage.zeropage */
1050                 if (has_zeropage)
1051                         err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
1052                 else if (res != -EINVAL)
1053                         err("UFFDIO_ZEROPAGE not -EINVAL");
1054         } else if (has_zeropage) {
1055                 if (res != page_size) {
1056                         err("UFFDIO_ZEROPAGE unexpected size");
1057                 } else {
1058                         if (test_uffdio_zeropage_eexist && retry) {
1059                                 test_uffdio_zeropage_eexist = false;
1060                                 retry_uffdio_zeropage(ufd, &uffdio_zeropage,
1061                                                       offset);
1062                         }
1063                         return 1;
1064                 }
1065         } else
1066                 err("UFFDIO_ZEROPAGE succeeded");
1067
1068         return 0;
1069 }
1070
1071 static int uffdio_zeropage(int ufd, unsigned long offset)
1072 {
1073         return __uffdio_zeropage(ufd, offset, false);
1074 }
1075
1076 /* exercise UFFDIO_ZEROPAGE */
1077 static int userfaultfd_zeropage_test(void)
1078 {
1079         struct uffdio_register uffdio_register;
1080
1081         printf("testing UFFDIO_ZEROPAGE: ");
1082         fflush(stdout);
1083
1084         uffd_test_ctx_init(0);
1085
1086         uffdio_register.range.start = (unsigned long) area_dst;
1087         uffdio_register.range.len = nr_pages * page_size;
1088         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1089         if (test_uffdio_wp)
1090                 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1091         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1092                 err("register failure");
1093
1094         assert_expected_ioctls_present(
1095                 uffdio_register.mode, uffdio_register.ioctls);
1096
1097         if (uffdio_zeropage(uffd, 0))
1098                 if (my_bcmp(area_dst, zeropage, page_size))
1099                         err("zeropage is not zero");
1100
1101         printf("done.\n");
1102         return 0;
1103 }
1104
1105 static int userfaultfd_events_test(void)
1106 {
1107         struct uffdio_register uffdio_register;
1108         pthread_t uffd_mon;
1109         int err, features;
1110         pid_t pid;
1111         char c;
1112         struct uffd_stats stats = { 0 };
1113
1114         printf("testing events (fork, remap, remove): ");
1115         fflush(stdout);
1116
1117         features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
1118                 UFFD_FEATURE_EVENT_REMOVE;
1119         uffd_test_ctx_init(features);
1120
1121         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1122
1123         uffdio_register.range.start = (unsigned long) area_dst;
1124         uffdio_register.range.len = nr_pages * page_size;
1125         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1126         if (test_uffdio_wp)
1127                 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1128         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1129                 err("register failure");
1130
1131         assert_expected_ioctls_present(
1132                 uffdio_register.mode, uffdio_register.ioctls);
1133
1134         if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1135                 err("uffd_poll_thread create");
1136
1137         pid = fork();
1138         if (pid < 0)
1139                 err("fork");
1140
1141         if (!pid)
1142                 exit(faulting_process(0));
1143
1144         waitpid(pid, &err, 0);
1145         if (err)
1146                 err("faulting process failed");
1147         if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1148                 err("pipe write");
1149         if (pthread_join(uffd_mon, NULL))
1150                 return 1;
1151
1152         uffd_stats_report(&stats, 1);
1153
1154         return stats.missing_faults != nr_pages;
1155 }
1156
1157 static int userfaultfd_sig_test(void)
1158 {
1159         struct uffdio_register uffdio_register;
1160         unsigned long userfaults;
1161         pthread_t uffd_mon;
1162         int err, features;
1163         pid_t pid;
1164         char c;
1165         struct uffd_stats stats = { 0 };
1166
1167         printf("testing signal delivery: ");
1168         fflush(stdout);
1169
1170         features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
1171         uffd_test_ctx_init(features);
1172
1173         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1174
1175         uffdio_register.range.start = (unsigned long) area_dst;
1176         uffdio_register.range.len = nr_pages * page_size;
1177         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1178         if (test_uffdio_wp)
1179                 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1180         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1181                 err("register failure");
1182
1183         assert_expected_ioctls_present(
1184                 uffdio_register.mode, uffdio_register.ioctls);
1185
1186         if (faulting_process(1))
1187                 err("faulting process failed");
1188
1189         uffd_test_ops->release_pages(area_dst);
1190
1191         if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1192                 err("uffd_poll_thread create");
1193
1194         pid = fork();
1195         if (pid < 0)
1196                 err("fork");
1197
1198         if (!pid)
1199                 exit(faulting_process(2));
1200
1201         waitpid(pid, &err, 0);
1202         if (err)
1203                 err("faulting process failed");
1204         if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1205                 err("pipe write");
1206         if (pthread_join(uffd_mon, (void **)&userfaults))
1207                 return 1;
1208
1209         printf("done.\n");
1210         if (userfaults)
1211                 err("Signal test failed, userfaults: %ld", userfaults);
1212
1213         return userfaults != 0;
1214 }
1215
1216 static int userfaultfd_minor_test(void)
1217 {
1218         struct uffdio_register uffdio_register;
1219         unsigned long p;
1220         pthread_t uffd_mon;
1221         uint8_t expected_byte;
1222         void *expected_page;
1223         char c;
1224         struct uffd_stats stats = { 0 };
1225
1226         if (!test_uffdio_minor)
1227                 return 0;
1228
1229         printf("testing minor faults: ");
1230         fflush(stdout);
1231
1232         uffd_test_ctx_init(uffd_minor_feature());
1233
1234         uffdio_register.range.start = (unsigned long)area_dst_alias;
1235         uffdio_register.range.len = nr_pages * page_size;
1236         uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
1237         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1238                 err("register failure");
1239
1240         assert_expected_ioctls_present(
1241                 uffdio_register.mode, uffdio_register.ioctls);
1242
1243         /*
1244          * After registering with UFFD, populate the non-UFFD-registered side of
1245          * the shared mapping. This should *not* trigger any UFFD minor faults.
1246          */
1247         for (p = 0; p < nr_pages; ++p) {
1248                 memset(area_dst + (p * page_size), p % ((uint8_t)-1),
1249                        page_size);
1250         }
1251
1252         if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1253                 err("uffd_poll_thread create");
1254
1255         /*
1256          * Read each of the pages back using the UFFD-registered mapping. We
1257          * expect that the first time we touch a page, it will result in a minor
1258          * fault. uffd_poll_thread will resolve the fault by bit-flipping the
1259          * page's contents, and then issuing a CONTINUE ioctl.
1260          */
1261
1262         if (posix_memalign(&expected_page, page_size, page_size))
1263                 err("out of memory");
1264
1265         for (p = 0; p < nr_pages; ++p) {
1266                 expected_byte = ~((uint8_t)(p % ((uint8_t)-1)));
1267                 memset(expected_page, expected_byte, page_size);
1268                 if (my_bcmp(expected_page, area_dst_alias + (p * page_size),
1269                             page_size))
1270                         err("unexpected page contents after minor fault");
1271         }
1272
1273         if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1274                 err("pipe write");
1275         if (pthread_join(uffd_mon, NULL))
1276                 return 1;
1277
1278         uffd_stats_report(&stats, 1);
1279
1280         return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
1281 }
1282
1283 #define BIT_ULL(nr)                   (1ULL << (nr))
1284 #define PM_SOFT_DIRTY                 BIT_ULL(55)
1285 #define PM_MMAP_EXCLUSIVE             BIT_ULL(56)
1286 #define PM_UFFD_WP                    BIT_ULL(57)
1287 #define PM_FILE                       BIT_ULL(61)
1288 #define PM_SWAP                       BIT_ULL(62)
1289 #define PM_PRESENT                    BIT_ULL(63)
1290
1291 static int pagemap_open(void)
1292 {
1293         int fd = open("/proc/self/pagemap", O_RDONLY);
1294
1295         if (fd < 0)
1296                 err("open pagemap");
1297
1298         return fd;
1299 }
1300
1301 static uint64_t pagemap_read_vaddr(int fd, void *vaddr)
1302 {
1303         uint64_t value;
1304         int ret;
1305
1306         ret = pread(fd, &value, sizeof(uint64_t),
1307                     ((uint64_t)vaddr >> 12) * sizeof(uint64_t));
1308         if (ret != sizeof(uint64_t))
1309                 err("pread() on pagemap failed");
1310
1311         return value;
1312 }
1313
1314 /* This macro let __LINE__ works in err() */
1315 #define  pagemap_check_wp(value, wp) do {                               \
1316                 if (!!(value & PM_UFFD_WP) != wp)                       \
1317                         err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
1318         } while (0)
1319
1320 static int pagemap_test_fork(bool present)
1321 {
1322         pid_t child = fork();
1323         uint64_t value;
1324         int fd, result;
1325
1326         if (!child) {
1327                 /* Open the pagemap fd of the child itself */
1328                 fd = pagemap_open();
1329                 value = pagemap_read_vaddr(fd, area_dst);
1330                 /*
1331                  * After fork() uffd-wp bit should be gone as long as we're
1332                  * without UFFD_FEATURE_EVENT_FORK
1333                  */
1334                 pagemap_check_wp(value, false);
1335                 /* Succeed */
1336                 exit(0);
1337         }
1338         waitpid(child, &result, 0);
1339         return result;
1340 }
1341
1342 static void userfaultfd_pagemap_test(unsigned int test_pgsize)
1343 {
1344         struct uffdio_register uffdio_register;
1345         int pagemap_fd;
1346         uint64_t value;
1347
1348         /* Pagemap tests uffd-wp only */
1349         if (!test_uffdio_wp)
1350                 return;
1351
1352         /* Not enough memory to test this page size */
1353         if (test_pgsize > nr_pages * page_size)
1354                 return;
1355
1356         printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize);
1357         /* Flush so it doesn't flush twice in parent/child later */
1358         fflush(stdout);
1359
1360         uffd_test_ctx_init(0);
1361
1362         if (test_pgsize > page_size) {
1363                 /* This is a thp test */
1364                 if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
1365                         err("madvise(MADV_HUGEPAGE) failed");
1366         } else if (test_pgsize == page_size) {
1367                 /* This is normal page test; force no thp */
1368                 if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
1369                         err("madvise(MADV_NOHUGEPAGE) failed");
1370         }
1371
1372         uffdio_register.range.start = (unsigned long) area_dst;
1373         uffdio_register.range.len = nr_pages * page_size;
1374         uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
1375         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1376                 err("register failed");
1377
1378         pagemap_fd = pagemap_open();
1379
1380         /* Touch the page */
1381         *area_dst = 1;
1382         wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
1383         value = pagemap_read_vaddr(pagemap_fd, area_dst);
1384         pagemap_check_wp(value, true);
1385         /* Make sure uffd-wp bit dropped when fork */
1386         if (pagemap_test_fork(true))
1387                 err("Detected stall uffd-wp bit in child");
1388
1389         /* Exclusive required or PAGEOUT won't work */
1390         if (!(value & PM_MMAP_EXCLUSIVE))
1391                 err("multiple mapping detected: 0x%"PRIx64, value);
1392
1393         if (madvise(area_dst, test_pgsize, MADV_PAGEOUT))
1394                 err("madvise(MADV_PAGEOUT) failed");
1395
1396         /* Uffd-wp should persist even swapped out */
1397         value = pagemap_read_vaddr(pagemap_fd, area_dst);
1398         pagemap_check_wp(value, true);
1399         /* Make sure uffd-wp bit dropped when fork */
1400         if (pagemap_test_fork(false))
1401                 err("Detected stall uffd-wp bit in child");
1402
1403         /* Unprotect; this tests swap pte modifications */
1404         wp_range(uffd, (uint64_t)area_dst, page_size, false);
1405         value = pagemap_read_vaddr(pagemap_fd, area_dst);
1406         pagemap_check_wp(value, false);
1407
1408         /* Fault in the page from disk */
1409         *area_dst = 2;
1410         value = pagemap_read_vaddr(pagemap_fd, area_dst);
1411         pagemap_check_wp(value, false);
1412
1413         close(pagemap_fd);
1414         printf("done\n");
1415 }
1416
1417 static int userfaultfd_stress(void)
1418 {
1419         void *area;
1420         char *tmp_area;
1421         unsigned long nr;
1422         struct uffdio_register uffdio_register;
1423         struct uffd_stats uffd_stats[nr_cpus];
1424
1425         uffd_test_ctx_init(0);
1426
1427         if (posix_memalign(&area, page_size, page_size))
1428                 err("out of memory");
1429         zeropage = area;
1430         bzero(zeropage, page_size);
1431
1432         pthread_mutex_lock(&uffd_read_mutex);
1433
1434         pthread_attr_init(&attr);
1435         pthread_attr_setstacksize(&attr, 16*1024*1024);
1436
1437         while (bounces--) {
1438                 printf("bounces: %d, mode:", bounces);
1439                 if (bounces & BOUNCE_RANDOM)
1440                         printf(" rnd");
1441                 if (bounces & BOUNCE_RACINGFAULTS)
1442                         printf(" racing");
1443                 if (bounces & BOUNCE_VERIFY)
1444                         printf(" ver");
1445                 if (bounces & BOUNCE_POLL)
1446                         printf(" poll");
1447                 else
1448                         printf(" read");
1449                 printf(", ");
1450                 fflush(stdout);
1451
1452                 if (bounces & BOUNCE_POLL)
1453                         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1454                 else
1455                         fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
1456
1457                 /* register */
1458                 uffdio_register.range.start = (unsigned long) area_dst;
1459                 uffdio_register.range.len = nr_pages * page_size;
1460                 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1461                 if (test_uffdio_wp)
1462                         uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1463                 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1464                         err("register failure");
1465                 assert_expected_ioctls_present(
1466                         uffdio_register.mode, uffdio_register.ioctls);
1467
1468                 if (area_dst_alias) {
1469                         uffdio_register.range.start = (unsigned long)
1470                                 area_dst_alias;
1471                         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1472                                 err("register failure alias");
1473                 }
1474
1475                 /*
1476                  * The madvise done previously isn't enough: some
1477                  * uffd_thread could have read userfaults (one of
1478                  * those already resolved by the background thread)
1479                  * and it may be in the process of calling
1480                  * UFFDIO_COPY. UFFDIO_COPY will read the zapped
1481                  * area_src and it would map a zero page in it (of
1482                  * course such a UFFDIO_COPY is perfectly safe as it'd
1483                  * return -EEXIST). The problem comes at the next
1484                  * bounce though: that racing UFFDIO_COPY would
1485                  * generate zeropages in the area_src, so invalidating
1486                  * the previous MADV_DONTNEED. Without this additional
1487                  * MADV_DONTNEED those zeropages leftovers in the
1488                  * area_src would lead to -EEXIST failure during the
1489                  * next bounce, effectively leaving a zeropage in the
1490                  * area_dst.
1491                  *
1492                  * Try to comment this out madvise to see the memory
1493                  * corruption being caught pretty quick.
1494                  *
1495                  * khugepaged is also inhibited to collapse THP after
1496                  * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
1497                  * required to MADV_DONTNEED here.
1498                  */
1499                 uffd_test_ops->release_pages(area_dst);
1500
1501                 uffd_stats_reset(uffd_stats, nr_cpus);
1502
1503                 /* bounce pass */
1504                 if (stress(uffd_stats))
1505                         return 1;
1506
1507                 /* Clear all the write protections if there is any */
1508                 if (test_uffdio_wp)
1509                         wp_range(uffd, (unsigned long)area_dst,
1510                                  nr_pages * page_size, false);
1511
1512                 /* unregister */
1513                 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
1514                         err("unregister failure");
1515                 if (area_dst_alias) {
1516                         uffdio_register.range.start = (unsigned long) area_dst;
1517                         if (ioctl(uffd, UFFDIO_UNREGISTER,
1518                                   &uffdio_register.range))
1519                                 err("unregister failure alias");
1520                 }
1521
1522                 /* verification */
1523                 if (bounces & BOUNCE_VERIFY)
1524                         for (nr = 0; nr < nr_pages; nr++)
1525                                 if (*area_count(area_dst, nr) != count_verify[nr])
1526                                         err("error area_count %llu %llu %lu\n",
1527                                             *area_count(area_src, nr),
1528                                             count_verify[nr], nr);
1529
1530                 /* prepare next bounce */
1531                 tmp_area = area_src;
1532                 area_src = area_dst;
1533                 area_dst = tmp_area;
1534
1535                 tmp_area = area_src_alias;
1536                 area_src_alias = area_dst_alias;
1537                 area_dst_alias = tmp_area;
1538
1539                 uffd_stats_report(uffd_stats, nr_cpus);
1540         }
1541
1542         if (test_type == TEST_ANON) {
1543                 /*
1544                  * shmem/hugetlb won't be able to run since they have different
1545                  * behavior on fork() (file-backed memory normally drops ptes
1546                  * directly when fork), meanwhile the pagemap test will verify
1547                  * pgtable entry of fork()ed child.
1548                  */
1549                 userfaultfd_pagemap_test(page_size);
1550                 /*
1551                  * Hard-code for x86_64 for now for 2M THP, as x86_64 is
1552                  * currently the only one that supports uffd-wp
1553                  */
1554                 userfaultfd_pagemap_test(page_size * 512);
1555         }
1556
1557         return userfaultfd_zeropage_test() || userfaultfd_sig_test()
1558                 || userfaultfd_events_test() || userfaultfd_minor_test();
1559 }
1560
1561 /*
1562  * Copied from mlock2-tests.c
1563  */
1564 unsigned long default_huge_page_size(void)
1565 {
1566         unsigned long hps = 0;
1567         char *line = NULL;
1568         size_t linelen = 0;
1569         FILE *f = fopen("/proc/meminfo", "r");
1570
1571         if (!f)
1572                 return 0;
1573         while (getline(&line, &linelen, f) > 0) {
1574                 if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
1575                         hps <<= 10;
1576                         break;
1577                 }
1578         }
1579
1580         free(line);
1581         fclose(f);
1582         return hps;
1583 }
1584
1585 static void set_test_type(const char *type)
1586 {
1587         uint64_t features = UFFD_API_FEATURES;
1588
1589         if (!strcmp(type, "anon")) {
1590                 test_type = TEST_ANON;
1591                 uffd_test_ops = &anon_uffd_test_ops;
1592                 /* Only enable write-protect test for anonymous test */
1593                 test_uffdio_wp = true;
1594         } else if (!strcmp(type, "hugetlb")) {
1595                 test_type = TEST_HUGETLB;
1596                 uffd_test_ops = &hugetlb_uffd_test_ops;
1597         } else if (!strcmp(type, "hugetlb_shared")) {
1598                 map_shared = true;
1599                 test_type = TEST_HUGETLB;
1600                 uffd_test_ops = &hugetlb_uffd_test_ops;
1601                 /* Minor faults require shared hugetlb; only enable here. */
1602                 test_uffdio_minor = true;
1603         } else if (!strcmp(type, "shmem")) {
1604                 map_shared = true;
1605                 test_type = TEST_SHMEM;
1606                 uffd_test_ops = &shmem_uffd_test_ops;
1607                 test_uffdio_minor = true;
1608         } else {
1609                 err("Unknown test type: %s", type);
1610         }
1611
1612         if (test_type == TEST_HUGETLB)
1613                 page_size = default_huge_page_size();
1614         else
1615                 page_size = sysconf(_SC_PAGE_SIZE);
1616
1617         if (!page_size)
1618                 err("Unable to determine page size");
1619         if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1620             > page_size)
1621                 err("Impossible to run this test");
1622
1623         /*
1624          * Whether we can test certain features depends not just on test type,
1625          * but also on whether or not this particular kernel supports the
1626          * feature.
1627          */
1628
1629         userfaultfd_open(&features);
1630
1631         test_uffdio_wp = test_uffdio_wp &&
1632                 (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP);
1633         test_uffdio_minor = test_uffdio_minor &&
1634                 (features & uffd_minor_feature());
1635
1636         close(uffd);
1637         uffd = -1;
1638 }
1639
1640 static void sigalrm(int sig)
1641 {
1642         if (sig != SIGALRM)
1643                 abort();
1644         test_uffdio_copy_eexist = true;
1645         test_uffdio_zeropage_eexist = true;
1646         alarm(ALARM_INTERVAL_SECS);
1647 }
1648
1649 int main(int argc, char **argv)
1650 {
1651         if (argc < 4)
1652                 usage();
1653
1654         if (signal(SIGALRM, sigalrm) == SIG_ERR)
1655                 err("failed to arm SIGALRM");
1656         alarm(ALARM_INTERVAL_SECS);
1657
1658         set_test_type(argv[1]);
1659
1660         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1661         nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
1662                 nr_cpus;
1663         if (!nr_pages_per_cpu) {
1664                 _err("invalid MiB");
1665                 usage();
1666         }
1667
1668         bounces = atoi(argv[3]);
1669         if (bounces <= 0) {
1670                 _err("invalid bounces");
1671                 usage();
1672         }
1673         nr_pages = nr_pages_per_cpu * nr_cpus;
1674
1675         if (test_type == TEST_HUGETLB) {
1676                 if (argc < 5)
1677                         usage();
1678                 huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
1679                 if (huge_fd < 0)
1680                         err("Open of %s failed", argv[4]);
1681                 if (ftruncate(huge_fd, 0))
1682                         err("ftruncate %s to size 0 failed", argv[4]);
1683         } else if (test_type == TEST_SHMEM) {
1684                 shm_fd = memfd_create(argv[0], 0);
1685                 if (shm_fd < 0)
1686                         err("memfd_create");
1687                 if (ftruncate(shm_fd, nr_pages * page_size * 2))
1688                         err("ftruncate");
1689                 if (fallocate(shm_fd,
1690                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
1691                               nr_pages * page_size * 2))
1692                         err("fallocate");
1693         }
1694         printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1695                nr_pages, nr_pages_per_cpu);
1696         return userfaultfd_stress();
1697 }
1698
1699 #else /* __NR_userfaultfd */
1700
1701 #warning "missing __NR_userfaultfd definition"
1702
1703 int main(void)
1704 {
1705         printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1706         return KSFT_SKIP;
1707 }
1708
1709 #endif /* __NR_userfaultfd */