Merge tag 'x86_urgent_for_v5.13_rc1' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-microblaze.git] / tools / testing / selftests / vm / userfaultfd.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Stress userfaultfd syscall.
4  *
5  *  Copyright (C) 2015  Red Hat, Inc.
6  *
7  * This test allocates two virtual areas and bounces the physical
8  * memory across the two virtual areas (from area_src to area_dst)
9  * using userfaultfd.
10  *
11  * There are three threads running per CPU:
12  *
13  * 1) one per-CPU thread takes a per-page pthread_mutex in a random
14  *    page of the area_dst (while the physical page may still be in
15  *    area_src), and increments a per-page counter in the same page,
16  *    and checks its value against a verification region.
17  *
18  * 2) another per-CPU thread handles the userfaults generated by
19  *    thread 1 above. userfaultfd blocking reads or poll() modes are
20  *    exercised interleaved.
21  *
22  * 3) one last per-CPU thread transfers the memory in the background
23  *    at maximum bandwidth (if not already transferred by thread
24  *    2). Each cpu thread takes cares of transferring a portion of the
25  *    area.
26  *
27  * When all threads of type 3 completed the transfer, one bounce is
28  * complete. area_src and area_dst are then swapped. All threads are
29  * respawned and so the bounce is immediately restarted in the
30  * opposite direction.
31  *
32  * per-CPU threads 1 by triggering userfaults inside
33  * pthread_mutex_lock will also verify the atomicity of the memory
34  * transfer (UFFDIO_COPY).
35  */
36
37 #define _GNU_SOURCE
38 #include <stdio.h>
39 #include <errno.h>
40 #include <unistd.h>
41 #include <stdlib.h>
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <fcntl.h>
45 #include <time.h>
46 #include <signal.h>
47 #include <poll.h>
48 #include <string.h>
49 #include <sys/mman.h>
50 #include <sys/syscall.h>
51 #include <sys/ioctl.h>
52 #include <sys/wait.h>
53 #include <pthread.h>
54 #include <linux/userfaultfd.h>
55 #include <setjmp.h>
56 #include <stdbool.h>
57 #include <assert.h>
58 #include <inttypes.h>
59 #include <stdint.h>
60
61 #include "../kselftest.h"
62
63 #ifdef __NR_userfaultfd
64
65 static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
66
67 #define BOUNCE_RANDOM           (1<<0)
68 #define BOUNCE_RACINGFAULTS     (1<<1)
69 #define BOUNCE_VERIFY           (1<<2)
70 #define BOUNCE_POLL             (1<<3)
71 static int bounces;
72
73 #define TEST_ANON       1
74 #define TEST_HUGETLB    2
75 #define TEST_SHMEM      3
76 static int test_type;
77
78 /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
79 #define ALARM_INTERVAL_SECS 10
80 static volatile bool test_uffdio_copy_eexist = true;
81 static volatile bool test_uffdio_zeropage_eexist = true;
82 /* Whether to test uffd write-protection */
83 static bool test_uffdio_wp = false;
84 /* Whether to test uffd minor faults */
85 static bool test_uffdio_minor = false;
86
87 static bool map_shared;
88 static int huge_fd;
89 static char *huge_fd_off0;
90 static unsigned long long *count_verify;
91 static int uffd, uffd_flags, finished, *pipefd;
92 static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
93 static char *zeropage;
94 pthread_attr_t attr;
95
96 /* Userfaultfd test statistics */
97 struct uffd_stats {
98         int cpu;
99         unsigned long missing_faults;
100         unsigned long wp_faults;
101         unsigned long minor_faults;
102 };
103
104 /* pthread_mutex_t starts at page offset 0 */
105 #define area_mutex(___area, ___nr)                                      \
106         ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
107 /*
108  * count is placed in the page after pthread_mutex_t naturally aligned
109  * to avoid non alignment faults on non-x86 archs.
110  */
111 #define area_count(___area, ___nr)                                      \
112         ((volatile unsigned long long *) ((unsigned long)               \
113                                  ((___area) + (___nr)*page_size +       \
114                                   sizeof(pthread_mutex_t) +             \
115                                   sizeof(unsigned long long) - 1) &     \
116                                  ~(unsigned long)(sizeof(unsigned long long) \
117                                                   -  1)))
118
119 const char *examples =
120     "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
121     "./userfaultfd anon 100 99999\n\n"
122     "# Run share memory test on 1GiB region with 99 bounces:\n"
123     "./userfaultfd shmem 1000 99\n\n"
124     "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n"
125     "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n"
126     "# Run the same hugetlb test but using shmem:\n"
127     "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
128     "# 10MiB-~6GiB 999 bounces anonymous test, "
129     "continue forever unless an error triggers\n"
130     "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
131
132 static void usage(void)
133 {
134         fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
135                 "[hugetlbfs_file]\n\n");
136         fprintf(stderr, "Supported <test type>: anon, hugetlb, "
137                 "hugetlb_shared, shmem\n\n");
138         fprintf(stderr, "Examples:\n\n");
139         fprintf(stderr, "%s", examples);
140         exit(1);
141 }
142
143 #define uffd_error(code, fmt, ...)                                             \
144         do {                                                                   \
145                 fprintf(stderr, fmt, ##__VA_ARGS__);                           \
146                 fprintf(stderr, ": %" PRId64 "\n", (int64_t)(code));           \
147                 exit(1);                                                       \
148         } while (0)
149
150 static void uffd_stats_reset(struct uffd_stats *uffd_stats,
151                              unsigned long n_cpus)
152 {
153         int i;
154
155         for (i = 0; i < n_cpus; i++) {
156                 uffd_stats[i].cpu = i;
157                 uffd_stats[i].missing_faults = 0;
158                 uffd_stats[i].wp_faults = 0;
159                 uffd_stats[i].minor_faults = 0;
160         }
161 }
162
163 static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
164 {
165         int i;
166         unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
167
168         for (i = 0; i < n_cpus; i++) {
169                 miss_total += stats[i].missing_faults;
170                 wp_total += stats[i].wp_faults;
171                 minor_total += stats[i].minor_faults;
172         }
173
174         printf("userfaults: %llu missing (", miss_total);
175         for (i = 0; i < n_cpus; i++)
176                 printf("%lu+", stats[i].missing_faults);
177         printf("\b), %llu wp (", wp_total);
178         for (i = 0; i < n_cpus; i++)
179                 printf("%lu+", stats[i].wp_faults);
180         printf("\b), %llu minor (", minor_total);
181         for (i = 0; i < n_cpus; i++)
182                 printf("%lu+", stats[i].minor_faults);
183         printf("\b)\n");
184 }
185
186 static int anon_release_pages(char *rel_area)
187 {
188         int ret = 0;
189
190         if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) {
191                 perror("madvise");
192                 ret = 1;
193         }
194
195         return ret;
196 }
197
198 static void anon_allocate_area(void **alloc_area)
199 {
200         if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
201                 fprintf(stderr, "out of memory\n");
202                 *alloc_area = NULL;
203         }
204 }
205
206 static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
207 {
208 }
209
210 /* HugeTLB memory */
211 static int hugetlb_release_pages(char *rel_area)
212 {
213         int ret = 0;
214
215         if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
216                                 rel_area == huge_fd_off0 ? 0 :
217                                 nr_pages * page_size,
218                                 nr_pages * page_size)) {
219                 perror("fallocate");
220                 ret = 1;
221         }
222
223         return ret;
224 }
225
226 static void hugetlb_allocate_area(void **alloc_area)
227 {
228         void *area_alias = NULL;
229         char **alloc_area_alias;
230
231         *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
232                            (map_shared ? MAP_SHARED : MAP_PRIVATE) |
233                            MAP_HUGETLB,
234                            huge_fd, *alloc_area == area_src ? 0 :
235                            nr_pages * page_size);
236         if (*alloc_area == MAP_FAILED) {
237                 perror("mmap of hugetlbfs file failed");
238                 goto fail;
239         }
240
241         if (map_shared) {
242                 area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
243                                   MAP_SHARED | MAP_HUGETLB,
244                                   huge_fd, *alloc_area == area_src ? 0 :
245                                   nr_pages * page_size);
246                 if (area_alias == MAP_FAILED) {
247                         perror("mmap of hugetlb file alias failed");
248                         goto fail_munmap;
249                 }
250         }
251
252         if (*alloc_area == area_src) {
253                 huge_fd_off0 = *alloc_area;
254                 alloc_area_alias = &area_src_alias;
255         } else {
256                 alloc_area_alias = &area_dst_alias;
257         }
258         if (area_alias)
259                 *alloc_area_alias = area_alias;
260
261         return;
262
263 fail_munmap:
264         if (munmap(*alloc_area, nr_pages * page_size) < 0) {
265                 perror("hugetlb munmap");
266                 exit(1);
267         }
268 fail:
269         *alloc_area = NULL;
270 }
271
272 static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
273 {
274         if (!map_shared)
275                 return;
276         /*
277          * We can't zap just the pagetable with hugetlbfs because
278          * MADV_DONTEED won't work. So exercise -EEXIST on a alias
279          * mapping where the pagetables are not established initially,
280          * this way we'll exercise the -EEXEC at the fs level.
281          */
282         *start = (unsigned long) area_dst_alias + offset;
283 }
284
285 /* Shared memory */
286 static int shmem_release_pages(char *rel_area)
287 {
288         int ret = 0;
289
290         if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) {
291                 perror("madvise");
292                 ret = 1;
293         }
294
295         return ret;
296 }
297
298 static void shmem_allocate_area(void **alloc_area)
299 {
300         *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
301                            MAP_ANONYMOUS | MAP_SHARED, -1, 0);
302         if (*alloc_area == MAP_FAILED) {
303                 fprintf(stderr, "shared memory mmap failed\n");
304                 *alloc_area = NULL;
305         }
306 }
307
308 struct uffd_test_ops {
309         unsigned long expected_ioctls;
310         void (*allocate_area)(void **alloc_area);
311         int (*release_pages)(char *rel_area);
312         void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
313 };
314
315 #define SHMEM_EXPECTED_IOCTLS           ((1 << _UFFDIO_WAKE) | \
316                                          (1 << _UFFDIO_COPY) | \
317                                          (1 << _UFFDIO_ZEROPAGE))
318
319 #define ANON_EXPECTED_IOCTLS            ((1 << _UFFDIO_WAKE) | \
320                                          (1 << _UFFDIO_COPY) | \
321                                          (1 << _UFFDIO_ZEROPAGE) | \
322                                          (1 << _UFFDIO_WRITEPROTECT))
323
324 static struct uffd_test_ops anon_uffd_test_ops = {
325         .expected_ioctls = ANON_EXPECTED_IOCTLS,
326         .allocate_area  = anon_allocate_area,
327         .release_pages  = anon_release_pages,
328         .alias_mapping = noop_alias_mapping,
329 };
330
331 static struct uffd_test_ops shmem_uffd_test_ops = {
332         .expected_ioctls = SHMEM_EXPECTED_IOCTLS,
333         .allocate_area  = shmem_allocate_area,
334         .release_pages  = shmem_release_pages,
335         .alias_mapping = noop_alias_mapping,
336 };
337
338 static struct uffd_test_ops hugetlb_uffd_test_ops = {
339         .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE),
340         .allocate_area  = hugetlb_allocate_area,
341         .release_pages  = hugetlb_release_pages,
342         .alias_mapping = hugetlb_alias_mapping,
343 };
344
345 static struct uffd_test_ops *uffd_test_ops;
346
347 static int my_bcmp(char *str1, char *str2, size_t n)
348 {
349         unsigned long i;
350         for (i = 0; i < n; i++)
351                 if (str1[i] != str2[i])
352                         return 1;
353         return 0;
354 }
355
356 static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
357 {
358         struct uffdio_writeprotect prms;
359
360         /* Write protection page faults */
361         prms.range.start = start;
362         prms.range.len = len;
363         /* Undo write-protect, do wakeup after that */
364         prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
365
366         if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) {
367                 fprintf(stderr, "clear WP failed for address 0x%" PRIx64 "\n",
368                         (uint64_t)start);
369                 exit(1);
370         }
371 }
372
373 static void continue_range(int ufd, __u64 start, __u64 len)
374 {
375         struct uffdio_continue req;
376
377         req.range.start = start;
378         req.range.len = len;
379         req.mode = 0;
380
381         if (ioctl(ufd, UFFDIO_CONTINUE, &req)) {
382                 fprintf(stderr,
383                         "UFFDIO_CONTINUE failed for address 0x%" PRIx64 "\n",
384                         (uint64_t)start);
385                 exit(1);
386         }
387 }
388
389 static void *locking_thread(void *arg)
390 {
391         unsigned long cpu = (unsigned long) arg;
392         struct random_data rand;
393         unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
394         int32_t rand_nr;
395         unsigned long long count;
396         char randstate[64];
397         unsigned int seed;
398         time_t start;
399
400         if (bounces & BOUNCE_RANDOM) {
401                 seed = (unsigned int) time(NULL) - bounces;
402                 if (!(bounces & BOUNCE_RACINGFAULTS))
403                         seed += cpu;
404                 bzero(&rand, sizeof(rand));
405                 bzero(&randstate, sizeof(randstate));
406                 if (initstate_r(seed, randstate, sizeof(randstate), &rand)) {
407                         fprintf(stderr, "srandom_r error\n");
408                         exit(1);
409                 }
410         } else {
411                 page_nr = -bounces;
412                 if (!(bounces & BOUNCE_RACINGFAULTS))
413                         page_nr += cpu * nr_pages_per_cpu;
414         }
415
416         while (!finished) {
417                 if (bounces & BOUNCE_RANDOM) {
418                         if (random_r(&rand, &rand_nr)) {
419                                 fprintf(stderr, "random_r 1 error\n");
420                                 exit(1);
421                         }
422                         page_nr = rand_nr;
423                         if (sizeof(page_nr) > sizeof(rand_nr)) {
424                                 if (random_r(&rand, &rand_nr)) {
425                                         fprintf(stderr, "random_r 2 error\n");
426                                         exit(1);
427                                 }
428                                 page_nr |= (((unsigned long) rand_nr) << 16) <<
429                                            16;
430                         }
431                 } else
432                         page_nr += 1;
433                 page_nr %= nr_pages;
434
435                 start = time(NULL);
436                 if (bounces & BOUNCE_VERIFY) {
437                         count = *area_count(area_dst, page_nr);
438                         if (!count) {
439                                 fprintf(stderr,
440                                         "page_nr %lu wrong count %Lu %Lu\n",
441                                         page_nr, count,
442                                         count_verify[page_nr]);
443                                 exit(1);
444                         }
445
446
447                         /*
448                          * We can't use bcmp (or memcmp) because that
449                          * returns 0 erroneously if the memory is
450                          * changing under it (even if the end of the
451                          * page is never changing and always
452                          * different).
453                          */
454 #if 1
455                         if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
456                                      page_size)) {
457                                 fprintf(stderr,
458                                         "my_bcmp page_nr %lu wrong count %Lu %Lu\n",
459                                         page_nr, count, count_verify[page_nr]);
460                                 exit(1);
461                         }
462 #else
463                         unsigned long loops;
464
465                         loops = 0;
466                         /* uncomment the below line to test with mutex */
467                         /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
468                         while (!bcmp(area_dst + page_nr * page_size, zeropage,
469                                      page_size)) {
470                                 loops += 1;
471                                 if (loops > 10)
472                                         break;
473                         }
474                         /* uncomment below line to test with mutex */
475                         /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
476                         if (loops) {
477                                 fprintf(stderr,
478                                         "page_nr %lu all zero thread %lu %p %lu\n",
479                                         page_nr, cpu, area_dst + page_nr * page_size,
480                                         loops);
481                                 if (loops > 10)
482                                         exit(1);
483                         }
484 #endif
485                 }
486
487                 pthread_mutex_lock(area_mutex(area_dst, page_nr));
488                 count = *area_count(area_dst, page_nr);
489                 if (count != count_verify[page_nr]) {
490                         fprintf(stderr,
491                                 "page_nr %lu memory corruption %Lu %Lu\n",
492                                 page_nr, count,
493                                 count_verify[page_nr]); exit(1);
494                 }
495                 count++;
496                 *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
497                 pthread_mutex_unlock(area_mutex(area_dst, page_nr));
498
499                 if (time(NULL) - start > 1)
500                         fprintf(stderr,
501                                 "userfault too slow %ld "
502                                 "possible false positive with overcommit\n",
503                                 time(NULL) - start);
504         }
505
506         return NULL;
507 }
508
509 static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
510                             unsigned long offset)
511 {
512         uffd_test_ops->alias_mapping(&uffdio_copy->dst,
513                                      uffdio_copy->len,
514                                      offset);
515         if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
516                 /* real retval in ufdio_copy.copy */
517                 if (uffdio_copy->copy != -EEXIST) {
518                         uffd_error(uffdio_copy->copy,
519                                    "UFFDIO_COPY retry error");
520                 }
521         } else
522                 uffd_error(uffdio_copy->copy, "UFFDIO_COPY retry unexpected");
523 }
524
525 static int __copy_page(int ufd, unsigned long offset, bool retry)
526 {
527         struct uffdio_copy uffdio_copy;
528
529         if (offset >= nr_pages * page_size) {
530                 fprintf(stderr, "unexpected offset %lu\n", offset);
531                 exit(1);
532         }
533         uffdio_copy.dst = (unsigned long) area_dst + offset;
534         uffdio_copy.src = (unsigned long) area_src + offset;
535         uffdio_copy.len = page_size;
536         if (test_uffdio_wp)
537                 uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
538         else
539                 uffdio_copy.mode = 0;
540         uffdio_copy.copy = 0;
541         if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
542                 /* real retval in ufdio_copy.copy */
543                 if (uffdio_copy.copy != -EEXIST)
544                         uffd_error(uffdio_copy.copy, "UFFDIO_COPY error");
545         } else if (uffdio_copy.copy != page_size) {
546                 uffd_error(uffdio_copy.copy, "UFFDIO_COPY unexpected copy");
547         } else {
548                 if (test_uffdio_copy_eexist && retry) {
549                         test_uffdio_copy_eexist = false;
550                         retry_copy_page(ufd, &uffdio_copy, offset);
551                 }
552                 return 1;
553         }
554         return 0;
555 }
556
557 static int copy_page_retry(int ufd, unsigned long offset)
558 {
559         return __copy_page(ufd, offset, true);
560 }
561
562 static int copy_page(int ufd, unsigned long offset)
563 {
564         return __copy_page(ufd, offset, false);
565 }
566
567 static int uffd_read_msg(int ufd, struct uffd_msg *msg)
568 {
569         int ret = read(uffd, msg, sizeof(*msg));
570
571         if (ret != sizeof(*msg)) {
572                 if (ret < 0) {
573                         if (errno == EAGAIN)
574                                 return 1;
575                         perror("blocking read error");
576                 } else {
577                         fprintf(stderr, "short read\n");
578                 }
579                 exit(1);
580         }
581
582         return 0;
583 }
584
585 static void uffd_handle_page_fault(struct uffd_msg *msg,
586                                    struct uffd_stats *stats)
587 {
588         unsigned long offset;
589
590         if (msg->event != UFFD_EVENT_PAGEFAULT) {
591                 fprintf(stderr, "unexpected msg event %u\n", msg->event);
592                 exit(1);
593         }
594
595         if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
596                 /* Write protect page faults */
597                 wp_range(uffd, msg->arg.pagefault.address, page_size, false);
598                 stats->wp_faults++;
599         } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
600                 uint8_t *area;
601                 int b;
602
603                 /*
604                  * Minor page faults
605                  *
606                  * To prove we can modify the original range for testing
607                  * purposes, we're going to bit flip this range before
608                  * continuing.
609                  *
610                  * Note that this requires all minor page fault tests operate on
611                  * area_dst (non-UFFD-registered) and area_dst_alias
612                  * (UFFD-registered).
613                  */
614
615                 area = (uint8_t *)(area_dst +
616                                    ((char *)msg->arg.pagefault.address -
617                                     area_dst_alias));
618                 for (b = 0; b < page_size; ++b)
619                         area[b] = ~area[b];
620                 continue_range(uffd, msg->arg.pagefault.address, page_size);
621                 stats->minor_faults++;
622         } else {
623                 /* Missing page faults */
624                 if (bounces & BOUNCE_VERIFY &&
625                     msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) {
626                         fprintf(stderr, "unexpected write fault\n");
627                         exit(1);
628                 }
629
630                 offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
631                 offset &= ~(page_size-1);
632
633                 if (copy_page(uffd, offset))
634                         stats->missing_faults++;
635         }
636 }
637
638 static void *uffd_poll_thread(void *arg)
639 {
640         struct uffd_stats *stats = (struct uffd_stats *)arg;
641         unsigned long cpu = stats->cpu;
642         struct pollfd pollfd[2];
643         struct uffd_msg msg;
644         struct uffdio_register uffd_reg;
645         int ret;
646         char tmp_chr;
647
648         pollfd[0].fd = uffd;
649         pollfd[0].events = POLLIN;
650         pollfd[1].fd = pipefd[cpu*2];
651         pollfd[1].events = POLLIN;
652
653         for (;;) {
654                 ret = poll(pollfd, 2, -1);
655                 if (!ret) {
656                         fprintf(stderr, "poll error %d\n", ret);
657                         exit(1);
658                 }
659                 if (ret < 0) {
660                         perror("poll");
661                         exit(1);
662                 }
663                 if (pollfd[1].revents & POLLIN) {
664                         if (read(pollfd[1].fd, &tmp_chr, 1) != 1) {
665                                 fprintf(stderr, "read pipefd error\n");
666                                 exit(1);
667                         }
668                         break;
669                 }
670                 if (!(pollfd[0].revents & POLLIN)) {
671                         fprintf(stderr, "pollfd[0].revents %d\n",
672                                 pollfd[0].revents);
673                         exit(1);
674                 }
675                 if (uffd_read_msg(uffd, &msg))
676                         continue;
677                 switch (msg.event) {
678                 default:
679                         fprintf(stderr, "unexpected msg event %u\n",
680                                 msg.event); exit(1);
681                         break;
682                 case UFFD_EVENT_PAGEFAULT:
683                         uffd_handle_page_fault(&msg, stats);
684                         break;
685                 case UFFD_EVENT_FORK:
686                         close(uffd);
687                         uffd = msg.arg.fork.ufd;
688                         pollfd[0].fd = uffd;
689                         break;
690                 case UFFD_EVENT_REMOVE:
691                         uffd_reg.range.start = msg.arg.remove.start;
692                         uffd_reg.range.len = msg.arg.remove.end -
693                                 msg.arg.remove.start;
694                         if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) {
695                                 fprintf(stderr, "remove failure\n");
696                                 exit(1);
697                         }
698                         break;
699                 case UFFD_EVENT_REMAP:
700                         area_dst = (char *)(unsigned long)msg.arg.remap.to;
701                         break;
702                 }
703         }
704
705         return NULL;
706 }
707
708 pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
709
710 static void *uffd_read_thread(void *arg)
711 {
712         struct uffd_stats *stats = (struct uffd_stats *)arg;
713         struct uffd_msg msg;
714
715         pthread_mutex_unlock(&uffd_read_mutex);
716         /* from here cancellation is ok */
717
718         for (;;) {
719                 if (uffd_read_msg(uffd, &msg))
720                         continue;
721                 uffd_handle_page_fault(&msg, stats);
722         }
723
724         return NULL;
725 }
726
727 static void *background_thread(void *arg)
728 {
729         unsigned long cpu = (unsigned long) arg;
730         unsigned long page_nr, start_nr, mid_nr, end_nr;
731
732         start_nr = cpu * nr_pages_per_cpu;
733         end_nr = (cpu+1) * nr_pages_per_cpu;
734         mid_nr = (start_nr + end_nr) / 2;
735
736         /* Copy the first half of the pages */
737         for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
738                 copy_page_retry(uffd, page_nr * page_size);
739
740         /*
741          * If we need to test uffd-wp, set it up now.  Then we'll have
742          * at least the first half of the pages mapped already which
743          * can be write-protected for testing
744          */
745         if (test_uffdio_wp)
746                 wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
747                         nr_pages_per_cpu * page_size, true);
748
749         /*
750          * Continue the 2nd half of the page copying, handling write
751          * protection faults if any
752          */
753         for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
754                 copy_page_retry(uffd, page_nr * page_size);
755
756         return NULL;
757 }
758
759 static int stress(struct uffd_stats *uffd_stats)
760 {
761         unsigned long cpu;
762         pthread_t locking_threads[nr_cpus];
763         pthread_t uffd_threads[nr_cpus];
764         pthread_t background_threads[nr_cpus];
765
766         finished = 0;
767         for (cpu = 0; cpu < nr_cpus; cpu++) {
768                 if (pthread_create(&locking_threads[cpu], &attr,
769                                    locking_thread, (void *)cpu))
770                         return 1;
771                 if (bounces & BOUNCE_POLL) {
772                         if (pthread_create(&uffd_threads[cpu], &attr,
773                                            uffd_poll_thread,
774                                            (void *)&uffd_stats[cpu]))
775                                 return 1;
776                 } else {
777                         if (pthread_create(&uffd_threads[cpu], &attr,
778                                            uffd_read_thread,
779                                            (void *)&uffd_stats[cpu]))
780                                 return 1;
781                         pthread_mutex_lock(&uffd_read_mutex);
782                 }
783                 if (pthread_create(&background_threads[cpu], &attr,
784                                    background_thread, (void *)cpu))
785                         return 1;
786         }
787         for (cpu = 0; cpu < nr_cpus; cpu++)
788                 if (pthread_join(background_threads[cpu], NULL))
789                         return 1;
790
791         /*
792          * Be strict and immediately zap area_src, the whole area has
793          * been transferred already by the background treads. The
794          * area_src could then be faulted in in a racy way by still
795          * running uffdio_threads reading zeropages after we zapped
796          * area_src (but they're guaranteed to get -EEXIST from
797          * UFFDIO_COPY without writing zero pages into area_dst
798          * because the background threads already completed).
799          */
800         if (uffd_test_ops->release_pages(area_src))
801                 return 1;
802
803
804         finished = 1;
805         for (cpu = 0; cpu < nr_cpus; cpu++)
806                 if (pthread_join(locking_threads[cpu], NULL))
807                         return 1;
808
809         for (cpu = 0; cpu < nr_cpus; cpu++) {
810                 char c;
811                 if (bounces & BOUNCE_POLL) {
812                         if (write(pipefd[cpu*2+1], &c, 1) != 1) {
813                                 fprintf(stderr, "pipefd write error\n");
814                                 return 1;
815                         }
816                         if (pthread_join(uffd_threads[cpu],
817                                          (void *)&uffd_stats[cpu]))
818                                 return 1;
819                 } else {
820                         if (pthread_cancel(uffd_threads[cpu]))
821                                 return 1;
822                         if (pthread_join(uffd_threads[cpu], NULL))
823                                 return 1;
824                 }
825         }
826
827         return 0;
828 }
829
830 static int userfaultfd_open_ext(uint64_t *features)
831 {
832         struct uffdio_api uffdio_api;
833
834         uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
835         if (uffd < 0) {
836                 fprintf(stderr,
837                         "userfaultfd syscall not available in this kernel\n");
838                 return 1;
839         }
840         uffd_flags = fcntl(uffd, F_GETFD, NULL);
841
842         uffdio_api.api = UFFD_API;
843         uffdio_api.features = *features;
844         if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
845                 fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to "
846                         "run with either root or ptrace capability.\n");
847                 return 1;
848         }
849         if (uffdio_api.api != UFFD_API) {
850                 fprintf(stderr, "UFFDIO_API error: %" PRIu64 "\n",
851                         (uint64_t)uffdio_api.api);
852                 return 1;
853         }
854
855         *features = uffdio_api.features;
856         return 0;
857 }
858
859 static int userfaultfd_open(uint64_t features)
860 {
861         return userfaultfd_open_ext(&features);
862 }
863
864 sigjmp_buf jbuf, *sigbuf;
865
866 static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
867 {
868         if (sig == SIGBUS) {
869                 if (sigbuf)
870                         siglongjmp(*sigbuf, 1);
871                 abort();
872         }
873 }
874
875 /*
876  * For non-cooperative userfaultfd test we fork() a process that will
877  * generate pagefaults, will mremap the area monitored by the
878  * userfaultfd and at last this process will release the monitored
879  * area.
880  * For the anonymous and shared memory the area is divided into two
881  * parts, the first part is accessed before mremap, and the second
882  * part is accessed after mremap. Since hugetlbfs does not support
883  * mremap, the entire monitored area is accessed in a single pass for
884  * HUGETLB_TEST.
885  * The release of the pages currently generates event for shmem and
886  * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
887  * for hugetlb.
888  * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
889  * monitored area, generate pagefaults and test that signal is delivered.
890  * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
891  * test robustness use case - we release monitored area, fork a process
892  * that will generate pagefaults and verify signal is generated.
893  * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
894  * feature. Using monitor thread, verify no userfault events are generated.
895  */
896 static int faulting_process(int signal_test)
897 {
898         unsigned long nr;
899         unsigned long long count;
900         unsigned long split_nr_pages;
901         unsigned long lastnr;
902         struct sigaction act;
903         unsigned long signalled = 0;
904
905         if (test_type != TEST_HUGETLB)
906                 split_nr_pages = (nr_pages + 1) / 2;
907         else
908                 split_nr_pages = nr_pages;
909
910         if (signal_test) {
911                 sigbuf = &jbuf;
912                 memset(&act, 0, sizeof(act));
913                 act.sa_sigaction = sighndl;
914                 act.sa_flags = SA_SIGINFO;
915                 if (sigaction(SIGBUS, &act, 0)) {
916                         perror("sigaction");
917                         return 1;
918                 }
919                 lastnr = (unsigned long)-1;
920         }
921
922         for (nr = 0; nr < split_nr_pages; nr++) {
923                 int steps = 1;
924                 unsigned long offset = nr * page_size;
925
926                 if (signal_test) {
927                         if (sigsetjmp(*sigbuf, 1) != 0) {
928                                 if (steps == 1 && nr == lastnr) {
929                                         fprintf(stderr, "Signal repeated\n");
930                                         return 1;
931                                 }
932
933                                 lastnr = nr;
934                                 if (signal_test == 1) {
935                                         if (steps == 1) {
936                                                 /* This is a MISSING request */
937                                                 steps++;
938                                                 if (copy_page(uffd, offset))
939                                                         signalled++;
940                                         } else {
941                                                 /* This is a WP request */
942                                                 assert(steps == 2);
943                                                 wp_range(uffd,
944                                                          (__u64)area_dst +
945                                                          offset,
946                                                          page_size, false);
947                                         }
948                                 } else {
949                                         signalled++;
950                                         continue;
951                                 }
952                         }
953                 }
954
955                 count = *area_count(area_dst, nr);
956                 if (count != count_verify[nr]) {
957                         fprintf(stderr,
958                                 "nr %lu memory corruption %Lu %Lu\n",
959                                 nr, count,
960                                 count_verify[nr]);
961                 }
962                 /*
963                  * Trigger write protection if there is by writing
964                  * the same value back.
965                  */
966                 *area_count(area_dst, nr) = count;
967         }
968
969         if (signal_test)
970                 return signalled != split_nr_pages;
971
972         if (test_type == TEST_HUGETLB)
973                 return 0;
974
975         area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
976                           MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
977         if (area_dst == MAP_FAILED) {
978                 perror("mremap");
979                 exit(1);
980         }
981
982         for (; nr < nr_pages; nr++) {
983                 count = *area_count(area_dst, nr);
984                 if (count != count_verify[nr]) {
985                         fprintf(stderr,
986                                 "nr %lu memory corruption %Lu %Lu\n",
987                                 nr, count,
988                                 count_verify[nr]); exit(1);
989                 }
990                 /*
991                  * Trigger write protection if there is by writing
992                  * the same value back.
993                  */
994                 *area_count(area_dst, nr) = count;
995         }
996
997         if (uffd_test_ops->release_pages(area_dst))
998                 return 1;
999
1000         for (nr = 0; nr < nr_pages; nr++) {
1001                 if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) {
1002                         fprintf(stderr, "nr %lu is not zero\n", nr);
1003                         exit(1);
1004                 }
1005         }
1006
1007         return 0;
1008 }
1009
1010 static void retry_uffdio_zeropage(int ufd,
1011                                   struct uffdio_zeropage *uffdio_zeropage,
1012                                   unsigned long offset)
1013 {
1014         uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
1015                                      uffdio_zeropage->range.len,
1016                                      offset);
1017         if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
1018                 if (uffdio_zeropage->zeropage != -EEXIST) {
1019                         uffd_error(uffdio_zeropage->zeropage,
1020                                    "UFFDIO_ZEROPAGE retry error");
1021                 }
1022         } else {
1023                 uffd_error(uffdio_zeropage->zeropage,
1024                            "UFFDIO_ZEROPAGE retry unexpected");
1025         }
1026 }
1027
1028 static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
1029 {
1030         struct uffdio_zeropage uffdio_zeropage;
1031         int ret;
1032         unsigned long has_zeropage;
1033         __s64 res;
1034
1035         has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
1036
1037         if (offset >= nr_pages * page_size) {
1038                 fprintf(stderr, "unexpected offset %lu\n", offset);
1039                 exit(1);
1040         }
1041         uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
1042         uffdio_zeropage.range.len = page_size;
1043         uffdio_zeropage.mode = 0;
1044         ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
1045         res = uffdio_zeropage.zeropage;
1046         if (ret) {
1047                 /* real retval in ufdio_zeropage.zeropage */
1048                 if (has_zeropage) {
1049                         uffd_error(res, "UFFDIO_ZEROPAGE %s",
1050                                    res == -EEXIST ? "-EEXIST" : "error");
1051                 } else if (res != -EINVAL)
1052                         uffd_error(res, "UFFDIO_ZEROPAGE not -EINVAL");
1053         } else if (has_zeropage) {
1054                 if (res != page_size) {
1055                         uffd_error(res, "UFFDIO_ZEROPAGE unexpected");
1056                 } else {
1057                         if (test_uffdio_zeropage_eexist && retry) {
1058                                 test_uffdio_zeropage_eexist = false;
1059                                 retry_uffdio_zeropage(ufd, &uffdio_zeropage,
1060                                                       offset);
1061                         }
1062                         return 1;
1063                 }
1064         } else
1065                 uffd_error(res, "UFFDIO_ZEROPAGE succeeded");
1066
1067         return 0;
1068 }
1069
1070 static int uffdio_zeropage(int ufd, unsigned long offset)
1071 {
1072         return __uffdio_zeropage(ufd, offset, false);
1073 }
1074
1075 /* exercise UFFDIO_ZEROPAGE */
1076 static int userfaultfd_zeropage_test(void)
1077 {
1078         struct uffdio_register uffdio_register;
1079         unsigned long expected_ioctls;
1080
1081         printf("testing UFFDIO_ZEROPAGE: ");
1082         fflush(stdout);
1083
1084         if (uffd_test_ops->release_pages(area_dst))
1085                 return 1;
1086
1087         if (userfaultfd_open(0))
1088                 return 1;
1089         uffdio_register.range.start = (unsigned long) area_dst;
1090         uffdio_register.range.len = nr_pages * page_size;
1091         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1092         if (test_uffdio_wp)
1093                 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1094         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1095                 fprintf(stderr, "register failure\n");
1096                 exit(1);
1097         }
1098
1099         expected_ioctls = uffd_test_ops->expected_ioctls;
1100         if ((uffdio_register.ioctls & expected_ioctls) !=
1101             expected_ioctls) {
1102                 fprintf(stderr,
1103                         "unexpected missing ioctl for anon memory\n");
1104                 exit(1);
1105         }
1106
1107         if (uffdio_zeropage(uffd, 0)) {
1108                 if (my_bcmp(area_dst, zeropage, page_size)) {
1109                         fprintf(stderr, "zeropage is not zero\n");
1110                         exit(1);
1111                 }
1112         }
1113
1114         close(uffd);
1115         printf("done.\n");
1116         return 0;
1117 }
1118
1119 static int userfaultfd_events_test(void)
1120 {
1121         struct uffdio_register uffdio_register;
1122         unsigned long expected_ioctls;
1123         pthread_t uffd_mon;
1124         int err, features;
1125         pid_t pid;
1126         char c;
1127         struct uffd_stats stats = { 0 };
1128
1129         printf("testing events (fork, remap, remove): ");
1130         fflush(stdout);
1131
1132         if (uffd_test_ops->release_pages(area_dst))
1133                 return 1;
1134
1135         features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
1136                 UFFD_FEATURE_EVENT_REMOVE;
1137         if (userfaultfd_open(features))
1138                 return 1;
1139         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1140
1141         uffdio_register.range.start = (unsigned long) area_dst;
1142         uffdio_register.range.len = nr_pages * page_size;
1143         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1144         if (test_uffdio_wp)
1145                 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1146         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1147                 fprintf(stderr, "register failure\n");
1148                 exit(1);
1149         }
1150
1151         expected_ioctls = uffd_test_ops->expected_ioctls;
1152         if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
1153                 fprintf(stderr, "unexpected missing ioctl for anon memory\n");
1154                 exit(1);
1155         }
1156
1157         if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
1158                 perror("uffd_poll_thread create");
1159                 exit(1);
1160         }
1161
1162         pid = fork();
1163         if (pid < 0) {
1164                 perror("fork");
1165                 exit(1);
1166         }
1167
1168         if (!pid)
1169                 exit(faulting_process(0));
1170
1171         waitpid(pid, &err, 0);
1172         if (err) {
1173                 fprintf(stderr, "faulting process failed\n");
1174                 exit(1);
1175         }
1176
1177         if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
1178                 perror("pipe write");
1179                 exit(1);
1180         }
1181         if (pthread_join(uffd_mon, NULL))
1182                 return 1;
1183
1184         close(uffd);
1185
1186         uffd_stats_report(&stats, 1);
1187
1188         return stats.missing_faults != nr_pages;
1189 }
1190
1191 static int userfaultfd_sig_test(void)
1192 {
1193         struct uffdio_register uffdio_register;
1194         unsigned long expected_ioctls;
1195         unsigned long userfaults;
1196         pthread_t uffd_mon;
1197         int err, features;
1198         pid_t pid;
1199         char c;
1200         struct uffd_stats stats = { 0 };
1201
1202         printf("testing signal delivery: ");
1203         fflush(stdout);
1204
1205         if (uffd_test_ops->release_pages(area_dst))
1206                 return 1;
1207
1208         features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
1209         if (userfaultfd_open(features))
1210                 return 1;
1211         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1212
1213         uffdio_register.range.start = (unsigned long) area_dst;
1214         uffdio_register.range.len = nr_pages * page_size;
1215         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1216         if (test_uffdio_wp)
1217                 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1218         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1219                 fprintf(stderr, "register failure\n");
1220                 exit(1);
1221         }
1222
1223         expected_ioctls = uffd_test_ops->expected_ioctls;
1224         if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
1225                 fprintf(stderr, "unexpected missing ioctl for anon memory\n");
1226                 exit(1);
1227         }
1228
1229         if (faulting_process(1)) {
1230                 fprintf(stderr, "faulting process failed\n");
1231                 exit(1);
1232         }
1233
1234         if (uffd_test_ops->release_pages(area_dst))
1235                 return 1;
1236
1237         if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
1238                 perror("uffd_poll_thread create");
1239                 exit(1);
1240         }
1241
1242         pid = fork();
1243         if (pid < 0) {
1244                 perror("fork");
1245                 exit(1);
1246         }
1247
1248         if (!pid)
1249                 exit(faulting_process(2));
1250
1251         waitpid(pid, &err, 0);
1252         if (err) {
1253                 fprintf(stderr, "faulting process failed\n");
1254                 exit(1);
1255         }
1256
1257         if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
1258                 perror("pipe write");
1259                 exit(1);
1260         }
1261         if (pthread_join(uffd_mon, (void **)&userfaults))
1262                 return 1;
1263
1264         printf("done.\n");
1265         if (userfaults)
1266                 fprintf(stderr, "Signal test failed, userfaults: %ld\n",
1267                         userfaults);
1268         close(uffd);
1269         return userfaults != 0;
1270 }
1271
1272 static int userfaultfd_minor_test(void)
1273 {
1274         struct uffdio_register uffdio_register;
1275         unsigned long expected_ioctls;
1276         unsigned long p;
1277         pthread_t uffd_mon;
1278         uint8_t expected_byte;
1279         void *expected_page;
1280         char c;
1281         struct uffd_stats stats = { 0 };
1282         uint64_t features = UFFD_FEATURE_MINOR_HUGETLBFS;
1283
1284         if (!test_uffdio_minor)
1285                 return 0;
1286
1287         printf("testing minor faults: ");
1288         fflush(stdout);
1289
1290         if (uffd_test_ops->release_pages(area_dst))
1291                 return 1;
1292
1293         if (userfaultfd_open_ext(&features))
1294                 return 1;
1295         /* If kernel reports the feature isn't supported, skip the test. */
1296         if (!(features & UFFD_FEATURE_MINOR_HUGETLBFS)) {
1297                 printf("skipping test due to lack of feature support\n");
1298                 fflush(stdout);
1299                 return 0;
1300         }
1301
1302         uffdio_register.range.start = (unsigned long)area_dst_alias;
1303         uffdio_register.range.len = nr_pages * page_size;
1304         uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
1305         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1306                 fprintf(stderr, "register failure\n");
1307                 exit(1);
1308         }
1309
1310         expected_ioctls = uffd_test_ops->expected_ioctls;
1311         expected_ioctls |= 1 << _UFFDIO_CONTINUE;
1312         if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
1313                 fprintf(stderr, "unexpected missing ioctl(s)\n");
1314                 exit(1);
1315         }
1316
1317         /*
1318          * After registering with UFFD, populate the non-UFFD-registered side of
1319          * the shared mapping. This should *not* trigger any UFFD minor faults.
1320          */
1321         for (p = 0; p < nr_pages; ++p) {
1322                 memset(area_dst + (p * page_size), p % ((uint8_t)-1),
1323                        page_size);
1324         }
1325
1326         if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
1327                 perror("uffd_poll_thread create");
1328                 exit(1);
1329         }
1330
1331         /*
1332          * Read each of the pages back using the UFFD-registered mapping. We
1333          * expect that the first time we touch a page, it will result in a minor
1334          * fault. uffd_poll_thread will resolve the fault by bit-flipping the
1335          * page's contents, and then issuing a CONTINUE ioctl.
1336          */
1337
1338         if (posix_memalign(&expected_page, page_size, page_size)) {
1339                 fprintf(stderr, "out of memory\n");
1340                 return 1;
1341         }
1342
1343         for (p = 0; p < nr_pages; ++p) {
1344                 expected_byte = ~((uint8_t)(p % ((uint8_t)-1)));
1345                 memset(expected_page, expected_byte, page_size);
1346                 if (my_bcmp(expected_page, area_dst_alias + (p * page_size),
1347                             page_size)) {
1348                         fprintf(stderr,
1349                                 "unexpected page contents after minor fault\n");
1350                         exit(1);
1351                 }
1352         }
1353
1354         if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
1355                 perror("pipe write");
1356                 exit(1);
1357         }
1358         if (pthread_join(uffd_mon, NULL))
1359                 return 1;
1360
1361         close(uffd);
1362
1363         uffd_stats_report(&stats, 1);
1364
1365         return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
1366 }
1367
1368 static int userfaultfd_stress(void)
1369 {
1370         void *area;
1371         char *tmp_area;
1372         unsigned long nr;
1373         struct uffdio_register uffdio_register;
1374         unsigned long cpu;
1375         int err;
1376         struct uffd_stats uffd_stats[nr_cpus];
1377
1378         uffd_test_ops->allocate_area((void **)&area_src);
1379         if (!area_src)
1380                 return 1;
1381         uffd_test_ops->allocate_area((void **)&area_dst);
1382         if (!area_dst)
1383                 return 1;
1384
1385         if (userfaultfd_open(0))
1386                 return 1;
1387
1388         count_verify = malloc(nr_pages * sizeof(unsigned long long));
1389         if (!count_verify) {
1390                 perror("count_verify");
1391                 return 1;
1392         }
1393
1394         for (nr = 0; nr < nr_pages; nr++) {
1395                 *area_mutex(area_src, nr) = (pthread_mutex_t)
1396                         PTHREAD_MUTEX_INITIALIZER;
1397                 count_verify[nr] = *area_count(area_src, nr) = 1;
1398                 /*
1399                  * In the transition between 255 to 256, powerpc will
1400                  * read out of order in my_bcmp and see both bytes as
1401                  * zero, so leave a placeholder below always non-zero
1402                  * after the count, to avoid my_bcmp to trigger false
1403                  * positives.
1404                  */
1405                 *(area_count(area_src, nr) + 1) = 1;
1406         }
1407
1408         pipefd = malloc(sizeof(int) * nr_cpus * 2);
1409         if (!pipefd) {
1410                 perror("pipefd");
1411                 return 1;
1412         }
1413         for (cpu = 0; cpu < nr_cpus; cpu++) {
1414                 if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
1415                         perror("pipe");
1416                         return 1;
1417                 }
1418         }
1419
1420         if (posix_memalign(&area, page_size, page_size)) {
1421                 fprintf(stderr, "out of memory\n");
1422                 return 1;
1423         }
1424         zeropage = area;
1425         bzero(zeropage, page_size);
1426
1427         pthread_mutex_lock(&uffd_read_mutex);
1428
1429         pthread_attr_init(&attr);
1430         pthread_attr_setstacksize(&attr, 16*1024*1024);
1431
1432         err = 0;
1433         while (bounces--) {
1434                 unsigned long expected_ioctls;
1435
1436                 printf("bounces: %d, mode:", bounces);
1437                 if (bounces & BOUNCE_RANDOM)
1438                         printf(" rnd");
1439                 if (bounces & BOUNCE_RACINGFAULTS)
1440                         printf(" racing");
1441                 if (bounces & BOUNCE_VERIFY)
1442                         printf(" ver");
1443                 if (bounces & BOUNCE_POLL)
1444                         printf(" poll");
1445                 else
1446                         printf(" read");
1447                 printf(", ");
1448                 fflush(stdout);
1449
1450                 if (bounces & BOUNCE_POLL)
1451                         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1452                 else
1453                         fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
1454
1455                 /* register */
1456                 uffdio_register.range.start = (unsigned long) area_dst;
1457                 uffdio_register.range.len = nr_pages * page_size;
1458                 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1459                 if (test_uffdio_wp)
1460                         uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1461                 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1462                         fprintf(stderr, "register failure\n");
1463                         return 1;
1464                 }
1465                 expected_ioctls = uffd_test_ops->expected_ioctls;
1466                 if ((uffdio_register.ioctls & expected_ioctls) !=
1467                     expected_ioctls) {
1468                         fprintf(stderr,
1469                                 "unexpected missing ioctl for anon memory\n");
1470                         return 1;
1471                 }
1472
1473                 if (area_dst_alias) {
1474                         uffdio_register.range.start = (unsigned long)
1475                                 area_dst_alias;
1476                         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1477                                 fprintf(stderr, "register failure alias\n");
1478                                 return 1;
1479                         }
1480                 }
1481
1482                 /*
1483                  * The madvise done previously isn't enough: some
1484                  * uffd_thread could have read userfaults (one of
1485                  * those already resolved by the background thread)
1486                  * and it may be in the process of calling
1487                  * UFFDIO_COPY. UFFDIO_COPY will read the zapped
1488                  * area_src and it would map a zero page in it (of
1489                  * course such a UFFDIO_COPY is perfectly safe as it'd
1490                  * return -EEXIST). The problem comes at the next
1491                  * bounce though: that racing UFFDIO_COPY would
1492                  * generate zeropages in the area_src, so invalidating
1493                  * the previous MADV_DONTNEED. Without this additional
1494                  * MADV_DONTNEED those zeropages leftovers in the
1495                  * area_src would lead to -EEXIST failure during the
1496                  * next bounce, effectively leaving a zeropage in the
1497                  * area_dst.
1498                  *
1499                  * Try to comment this out madvise to see the memory
1500                  * corruption being caught pretty quick.
1501                  *
1502                  * khugepaged is also inhibited to collapse THP after
1503                  * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
1504                  * required to MADV_DONTNEED here.
1505                  */
1506                 if (uffd_test_ops->release_pages(area_dst))
1507                         return 1;
1508
1509                 uffd_stats_reset(uffd_stats, nr_cpus);
1510
1511                 /* bounce pass */
1512                 if (stress(uffd_stats))
1513                         return 1;
1514
1515                 /* Clear all the write protections if there is any */
1516                 if (test_uffdio_wp)
1517                         wp_range(uffd, (unsigned long)area_dst,
1518                                  nr_pages * page_size, false);
1519
1520                 /* unregister */
1521                 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
1522                         fprintf(stderr, "unregister failure\n");
1523                         return 1;
1524                 }
1525                 if (area_dst_alias) {
1526                         uffdio_register.range.start = (unsigned long) area_dst;
1527                         if (ioctl(uffd, UFFDIO_UNREGISTER,
1528                                   &uffdio_register.range)) {
1529                                 fprintf(stderr, "unregister failure alias\n");
1530                                 return 1;
1531                         }
1532                 }
1533
1534                 /* verification */
1535                 if (bounces & BOUNCE_VERIFY) {
1536                         for (nr = 0; nr < nr_pages; nr++) {
1537                                 if (*area_count(area_dst, nr) != count_verify[nr]) {
1538                                         fprintf(stderr,
1539                                                 "error area_count %Lu %Lu %lu\n",
1540                                                 *area_count(area_src, nr),
1541                                                 count_verify[nr],
1542                                                 nr);
1543                                         err = 1;
1544                                         bounces = 0;
1545                                 }
1546                         }
1547                 }
1548
1549                 /* prepare next bounce */
1550                 tmp_area = area_src;
1551                 area_src = area_dst;
1552                 area_dst = tmp_area;
1553
1554                 tmp_area = area_src_alias;
1555                 area_src_alias = area_dst_alias;
1556                 area_dst_alias = tmp_area;
1557
1558                 uffd_stats_report(uffd_stats, nr_cpus);
1559         }
1560
1561         if (err)
1562                 return err;
1563
1564         close(uffd);
1565         return userfaultfd_zeropage_test() || userfaultfd_sig_test()
1566                 || userfaultfd_events_test() || userfaultfd_minor_test();
1567 }
1568
1569 /*
1570  * Copied from mlock2-tests.c
1571  */
1572 unsigned long default_huge_page_size(void)
1573 {
1574         unsigned long hps = 0;
1575         char *line = NULL;
1576         size_t linelen = 0;
1577         FILE *f = fopen("/proc/meminfo", "r");
1578
1579         if (!f)
1580                 return 0;
1581         while (getline(&line, &linelen, f) > 0) {
1582                 if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
1583                         hps <<= 10;
1584                         break;
1585                 }
1586         }
1587
1588         free(line);
1589         fclose(f);
1590         return hps;
1591 }
1592
1593 static void set_test_type(const char *type)
1594 {
1595         if (!strcmp(type, "anon")) {
1596                 test_type = TEST_ANON;
1597                 uffd_test_ops = &anon_uffd_test_ops;
1598                 /* Only enable write-protect test for anonymous test */
1599                 test_uffdio_wp = true;
1600         } else if (!strcmp(type, "hugetlb")) {
1601                 test_type = TEST_HUGETLB;
1602                 uffd_test_ops = &hugetlb_uffd_test_ops;
1603         } else if (!strcmp(type, "hugetlb_shared")) {
1604                 map_shared = true;
1605                 test_type = TEST_HUGETLB;
1606                 uffd_test_ops = &hugetlb_uffd_test_ops;
1607                 /* Minor faults require shared hugetlb; only enable here. */
1608                 test_uffdio_minor = true;
1609         } else if (!strcmp(type, "shmem")) {
1610                 map_shared = true;
1611                 test_type = TEST_SHMEM;
1612                 uffd_test_ops = &shmem_uffd_test_ops;
1613         } else {
1614                 fprintf(stderr, "Unknown test type: %s\n", type); exit(1);
1615         }
1616
1617         if (test_type == TEST_HUGETLB)
1618                 page_size = default_huge_page_size();
1619         else
1620                 page_size = sysconf(_SC_PAGE_SIZE);
1621
1622         if (!page_size) {
1623                 fprintf(stderr, "Unable to determine page size\n");
1624                 exit(2);
1625         }
1626         if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1627             > page_size) {
1628                 fprintf(stderr, "Impossible to run this test\n");
1629                 exit(2);
1630         }
1631 }
1632
1633 static void sigalrm(int sig)
1634 {
1635         if (sig != SIGALRM)
1636                 abort();
1637         test_uffdio_copy_eexist = true;
1638         test_uffdio_zeropage_eexist = true;
1639         alarm(ALARM_INTERVAL_SECS);
1640 }
1641
1642 int main(int argc, char **argv)
1643 {
1644         if (argc < 4)
1645                 usage();
1646
1647         if (signal(SIGALRM, sigalrm) == SIG_ERR) {
1648                 fprintf(stderr, "failed to arm SIGALRM");
1649                 exit(1);
1650         }
1651         alarm(ALARM_INTERVAL_SECS);
1652
1653         set_test_type(argv[1]);
1654
1655         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1656         nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
1657                 nr_cpus;
1658         if (!nr_pages_per_cpu) {
1659                 fprintf(stderr, "invalid MiB\n");
1660                 usage();
1661         }
1662
1663         bounces = atoi(argv[3]);
1664         if (bounces <= 0) {
1665                 fprintf(stderr, "invalid bounces\n");
1666                 usage();
1667         }
1668         nr_pages = nr_pages_per_cpu * nr_cpus;
1669
1670         if (test_type == TEST_HUGETLB) {
1671                 if (argc < 5)
1672                         usage();
1673                 huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
1674                 if (huge_fd < 0) {
1675                         fprintf(stderr, "Open of %s failed", argv[3]);
1676                         perror("open");
1677                         exit(1);
1678                 }
1679                 if (ftruncate(huge_fd, 0)) {
1680                         fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
1681                         perror("ftruncate");
1682                         exit(1);
1683                 }
1684         }
1685         printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1686                nr_pages, nr_pages_per_cpu);
1687         return userfaultfd_stress();
1688 }
1689
1690 #else /* __NR_userfaultfd */
1691
1692 #warning "missing __NR_userfaultfd definition"
1693
1694 int main(void)
1695 {
1696         printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1697         return KSFT_SKIP;
1698 }
1699
1700 #endif /* __NR_userfaultfd */