Merge branch 'akpm' (patches from Andrew)
[linux-2.6-microblaze.git] / tools / testing / selftests / vm / userfaultfd.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Stress userfaultfd syscall.
4  *
5  *  Copyright (C) 2015  Red Hat, Inc.
6  *
7  * This test allocates two virtual areas and bounces the physical
8  * memory across the two virtual areas (from area_src to area_dst)
9  * using userfaultfd.
10  *
11  * There are three threads running per CPU:
12  *
13  * 1) one per-CPU thread takes a per-page pthread_mutex in a random
14  *    page of the area_dst (while the physical page may still be in
15  *    area_src), and increments a per-page counter in the same page,
16  *    and checks its value against a verification region.
17  *
18  * 2) another per-CPU thread handles the userfaults generated by
19  *    thread 1 above. userfaultfd blocking reads or poll() modes are
20  *    exercised interleaved.
21  *
22  * 3) one last per-CPU thread transfers the memory in the background
23  *    at maximum bandwidth (if not already transferred by thread
24  *    2). Each cpu thread takes cares of transferring a portion of the
25  *    area.
26  *
27  * When all threads of type 3 completed the transfer, one bounce is
28  * complete. area_src and area_dst are then swapped. All threads are
29  * respawned and so the bounce is immediately restarted in the
30  * opposite direction.
31  *
32  * per-CPU threads 1 by triggering userfaults inside
33  * pthread_mutex_lock will also verify the atomicity of the memory
34  * transfer (UFFDIO_COPY).
35  */
36
37 #define _GNU_SOURCE
38 #include <stdio.h>
39 #include <errno.h>
40 #include <unistd.h>
41 #include <stdlib.h>
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <fcntl.h>
45 #include <time.h>
46 #include <signal.h>
47 #include <poll.h>
48 #include <string.h>
49 #include <linux/mman.h>
50 #include <sys/mman.h>
51 #include <sys/syscall.h>
52 #include <sys/ioctl.h>
53 #include <sys/wait.h>
54 #include <pthread.h>
55 #include <linux/userfaultfd.h>
56 #include <setjmp.h>
57 #include <stdbool.h>
58 #include <assert.h>
59 #include <inttypes.h>
60 #include <stdint.h>
61 #include <sys/random.h>
62
63 #include "../kselftest.h"
64
65 #ifdef __NR_userfaultfd
66
67 static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
68
69 #define BOUNCE_RANDOM           (1<<0)
70 #define BOUNCE_RACINGFAULTS     (1<<1)
71 #define BOUNCE_VERIFY           (1<<2)
72 #define BOUNCE_POLL             (1<<3)
73 static int bounces;
74
75 #define TEST_ANON       1
76 #define TEST_HUGETLB    2
77 #define TEST_SHMEM      3
78 static int test_type;
79
80 /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
81 #define ALARM_INTERVAL_SECS 10
82 static volatile bool test_uffdio_copy_eexist = true;
83 static volatile bool test_uffdio_zeropage_eexist = true;
84 /* Whether to test uffd write-protection */
85 static bool test_uffdio_wp = false;
86 /* Whether to test uffd minor faults */
87 static bool test_uffdio_minor = false;
88
89 static bool map_shared;
90 static int shm_fd;
91 static int huge_fd;
92 static unsigned long long *count_verify;
93 static int uffd = -1;
94 static int uffd_flags, finished, *pipefd;
95 static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
96 static char *zeropage;
97 pthread_attr_t attr;
98
99 /* Userfaultfd test statistics */
100 struct uffd_stats {
101         int cpu;
102         unsigned long missing_faults;
103         unsigned long wp_faults;
104         unsigned long minor_faults;
105 };
106
107 /* pthread_mutex_t starts at page offset 0 */
108 #define area_mutex(___area, ___nr)                                      \
109         ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
110 /*
111  * count is placed in the page after pthread_mutex_t naturally aligned
112  * to avoid non alignment faults on non-x86 archs.
113  */
114 #define area_count(___area, ___nr)                                      \
115         ((volatile unsigned long long *) ((unsigned long)               \
116                                  ((___area) + (___nr)*page_size +       \
117                                   sizeof(pthread_mutex_t) +             \
118                                   sizeof(unsigned long long) - 1) &     \
119                                  ~(unsigned long)(sizeof(unsigned long long) \
120                                                   -  1)))
121
122 #define swap(a, b) \
123         do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
124
125 const char *examples =
126     "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
127     "./userfaultfd anon 100 99999\n\n"
128     "# Run share memory test on 1GiB region with 99 bounces:\n"
129     "./userfaultfd shmem 1000 99\n\n"
130     "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
131     "./userfaultfd hugetlb 256 50\n\n"
132     "# Run the same hugetlb test but using shared file:\n"
133     "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
134     "# 10MiB-~6GiB 999 bounces anonymous test, "
135     "continue forever unless an error triggers\n"
136     "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
137
138 static void usage(void)
139 {
140         fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
141                 "[hugetlbfs_file]\n\n");
142         fprintf(stderr, "Supported <test type>: anon, hugetlb, "
143                 "hugetlb_shared, shmem\n\n");
144         fprintf(stderr, "Examples:\n\n");
145         fprintf(stderr, "%s", examples);
146         exit(1);
147 }
148
149 #define _err(fmt, ...)                                          \
150         do {                                                    \
151                 int ret = errno;                                \
152                 fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__);  \
153                 fprintf(stderr, " (errno=%d, line=%d)\n",       \
154                         ret, __LINE__);                         \
155         } while (0)
156
157 #define err(fmt, ...)                           \
158         do {                                    \
159                 _err(fmt, ##__VA_ARGS__);       \
160                 exit(1);                        \
161         } while (0)
162
163 static void uffd_stats_reset(struct uffd_stats *uffd_stats,
164                              unsigned long n_cpus)
165 {
166         int i;
167
168         for (i = 0; i < n_cpus; i++) {
169                 uffd_stats[i].cpu = i;
170                 uffd_stats[i].missing_faults = 0;
171                 uffd_stats[i].wp_faults = 0;
172                 uffd_stats[i].minor_faults = 0;
173         }
174 }
175
176 static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
177 {
178         int i;
179         unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
180
181         for (i = 0; i < n_cpus; i++) {
182                 miss_total += stats[i].missing_faults;
183                 wp_total += stats[i].wp_faults;
184                 minor_total += stats[i].minor_faults;
185         }
186
187         printf("userfaults: ");
188         if (miss_total) {
189                 printf("%llu missing (", miss_total);
190                 for (i = 0; i < n_cpus; i++)
191                         printf("%lu+", stats[i].missing_faults);
192                 printf("\b) ");
193         }
194         if (wp_total) {
195                 printf("%llu wp (", wp_total);
196                 for (i = 0; i < n_cpus; i++)
197                         printf("%lu+", stats[i].wp_faults);
198                 printf("\b) ");
199         }
200         if (minor_total) {
201                 printf("%llu minor (", minor_total);
202                 for (i = 0; i < n_cpus; i++)
203                         printf("%lu+", stats[i].minor_faults);
204                 printf("\b)");
205         }
206         printf("\n");
207 }
208
209 static void anon_release_pages(char *rel_area)
210 {
211         if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
212                 err("madvise(MADV_DONTNEED) failed");
213 }
214
215 static void anon_allocate_area(void **alloc_area)
216 {
217         *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
218                            MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
219         if (*alloc_area == MAP_FAILED)
220                 err("mmap of anonymous memory failed");
221 }
222
223 static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
224 {
225 }
226
227 static void hugetlb_release_pages(char *rel_area)
228 {
229         if (!map_shared) {
230                 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
231                         err("madvise(MADV_DONTNEED) failed");
232         } else {
233                 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
234                         err("madvise(MADV_REMOVE) failed");
235         }
236 }
237
238 static void hugetlb_allocate_area(void **alloc_area)
239 {
240         void *area_alias = NULL;
241         char **alloc_area_alias;
242
243         if (!map_shared)
244                 *alloc_area = mmap(NULL,
245                         nr_pages * page_size,
246                         PROT_READ | PROT_WRITE,
247                         MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB |
248                                 (*alloc_area == area_src ? 0 : MAP_NORESERVE),
249                         -1,
250                         0);
251         else
252                 *alloc_area = mmap(NULL,
253                         nr_pages * page_size,
254                         PROT_READ | PROT_WRITE,
255                         MAP_SHARED |
256                                 (*alloc_area == area_src ? 0 : MAP_NORESERVE),
257                         huge_fd,
258                         *alloc_area == area_src ? 0 : nr_pages * page_size);
259         if (*alloc_area == MAP_FAILED)
260                 err("mmap of hugetlbfs file failed");
261
262         if (map_shared) {
263                 area_alias = mmap(NULL,
264                         nr_pages * page_size,
265                         PROT_READ | PROT_WRITE,
266                         MAP_SHARED,
267                         huge_fd,
268                         *alloc_area == area_src ? 0 : nr_pages * page_size);
269                 if (area_alias == MAP_FAILED)
270                         err("mmap of hugetlb file alias failed");
271         }
272
273         if (*alloc_area == area_src) {
274                 alloc_area_alias = &area_src_alias;
275         } else {
276                 alloc_area_alias = &area_dst_alias;
277         }
278         if (area_alias)
279                 *alloc_area_alias = area_alias;
280 }
281
282 static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
283 {
284         if (!map_shared)
285                 return;
286
287         *start = (unsigned long) area_dst_alias + offset;
288 }
289
290 static void shmem_release_pages(char *rel_area)
291 {
292         if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
293                 err("madvise(MADV_REMOVE) failed");
294 }
295
296 static void shmem_allocate_area(void **alloc_area)
297 {
298         void *area_alias = NULL;
299         bool is_src = alloc_area == (void **)&area_src;
300         unsigned long offset = is_src ? 0 : nr_pages * page_size;
301
302         *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
303                            MAP_SHARED, shm_fd, offset);
304         if (*alloc_area == MAP_FAILED)
305                 err("mmap of memfd failed");
306
307         area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
308                           MAP_SHARED, shm_fd, offset);
309         if (area_alias == MAP_FAILED)
310                 err("mmap of memfd alias failed");
311
312         if (is_src)
313                 area_src_alias = area_alias;
314         else
315                 area_dst_alias = area_alias;
316 }
317
318 static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
319 {
320         *start = (unsigned long)area_dst_alias + offset;
321 }
322
323 struct uffd_test_ops {
324         void (*allocate_area)(void **alloc_area);
325         void (*release_pages)(char *rel_area);
326         void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
327 };
328
329 static struct uffd_test_ops anon_uffd_test_ops = {
330         .allocate_area  = anon_allocate_area,
331         .release_pages  = anon_release_pages,
332         .alias_mapping = noop_alias_mapping,
333 };
334
335 static struct uffd_test_ops shmem_uffd_test_ops = {
336         .allocate_area  = shmem_allocate_area,
337         .release_pages  = shmem_release_pages,
338         .alias_mapping = shmem_alias_mapping,
339 };
340
341 static struct uffd_test_ops hugetlb_uffd_test_ops = {
342         .allocate_area  = hugetlb_allocate_area,
343         .release_pages  = hugetlb_release_pages,
344         .alias_mapping = hugetlb_alias_mapping,
345 };
346
347 static struct uffd_test_ops *uffd_test_ops;
348
349 static inline uint64_t uffd_minor_feature(void)
350 {
351         if (test_type == TEST_HUGETLB && map_shared)
352                 return UFFD_FEATURE_MINOR_HUGETLBFS;
353         else if (test_type == TEST_SHMEM)
354                 return UFFD_FEATURE_MINOR_SHMEM;
355         else
356                 return 0;
357 }
358
359 static uint64_t get_expected_ioctls(uint64_t mode)
360 {
361         uint64_t ioctls = UFFD_API_RANGE_IOCTLS;
362
363         if (test_type == TEST_HUGETLB)
364                 ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
365
366         if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp))
367                 ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
368
369         if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor))
370                 ioctls &= ~(1 << _UFFDIO_CONTINUE);
371
372         return ioctls;
373 }
374
375 static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
376 {
377         uint64_t expected = get_expected_ioctls(mode);
378         uint64_t actual = ioctls & expected;
379
380         if (actual != expected) {
381                 err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64,
382                     expected, actual);
383         }
384 }
385
386 static void userfaultfd_open(uint64_t *features)
387 {
388         struct uffdio_api uffdio_api;
389
390         uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
391         if (uffd < 0)
392                 err("userfaultfd syscall not available in this kernel");
393         uffd_flags = fcntl(uffd, F_GETFD, NULL);
394
395         uffdio_api.api = UFFD_API;
396         uffdio_api.features = *features;
397         if (ioctl(uffd, UFFDIO_API, &uffdio_api))
398                 err("UFFDIO_API failed.\nPlease make sure to "
399                     "run with either root or ptrace capability.");
400         if (uffdio_api.api != UFFD_API)
401                 err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
402
403         *features = uffdio_api.features;
404 }
405
406 static inline void munmap_area(void **area)
407 {
408         if (*area)
409                 if (munmap(*area, nr_pages * page_size))
410                         err("munmap");
411
412         *area = NULL;
413 }
414
415 static void uffd_test_ctx_clear(void)
416 {
417         size_t i;
418
419         if (pipefd) {
420                 for (i = 0; i < nr_cpus * 2; ++i) {
421                         if (close(pipefd[i]))
422                                 err("close pipefd");
423                 }
424                 free(pipefd);
425                 pipefd = NULL;
426         }
427
428         if (count_verify) {
429                 free(count_verify);
430                 count_verify = NULL;
431         }
432
433         if (uffd != -1) {
434                 if (close(uffd))
435                         err("close uffd");
436                 uffd = -1;
437         }
438
439         munmap_area((void **)&area_src);
440         munmap_area((void **)&area_src_alias);
441         munmap_area((void **)&area_dst);
442         munmap_area((void **)&area_dst_alias);
443 }
444
445 static void uffd_test_ctx_init(uint64_t features)
446 {
447         unsigned long nr, cpu;
448
449         uffd_test_ctx_clear();
450
451         uffd_test_ops->allocate_area((void **)&area_src);
452         uffd_test_ops->allocate_area((void **)&area_dst);
453
454         userfaultfd_open(&features);
455
456         count_verify = malloc(nr_pages * sizeof(unsigned long long));
457         if (!count_verify)
458                 err("count_verify");
459
460         for (nr = 0; nr < nr_pages; nr++) {
461                 *area_mutex(area_src, nr) =
462                         (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
463                 count_verify[nr] = *area_count(area_src, nr) = 1;
464                 /*
465                  * In the transition between 255 to 256, powerpc will
466                  * read out of order in my_bcmp and see both bytes as
467                  * zero, so leave a placeholder below always non-zero
468                  * after the count, to avoid my_bcmp to trigger false
469                  * positives.
470                  */
471                 *(area_count(area_src, nr) + 1) = 1;
472         }
473
474         /*
475          * After initialization of area_src, we must explicitly release pages
476          * for area_dst to make sure it's fully empty.  Otherwise we could have
477          * some area_dst pages be errornously initialized with zero pages,
478          * hence we could hit memory corruption later in the test.
479          *
480          * One example is when THP is globally enabled, above allocate_area()
481          * calls could have the two areas merged into a single VMA (as they
482          * will have the same VMA flags so they're mergeable).  When we
483          * initialize the area_src above, it's possible that some part of
484          * area_dst could have been faulted in via one huge THP that will be
485          * shared between area_src and area_dst.  It could cause some of the
486          * area_dst won't be trapped by missing userfaults.
487          *
488          * This release_pages() will guarantee even if that happened, we'll
489          * proactively split the thp and drop any accidentally initialized
490          * pages within area_dst.
491          */
492         uffd_test_ops->release_pages(area_dst);
493
494         pipefd = malloc(sizeof(int) * nr_cpus * 2);
495         if (!pipefd)
496                 err("pipefd");
497         for (cpu = 0; cpu < nr_cpus; cpu++)
498                 if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
499                         err("pipe");
500 }
501
502 static int my_bcmp(char *str1, char *str2, size_t n)
503 {
504         unsigned long i;
505         for (i = 0; i < n; i++)
506                 if (str1[i] != str2[i])
507                         return 1;
508         return 0;
509 }
510
511 static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
512 {
513         struct uffdio_writeprotect prms;
514
515         /* Write protection page faults */
516         prms.range.start = start;
517         prms.range.len = len;
518         /* Undo write-protect, do wakeup after that */
519         prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
520
521         if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
522                 err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
523 }
524
525 static void continue_range(int ufd, __u64 start, __u64 len)
526 {
527         struct uffdio_continue req;
528         int ret;
529
530         req.range.start = start;
531         req.range.len = len;
532         req.mode = 0;
533
534         if (ioctl(ufd, UFFDIO_CONTINUE, &req))
535                 err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
536                     (uint64_t)start);
537
538         /*
539          * Error handling within the kernel for continue is subtly different
540          * from copy or zeropage, so it may be a source of bugs. Trigger an
541          * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
542          */
543         req.mapped = 0;
544         ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
545         if (ret >= 0 || req.mapped != -EEXIST)
546                 err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
547                     ret, (int64_t) req.mapped);
548 }
549
550 static void *locking_thread(void *arg)
551 {
552         unsigned long cpu = (unsigned long) arg;
553         unsigned long page_nr;
554         unsigned long long count;
555
556         if (!(bounces & BOUNCE_RANDOM)) {
557                 page_nr = -bounces;
558                 if (!(bounces & BOUNCE_RACINGFAULTS))
559                         page_nr += cpu * nr_pages_per_cpu;
560         }
561
562         while (!finished) {
563                 if (bounces & BOUNCE_RANDOM) {
564                         if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr))
565                                 err("getrandom failed");
566                 } else
567                         page_nr += 1;
568                 page_nr %= nr_pages;
569                 pthread_mutex_lock(area_mutex(area_dst, page_nr));
570                 count = *area_count(area_dst, page_nr);
571                 if (count != count_verify[page_nr])
572                         err("page_nr %lu memory corruption %llu %llu",
573                             page_nr, count, count_verify[page_nr]);
574                 count++;
575                 *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
576                 pthread_mutex_unlock(area_mutex(area_dst, page_nr));
577         }
578
579         return NULL;
580 }
581
582 static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
583                             unsigned long offset)
584 {
585         uffd_test_ops->alias_mapping(&uffdio_copy->dst,
586                                      uffdio_copy->len,
587                                      offset);
588         if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
589                 /* real retval in ufdio_copy.copy */
590                 if (uffdio_copy->copy != -EEXIST)
591                         err("UFFDIO_COPY retry error: %"PRId64,
592                             (int64_t)uffdio_copy->copy);
593         } else {
594                 err("UFFDIO_COPY retry unexpected: %"PRId64,
595                     (int64_t)uffdio_copy->copy);
596         }
597 }
598
599 static void wake_range(int ufd, unsigned long addr, unsigned long len)
600 {
601         struct uffdio_range uffdio_wake;
602
603         uffdio_wake.start = addr;
604         uffdio_wake.len = len;
605
606         if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
607                 fprintf(stderr, "error waking %lu\n",
608                         addr), exit(1);
609 }
610
611 static int __copy_page(int ufd, unsigned long offset, bool retry)
612 {
613         struct uffdio_copy uffdio_copy;
614
615         if (offset >= nr_pages * page_size)
616                 err("unexpected offset %lu\n", offset);
617         uffdio_copy.dst = (unsigned long) area_dst + offset;
618         uffdio_copy.src = (unsigned long) area_src + offset;
619         uffdio_copy.len = page_size;
620         if (test_uffdio_wp)
621                 uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
622         else
623                 uffdio_copy.mode = 0;
624         uffdio_copy.copy = 0;
625         if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
626                 /* real retval in ufdio_copy.copy */
627                 if (uffdio_copy.copy != -EEXIST)
628                         err("UFFDIO_COPY error: %"PRId64,
629                             (int64_t)uffdio_copy.copy);
630                 wake_range(ufd, uffdio_copy.dst, page_size);
631         } else if (uffdio_copy.copy != page_size) {
632                 err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
633         } else {
634                 if (test_uffdio_copy_eexist && retry) {
635                         test_uffdio_copy_eexist = false;
636                         retry_copy_page(ufd, &uffdio_copy, offset);
637                 }
638                 return 1;
639         }
640         return 0;
641 }
642
643 static int copy_page_retry(int ufd, unsigned long offset)
644 {
645         return __copy_page(ufd, offset, true);
646 }
647
648 static int copy_page(int ufd, unsigned long offset)
649 {
650         return __copy_page(ufd, offset, false);
651 }
652
653 static int uffd_read_msg(int ufd, struct uffd_msg *msg)
654 {
655         int ret = read(uffd, msg, sizeof(*msg));
656
657         if (ret != sizeof(*msg)) {
658                 if (ret < 0) {
659                         if (errno == EAGAIN || errno == EINTR)
660                                 return 1;
661                         err("blocking read error");
662                 } else {
663                         err("short read");
664                 }
665         }
666
667         return 0;
668 }
669
670 static void uffd_handle_page_fault(struct uffd_msg *msg,
671                                    struct uffd_stats *stats)
672 {
673         unsigned long offset;
674
675         if (msg->event != UFFD_EVENT_PAGEFAULT)
676                 err("unexpected msg event %u", msg->event);
677
678         if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
679                 /* Write protect page faults */
680                 wp_range(uffd, msg->arg.pagefault.address, page_size, false);
681                 stats->wp_faults++;
682         } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
683                 uint8_t *area;
684                 int b;
685
686                 /*
687                  * Minor page faults
688                  *
689                  * To prove we can modify the original range for testing
690                  * purposes, we're going to bit flip this range before
691                  * continuing.
692                  *
693                  * Note that this requires all minor page fault tests operate on
694                  * area_dst (non-UFFD-registered) and area_dst_alias
695                  * (UFFD-registered).
696                  */
697
698                 area = (uint8_t *)(area_dst +
699                                    ((char *)msg->arg.pagefault.address -
700                                     area_dst_alias));
701                 for (b = 0; b < page_size; ++b)
702                         area[b] = ~area[b];
703                 continue_range(uffd, msg->arg.pagefault.address, page_size);
704                 stats->minor_faults++;
705         } else {
706                 /* Missing page faults */
707                 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
708                         err("unexpected write fault");
709
710                 offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
711                 offset &= ~(page_size-1);
712
713                 if (copy_page(uffd, offset))
714                         stats->missing_faults++;
715         }
716 }
717
718 static void *uffd_poll_thread(void *arg)
719 {
720         struct uffd_stats *stats = (struct uffd_stats *)arg;
721         unsigned long cpu = stats->cpu;
722         struct pollfd pollfd[2];
723         struct uffd_msg msg;
724         struct uffdio_register uffd_reg;
725         int ret;
726         char tmp_chr;
727
728         pollfd[0].fd = uffd;
729         pollfd[0].events = POLLIN;
730         pollfd[1].fd = pipefd[cpu*2];
731         pollfd[1].events = POLLIN;
732
733         for (;;) {
734                 ret = poll(pollfd, 2, -1);
735                 if (ret <= 0) {
736                         if (errno == EINTR || errno == EAGAIN)
737                                 continue;
738                         err("poll error: %d", ret);
739                 }
740                 if (pollfd[1].revents & POLLIN) {
741                         if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
742                                 err("read pipefd error");
743                         break;
744                 }
745                 if (!(pollfd[0].revents & POLLIN))
746                         err("pollfd[0].revents %d", pollfd[0].revents);
747                 if (uffd_read_msg(uffd, &msg))
748                         continue;
749                 switch (msg.event) {
750                 default:
751                         err("unexpected msg event %u\n", msg.event);
752                         break;
753                 case UFFD_EVENT_PAGEFAULT:
754                         uffd_handle_page_fault(&msg, stats);
755                         break;
756                 case UFFD_EVENT_FORK:
757                         close(uffd);
758                         uffd = msg.arg.fork.ufd;
759                         pollfd[0].fd = uffd;
760                         break;
761                 case UFFD_EVENT_REMOVE:
762                         uffd_reg.range.start = msg.arg.remove.start;
763                         uffd_reg.range.len = msg.arg.remove.end -
764                                 msg.arg.remove.start;
765                         if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
766                                 err("remove failure");
767                         break;
768                 case UFFD_EVENT_REMAP:
769                         area_dst = (char *)(unsigned long)msg.arg.remap.to;
770                         break;
771                 }
772         }
773
774         return NULL;
775 }
776
777 pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
778
779 static void *uffd_read_thread(void *arg)
780 {
781         struct uffd_stats *stats = (struct uffd_stats *)arg;
782         struct uffd_msg msg;
783
784         pthread_mutex_unlock(&uffd_read_mutex);
785         /* from here cancellation is ok */
786
787         for (;;) {
788                 if (uffd_read_msg(uffd, &msg))
789                         continue;
790                 uffd_handle_page_fault(&msg, stats);
791         }
792
793         return NULL;
794 }
795
796 static void *background_thread(void *arg)
797 {
798         unsigned long cpu = (unsigned long) arg;
799         unsigned long page_nr, start_nr, mid_nr, end_nr;
800
801         start_nr = cpu * nr_pages_per_cpu;
802         end_nr = (cpu+1) * nr_pages_per_cpu;
803         mid_nr = (start_nr + end_nr) / 2;
804
805         /* Copy the first half of the pages */
806         for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
807                 copy_page_retry(uffd, page_nr * page_size);
808
809         /*
810          * If we need to test uffd-wp, set it up now.  Then we'll have
811          * at least the first half of the pages mapped already which
812          * can be write-protected for testing
813          */
814         if (test_uffdio_wp)
815                 wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
816                         nr_pages_per_cpu * page_size, true);
817
818         /*
819          * Continue the 2nd half of the page copying, handling write
820          * protection faults if any
821          */
822         for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
823                 copy_page_retry(uffd, page_nr * page_size);
824
825         return NULL;
826 }
827
828 static int stress(struct uffd_stats *uffd_stats)
829 {
830         unsigned long cpu;
831         pthread_t locking_threads[nr_cpus];
832         pthread_t uffd_threads[nr_cpus];
833         pthread_t background_threads[nr_cpus];
834
835         finished = 0;
836         for (cpu = 0; cpu < nr_cpus; cpu++) {
837                 if (pthread_create(&locking_threads[cpu], &attr,
838                                    locking_thread, (void *)cpu))
839                         return 1;
840                 if (bounces & BOUNCE_POLL) {
841                         if (pthread_create(&uffd_threads[cpu], &attr,
842                                            uffd_poll_thread,
843                                            (void *)&uffd_stats[cpu]))
844                                 return 1;
845                 } else {
846                         if (pthread_create(&uffd_threads[cpu], &attr,
847                                            uffd_read_thread,
848                                            (void *)&uffd_stats[cpu]))
849                                 return 1;
850                         pthread_mutex_lock(&uffd_read_mutex);
851                 }
852                 if (pthread_create(&background_threads[cpu], &attr,
853                                    background_thread, (void *)cpu))
854                         return 1;
855         }
856         for (cpu = 0; cpu < nr_cpus; cpu++)
857                 if (pthread_join(background_threads[cpu], NULL))
858                         return 1;
859
860         /*
861          * Be strict and immediately zap area_src, the whole area has
862          * been transferred already by the background treads. The
863          * area_src could then be faulted in in a racy way by still
864          * running uffdio_threads reading zeropages after we zapped
865          * area_src (but they're guaranteed to get -EEXIST from
866          * UFFDIO_COPY without writing zero pages into area_dst
867          * because the background threads already completed).
868          */
869         uffd_test_ops->release_pages(area_src);
870
871         finished = 1;
872         for (cpu = 0; cpu < nr_cpus; cpu++)
873                 if (pthread_join(locking_threads[cpu], NULL))
874                         return 1;
875
876         for (cpu = 0; cpu < nr_cpus; cpu++) {
877                 char c;
878                 if (bounces & BOUNCE_POLL) {
879                         if (write(pipefd[cpu*2+1], &c, 1) != 1)
880                                 err("pipefd write error");
881                         if (pthread_join(uffd_threads[cpu],
882                                          (void *)&uffd_stats[cpu]))
883                                 return 1;
884                 } else {
885                         if (pthread_cancel(uffd_threads[cpu]))
886                                 return 1;
887                         if (pthread_join(uffd_threads[cpu], NULL))
888                                 return 1;
889                 }
890         }
891
892         return 0;
893 }
894
895 sigjmp_buf jbuf, *sigbuf;
896
897 static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
898 {
899         if (sig == SIGBUS) {
900                 if (sigbuf)
901                         siglongjmp(*sigbuf, 1);
902                 abort();
903         }
904 }
905
906 /*
907  * For non-cooperative userfaultfd test we fork() a process that will
908  * generate pagefaults, will mremap the area monitored by the
909  * userfaultfd and at last this process will release the monitored
910  * area.
911  * For the anonymous and shared memory the area is divided into two
912  * parts, the first part is accessed before mremap, and the second
913  * part is accessed after mremap. Since hugetlbfs does not support
914  * mremap, the entire monitored area is accessed in a single pass for
915  * HUGETLB_TEST.
916  * The release of the pages currently generates event for shmem and
917  * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
918  * for hugetlb.
919  * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
920  * monitored area, generate pagefaults and test that signal is delivered.
921  * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
922  * test robustness use case - we release monitored area, fork a process
923  * that will generate pagefaults and verify signal is generated.
924  * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
925  * feature. Using monitor thread, verify no userfault events are generated.
926  */
927 static int faulting_process(int signal_test)
928 {
929         unsigned long nr;
930         unsigned long long count;
931         unsigned long split_nr_pages;
932         unsigned long lastnr;
933         struct sigaction act;
934         unsigned long signalled = 0;
935
936         split_nr_pages = (nr_pages + 1) / 2;
937
938         if (signal_test) {
939                 sigbuf = &jbuf;
940                 memset(&act, 0, sizeof(act));
941                 act.sa_sigaction = sighndl;
942                 act.sa_flags = SA_SIGINFO;
943                 if (sigaction(SIGBUS, &act, 0))
944                         err("sigaction");
945                 lastnr = (unsigned long)-1;
946         }
947
948         for (nr = 0; nr < split_nr_pages; nr++) {
949                 int steps = 1;
950                 unsigned long offset = nr * page_size;
951
952                 if (signal_test) {
953                         if (sigsetjmp(*sigbuf, 1) != 0) {
954                                 if (steps == 1 && nr == lastnr)
955                                         err("Signal repeated");
956
957                                 lastnr = nr;
958                                 if (signal_test == 1) {
959                                         if (steps == 1) {
960                                                 /* This is a MISSING request */
961                                                 steps++;
962                                                 if (copy_page(uffd, offset))
963                                                         signalled++;
964                                         } else {
965                                                 /* This is a WP request */
966                                                 assert(steps == 2);
967                                                 wp_range(uffd,
968                                                          (__u64)area_dst +
969                                                          offset,
970                                                          page_size, false);
971                                         }
972                                 } else {
973                                         signalled++;
974                                         continue;
975                                 }
976                         }
977                 }
978
979                 count = *area_count(area_dst, nr);
980                 if (count != count_verify[nr])
981                         err("nr %lu memory corruption %llu %llu\n",
982                             nr, count, count_verify[nr]);
983                 /*
984                  * Trigger write protection if there is by writing
985                  * the same value back.
986                  */
987                 *area_count(area_dst, nr) = count;
988         }
989
990         if (signal_test)
991                 return signalled != split_nr_pages;
992
993         area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
994                           MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
995         if (area_dst == MAP_FAILED)
996                 err("mremap");
997         /* Reset area_src since we just clobbered it */
998         area_src = NULL;
999
1000         for (; nr < nr_pages; nr++) {
1001                 count = *area_count(area_dst, nr);
1002                 if (count != count_verify[nr]) {
1003                         err("nr %lu memory corruption %llu %llu\n",
1004                             nr, count, count_verify[nr]);
1005                 }
1006                 /*
1007                  * Trigger write protection if there is by writing
1008                  * the same value back.
1009                  */
1010                 *area_count(area_dst, nr) = count;
1011         }
1012
1013         uffd_test_ops->release_pages(area_dst);
1014
1015         for (nr = 0; nr < nr_pages; nr++)
1016                 if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
1017                         err("nr %lu is not zero", nr);
1018
1019         return 0;
1020 }
1021
1022 static void retry_uffdio_zeropage(int ufd,
1023                                   struct uffdio_zeropage *uffdio_zeropage,
1024                                   unsigned long offset)
1025 {
1026         uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
1027                                      uffdio_zeropage->range.len,
1028                                      offset);
1029         if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
1030                 if (uffdio_zeropage->zeropage != -EEXIST)
1031                         err("UFFDIO_ZEROPAGE error: %"PRId64,
1032                             (int64_t)uffdio_zeropage->zeropage);
1033         } else {
1034                 err("UFFDIO_ZEROPAGE error: %"PRId64,
1035                     (int64_t)uffdio_zeropage->zeropage);
1036         }
1037 }
1038
1039 static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
1040 {
1041         struct uffdio_zeropage uffdio_zeropage;
1042         int ret;
1043         bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE);
1044         __s64 res;
1045
1046         if (offset >= nr_pages * page_size)
1047                 err("unexpected offset %lu", offset);
1048         uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
1049         uffdio_zeropage.range.len = page_size;
1050         uffdio_zeropage.mode = 0;
1051         ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
1052         res = uffdio_zeropage.zeropage;
1053         if (ret) {
1054                 /* real retval in ufdio_zeropage.zeropage */
1055                 if (has_zeropage)
1056                         err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
1057                 else if (res != -EINVAL)
1058                         err("UFFDIO_ZEROPAGE not -EINVAL");
1059         } else if (has_zeropage) {
1060                 if (res != page_size) {
1061                         err("UFFDIO_ZEROPAGE unexpected size");
1062                 } else {
1063                         if (test_uffdio_zeropage_eexist && retry) {
1064                                 test_uffdio_zeropage_eexist = false;
1065                                 retry_uffdio_zeropage(ufd, &uffdio_zeropage,
1066                                                       offset);
1067                         }
1068                         return 1;
1069                 }
1070         } else
1071                 err("UFFDIO_ZEROPAGE succeeded");
1072
1073         return 0;
1074 }
1075
1076 static int uffdio_zeropage(int ufd, unsigned long offset)
1077 {
1078         return __uffdio_zeropage(ufd, offset, false);
1079 }
1080
1081 /* exercise UFFDIO_ZEROPAGE */
1082 static int userfaultfd_zeropage_test(void)
1083 {
1084         struct uffdio_register uffdio_register;
1085
1086         printf("testing UFFDIO_ZEROPAGE: ");
1087         fflush(stdout);
1088
1089         uffd_test_ctx_init(0);
1090
1091         uffdio_register.range.start = (unsigned long) area_dst;
1092         uffdio_register.range.len = nr_pages * page_size;
1093         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1094         if (test_uffdio_wp)
1095                 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1096         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1097                 err("register failure");
1098
1099         assert_expected_ioctls_present(
1100                 uffdio_register.mode, uffdio_register.ioctls);
1101
1102         if (uffdio_zeropage(uffd, 0))
1103                 if (my_bcmp(area_dst, zeropage, page_size))
1104                         err("zeropage is not zero");
1105
1106         printf("done.\n");
1107         return 0;
1108 }
1109
1110 static int userfaultfd_events_test(void)
1111 {
1112         struct uffdio_register uffdio_register;
1113         pthread_t uffd_mon;
1114         int err, features;
1115         pid_t pid;
1116         char c;
1117         struct uffd_stats stats = { 0 };
1118
1119         printf("testing events (fork, remap, remove): ");
1120         fflush(stdout);
1121
1122         features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
1123                 UFFD_FEATURE_EVENT_REMOVE;
1124         uffd_test_ctx_init(features);
1125
1126         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1127
1128         uffdio_register.range.start = (unsigned long) area_dst;
1129         uffdio_register.range.len = nr_pages * page_size;
1130         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1131         if (test_uffdio_wp)
1132                 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1133         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1134                 err("register failure");
1135
1136         assert_expected_ioctls_present(
1137                 uffdio_register.mode, uffdio_register.ioctls);
1138
1139         if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1140                 err("uffd_poll_thread create");
1141
1142         pid = fork();
1143         if (pid < 0)
1144                 err("fork");
1145
1146         if (!pid)
1147                 exit(faulting_process(0));
1148
1149         waitpid(pid, &err, 0);
1150         if (err)
1151                 err("faulting process failed");
1152         if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1153                 err("pipe write");
1154         if (pthread_join(uffd_mon, NULL))
1155                 return 1;
1156
1157         uffd_stats_report(&stats, 1);
1158
1159         return stats.missing_faults != nr_pages;
1160 }
1161
1162 static int userfaultfd_sig_test(void)
1163 {
1164         struct uffdio_register uffdio_register;
1165         unsigned long userfaults;
1166         pthread_t uffd_mon;
1167         int err, features;
1168         pid_t pid;
1169         char c;
1170         struct uffd_stats stats = { 0 };
1171
1172         printf("testing signal delivery: ");
1173         fflush(stdout);
1174
1175         features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
1176         uffd_test_ctx_init(features);
1177
1178         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1179
1180         uffdio_register.range.start = (unsigned long) area_dst;
1181         uffdio_register.range.len = nr_pages * page_size;
1182         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1183         if (test_uffdio_wp)
1184                 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1185         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1186                 err("register failure");
1187
1188         assert_expected_ioctls_present(
1189                 uffdio_register.mode, uffdio_register.ioctls);
1190
1191         if (faulting_process(1))
1192                 err("faulting process failed");
1193
1194         uffd_test_ops->release_pages(area_dst);
1195
1196         if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1197                 err("uffd_poll_thread create");
1198
1199         pid = fork();
1200         if (pid < 0)
1201                 err("fork");
1202
1203         if (!pid)
1204                 exit(faulting_process(2));
1205
1206         waitpid(pid, &err, 0);
1207         if (err)
1208                 err("faulting process failed");
1209         if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1210                 err("pipe write");
1211         if (pthread_join(uffd_mon, (void **)&userfaults))
1212                 return 1;
1213
1214         printf("done.\n");
1215         if (userfaults)
1216                 err("Signal test failed, userfaults: %ld", userfaults);
1217
1218         return userfaults != 0;
1219 }
1220
1221 static int userfaultfd_minor_test(void)
1222 {
1223         struct uffdio_register uffdio_register;
1224         unsigned long p;
1225         pthread_t uffd_mon;
1226         uint8_t expected_byte;
1227         void *expected_page;
1228         char c;
1229         struct uffd_stats stats = { 0 };
1230
1231         if (!test_uffdio_minor)
1232                 return 0;
1233
1234         printf("testing minor faults: ");
1235         fflush(stdout);
1236
1237         uffd_test_ctx_init(uffd_minor_feature());
1238
1239         uffdio_register.range.start = (unsigned long)area_dst_alias;
1240         uffdio_register.range.len = nr_pages * page_size;
1241         uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
1242         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1243                 err("register failure");
1244
1245         assert_expected_ioctls_present(
1246                 uffdio_register.mode, uffdio_register.ioctls);
1247
1248         /*
1249          * After registering with UFFD, populate the non-UFFD-registered side of
1250          * the shared mapping. This should *not* trigger any UFFD minor faults.
1251          */
1252         for (p = 0; p < nr_pages; ++p) {
1253                 memset(area_dst + (p * page_size), p % ((uint8_t)-1),
1254                        page_size);
1255         }
1256
1257         if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1258                 err("uffd_poll_thread create");
1259
1260         /*
1261          * Read each of the pages back using the UFFD-registered mapping. We
1262          * expect that the first time we touch a page, it will result in a minor
1263          * fault. uffd_poll_thread will resolve the fault by bit-flipping the
1264          * page's contents, and then issuing a CONTINUE ioctl.
1265          */
1266
1267         if (posix_memalign(&expected_page, page_size, page_size))
1268                 err("out of memory");
1269
1270         for (p = 0; p < nr_pages; ++p) {
1271                 expected_byte = ~((uint8_t)(p % ((uint8_t)-1)));
1272                 memset(expected_page, expected_byte, page_size);
1273                 if (my_bcmp(expected_page, area_dst_alias + (p * page_size),
1274                             page_size))
1275                         err("unexpected page contents after minor fault");
1276         }
1277
1278         if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1279                 err("pipe write");
1280         if (pthread_join(uffd_mon, NULL))
1281                 return 1;
1282
1283         uffd_stats_report(&stats, 1);
1284
1285         return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
1286 }
1287
1288 #define BIT_ULL(nr)                   (1ULL << (nr))
1289 #define PM_SOFT_DIRTY                 BIT_ULL(55)
1290 #define PM_MMAP_EXCLUSIVE             BIT_ULL(56)
1291 #define PM_UFFD_WP                    BIT_ULL(57)
1292 #define PM_FILE                       BIT_ULL(61)
1293 #define PM_SWAP                       BIT_ULL(62)
1294 #define PM_PRESENT                    BIT_ULL(63)
1295
1296 static int pagemap_open(void)
1297 {
1298         int fd = open("/proc/self/pagemap", O_RDONLY);
1299
1300         if (fd < 0)
1301                 err("open pagemap");
1302
1303         return fd;
1304 }
1305
1306 static uint64_t pagemap_read_vaddr(int fd, void *vaddr)
1307 {
1308         uint64_t value;
1309         int ret;
1310
1311         ret = pread(fd, &value, sizeof(uint64_t),
1312                     ((uint64_t)vaddr >> 12) * sizeof(uint64_t));
1313         if (ret != sizeof(uint64_t))
1314                 err("pread() on pagemap failed");
1315
1316         return value;
1317 }
1318
1319 /* This macro let __LINE__ works in err() */
1320 #define  pagemap_check_wp(value, wp) do {                               \
1321                 if (!!(value & PM_UFFD_WP) != wp)                       \
1322                         err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
1323         } while (0)
1324
1325 static int pagemap_test_fork(bool present)
1326 {
1327         pid_t child = fork();
1328         uint64_t value;
1329         int fd, result;
1330
1331         if (!child) {
1332                 /* Open the pagemap fd of the child itself */
1333                 fd = pagemap_open();
1334                 value = pagemap_read_vaddr(fd, area_dst);
1335                 /*
1336                  * After fork() uffd-wp bit should be gone as long as we're
1337                  * without UFFD_FEATURE_EVENT_FORK
1338                  */
1339                 pagemap_check_wp(value, false);
1340                 /* Succeed */
1341                 exit(0);
1342         }
1343         waitpid(child, &result, 0);
1344         return result;
1345 }
1346
1347 static void userfaultfd_pagemap_test(unsigned int test_pgsize)
1348 {
1349         struct uffdio_register uffdio_register;
1350         int pagemap_fd;
1351         uint64_t value;
1352
1353         /* Pagemap tests uffd-wp only */
1354         if (!test_uffdio_wp)
1355                 return;
1356
1357         /* Not enough memory to test this page size */
1358         if (test_pgsize > nr_pages * page_size)
1359                 return;
1360
1361         printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize);
1362         /* Flush so it doesn't flush twice in parent/child later */
1363         fflush(stdout);
1364
1365         uffd_test_ctx_init(0);
1366
1367         if (test_pgsize > page_size) {
1368                 /* This is a thp test */
1369                 if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
1370                         err("madvise(MADV_HUGEPAGE) failed");
1371         } else if (test_pgsize == page_size) {
1372                 /* This is normal page test; force no thp */
1373                 if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
1374                         err("madvise(MADV_NOHUGEPAGE) failed");
1375         }
1376
1377         uffdio_register.range.start = (unsigned long) area_dst;
1378         uffdio_register.range.len = nr_pages * page_size;
1379         uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
1380         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1381                 err("register failed");
1382
1383         pagemap_fd = pagemap_open();
1384
1385         /* Touch the page */
1386         *area_dst = 1;
1387         wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
1388         value = pagemap_read_vaddr(pagemap_fd, area_dst);
1389         pagemap_check_wp(value, true);
1390         /* Make sure uffd-wp bit dropped when fork */
1391         if (pagemap_test_fork(true))
1392                 err("Detected stall uffd-wp bit in child");
1393
1394         /* Exclusive required or PAGEOUT won't work */
1395         if (!(value & PM_MMAP_EXCLUSIVE))
1396                 err("multiple mapping detected: 0x%"PRIx64, value);
1397
1398         if (madvise(area_dst, test_pgsize, MADV_PAGEOUT))
1399                 err("madvise(MADV_PAGEOUT) failed");
1400
1401         /* Uffd-wp should persist even swapped out */
1402         value = pagemap_read_vaddr(pagemap_fd, area_dst);
1403         pagemap_check_wp(value, true);
1404         /* Make sure uffd-wp bit dropped when fork */
1405         if (pagemap_test_fork(false))
1406                 err("Detected stall uffd-wp bit in child");
1407
1408         /* Unprotect; this tests swap pte modifications */
1409         wp_range(uffd, (uint64_t)area_dst, page_size, false);
1410         value = pagemap_read_vaddr(pagemap_fd, area_dst);
1411         pagemap_check_wp(value, false);
1412
1413         /* Fault in the page from disk */
1414         *area_dst = 2;
1415         value = pagemap_read_vaddr(pagemap_fd, area_dst);
1416         pagemap_check_wp(value, false);
1417
1418         close(pagemap_fd);
1419         printf("done\n");
1420 }
1421
1422 static int userfaultfd_stress(void)
1423 {
1424         void *area;
1425         char *tmp_area;
1426         unsigned long nr;
1427         struct uffdio_register uffdio_register;
1428         struct uffd_stats uffd_stats[nr_cpus];
1429
1430         uffd_test_ctx_init(0);
1431
1432         if (posix_memalign(&area, page_size, page_size))
1433                 err("out of memory");
1434         zeropage = area;
1435         bzero(zeropage, page_size);
1436
1437         pthread_mutex_lock(&uffd_read_mutex);
1438
1439         pthread_attr_init(&attr);
1440         pthread_attr_setstacksize(&attr, 16*1024*1024);
1441
1442         while (bounces--) {
1443                 printf("bounces: %d, mode:", bounces);
1444                 if (bounces & BOUNCE_RANDOM)
1445                         printf(" rnd");
1446                 if (bounces & BOUNCE_RACINGFAULTS)
1447                         printf(" racing");
1448                 if (bounces & BOUNCE_VERIFY)
1449                         printf(" ver");
1450                 if (bounces & BOUNCE_POLL)
1451                         printf(" poll");
1452                 else
1453                         printf(" read");
1454                 printf(", ");
1455                 fflush(stdout);
1456
1457                 if (bounces & BOUNCE_POLL)
1458                         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1459                 else
1460                         fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
1461
1462                 /* register */
1463                 uffdio_register.range.start = (unsigned long) area_dst;
1464                 uffdio_register.range.len = nr_pages * page_size;
1465                 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1466                 if (test_uffdio_wp)
1467                         uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1468                 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1469                         err("register failure");
1470                 assert_expected_ioctls_present(
1471                         uffdio_register.mode, uffdio_register.ioctls);
1472
1473                 if (area_dst_alias) {
1474                         uffdio_register.range.start = (unsigned long)
1475                                 area_dst_alias;
1476                         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1477                                 err("register failure alias");
1478                 }
1479
1480                 /*
1481                  * The madvise done previously isn't enough: some
1482                  * uffd_thread could have read userfaults (one of
1483                  * those already resolved by the background thread)
1484                  * and it may be in the process of calling
1485                  * UFFDIO_COPY. UFFDIO_COPY will read the zapped
1486                  * area_src and it would map a zero page in it (of
1487                  * course such a UFFDIO_COPY is perfectly safe as it'd
1488                  * return -EEXIST). The problem comes at the next
1489                  * bounce though: that racing UFFDIO_COPY would
1490                  * generate zeropages in the area_src, so invalidating
1491                  * the previous MADV_DONTNEED. Without this additional
1492                  * MADV_DONTNEED those zeropages leftovers in the
1493                  * area_src would lead to -EEXIST failure during the
1494                  * next bounce, effectively leaving a zeropage in the
1495                  * area_dst.
1496                  *
1497                  * Try to comment this out madvise to see the memory
1498                  * corruption being caught pretty quick.
1499                  *
1500                  * khugepaged is also inhibited to collapse THP after
1501                  * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
1502                  * required to MADV_DONTNEED here.
1503                  */
1504                 uffd_test_ops->release_pages(area_dst);
1505
1506                 uffd_stats_reset(uffd_stats, nr_cpus);
1507
1508                 /* bounce pass */
1509                 if (stress(uffd_stats))
1510                         return 1;
1511
1512                 /* Clear all the write protections if there is any */
1513                 if (test_uffdio_wp)
1514                         wp_range(uffd, (unsigned long)area_dst,
1515                                  nr_pages * page_size, false);
1516
1517                 /* unregister */
1518                 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
1519                         err("unregister failure");
1520                 if (area_dst_alias) {
1521                         uffdio_register.range.start = (unsigned long) area_dst;
1522                         if (ioctl(uffd, UFFDIO_UNREGISTER,
1523                                   &uffdio_register.range))
1524                                 err("unregister failure alias");
1525                 }
1526
1527                 /* verification */
1528                 if (bounces & BOUNCE_VERIFY)
1529                         for (nr = 0; nr < nr_pages; nr++)
1530                                 if (*area_count(area_dst, nr) != count_verify[nr])
1531                                         err("error area_count %llu %llu %lu\n",
1532                                             *area_count(area_src, nr),
1533                                             count_verify[nr], nr);
1534
1535                 /* prepare next bounce */
1536                 tmp_area = area_src;
1537                 area_src = area_dst;
1538                 area_dst = tmp_area;
1539
1540                 tmp_area = area_src_alias;
1541                 area_src_alias = area_dst_alias;
1542                 area_dst_alias = tmp_area;
1543
1544                 uffd_stats_report(uffd_stats, nr_cpus);
1545         }
1546
1547         if (test_type == TEST_ANON) {
1548                 /*
1549                  * shmem/hugetlb won't be able to run since they have different
1550                  * behavior on fork() (file-backed memory normally drops ptes
1551                  * directly when fork), meanwhile the pagemap test will verify
1552                  * pgtable entry of fork()ed child.
1553                  */
1554                 userfaultfd_pagemap_test(page_size);
1555                 /*
1556                  * Hard-code for x86_64 for now for 2M THP, as x86_64 is
1557                  * currently the only one that supports uffd-wp
1558                  */
1559                 userfaultfd_pagemap_test(page_size * 512);
1560         }
1561
1562         return userfaultfd_zeropage_test() || userfaultfd_sig_test()
1563                 || userfaultfd_events_test() || userfaultfd_minor_test();
1564 }
1565
1566 /*
1567  * Copied from mlock2-tests.c
1568  */
1569 unsigned long default_huge_page_size(void)
1570 {
1571         unsigned long hps = 0;
1572         char *line = NULL;
1573         size_t linelen = 0;
1574         FILE *f = fopen("/proc/meminfo", "r");
1575
1576         if (!f)
1577                 return 0;
1578         while (getline(&line, &linelen, f) > 0) {
1579                 if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
1580                         hps <<= 10;
1581                         break;
1582                 }
1583         }
1584
1585         free(line);
1586         fclose(f);
1587         return hps;
1588 }
1589
1590 static void set_test_type(const char *type)
1591 {
1592         uint64_t features = UFFD_API_FEATURES;
1593
1594         if (!strcmp(type, "anon")) {
1595                 test_type = TEST_ANON;
1596                 uffd_test_ops = &anon_uffd_test_ops;
1597                 /* Only enable write-protect test for anonymous test */
1598                 test_uffdio_wp = true;
1599         } else if (!strcmp(type, "hugetlb")) {
1600                 test_type = TEST_HUGETLB;
1601                 uffd_test_ops = &hugetlb_uffd_test_ops;
1602         } else if (!strcmp(type, "hugetlb_shared")) {
1603                 map_shared = true;
1604                 test_type = TEST_HUGETLB;
1605                 uffd_test_ops = &hugetlb_uffd_test_ops;
1606                 /* Minor faults require shared hugetlb; only enable here. */
1607                 test_uffdio_minor = true;
1608         } else if (!strcmp(type, "shmem")) {
1609                 map_shared = true;
1610                 test_type = TEST_SHMEM;
1611                 uffd_test_ops = &shmem_uffd_test_ops;
1612                 test_uffdio_minor = true;
1613         } else {
1614                 err("Unknown test type: %s", type);
1615         }
1616
1617         if (test_type == TEST_HUGETLB)
1618                 page_size = default_huge_page_size();
1619         else
1620                 page_size = sysconf(_SC_PAGE_SIZE);
1621
1622         if (!page_size)
1623                 err("Unable to determine page size");
1624         if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1625             > page_size)
1626                 err("Impossible to run this test");
1627
1628         /*
1629          * Whether we can test certain features depends not just on test type,
1630          * but also on whether or not this particular kernel supports the
1631          * feature.
1632          */
1633
1634         userfaultfd_open(&features);
1635
1636         test_uffdio_wp = test_uffdio_wp &&
1637                 (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP);
1638         test_uffdio_minor = test_uffdio_minor &&
1639                 (features & uffd_minor_feature());
1640
1641         close(uffd);
1642         uffd = -1;
1643 }
1644
1645 static void sigalrm(int sig)
1646 {
1647         if (sig != SIGALRM)
1648                 abort();
1649         test_uffdio_copy_eexist = true;
1650         test_uffdio_zeropage_eexist = true;
1651         alarm(ALARM_INTERVAL_SECS);
1652 }
1653
1654 int main(int argc, char **argv)
1655 {
1656         if (argc < 4)
1657                 usage();
1658
1659         if (signal(SIGALRM, sigalrm) == SIG_ERR)
1660                 err("failed to arm SIGALRM");
1661         alarm(ALARM_INTERVAL_SECS);
1662
1663         set_test_type(argv[1]);
1664
1665         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1666         nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
1667                 nr_cpus;
1668         if (!nr_pages_per_cpu) {
1669                 _err("invalid MiB");
1670                 usage();
1671         }
1672
1673         bounces = atoi(argv[3]);
1674         if (bounces <= 0) {
1675                 _err("invalid bounces");
1676                 usage();
1677         }
1678         nr_pages = nr_pages_per_cpu * nr_cpus;
1679
1680         if (test_type == TEST_HUGETLB && map_shared) {
1681                 if (argc < 5)
1682                         usage();
1683                 huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
1684                 if (huge_fd < 0)
1685                         err("Open of %s failed", argv[4]);
1686                 if (ftruncate(huge_fd, 0))
1687                         err("ftruncate %s to size 0 failed", argv[4]);
1688         } else if (test_type == TEST_SHMEM) {
1689                 shm_fd = memfd_create(argv[0], 0);
1690                 if (shm_fd < 0)
1691                         err("memfd_create");
1692                 if (ftruncate(shm_fd, nr_pages * page_size * 2))
1693                         err("ftruncate");
1694                 if (fallocate(shm_fd,
1695                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
1696                               nr_pages * page_size * 2))
1697                         err("fallocate");
1698         }
1699         printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1700                nr_pages, nr_pages_per_cpu);
1701         return userfaultfd_stress();
1702 }
1703
1704 #else /* __NR_userfaultfd */
1705
1706 #warning "missing __NR_userfaultfd definition"
1707
1708 int main(void)
1709 {
1710         printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1711         return KSFT_SKIP;
1712 }
1713
1714 #endif /* __NR_userfaultfd */