~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/tools/testing/selftests/vm/userfaultfd.c

Version: ~ [ linux-5.19-rc8 ] ~ [ linux-5.18.14 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.57 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.133 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.207 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.253 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.289 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.324 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.302 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-only
  2 /*
  3  * Stress userfaultfd syscall.
  4  *
  5  *  Copyright (C) 2015  Red Hat, Inc.
  6  *
  7  * This test allocates two virtual areas and bounces the physical
  8  * memory across the two virtual areas (from area_src to area_dst)
  9  * using userfaultfd.
 10  *
 11  * There are three threads running per CPU:
 12  *
 13  * 1) one per-CPU thread takes a per-page pthread_mutex in a random
 14  *    page of the area_dst (while the physical page may still be in
 15  *    area_src), and increments a per-page counter in the same page,
 16  *    and checks its value against a verification region.
 17  *
 18  * 2) another per-CPU thread handles the userfaults generated by
 19  *    thread 1 above. userfaultfd blocking reads or poll() modes are
 20  *    exercised interleaved.
 21  *
 22  * 3) one last per-CPU thread transfers the memory in the background
 23  *    at maximum bandwidth (if not already transferred by thread
 24  *    2). Each cpu thread takes cares of transferring a portion of the
 25  *    area.
 26  *
 27  * When all threads of type 3 completed the transfer, one bounce is
 28  * complete. area_src and area_dst are then swapped. All threads are
 29  * respawned and so the bounce is immediately restarted in the
 30  * opposite direction.
 31  *
 32  * per-CPU threads 1 by triggering userfaults inside
 33  * pthread_mutex_lock will also verify the atomicity of the memory
 34  * transfer (UFFDIO_COPY).
 35  */
 36 
 37 #define _GNU_SOURCE
 38 #include <stdio.h>
 39 #include <errno.h>
 40 #include <unistd.h>
 41 #include <stdlib.h>
 42 #include <sys/types.h>
 43 #include <sys/stat.h>
 44 #include <fcntl.h>
 45 #include <time.h>
 46 #include <signal.h>
 47 #include <poll.h>
 48 #include <string.h>
 49 #include <sys/mman.h>
 50 #include <sys/syscall.h>
 51 #include <sys/ioctl.h>
 52 #include <sys/wait.h>
 53 #include <pthread.h>
 54 #include <linux/userfaultfd.h>
 55 #include <setjmp.h>
 56 #include <stdbool.h>
 57 #include <assert.h>
 58 #include <inttypes.h>
 59 #include <stdint.h>
 60 
 61 #include "../kselftest.h"
 62 
 63 #ifdef __NR_userfaultfd
 64 
 65 static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
 66 
 67 #define BOUNCE_RANDOM           (1<<0)
 68 #define BOUNCE_RACINGFAULTS     (1<<1)
 69 #define BOUNCE_VERIFY           (1<<2)
 70 #define BOUNCE_POLL             (1<<3)
 71 static int bounces;
 72 
 73 #define TEST_ANON       1
 74 #define TEST_HUGETLB    2
 75 #define TEST_SHMEM      3
 76 static int test_type;
 77 
 78 /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
 79 #define ALARM_INTERVAL_SECS 10
 80 static volatile bool test_uffdio_copy_eexist = true;
 81 static volatile bool test_uffdio_zeropage_eexist = true;
 82 /* Whether to test uffd write-protection */
 83 static bool test_uffdio_wp = false;
 84 /* Whether to test uffd minor faults */
 85 static bool test_uffdio_minor = false;
 86 
 87 static bool map_shared;
 88 static int huge_fd;
 89 static char *huge_fd_off0;
 90 static unsigned long long *count_verify;
 91 static int uffd, uffd_flags, finished, *pipefd;
 92 static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
 93 static char *zeropage;
 94 pthread_attr_t attr;
 95 
 96 /* Userfaultfd test statistics */
 97 struct uffd_stats {
 98         int cpu;
 99         unsigned long missing_faults;
100         unsigned long wp_faults;
101         unsigned long minor_faults;
102 };
103 
104 /* pthread_mutex_t starts at page offset 0 */
105 #define area_mutex(___area, ___nr)                                      \
106         ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
107 /*
108  * count is placed in the page after pthread_mutex_t naturally aligned
109  * to avoid non alignment faults on non-x86 archs.
110  */
111 #define area_count(___area, ___nr)                                      \
112         ((volatile unsigned long long *) ((unsigned long)               \
113                                  ((___area) + (___nr)*page_size +       \
114                                   sizeof(pthread_mutex_t) +             \
115                                   sizeof(unsigned long long) - 1) &     \
116                                  ~(unsigned long)(sizeof(unsigned long long) \
117                                                   -  1)))
118 
119 const char *examples =
120     "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
121     "./userfaultfd anon 100 99999\n\n"
122     "# Run share memory test on 1GiB region with 99 bounces:\n"
123     "./userfaultfd shmem 1000 99\n\n"
124     "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n"
125     "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n"
126     "# Run the same hugetlb test but using shmem:\n"
127     "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
128     "# 10MiB-~6GiB 999 bounces anonymous test, "
129     "continue forever unless an error triggers\n"
130     "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
131 
132 static void usage(void)
133 {
134         fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
135                 "[hugetlbfs_file]\n\n");
136         fprintf(stderr, "Supported <test type>: anon, hugetlb, "
137                 "hugetlb_shared, shmem\n\n");
138         fprintf(stderr, "Examples:\n\n");
139         fprintf(stderr, "%s", examples);
140         exit(1);
141 }
142 
143 #define uffd_error(code, fmt, ...)                                             \
144         do {                                                                   \
145                 fprintf(stderr, fmt, ##__VA_ARGS__);                           \
146                 fprintf(stderr, ": %" PRId64 "\n", (int64_t)(code));           \
147                 exit(1);                                                       \
148         } while (0)
149 
150 static void uffd_stats_reset(struct uffd_stats *uffd_stats,
151                              unsigned long n_cpus)
152 {
153         int i;
154 
155         for (i = 0; i < n_cpus; i++) {
156                 uffd_stats[i].cpu = i;
157                 uffd_stats[i].missing_faults = 0;
158                 uffd_stats[i].wp_faults = 0;
159                 uffd_stats[i].minor_faults = 0;
160         }
161 }
162 
163 static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
164 {
165         int i;
166         unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
167 
168         for (i = 0; i < n_cpus; i++) {
169                 miss_total += stats[i].missing_faults;
170                 wp_total += stats[i].wp_faults;
171                 minor_total += stats[i].minor_faults;
172         }
173 
174         printf("userfaults: %llu missing (", miss_total);
175         for (i = 0; i < n_cpus; i++)
176                 printf("%lu+", stats[i].missing_faults);
177         printf("\b), %llu wp (", wp_total);
178         for (i = 0; i < n_cpus; i++)
179                 printf("%lu+", stats[i].wp_faults);
180         printf("\b), %llu minor (", minor_total);
181         for (i = 0; i < n_cpus; i++)
182                 printf("%lu+", stats[i].minor_faults);
183         printf("\b)\n");
184 }
185 
186 static int anon_release_pages(char *rel_area)
187 {
188         int ret = 0;
189 
190         if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) {
191                 perror("madvise");
192                 ret = 1;
193         }
194 
195         return ret;
196 }
197 
198 static void anon_allocate_area(void **alloc_area)
199 {
200         *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
201                            MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
202         if (*alloc_area == MAP_FAILED) {
203                 fprintf(stderr, "mmap of anonymous memory failed");
204                 *alloc_area = NULL;
205         }
206 }
207 
208 static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
209 {
210 }
211 
212 /* HugeTLB memory */
213 static int hugetlb_release_pages(char *rel_area)
214 {
215         int ret = 0;
216 
217         if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
218                                 rel_area == huge_fd_off0 ? 0 :
219                                 nr_pages * page_size,
220                                 nr_pages * page_size)) {
221                 perror("fallocate");
222                 ret = 1;
223         }
224 
225         return ret;
226 }
227 
228 static void hugetlb_allocate_area(void **alloc_area)
229 {
230         void *area_alias = NULL;
231         char **alloc_area_alias;
232 
233         *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
234                            (map_shared ? MAP_SHARED : MAP_PRIVATE) |
235                            MAP_HUGETLB,
236                            huge_fd, *alloc_area == area_src ? 0 :
237                            nr_pages * page_size);
238         if (*alloc_area == MAP_FAILED) {
239                 perror("mmap of hugetlbfs file failed");
240                 goto fail;
241         }
242 
243         if (map_shared) {
244                 area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
245                                   MAP_SHARED | MAP_HUGETLB,
246                                   huge_fd, *alloc_area == area_src ? 0 :
247                                   nr_pages * page_size);
248                 if (area_alias == MAP_FAILED) {
249                         perror("mmap of hugetlb file alias failed");
250                         goto fail_munmap;
251                 }
252         }
253 
254         if (*alloc_area == area_src) {
255                 huge_fd_off0 = *alloc_area;
256                 alloc_area_alias = &area_src_alias;
257         } else {
258                 alloc_area_alias = &area_dst_alias;
259         }
260         if (area_alias)
261                 *alloc_area_alias = area_alias;
262 
263         return;
264 
265 fail_munmap:
266         if (munmap(*alloc_area, nr_pages * page_size) < 0) {
267                 perror("hugetlb munmap");
268                 exit(1);
269         }
270 fail:
271         *alloc_area = NULL;
272 }
273 
274 static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
275 {
276         if (!map_shared)
277                 return;
278         /*
279          * We can't zap just the pagetable with hugetlbfs because
280          * MADV_DONTEED won't work. So exercise -EEXIST on a alias
281          * mapping where the pagetables are not established initially,
282          * this way we'll exercise the -EEXEC at the fs level.
283          */
284         *start = (unsigned long) area_dst_alias + offset;
285 }
286 
287 /* Shared memory */
288 static int shmem_release_pages(char *rel_area)
289 {
290         int ret = 0;
291 
292         if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) {
293                 perror("madvise");
294                 ret = 1;
295         }
296 
297         return ret;
298 }
299 
300 static void shmem_allocate_area(void **alloc_area)
301 {
302         *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
303                            MAP_ANONYMOUS | MAP_SHARED, -1, 0);
304         if (*alloc_area == MAP_FAILED) {
305                 fprintf(stderr, "shared memory mmap failed\n");
306                 *alloc_area = NULL;
307         }
308 }
309 
310 struct uffd_test_ops {
311         unsigned long expected_ioctls;
312         void (*allocate_area)(void **alloc_area);
313         int (*release_pages)(char *rel_area);
314         void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
315 };
316 
317 #define SHMEM_EXPECTED_IOCTLS           ((1 << _UFFDIO_WAKE) | \
318                                          (1 << _UFFDIO_COPY) | \
319                                          (1 << _UFFDIO_ZEROPAGE))
320 
321 #define ANON_EXPECTED_IOCTLS            ((1 << _UFFDIO_WAKE) | \
322                                          (1 << _UFFDIO_COPY) | \
323                                          (1 << _UFFDIO_ZEROPAGE) | \
324                                          (1 << _UFFDIO_WRITEPROTECT))
325 
326 static struct uffd_test_ops anon_uffd_test_ops = {
327         .expected_ioctls = ANON_EXPECTED_IOCTLS,
328         .allocate_area  = anon_allocate_area,
329         .release_pages  = anon_release_pages,
330         .alias_mapping = noop_alias_mapping,
331 };
332 
333 static struct uffd_test_ops shmem_uffd_test_ops = {
334         .expected_ioctls = SHMEM_EXPECTED_IOCTLS,
335         .allocate_area  = shmem_allocate_area,
336         .release_pages  = shmem_release_pages,
337         .alias_mapping = noop_alias_mapping,
338 };
339 
340 static struct uffd_test_ops hugetlb_uffd_test_ops = {
341         .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE),
342         .allocate_area  = hugetlb_allocate_area,
343         .release_pages  = hugetlb_release_pages,
344         .alias_mapping = hugetlb_alias_mapping,
345 };
346 
347 static struct uffd_test_ops *uffd_test_ops;
348 
349 static int my_bcmp(char *str1, char *str2, size_t n)
350 {
351         unsigned long i;
352         for (i = 0; i < n; i++)
353                 if (str1[i] != str2[i])
354                         return 1;
355         return 0;
356 }
357 
358 static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
359 {
360         struct uffdio_writeprotect prms;
361 
362         /* Write protection page faults */
363         prms.range.start = start;
364         prms.range.len = len;
365         /* Undo write-protect, do wakeup after that */
366         prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
367 
368         if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) {
369                 fprintf(stderr, "clear WP failed for address 0x%" PRIx64 "\n",
370                         (uint64_t)start);
371                 exit(1);
372         }
373 }
374 
375 static void continue_range(int ufd, __u64 start, __u64 len)
376 {
377         struct uffdio_continue req;
378 
379         req.range.start = start;
380         req.range.len = len;
381         req.mode = 0;
382 
383         if (ioctl(ufd, UFFDIO_CONTINUE, &req)) {
384                 fprintf(stderr,
385                         "UFFDIO_CONTINUE failed for address 0x%" PRIx64 "\n",
386                         (uint64_t)start);
387                 exit(1);
388         }
389 }
390 
391 static void *locking_thread(void *arg)
392 {
393         unsigned long cpu = (unsigned long) arg;
394         struct random_data rand;
395         unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
396         int32_t rand_nr;
397         unsigned long long count;
398         char randstate[64];
399         unsigned int seed;
400         time_t start;
401 
402         if (bounces & BOUNCE_RANDOM) {
403                 seed = (unsigned int) time(NULL) - bounces;
404                 if (!(bounces & BOUNCE_RACINGFAULTS))
405                         seed += cpu;
406                 bzero(&rand, sizeof(rand));
407                 bzero(&randstate, sizeof(randstate));
408                 if (initstate_r(seed, randstate, sizeof(randstate), &rand)) {
409                         fprintf(stderr, "srandom_r error\n");
410                         exit(1);
411                 }
412         } else {
413                 page_nr = -bounces;
414                 if (!(bounces & BOUNCE_RACINGFAULTS))
415                         page_nr += cpu * nr_pages_per_cpu;
416         }
417 
418         while (!finished) {
419                 if (bounces & BOUNCE_RANDOM) {
420                         if (random_r(&rand, &rand_nr)) {
421                                 fprintf(stderr, "random_r 1 error\n");
422                                 exit(1);
423                         }
424                         page_nr = rand_nr;
425                         if (sizeof(page_nr) > sizeof(rand_nr)) {
426                                 if (random_r(&rand, &rand_nr)) {
427                                         fprintf(stderr, "random_r 2 error\n");
428                                         exit(1);
429                                 }
430                                 page_nr |= (((unsigned long) rand_nr) << 16) <<
431                                            16;
432                         }
433                 } else
434                         page_nr += 1;
435                 page_nr %= nr_pages;
436 
437                 start = time(NULL);
438                 if (bounces & BOUNCE_VERIFY) {
439                         count = *area_count(area_dst, page_nr);
440                         if (!count) {
441                                 fprintf(stderr,
442                                         "page_nr %lu wrong count %Lu %Lu\n",
443                                         page_nr, count,
444                                         count_verify[page_nr]);
445                                 exit(1);
446                         }
447 
448 
449                         /*
450                          * We can't use bcmp (or memcmp) because that
451                          * returns 0 erroneously if the memory is
452                          * changing under it (even if the end of the
453                          * page is never changing and always
454                          * different).
455                          */
456 #if 1
457                         if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
458                                      page_size)) {
459                                 fprintf(stderr,
460                                         "my_bcmp page_nr %lu wrong count %Lu %Lu\n",
461                                         page_nr, count, count_verify[page_nr]);
462                                 exit(1);
463                         }
464 #else
465                         unsigned long loops;
466 
467                         loops = 0;
468                         /* uncomment the below line to test with mutex */
469                         /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
470                         while (!bcmp(area_dst + page_nr * page_size, zeropage,
471                                      page_size)) {
472                                 loops += 1;
473                                 if (loops > 10)
474                                         break;
475                         }
476                         /* uncomment below line to test with mutex */
477                         /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
478                         if (loops) {
479                                 fprintf(stderr,
480                                         "page_nr %lu all zero thread %lu %p %lu\n",
481                                         page_nr, cpu, area_dst + page_nr * page_size,
482                                         loops);
483                                 if (loops > 10)
484                                         exit(1);
485                         }
486 #endif
487                 }
488 
489                 pthread_mutex_lock(area_mutex(area_dst, page_nr));
490                 count = *area_count(area_dst, page_nr);
491                 if (count != count_verify[page_nr]) {
492                         fprintf(stderr,
493                                 "page_nr %lu memory corruption %Lu %Lu\n",
494                                 page_nr, count,
495                                 count_verify[page_nr]); exit(1);
496                 }
497                 count++;
498                 *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
499                 pthread_mutex_unlock(area_mutex(area_dst, page_nr));
500 
501                 if (time(NULL) - start > 1)
502                         fprintf(stderr,
503                                 "userfault too slow %ld "
504                                 "possible false positive with overcommit\n",
505                                 time(NULL) - start);
506         }
507 
508         return NULL;
509 }
510 
511 static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
512                             unsigned long offset)
513 {
514         uffd_test_ops->alias_mapping(&uffdio_copy->dst,
515                                      uffdio_copy->len,
516                                      offset);
517         if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
518                 /* real retval in ufdio_copy.copy */
519                 if (uffdio_copy->copy != -EEXIST) {
520                         uffd_error(uffdio_copy->copy,
521                                    "UFFDIO_COPY retry error");
522                 }
523         } else
524                 uffd_error(uffdio_copy->copy, "UFFDIO_COPY retry unexpected");
525 }
526 
527 static int __copy_page(int ufd, unsigned long offset, bool retry)
528 {
529         struct uffdio_copy uffdio_copy;
530 
531         if (offset >= nr_pages * page_size) {
532                 fprintf(stderr, "unexpected offset %lu\n", offset);
533                 exit(1);
534         }
535         uffdio_copy.dst = (unsigned long) area_dst + offset;
536         uffdio_copy.src = (unsigned long) area_src + offset;
537         uffdio_copy.len = page_size;
538         if (test_uffdio_wp)
539                 uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
540         else
541                 uffdio_copy.mode = 0;
542         uffdio_copy.copy = 0;
543         if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
544                 /* real retval in ufdio_copy.copy */
545                 if (uffdio_copy.copy != -EEXIST)
546                         uffd_error(uffdio_copy.copy, "UFFDIO_COPY error");
547         } else if (uffdio_copy.copy != page_size) {
548                 uffd_error(uffdio_copy.copy, "UFFDIO_COPY unexpected copy");
549         } else {
550                 if (test_uffdio_copy_eexist && retry) {
551                         test_uffdio_copy_eexist = false;
552                         retry_copy_page(ufd, &uffdio_copy, offset);
553                 }
554                 return 1;
555         }
556         return 0;
557 }
558 
559 static int copy_page_retry(int ufd, unsigned long offset)
560 {
561         return __copy_page(ufd, offset, true);
562 }
563 
564 static int copy_page(int ufd, unsigned long offset)
565 {
566         return __copy_page(ufd, offset, false);
567 }
568 
569 static int uffd_read_msg(int ufd, struct uffd_msg *msg)
570 {
571         int ret = read(uffd, msg, sizeof(*msg));
572 
573         if (ret != sizeof(*msg)) {
574                 if (ret < 0) {
575                         if (errno == EAGAIN)
576                                 return 1;
577                         perror("blocking read error");
578                 } else {
579                         fprintf(stderr, "short read\n");
580                 }
581                 exit(1);
582         }
583 
584         return 0;
585 }
586 
587 static void uffd_handle_page_fault(struct uffd_msg *msg,
588                                    struct uffd_stats *stats)
589 {
590         unsigned long offset;
591 
592         if (msg->event != UFFD_EVENT_PAGEFAULT) {
593                 fprintf(stderr, "unexpected msg event %u\n", msg->event);
594                 exit(1);
595         }
596 
597         if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
598                 /* Write protect page faults */
599                 wp_range(uffd, msg->arg.pagefault.address, page_size, false);
600                 stats->wp_faults++;
601         } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
602                 uint8_t *area;
603                 int b;
604 
605                 /*
606                  * Minor page faults
607                  *
608                  * To prove we can modify the original range for testing
609                  * purposes, we're going to bit flip this range before
610                  * continuing.
611                  *
612                  * Note that this requires all minor page fault tests operate on
613                  * area_dst (non-UFFD-registered) and area_dst_alias
614                  * (UFFD-registered).
615                  */
616 
617                 area = (uint8_t *)(area_dst +
618                                    ((char *)msg->arg.pagefault.address -
619                                     area_dst_alias));
620                 for (b = 0; b < page_size; ++b)
621                         area[b] = ~area[b];
622                 continue_range(uffd, msg->arg.pagefault.address, page_size);
623                 stats->minor_faults++;
624         } else {
625                 /* Missing page faults */
626                 if (bounces & BOUNCE_VERIFY &&
627                     msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) {
628                         fprintf(stderr, "unexpected write fault\n");
629                         exit(1);
630                 }
631 
632                 offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
633                 offset &= ~(page_size-1);
634 
635                 if (copy_page(uffd, offset))
636                         stats->missing_faults++;
637         }
638 }
639 
640 static void *uffd_poll_thread(void *arg)
641 {
642         struct uffd_stats *stats = (struct uffd_stats *)arg;
643         unsigned long cpu = stats->cpu;
644         struct pollfd pollfd[2];
645         struct uffd_msg msg;
646         struct uffdio_register uffd_reg;
647         int ret;
648         char tmp_chr;
649 
650         pollfd[0].fd = uffd;
651         pollfd[0].events = POLLIN;
652         pollfd[1].fd = pipefd[cpu*2];
653         pollfd[1].events = POLLIN;
654 
655         for (;;) {
656                 ret = poll(pollfd, 2, -1);
657                 if (!ret) {
658                         fprintf(stderr, "poll error %d\n", ret);
659                         exit(1);
660                 }
661                 if (ret < 0) {
662                         perror("poll");
663                         exit(1);
664                 }
665                 if (pollfd[1].revents & POLLIN) {
666                         if (read(pollfd[1].fd, &tmp_chr, 1) != 1) {
667                                 fprintf(stderr, "read pipefd error\n");
668                                 exit(1);
669                         }
670                         break;
671                 }
672                 if (!(pollfd[0].revents & POLLIN)) {
673                         fprintf(stderr, "pollfd[0].revents %d\n",
674                                 pollfd[0].revents);
675                         exit(1);
676                 }
677                 if (uffd_read_msg(uffd, &msg))
678                         continue;
679                 switch (msg.event) {
680                 default:
681                         fprintf(stderr, "unexpected msg event %u\n",
682                                 msg.event); exit(1);
683                         break;
684                 case UFFD_EVENT_PAGEFAULT:
685                         uffd_handle_page_fault(&msg, stats);
686                         break;
687                 case UFFD_EVENT_FORK:
688                         close(uffd);
689                         uffd = msg.arg.fork.ufd;
690                         pollfd[0].fd = uffd;
691                         break;
692                 case UFFD_EVENT_REMOVE:
693                         uffd_reg.range.start = msg.arg.remove.start;
694                         uffd_reg.range.len = msg.arg.remove.end -
695                                 msg.arg.remove.start;
696                         if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) {
697                                 fprintf(stderr, "remove failure\n");
698                                 exit(1);
699                         }
700                         break;
701                 case UFFD_EVENT_REMAP:
702                         area_dst = (char *)(unsigned long)msg.arg.remap.to;
703                         break;
704                 }
705         }
706 
707         return NULL;
708 }
709 
710 pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
711 
712 static void *uffd_read_thread(void *arg)
713 {
714         struct uffd_stats *stats = (struct uffd_stats *)arg;
715         struct uffd_msg msg;
716 
717         pthread_mutex_unlock(&uffd_read_mutex);
718         /* from here cancellation is ok */
719 
720         for (;;) {
721                 if (uffd_read_msg(uffd, &msg))
722                         continue;
723                 uffd_handle_page_fault(&msg, stats);
724         }
725 
726         return NULL;
727 }
728 
729 static void *background_thread(void *arg)
730 {
731         unsigned long cpu = (unsigned long) arg;
732         unsigned long page_nr, start_nr, mid_nr, end_nr;
733 
734         start_nr = cpu * nr_pages_per_cpu;
735         end_nr = (cpu+1) * nr_pages_per_cpu;
736         mid_nr = (start_nr + end_nr) / 2;
737 
738         /* Copy the first half of the pages */
739         for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
740                 copy_page_retry(uffd, page_nr * page_size);
741 
742         /*
743          * If we need to test uffd-wp, set it up now.  Then we'll have
744          * at least the first half of the pages mapped already which
745          * can be write-protected for testing
746          */
747         if (test_uffdio_wp)
748                 wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
749                         nr_pages_per_cpu * page_size, true);
750 
751         /*
752          * Continue the 2nd half of the page copying, handling write
753          * protection faults if any
754          */
755         for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
756                 copy_page_retry(uffd, page_nr * page_size);
757 
758         return NULL;
759 }
760 
761 static int stress(struct uffd_stats *uffd_stats)
762 {
763         unsigned long cpu;
764         pthread_t locking_threads[nr_cpus];
765         pthread_t uffd_threads[nr_cpus];
766         pthread_t background_threads[nr_cpus];
767 
768         finished = 0;
769         for (cpu = 0; cpu < nr_cpus; cpu++) {
770                 if (pthread_create(&locking_threads[cpu], &attr,
771                                    locking_thread, (void *)cpu))
772                         return 1;
773                 if (bounces & BOUNCE_POLL) {
774                         if (pthread_create(&uffd_threads[cpu], &attr,
775                                            uffd_poll_thread,
776                                            (void *)&uffd_stats[cpu]))
777                                 return 1;
778                 } else {
779                         if (pthread_create(&uffd_threads[cpu], &attr,
780                                            uffd_read_thread,
781                                            (void *)&uffd_stats[cpu]))
782                                 return 1;
783                         pthread_mutex_lock(&uffd_read_mutex);
784                 }
785                 if (pthread_create(&background_threads[cpu], &attr,
786                                    background_thread, (void *)cpu))
787                         return 1;
788         }
789         for (cpu = 0; cpu < nr_cpus; cpu++)
790                 if (pthread_join(background_threads[cpu], NULL))
791                         return 1;
792 
793         /*
794          * Be strict and immediately zap area_src, the whole area has
795          * been transferred already by the background treads. The
796          * area_src could then be faulted in in a racy way by still
797          * running uffdio_threads reading zeropages after we zapped
798          * area_src (but they're guaranteed to get -EEXIST from
799          * UFFDIO_COPY without writing zero pages into area_dst
800          * because the background threads already completed).
801          */
802         if (uffd_test_ops->release_pages(area_src))
803                 return 1;
804 
805 
806         finished = 1;
807         for (cpu = 0; cpu < nr_cpus; cpu++)
808                 if (pthread_join(locking_threads[cpu], NULL))
809                         return 1;
810 
811         for (cpu = 0; cpu < nr_cpus; cpu++) {
812                 char c;
813                 if (bounces & BOUNCE_POLL) {
814                         if (write(pipefd[cpu*2+1], &c, 1) != 1) {
815                                 fprintf(stderr, "pipefd write error\n");
816                                 return 1;
817                         }
818                         if (pthread_join(uffd_threads[cpu],
819                                          (void *)&uffd_stats[cpu]))
820                                 return 1;
821                 } else {
822                         if (pthread_cancel(uffd_threads[cpu]))
823                                 return 1;
824                         if (pthread_join(uffd_threads[cpu], NULL))
825                                 return 1;
826                 }
827         }
828 
829         return 0;
830 }
831 
832 static int userfaultfd_open_ext(uint64_t *features)
833 {
834         struct uffdio_api uffdio_api;
835 
836         uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
837         if (uffd < 0) {
838                 fprintf(stderr,
839                         "userfaultfd syscall not available in this kernel\n");
840                 return 1;
841         }
842         uffd_flags = fcntl(uffd, F_GETFD, NULL);
843 
844         uffdio_api.api = UFFD_API;
845         uffdio_api.features = *features;
846         if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
847                 fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to "
848                         "run with either root or ptrace capability.\n");
849                 return 1;
850         }
851         if (uffdio_api.api != UFFD_API) {
852                 fprintf(stderr, "UFFDIO_API error: %" PRIu64 "\n",
853                         (uint64_t)uffdio_api.api);
854                 return 1;
855         }
856 
857         *features = uffdio_api.features;
858         return 0;
859 }
860 
861 static int userfaultfd_open(uint64_t features)
862 {
863         return userfaultfd_open_ext(&features);
864 }
865 
866 sigjmp_buf jbuf, *sigbuf;
867 
868 static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
869 {
870         if (sig == SIGBUS) {
871                 if (sigbuf)
872                         siglongjmp(*sigbuf, 1);
873                 abort();
874         }
875 }
876 
877 /*
878  * For non-cooperative userfaultfd test we fork() a process that will
879  * generate pagefaults, will mremap the area monitored by the
880  * userfaultfd and at last this process will release the monitored
881  * area.
882  * For the anonymous and shared memory the area is divided into two
883  * parts, the first part is accessed before mremap, and the second
884  * part is accessed after mremap. Since hugetlbfs does not support
885  * mremap, the entire monitored area is accessed in a single pass for
886  * HUGETLB_TEST.
887  * The release of the pages currently generates event for shmem and
888  * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
889  * for hugetlb.
890  * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
891  * monitored area, generate pagefaults and test that signal is delivered.
892  * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
893  * test robustness use case - we release monitored area, fork a process
894  * that will generate pagefaults and verify signal is generated.
895  * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
896  * feature. Using monitor thread, verify no userfault events are generated.
897  */
898 static int faulting_process(int signal_test)
899 {
900         unsigned long nr;
901         unsigned long long count;
902         unsigned long split_nr_pages;
903         unsigned long lastnr;
904         struct sigaction act;
905         unsigned long signalled = 0;
906 
907         if (test_type != TEST_HUGETLB)
908                 split_nr_pages = (nr_pages + 1) / 2;
909         else
910                 split_nr_pages = nr_pages;
911 
912         if (signal_test) {
913                 sigbuf = &jbuf;
914                 memset(&act, 0, sizeof(act));
915                 act.sa_sigaction = sighndl;
916                 act.sa_flags = SA_SIGINFO;
917                 if (sigaction(SIGBUS, &act, 0)) {
918                         perror("sigaction");
919                         return 1;
920                 }
921                 lastnr = (unsigned long)-1;
922         }
923 
924         for (nr = 0; nr < split_nr_pages; nr++) {
925                 int steps = 1;
926                 unsigned long offset = nr * page_size;
927 
928                 if (signal_test) {
929                         if (sigsetjmp(*sigbuf, 1) != 0) {
930                                 if (steps == 1 && nr == lastnr) {
931                                         fprintf(stderr, "Signal repeated\n");
932                                         return 1;
933                                 }
934 
935                                 lastnr = nr;
936                                 if (signal_test == 1) {
937                                         if (steps == 1) {
938                                                 /* This is a MISSING request */
939                                                 steps++;
940                                                 if (copy_page(uffd, offset))
941                                                         signalled++;
942                                         } else {
943                                                 /* This is a WP request */
944                                                 assert(steps == 2);
945                                                 wp_range(uffd,
946                                                          (__u64)area_dst +
947                                                          offset,
948                                                          page_size, false);
949                                         }
950                                 } else {
951                                         signalled++;
952                                         continue;
953                                 }
954                         }
955                 }
956 
957                 count = *area_count(area_dst, nr);
958                 if (count != count_verify[nr]) {
959                         fprintf(stderr,
960                                 "nr %lu memory corruption %Lu %Lu\n",
961                                 nr, count,
962                                 count_verify[nr]);
963                 }
964                 /*
965                  * Trigger write protection if there is by writing
966                  * the same value back.
967                  */
968                 *area_count(area_dst, nr) = count;
969         }
970 
971         if (signal_test)
972                 return signalled != split_nr_pages;
973 
974         if (test_type == TEST_HUGETLB)
975                 return 0;
976 
977         area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
978                           MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
979         if (area_dst == MAP_FAILED) {
980                 perror("mremap");
981                 exit(1);
982         }
983 
984         for (; nr < nr_pages; nr++) {
985                 count = *area_count(area_dst, nr);
986                 if (count != count_verify[nr]) {
987                         fprintf(stderr,
988                                 "nr %lu memory corruption %Lu %Lu\n",
989                                 nr, count,
990                                 count_verify[nr]); exit(1);
991                 }
992                 /*
993                  * Trigger write protection if there is by writing
994                  * the same value back.
995                  */
996                 *area_count(area_dst, nr) = count;
997         }
998 
999         if (uffd_test_ops->release_pages(area_dst))
1000                 return 1;
1001 
1002         for (nr = 0; nr < nr_pages; nr++) {
1003                 if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) {
1004                         fprintf(stderr, "nr %lu is not zero\n", nr);
1005                         exit(1);
1006                 }
1007         }
1008 
1009         return 0;
1010 }
1011 
1012 static void retry_uffdio_zeropage(int ufd,
1013                                   struct uffdio_zeropage *uffdio_zeropage,
1014                                   unsigned long offset)
1015 {
1016         uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
1017                                      uffdio_zeropage->range.len,
1018                                      offset);
1019         if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
1020                 if (uffdio_zeropage->zeropage != -EEXIST) {
1021                         uffd_error(uffdio_zeropage->zeropage,
1022                                    "UFFDIO_ZEROPAGE retry error");
1023                 }
1024         } else {
1025                 uffd_error(uffdio_zeropage->zeropage,
1026                            "UFFDIO_ZEROPAGE retry unexpected");
1027         }
1028 }
1029 
1030 static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
1031 {
1032         struct uffdio_zeropage uffdio_zeropage;
1033         int ret;
1034         unsigned long has_zeropage;
1035         __s64 res;
1036 
1037         has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
1038 
1039         if (offset >= nr_pages * page_size) {
1040                 fprintf(stderr, "unexpected offset %lu\n", offset);
1041                 exit(1);
1042         }
1043         uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
1044         uffdio_zeropage.range.len = page_size;
1045         uffdio_zeropage.mode = 0;
1046         ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
1047         res = uffdio_zeropage.zeropage;
1048         if (ret) {
1049                 /* real retval in ufdio_zeropage.zeropage */
1050                 if (has_zeropage) {
1051                         uffd_error(res, "UFFDIO_ZEROPAGE %s",
1052                                    res == -EEXIST ? "-EEXIST" : "error");
1053                 } else if (res != -EINVAL)
1054                         uffd_error(res, "UFFDIO_ZEROPAGE not -EINVAL");
1055         } else if (has_zeropage) {
1056                 if (res != page_size) {
1057                         uffd_error(res, "UFFDIO_ZEROPAGE unexpected");
1058                 } else {
1059                         if (test_uffdio_zeropage_eexist && retry) {
1060                                 test_uffdio_zeropage_eexist = false;
1061                                 retry_uffdio_zeropage(ufd, &uffdio_zeropage,
1062                                                       offset);
1063                         }
1064                         return 1;
1065                 }
1066         } else
1067                 uffd_error(res, "UFFDIO_ZEROPAGE succeeded");
1068 
1069         return 0;
1070 }
1071 
1072 static int uffdio_zeropage(int ufd, unsigned long offset)
1073 {
1074         return __uffdio_zeropage(ufd, offset, false);
1075 }
1076 
1077 /* exercise UFFDIO_ZEROPAGE */
1078 static int userfaultfd_zeropage_test(void)
1079 {
1080         struct uffdio_register uffdio_register;
1081         unsigned long expected_ioctls;
1082 
1083         printf("testing UFFDIO_ZEROPAGE: ");
1084         fflush(stdout);
1085 
1086         if (uffd_test_ops->release_pages(area_dst))
1087                 return 1;
1088 
1089         if (userfaultfd_open(0))
1090                 return 1;
1091         uffdio_register.range.start = (unsigned long) area_dst;
1092         uffdio_register.range.len = nr_pages * page_size;
1093         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1094         if (test_uffdio_wp)
1095                 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1096         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1097                 fprintf(stderr, "register failure\n");
1098                 exit(1);
1099         }
1100 
1101         expected_ioctls = uffd_test_ops->expected_ioctls;
1102         if ((uffdio_register.ioctls & expected_ioctls) !=
1103             expected_ioctls) {
1104                 fprintf(stderr,
1105                         "unexpected missing ioctl for anon memory\n");
1106                 exit(1);
1107         }
1108 
1109         if (uffdio_zeropage(uffd, 0)) {
1110                 if (my_bcmp(area_dst, zeropage, page_size)) {
1111                         fprintf(stderr, "zeropage is not zero\n");
1112                         exit(1);
1113                 }
1114         }
1115 
1116         close(uffd);
1117         printf("done.\n");
1118         return 0;
1119 }
1120 
1121 static int userfaultfd_events_test(void)
1122 {
1123         struct uffdio_register uffdio_register;
1124         unsigned long expected_ioctls;
1125         pthread_t uffd_mon;
1126         int err, features;
1127         pid_t pid;
1128         char c;
1129         struct uffd_stats stats = { 0 };
1130 
1131         printf("testing events (fork, remap, remove): ");
1132         fflush(stdout);
1133 
1134         if (uffd_test_ops->release_pages(area_dst))
1135                 return 1;
1136 
1137         features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
1138                 UFFD_FEATURE_EVENT_REMOVE;
1139         if (userfaultfd_open(features))
1140                 return 1;
1141         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1142 
1143         uffdio_register.range.start = (unsigned long) area_dst;
1144         uffdio_register.range.len = nr_pages * page_size;
1145         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1146         if (test_uffdio_wp)
1147                 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1148         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1149                 fprintf(stderr, "register failure\n");
1150                 exit(1);
1151         }
1152 
1153         expected_ioctls = uffd_test_ops->expected_ioctls;
1154         if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
1155                 fprintf(stderr, "unexpected missing ioctl for anon memory\n");
1156                 exit(1);
1157         }
1158 
1159         if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
1160                 perror("uffd_poll_thread create");
1161                 exit(1);
1162         }
1163 
1164         pid = fork();
1165         if (pid < 0) {
1166                 perror("fork");
1167                 exit(1);
1168         }
1169 
1170         if (!pid)
1171                 exit(faulting_process(0));
1172 
1173         waitpid(pid, &err, 0);
1174         if (err) {
1175                 fprintf(stderr, "faulting process failed\n");
1176                 exit(1);
1177         }
1178 
1179         if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
1180                 perror("pipe write");
1181                 exit(1);
1182         }
1183         if (pthread_join(uffd_mon, NULL))
1184                 return 1;
1185 
1186         close(uffd);
1187 
1188         uffd_stats_report(&stats, 1);
1189 
1190         return stats.missing_faults != nr_pages;
1191 }
1192 
1193 static int userfaultfd_sig_test(void)
1194 {
1195         struct uffdio_register uffdio_register;
1196         unsigned long expected_ioctls;
1197         unsigned long userfaults;
1198         pthread_t uffd_mon;
1199         int err, features;
1200         pid_t pid;
1201         char c;
1202         struct uffd_stats stats = { 0 };
1203 
1204         printf("testing signal delivery: ");
1205         fflush(stdout);
1206 
1207         if (uffd_test_ops->release_pages(area_dst))
1208                 return 1;
1209 
1210         features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
1211         if (userfaultfd_open(features))
1212                 return 1;
1213         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1214 
1215         uffdio_register.range.start = (unsigned long) area_dst;
1216         uffdio_register.range.len = nr_pages * page_size;
1217         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1218         if (test_uffdio_wp)
1219                 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1220         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1221                 fprintf(stderr, "register failure\n");
1222                 exit(1);
1223         }
1224 
1225         expected_ioctls = uffd_test_ops->expected_ioctls;
1226         if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
1227                 fprintf(stderr, "unexpected missing ioctl for anon memory\n");
1228                 exit(1);
1229         }
1230 
1231         if (faulting_process(1)) {
1232                 fprintf(stderr, "faulting process failed\n");
1233                 exit(1);
1234         }
1235 
1236         if (uffd_test_ops->release_pages(area_dst))
1237                 return 1;
1238 
1239         if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
1240                 perror("uffd_poll_thread create");
1241                 exit(1);
1242         }
1243 
1244         pid = fork();
1245         if (pid < 0) {
1246                 perror("fork");
1247                 exit(1);
1248         }
1249 
1250         if (!pid)
1251                 exit(faulting_process(2));
1252 
1253         waitpid(pid, &err, 0);
1254         if (err) {
1255                 fprintf(stderr, "faulting process failed\n");
1256                 exit(1);
1257         }
1258 
1259         if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
1260                 perror("pipe write");
1261                 exit(1);
1262         }
1263         if (pthread_join(uffd_mon, (void **)&userfaults))
1264                 return 1;
1265 
1266         printf("done.\n");
1267         if (userfaults)
1268                 fprintf(stderr, "Signal test failed, userfaults: %ld\n",
1269                         userfaults);
1270         close(uffd);
1271         return userfaults != 0;
1272 }
1273 
1274 static int userfaultfd_minor_test(void)
1275 {
1276         struct uffdio_register uffdio_register;
1277         unsigned long expected_ioctls;
1278         unsigned long p;
1279         pthread_t uffd_mon;
1280         uint8_t expected_byte;
1281         void *expected_page;
1282         char c;
1283         struct uffd_stats stats = { 0 };
1284         uint64_t features = UFFD_FEATURE_MINOR_HUGETLBFS;
1285 
1286         if (!test_uffdio_minor)
1287                 return 0;
1288 
1289         printf("testing minor faults: ");
1290         fflush(stdout);
1291 
1292         if (uffd_test_ops->release_pages(area_dst))
1293                 return 1;
1294 
1295         if (userfaultfd_open_ext(&features))
1296                 return 1;
1297         /* If kernel reports the feature isn't supported, skip the test. */
1298         if (!(features & UFFD_FEATURE_MINOR_HUGETLBFS)) {
1299                 printf("skipping test due to lack of feature support\n");
1300                 fflush(stdout);
1301                 return 0;
1302         }
1303 
1304         uffdio_register.range.start = (unsigned long)area_dst_alias;
1305         uffdio_register.range.len = nr_pages * page_size;
1306         uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
1307         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1308                 fprintf(stderr, "register failure\n");
1309                 exit(1);
1310         }
1311 
1312         expected_ioctls = uffd_test_ops->expected_ioctls;
1313         expected_ioctls |= 1 << _UFFDIO_CONTINUE;
1314         if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
1315                 fprintf(stderr, "unexpected missing ioctl(s)\n");
1316                 exit(1);
1317         }
1318 
1319         /*
1320          * After registering with UFFD, populate the non-UFFD-registered side of
1321          * the shared mapping. This should *not* trigger any UFFD minor faults.
1322          */
1323         for (p = 0; p < nr_pages; ++p) {
1324                 memset(area_dst + (p * page_size), p % ((uint8_t)-1),
1325                        page_size);
1326         }
1327 
1328         if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
1329                 perror("uffd_poll_thread create");
1330                 exit(1);
1331         }
1332 
1333         /*
1334          * Read each of the pages back using the UFFD-registered mapping. We
1335          * expect that the first time we touch a page, it will result in a minor
1336          * fault. uffd_poll_thread will resolve the fault by bit-flipping the
1337          * page's contents, and then issuing a CONTINUE ioctl.
1338          */
1339 
1340         if (posix_memalign(&expected_page, page_size, page_size)) {
1341                 fprintf(stderr, "out of memory\n");
1342                 return 1;
1343         }
1344 
1345         for (p = 0; p < nr_pages; ++p) {
1346                 expected_byte = ~((uint8_t)(p % ((uint8_t)-1)));
1347                 memset(expected_page, expected_byte, page_size);
1348                 if (my_bcmp(expected_page, area_dst_alias + (p * page_size),
1349                             page_size)) {
1350                         fprintf(stderr,
1351                                 "unexpected page contents after minor fault\n");
1352                         exit(1);
1353                 }
1354         }
1355 
1356         if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
1357                 perror("pipe write");
1358                 exit(1);
1359         }
1360         if (pthread_join(uffd_mon, NULL))
1361                 return 1;
1362 
1363         close(uffd);
1364 
1365         uffd_stats_report(&stats, 1);
1366 
1367         return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
1368 }
1369 
1370 static int userfaultfd_stress(void)
1371 {
1372         void *area;
1373         char *tmp_area;
1374         unsigned long nr;
1375         struct uffdio_register uffdio_register;
1376         unsigned long cpu;
1377         int err;
1378         struct uffd_stats uffd_stats[nr_cpus];
1379 
1380         uffd_test_ops->allocate_area((void **)&area_src);
1381         if (!area_src)
1382                 return 1;
1383         uffd_test_ops->allocate_area((void **)&area_dst);
1384         if (!area_dst)
1385                 return 1;
1386 
1387         if (userfaultfd_open(0))
1388                 return 1;
1389 
1390         count_verify = malloc(nr_pages * sizeof(unsigned long long));
1391         if (!count_verify) {
1392                 perror("count_verify");
1393                 return 1;
1394         }
1395 
1396         for (nr = 0; nr < nr_pages; nr++) {
1397                 *area_mutex(area_src, nr) = (pthread_mutex_t)
1398                         PTHREAD_MUTEX_INITIALIZER;
1399                 count_verify[nr] = *area_count(area_src, nr) = 1;
1400                 /*
1401                  * In the transition between 255 to 256, powerpc will
1402                  * read out of order in my_bcmp and see both bytes as
1403                  * zero, so leave a placeholder below always non-zero
1404                  * after the count, to avoid my_bcmp to trigger false
1405                  * positives.
1406                  */
1407                 *(area_count(area_src, nr) + 1) = 1;
1408         }
1409 
1410         pipefd = malloc(sizeof(int) * nr_cpus * 2);
1411         if (!pipefd) {
1412                 perror("pipefd");
1413                 return 1;
1414         }
1415         for (cpu = 0; cpu < nr_cpus; cpu++) {
1416                 if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
1417                         perror("pipe");
1418                         return 1;
1419                 }
1420         }
1421 
1422         if (posix_memalign(&area, page_size, page_size)) {
1423                 fprintf(stderr, "out of memory\n");
1424                 return 1;
1425         }
1426         zeropage = area;
1427         bzero(zeropage, page_size);
1428 
1429         pthread_mutex_lock(&uffd_read_mutex);
1430 
1431         pthread_attr_init(&attr);
1432         pthread_attr_setstacksize(&attr, 16*1024*1024);
1433 
1434         err = 0;
1435         while (bounces--) {
1436                 unsigned long expected_ioctls;
1437 
1438                 printf("bounces: %d, mode:", bounces);
1439                 if (bounces & BOUNCE_RANDOM)
1440                         printf(" rnd");
1441                 if (bounces & BOUNCE_RACINGFAULTS)
1442                         printf(" racing");
1443                 if (bounces & BOUNCE_VERIFY)
1444                         printf(" ver");
1445                 if (bounces & BOUNCE_POLL)
1446                         printf(" poll");
1447                 else
1448                         printf(" read");
1449                 printf(", ");
1450                 fflush(stdout);
1451 
1452                 if (bounces & BOUNCE_POLL)
1453                         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1454                 else
1455                         fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
1456 
1457                 /* register */
1458                 uffdio_register.range.start = (unsigned long) area_dst;
1459                 uffdio_register.range.len = nr_pages * page_size;
1460                 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1461                 if (test_uffdio_wp)
1462                         uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1463                 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1464                         fprintf(stderr, "register failure\n");
1465                         return 1;
1466                 }
1467                 expected_ioctls = uffd_test_ops->expected_ioctls;
1468                 if ((uffdio_register.ioctls & expected_ioctls) !=
1469                     expected_ioctls) {
1470                         fprintf(stderr,
1471                                 "unexpected missing ioctl for anon memory\n");
1472                         return 1;
1473                 }
1474 
1475                 if (area_dst_alias) {
1476                         uffdio_register.range.start = (unsigned long)
1477                                 area_dst_alias;
1478                         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1479                                 fprintf(stderr, "register failure alias\n");
1480                                 return 1;
1481                         }
1482                 }
1483 
1484                 /*
1485                  * The madvise done previously isn't enough: some
1486                  * uffd_thread could have read userfaults (one of
1487                  * those already resolved by the background thread)
1488                  * and it may be in the process of calling
1489                  * UFFDIO_COPY. UFFDIO_COPY will read the zapped
1490                  * area_src and it would map a zero page in it (of
1491                  * course such a UFFDIO_COPY is perfectly safe as it'd
1492                  * return -EEXIST). The problem comes at the next
1493                  * bounce though: that racing UFFDIO_COPY would
1494                  * generate zeropages in the area_src, so invalidating
1495                  * the previous MADV_DONTNEED. Without this additional
1496                  * MADV_DONTNEED those zeropages leftovers in the
1497                  * area_src would lead to -EEXIST failure during the
1498                  * next bounce, effectively leaving a zeropage in the
1499                  * area_dst.
1500                  *
1501                  * Try to comment this out madvise to see the memory
1502                  * corruption being caught pretty quick.
1503                  *
1504                  * khugepaged is also inhibited to collapse THP after
1505                  * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
1506                  * required to MADV_DONTNEED here.
1507                  */
1508                 if (uffd_test_ops->release_pages(area_dst))
1509                         return 1;
1510 
1511                 uffd_stats_reset(uffd_stats, nr_cpus);
1512 
1513                 /* bounce pass */
1514                 if (stress(uffd_stats))
1515                         return 1;
1516 
1517                 /* Clear all the write protections if there is any */
1518                 if (test_uffdio_wp)
1519                         wp_range(uffd, (unsigned long)area_dst,
1520                                  nr_pages * page_size, false);
1521 
1522                 /* unregister */
1523                 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
1524                         fprintf(stderr, "unregister failure\n");
1525                         return 1;
1526                 }
1527                 if (area_dst_alias) {
1528                         uffdio_register.range.start = (unsigned long) area_dst;
1529                         if (ioctl(uffd, UFFDIO_UNREGISTER,
1530                                   &uffdio_register.range)) {
1531                                 fprintf(stderr, "unregister failure alias\n");
1532                                 return 1;
1533                         }
1534                 }
1535 
1536                 /* verification */
1537                 if (bounces & BOUNCE_VERIFY) {
1538                         for (nr = 0; nr < nr_pages; nr++) {
1539                                 if (*area_count(area_dst, nr) != count_verify[nr]) {
1540                                         fprintf(stderr,
1541                                                 "error area_count %Lu %Lu %lu\n",
1542                                                 *area_count(area_src, nr),
1543                                                 count_verify[nr],
1544                                                 nr);
1545                                         err = 1;
1546                                         bounces = 0;
1547                                 }
1548                         }
1549                 }
1550 
1551                 /* prepare next bounce */
1552                 tmp_area = area_src;
1553                 area_src = area_dst;
1554                 area_dst = tmp_area;
1555 
1556                 tmp_area = area_src_alias;
1557                 area_src_alias = area_dst_alias;
1558                 area_dst_alias = tmp_area;
1559 
1560                 uffd_stats_report(uffd_stats, nr_cpus);
1561         }
1562 
1563         if (err)
1564                 return err;
1565 
1566         close(uffd);
1567         return userfaultfd_zeropage_test() || userfaultfd_sig_test()
1568                 || userfaultfd_events_test() || userfaultfd_minor_test();
1569 }
1570 
1571 /*
1572  * Copied from mlock2-tests.c
1573  */
1574 unsigned long default_huge_page_size(void)
1575 {
1576         unsigned long hps = 0;
1577         char *line = NULL;
1578         size_t linelen = 0;
1579         FILE *f = fopen("/proc/meminfo", "r");
1580 
1581         if (!f)
1582                 return 0;
1583         while (getline(&line, &linelen, f) > 0) {
1584                 if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
1585                         hps <<= 10;
1586                         break;
1587                 }
1588         }
1589 
1590         free(line);
1591         fclose(f);
1592         return hps;
1593 }
1594 
1595 static void set_test_type(const char *type)
1596 {
1597         if (!strcmp(type, "anon")) {
1598                 test_type = TEST_ANON;
1599                 uffd_test_ops = &anon_uffd_test_ops;
1600                 /* Only enable write-protect test for anonymous test */
1601                 test_uffdio_wp = true;
1602         } else if (!strcmp(type, "hugetlb")) {
1603                 test_type = TEST_HUGETLB;
1604                 uffd_test_ops = &hugetlb_uffd_test_ops;
1605         } else if (!strcmp(type, "hugetlb_shared")) {
1606                 map_shared = true;
1607                 test_type = TEST_HUGETLB;
1608                 uffd_test_ops = &hugetlb_uffd_test_ops;
1609                 /* Minor faults require shared hugetlb; only enable here. */
1610                 test_uffdio_minor = true;
1611         } else if (!strcmp(type, "shmem")) {
1612                 map_shared = true;
1613                 test_type = TEST_SHMEM;
1614                 uffd_test_ops = &shmem_uffd_test_ops;
1615         } else {
1616                 fprintf(stderr, "Unknown test type: %s\n", type); exit(1);
1617         }
1618 
1619         if (test_type == TEST_HUGETLB)
1620                 page_size = default_huge_page_size();
1621         else
1622                 page_size = sysconf(_SC_PAGE_SIZE);
1623 
1624         if (!page_size) {
1625                 fprintf(stderr, "Unable to determine page size\n");
1626                 exit(2);
1627         }
1628         if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1629             > page_size) {
1630                 fprintf(stderr, "Impossible to run this test\n");
1631                 exit(2);
1632         }
1633 }
1634 
1635 static void sigalrm(int sig)
1636 {
1637         if (sig != SIGALRM)
1638                 abort();
1639         test_uffdio_copy_eexist = true;
1640         test_uffdio_zeropage_eexist = true;
1641         alarm(ALARM_INTERVAL_SECS);
1642 }
1643 
1644 int main(int argc, char **argv)
1645 {
1646         if (argc < 4)
1647                 usage();
1648 
1649         if (signal(SIGALRM, sigalrm) == SIG_ERR) {
1650                 fprintf(stderr, "failed to arm SIGALRM");
1651                 exit(1);
1652         }
1653         alarm(ALARM_INTERVAL_SECS);
1654 
1655         set_test_type(argv[1]);
1656 
1657         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1658         nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
1659                 nr_cpus;
1660         if (!nr_pages_per_cpu) {
1661                 fprintf(stderr, "invalid MiB\n");
1662                 usage();
1663         }
1664 
1665         bounces = atoi(argv[3]);
1666         if (bounces <= 0) {
1667                 fprintf(stderr, "invalid bounces\n");
1668                 usage();
1669         }
1670         nr_pages = nr_pages_per_cpu * nr_cpus;
1671 
1672         if (test_type == TEST_HUGETLB) {
1673                 if (argc < 5)
1674                         usage();
1675                 huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
1676                 if (huge_fd < 0) {
1677                         fprintf(stderr, "Open of %s failed", argv[3]);
1678                         perror("open");
1679                         exit(1);
1680                 }
1681                 if (ftruncate(huge_fd, 0)) {
1682                         fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
1683                         perror("ftruncate");
1684                         exit(1);
1685                 }
1686         }
1687         printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1688                nr_pages, nr_pages_per_cpu);
1689         return userfaultfd_stress();
1690 }
1691 
1692 #else /* __NR_userfaultfd */
1693 
1694 #warning "missing __NR_userfaultfd definition"
1695 
1696 int main(void)
1697 {
1698         printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1699         return KSFT_SKIP;
1700 }
1701 
1702 #endif /* __NR_userfaultfd */
1703 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp