Merge branch 'for-next/esr-elx-64-bit' into for-next/core
[linux-2.6-microblaze.git] / tools / testing / selftests / seccomp / seccomp_bpf.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4  *
5  * Test code for seccomp bpf.
6  */
7
8 #define _GNU_SOURCE
9 #include <sys/types.h>
10
11 /*
12  * glibc 2.26 and later have SIGSYS in siginfo_t. Before that,
13  * we need to use the kernel's siginfo.h file and trick glibc
14  * into accepting it.
15  */
16 #if !__GLIBC_PREREQ(2, 26)
17 # include <asm/siginfo.h>
18 # define __have_siginfo_t 1
19 # define __have_sigval_t 1
20 # define __have_sigevent_t 1
21 #endif
22
23 #include <errno.h>
24 #include <linux/filter.h>
25 #include <sys/prctl.h>
26 #include <sys/ptrace.h>
27 #include <sys/user.h>
28 #include <linux/prctl.h>
29 #include <linux/ptrace.h>
30 #include <linux/seccomp.h>
31 #include <pthread.h>
32 #include <semaphore.h>
33 #include <signal.h>
34 #include <stddef.h>
35 #include <stdbool.h>
36 #include <string.h>
37 #include <time.h>
38 #include <limits.h>
39 #include <linux/elf.h>
40 #include <sys/uio.h>
41 #include <sys/utsname.h>
42 #include <sys/fcntl.h>
43 #include <sys/mman.h>
44 #include <sys/times.h>
45 #include <sys/socket.h>
46 #include <sys/ioctl.h>
47 #include <linux/kcmp.h>
48 #include <sys/resource.h>
49
50 #include <unistd.h>
51 #include <sys/syscall.h>
52 #include <poll.h>
53
54 #include "../kselftest_harness.h"
55 #include "../clone3/clone3_selftests.h"
56
57 /* Attempt to de-conflict with the selftests tree. */
58 #ifndef SKIP
59 #define SKIP(s, ...)    XFAIL(s, ##__VA_ARGS__)
60 #endif
61
62 #ifndef PR_SET_PTRACER
63 # define PR_SET_PTRACER 0x59616d61
64 #endif
65
66 #ifndef PR_SET_NO_NEW_PRIVS
67 #define PR_SET_NO_NEW_PRIVS 38
68 #define PR_GET_NO_NEW_PRIVS 39
69 #endif
70
71 #ifndef PR_SECCOMP_EXT
72 #define PR_SECCOMP_EXT 43
73 #endif
74
75 #ifndef SECCOMP_EXT_ACT
76 #define SECCOMP_EXT_ACT 1
77 #endif
78
79 #ifndef SECCOMP_EXT_ACT_TSYNC
80 #define SECCOMP_EXT_ACT_TSYNC 1
81 #endif
82
83 #ifndef SECCOMP_MODE_STRICT
84 #define SECCOMP_MODE_STRICT 1
85 #endif
86
87 #ifndef SECCOMP_MODE_FILTER
88 #define SECCOMP_MODE_FILTER 2
89 #endif
90
91 #ifndef SECCOMP_RET_ALLOW
92 struct seccomp_data {
93         int nr;
94         __u32 arch;
95         __u64 instruction_pointer;
96         __u64 args[6];
97 };
98 #endif
99
100 #ifndef SECCOMP_RET_KILL_PROCESS
101 #define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
102 #define SECCOMP_RET_KILL_THREAD  0x00000000U /* kill the thread */
103 #endif
104 #ifndef SECCOMP_RET_KILL
105 #define SECCOMP_RET_KILL         SECCOMP_RET_KILL_THREAD
106 #define SECCOMP_RET_TRAP         0x00030000U /* disallow and force a SIGSYS */
107 #define SECCOMP_RET_ERRNO        0x00050000U /* returns an errno */
108 #define SECCOMP_RET_TRACE        0x7ff00000U /* pass to a tracer or disallow */
109 #define SECCOMP_RET_ALLOW        0x7fff0000U /* allow */
110 #endif
111 #ifndef SECCOMP_RET_LOG
112 #define SECCOMP_RET_LOG          0x7ffc0000U /* allow after logging */
113 #endif
114
115 #ifndef __NR_seccomp
116 # if defined(__i386__)
117 #  define __NR_seccomp 354
118 # elif defined(__x86_64__)
119 #  define __NR_seccomp 317
120 # elif defined(__arm__)
121 #  define __NR_seccomp 383
122 # elif defined(__aarch64__)
123 #  define __NR_seccomp 277
124 # elif defined(__riscv)
125 #  define __NR_seccomp 277
126 # elif defined(__csky__)
127 #  define __NR_seccomp 277
128 # elif defined(__hppa__)
129 #  define __NR_seccomp 338
130 # elif defined(__powerpc__)
131 #  define __NR_seccomp 358
132 # elif defined(__s390__)
133 #  define __NR_seccomp 348
134 # elif defined(__xtensa__)
135 #  define __NR_seccomp 337
136 # elif defined(__sh__)
137 #  define __NR_seccomp 372
138 # else
139 #  warning "seccomp syscall number unknown for this architecture"
140 #  define __NR_seccomp 0xffff
141 # endif
142 #endif
143
144 #ifndef SECCOMP_SET_MODE_STRICT
145 #define SECCOMP_SET_MODE_STRICT 0
146 #endif
147
148 #ifndef SECCOMP_SET_MODE_FILTER
149 #define SECCOMP_SET_MODE_FILTER 1
150 #endif
151
152 #ifndef SECCOMP_GET_ACTION_AVAIL
153 #define SECCOMP_GET_ACTION_AVAIL 2
154 #endif
155
156 #ifndef SECCOMP_GET_NOTIF_SIZES
157 #define SECCOMP_GET_NOTIF_SIZES 3
158 #endif
159
160 #ifndef SECCOMP_FILTER_FLAG_TSYNC
161 #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
162 #endif
163
164 #ifndef SECCOMP_FILTER_FLAG_LOG
165 #define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
166 #endif
167
168 #ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
169 #define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
170 #endif
171
172 #ifndef PTRACE_SECCOMP_GET_METADATA
173 #define PTRACE_SECCOMP_GET_METADATA     0x420d
174
175 struct seccomp_metadata {
176         __u64 filter_off;       /* Input: which filter */
177         __u64 flags;             /* Output: filter's flags */
178 };
179 #endif
180
181 #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
182 #define SECCOMP_FILTER_FLAG_NEW_LISTENER        (1UL << 3)
183 #endif
184
185 #ifndef SECCOMP_RET_USER_NOTIF
186 #define SECCOMP_RET_USER_NOTIF 0x7fc00000U
187
188 #define SECCOMP_IOC_MAGIC               '!'
189 #define SECCOMP_IO(nr)                  _IO(SECCOMP_IOC_MAGIC, nr)
190 #define SECCOMP_IOR(nr, type)           _IOR(SECCOMP_IOC_MAGIC, nr, type)
191 #define SECCOMP_IOW(nr, type)           _IOW(SECCOMP_IOC_MAGIC, nr, type)
192 #define SECCOMP_IOWR(nr, type)          _IOWR(SECCOMP_IOC_MAGIC, nr, type)
193
194 /* Flags for seccomp notification fd ioctl. */
195 #define SECCOMP_IOCTL_NOTIF_RECV        SECCOMP_IOWR(0, struct seccomp_notif)
196 #define SECCOMP_IOCTL_NOTIF_SEND        SECCOMP_IOWR(1, \
197                                                 struct seccomp_notif_resp)
198 #define SECCOMP_IOCTL_NOTIF_ID_VALID    SECCOMP_IOW(2, __u64)
199
200 struct seccomp_notif {
201         __u64 id;
202         __u32 pid;
203         __u32 flags;
204         struct seccomp_data data;
205 };
206
207 struct seccomp_notif_resp {
208         __u64 id;
209         __s64 val;
210         __s32 error;
211         __u32 flags;
212 };
213
214 struct seccomp_notif_sizes {
215         __u16 seccomp_notif;
216         __u16 seccomp_notif_resp;
217         __u16 seccomp_data;
218 };
219 #endif
220
221 #ifndef SECCOMP_IOCTL_NOTIF_ADDFD
222 /* On success, the return value is the remote process's added fd number */
223 #define SECCOMP_IOCTL_NOTIF_ADDFD       SECCOMP_IOW(3,  \
224                                                 struct seccomp_notif_addfd)
225
226 /* valid flags for seccomp_notif_addfd */
227 #define SECCOMP_ADDFD_FLAG_SETFD        (1UL << 0) /* Specify remote fd */
228
229 struct seccomp_notif_addfd {
230         __u64 id;
231         __u32 flags;
232         __u32 srcfd;
233         __u32 newfd;
234         __u32 newfd_flags;
235 };
236 #endif
237
238 #ifndef SECCOMP_ADDFD_FLAG_SEND
239 #define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */
240 #endif
241
242 struct seccomp_notif_addfd_small {
243         __u64 id;
244         char weird[4];
245 };
246 #define SECCOMP_IOCTL_NOTIF_ADDFD_SMALL \
247         SECCOMP_IOW(3, struct seccomp_notif_addfd_small)
248
249 struct seccomp_notif_addfd_big {
250         union {
251                 struct seccomp_notif_addfd addfd;
252                 char buf[sizeof(struct seccomp_notif_addfd) + 8];
253         };
254 };
255 #define SECCOMP_IOCTL_NOTIF_ADDFD_BIG   \
256         SECCOMP_IOWR(3, struct seccomp_notif_addfd_big)
257
258 #ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY
259 #define PTRACE_EVENTMSG_SYSCALL_ENTRY   1
260 #define PTRACE_EVENTMSG_SYSCALL_EXIT    2
261 #endif
262
263 #ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
264 #define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
265 #endif
266
267 #ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH
268 #define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
269 #endif
270
271 #ifndef seccomp
272 int seccomp(unsigned int op, unsigned int flags, void *args)
273 {
274         errno = 0;
275         return syscall(__NR_seccomp, op, flags, args);
276 }
277 #endif
278
279 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
280 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
281 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
282 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]) + sizeof(__u32))
283 #else
284 #error "wut? Unknown __BYTE_ORDER__?!"
285 #endif
286
287 #define SIBLING_EXIT_UNKILLED   0xbadbeef
288 #define SIBLING_EXIT_FAILURE    0xbadface
289 #define SIBLING_EXIT_NEWPRIVS   0xbadfeed
290
291 static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
292 {
293 #ifdef __NR_kcmp
294         errno = 0;
295         return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2);
296 #else
297         errno = ENOSYS;
298         return -1;
299 #endif
300 }
301
302 /* Have TH_LOG report actual location filecmp() is used. */
303 #define filecmp(pid1, pid2, fd1, fd2)   ({              \
304         int _ret;                                       \
305                                                         \
306         _ret = __filecmp(pid1, pid2, fd1, fd2);         \
307         if (_ret != 0) {                                \
308                 if (_ret < 0 && errno == ENOSYS) {      \
309                         TH_LOG("kcmp() syscall missing (test is less accurate)");\
310                         _ret = 0;                       \
311                 }                                       \
312         }                                               \
313         _ret; })
314
315 TEST(kcmp)
316 {
317         int ret;
318
319         ret = __filecmp(getpid(), getpid(), 1, 1);
320         EXPECT_EQ(ret, 0);
321         if (ret != 0 && errno == ENOSYS)
322                 SKIP(return, "Kernel does not support kcmp() (missing CONFIG_KCMP?)");
323 }
324
325 TEST(mode_strict_support)
326 {
327         long ret;
328
329         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
330         ASSERT_EQ(0, ret) {
331                 TH_LOG("Kernel does not support CONFIG_SECCOMP");
332         }
333         syscall(__NR_exit, 0);
334 }
335
336 TEST_SIGNAL(mode_strict_cannot_call_prctl, SIGKILL)
337 {
338         long ret;
339
340         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
341         ASSERT_EQ(0, ret) {
342                 TH_LOG("Kernel does not support CONFIG_SECCOMP");
343         }
344         syscall(__NR_prctl, PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
345                 NULL, NULL, NULL);
346         EXPECT_FALSE(true) {
347                 TH_LOG("Unreachable!");
348         }
349 }
350
351 /* Note! This doesn't test no new privs behavior */
352 TEST(no_new_privs_support)
353 {
354         long ret;
355
356         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
357         EXPECT_EQ(0, ret) {
358                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
359         }
360 }
361
362 /* Tests kernel support by checking for a copy_from_user() fault on NULL. */
363 TEST(mode_filter_support)
364 {
365         long ret;
366
367         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
368         ASSERT_EQ(0, ret) {
369                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
370         }
371         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, NULL, NULL);
372         EXPECT_EQ(-1, ret);
373         EXPECT_EQ(EFAULT, errno) {
374                 TH_LOG("Kernel does not support CONFIG_SECCOMP_FILTER!");
375         }
376 }
377
378 TEST(mode_filter_without_nnp)
379 {
380         struct sock_filter filter[] = {
381                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
382         };
383         struct sock_fprog prog = {
384                 .len = (unsigned short)ARRAY_SIZE(filter),
385                 .filter = filter,
386         };
387         long ret;
388
389         ret = prctl(PR_GET_NO_NEW_PRIVS, 0, NULL, 0, 0);
390         ASSERT_LE(0, ret) {
391                 TH_LOG("Expected 0 or unsupported for NO_NEW_PRIVS");
392         }
393         errno = 0;
394         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
395         /* Succeeds with CAP_SYS_ADMIN, fails without */
396         /* TODO(wad) check caps not euid */
397         if (geteuid()) {
398                 EXPECT_EQ(-1, ret);
399                 EXPECT_EQ(EACCES, errno);
400         } else {
401                 EXPECT_EQ(0, ret);
402         }
403 }
404
405 #define MAX_INSNS_PER_PATH 32768
406
407 TEST(filter_size_limits)
408 {
409         int i;
410         int count = BPF_MAXINSNS + 1;
411         struct sock_filter allow[] = {
412                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
413         };
414         struct sock_filter *filter;
415         struct sock_fprog prog = { };
416         long ret;
417
418         filter = calloc(count, sizeof(*filter));
419         ASSERT_NE(NULL, filter);
420
421         for (i = 0; i < count; i++)
422                 filter[i] = allow[0];
423
424         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
425         ASSERT_EQ(0, ret);
426
427         prog.filter = filter;
428         prog.len = count;
429
430         /* Too many filter instructions in a single filter. */
431         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
432         ASSERT_NE(0, ret) {
433                 TH_LOG("Installing %d insn filter was allowed", prog.len);
434         }
435
436         /* One less is okay, though. */
437         prog.len -= 1;
438         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
439         ASSERT_EQ(0, ret) {
440                 TH_LOG("Installing %d insn filter wasn't allowed", prog.len);
441         }
442 }
443
444 TEST(filter_chain_limits)
445 {
446         int i;
447         int count = BPF_MAXINSNS;
448         struct sock_filter allow[] = {
449                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
450         };
451         struct sock_filter *filter;
452         struct sock_fprog prog = { };
453         long ret;
454
455         filter = calloc(count, sizeof(*filter));
456         ASSERT_NE(NULL, filter);
457
458         for (i = 0; i < count; i++)
459                 filter[i] = allow[0];
460
461         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
462         ASSERT_EQ(0, ret);
463
464         prog.filter = filter;
465         prog.len = 1;
466
467         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
468         ASSERT_EQ(0, ret);
469
470         prog.len = count;
471
472         /* Too many total filter instructions. */
473         for (i = 0; i < MAX_INSNS_PER_PATH; i++) {
474                 ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
475                 if (ret != 0)
476                         break;
477         }
478         ASSERT_NE(0, ret) {
479                 TH_LOG("Allowed %d %d-insn filters (total with penalties:%d)",
480                        i, count, i * (count + 4));
481         }
482 }
483
484 TEST(mode_filter_cannot_move_to_strict)
485 {
486         struct sock_filter filter[] = {
487                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
488         };
489         struct sock_fprog prog = {
490                 .len = (unsigned short)ARRAY_SIZE(filter),
491                 .filter = filter,
492         };
493         long ret;
494
495         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
496         ASSERT_EQ(0, ret);
497
498         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
499         ASSERT_EQ(0, ret);
500
501         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, 0, 0);
502         EXPECT_EQ(-1, ret);
503         EXPECT_EQ(EINVAL, errno);
504 }
505
506
507 TEST(mode_filter_get_seccomp)
508 {
509         struct sock_filter filter[] = {
510                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
511         };
512         struct sock_fprog prog = {
513                 .len = (unsigned short)ARRAY_SIZE(filter),
514                 .filter = filter,
515         };
516         long ret;
517
518         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
519         ASSERT_EQ(0, ret);
520
521         ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
522         EXPECT_EQ(0, ret);
523
524         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
525         ASSERT_EQ(0, ret);
526
527         ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
528         EXPECT_EQ(2, ret);
529 }
530
531
532 TEST(ALLOW_all)
533 {
534         struct sock_filter filter[] = {
535                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
536         };
537         struct sock_fprog prog = {
538                 .len = (unsigned short)ARRAY_SIZE(filter),
539                 .filter = filter,
540         };
541         long ret;
542
543         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
544         ASSERT_EQ(0, ret);
545
546         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
547         ASSERT_EQ(0, ret);
548 }
549
550 TEST(empty_prog)
551 {
552         struct sock_filter filter[] = {
553         };
554         struct sock_fprog prog = {
555                 .len = (unsigned short)ARRAY_SIZE(filter),
556                 .filter = filter,
557         };
558         long ret;
559
560         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
561         ASSERT_EQ(0, ret);
562
563         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
564         EXPECT_EQ(-1, ret);
565         EXPECT_EQ(EINVAL, errno);
566 }
567
568 TEST(log_all)
569 {
570         struct sock_filter filter[] = {
571                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
572         };
573         struct sock_fprog prog = {
574                 .len = (unsigned short)ARRAY_SIZE(filter),
575                 .filter = filter,
576         };
577         long ret;
578         pid_t parent = getppid();
579
580         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
581         ASSERT_EQ(0, ret);
582
583         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
584         ASSERT_EQ(0, ret);
585
586         /* getppid() should succeed and be logged (no check for logging) */
587         EXPECT_EQ(parent, syscall(__NR_getppid));
588 }
589
590 TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
591 {
592         struct sock_filter filter[] = {
593                 BPF_STMT(BPF_RET|BPF_K, 0x10000000U),
594         };
595         struct sock_fprog prog = {
596                 .len = (unsigned short)ARRAY_SIZE(filter),
597                 .filter = filter,
598         };
599         long ret;
600
601         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
602         ASSERT_EQ(0, ret);
603
604         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
605         ASSERT_EQ(0, ret);
606         EXPECT_EQ(0, syscall(__NR_getpid)) {
607                 TH_LOG("getpid() shouldn't ever return");
608         }
609 }
610
611 /* return code >= 0x80000000 is unused. */
612 TEST_SIGNAL(unknown_ret_is_kill_above_allow, SIGSYS)
613 {
614         struct sock_filter filter[] = {
615                 BPF_STMT(BPF_RET|BPF_K, 0x90000000U),
616         };
617         struct sock_fprog prog = {
618                 .len = (unsigned short)ARRAY_SIZE(filter),
619                 .filter = filter,
620         };
621         long ret;
622
623         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
624         ASSERT_EQ(0, ret);
625
626         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
627         ASSERT_EQ(0, ret);
628         EXPECT_EQ(0, syscall(__NR_getpid)) {
629                 TH_LOG("getpid() shouldn't ever return");
630         }
631 }
632
633 TEST_SIGNAL(KILL_all, SIGSYS)
634 {
635         struct sock_filter filter[] = {
636                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
637         };
638         struct sock_fprog prog = {
639                 .len = (unsigned short)ARRAY_SIZE(filter),
640                 .filter = filter,
641         };
642         long ret;
643
644         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
645         ASSERT_EQ(0, ret);
646
647         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
648         ASSERT_EQ(0, ret);
649 }
650
651 TEST_SIGNAL(KILL_one, SIGSYS)
652 {
653         struct sock_filter filter[] = {
654                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
655                         offsetof(struct seccomp_data, nr)),
656                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
657                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
658                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
659         };
660         struct sock_fprog prog = {
661                 .len = (unsigned short)ARRAY_SIZE(filter),
662                 .filter = filter,
663         };
664         long ret;
665         pid_t parent = getppid();
666
667         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
668         ASSERT_EQ(0, ret);
669
670         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
671         ASSERT_EQ(0, ret);
672
673         EXPECT_EQ(parent, syscall(__NR_getppid));
674         /* getpid() should never return. */
675         EXPECT_EQ(0, syscall(__NR_getpid));
676 }
677
678 TEST_SIGNAL(KILL_one_arg_one, SIGSYS)
679 {
680         void *fatal_address;
681         struct sock_filter filter[] = {
682                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
683                         offsetof(struct seccomp_data, nr)),
684                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_times, 1, 0),
685                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
686                 /* Only both with lower 32-bit for now. */
687                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(0)),
688                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K,
689                         (unsigned long)&fatal_address, 0, 1),
690                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
691                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
692         };
693         struct sock_fprog prog = {
694                 .len = (unsigned short)ARRAY_SIZE(filter),
695                 .filter = filter,
696         };
697         long ret;
698         pid_t parent = getppid();
699         struct tms timebuf;
700         clock_t clock = times(&timebuf);
701
702         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
703         ASSERT_EQ(0, ret);
704
705         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
706         ASSERT_EQ(0, ret);
707
708         EXPECT_EQ(parent, syscall(__NR_getppid));
709         EXPECT_LE(clock, syscall(__NR_times, &timebuf));
710         /* times() should never return. */
711         EXPECT_EQ(0, syscall(__NR_times, &fatal_address));
712 }
713
714 TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
715 {
716 #ifndef __NR_mmap2
717         int sysno = __NR_mmap;
718 #else
719         int sysno = __NR_mmap2;
720 #endif
721         struct sock_filter filter[] = {
722                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
723                         offsetof(struct seccomp_data, nr)),
724                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, sysno, 1, 0),
725                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
726                 /* Only both with lower 32-bit for now. */
727                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(5)),
728                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1),
729                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
730                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
731         };
732         struct sock_fprog prog = {
733                 .len = (unsigned short)ARRAY_SIZE(filter),
734                 .filter = filter,
735         };
736         long ret;
737         pid_t parent = getppid();
738         int fd;
739         void *map1, *map2;
740         int page_size = sysconf(_SC_PAGESIZE);
741
742         ASSERT_LT(0, page_size);
743
744         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
745         ASSERT_EQ(0, ret);
746
747         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
748         ASSERT_EQ(0, ret);
749
750         fd = open("/dev/zero", O_RDONLY);
751         ASSERT_NE(-1, fd);
752
753         EXPECT_EQ(parent, syscall(__NR_getppid));
754         map1 = (void *)syscall(sysno,
755                 NULL, page_size, PROT_READ, MAP_PRIVATE, fd, page_size);
756         EXPECT_NE(MAP_FAILED, map1);
757         /* mmap2() should never return. */
758         map2 = (void *)syscall(sysno,
759                  NULL, page_size, PROT_READ, MAP_PRIVATE, fd, 0x0C0FFEE);
760         EXPECT_EQ(MAP_FAILED, map2);
761
762         /* The test failed, so clean up the resources. */
763         munmap(map1, page_size);
764         munmap(map2, page_size);
765         close(fd);
766 }
767
768 /* This is a thread task to die via seccomp filter violation. */
769 void *kill_thread(void *data)
770 {
771         bool die = (bool)data;
772
773         if (die) {
774                 prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
775                 return (void *)SIBLING_EXIT_FAILURE;
776         }
777
778         return (void *)SIBLING_EXIT_UNKILLED;
779 }
780
781 enum kill_t {
782         KILL_THREAD,
783         KILL_PROCESS,
784         RET_UNKNOWN
785 };
786
787 /* Prepare a thread that will kill itself or both of us. */
788 void kill_thread_or_group(struct __test_metadata *_metadata,
789                           enum kill_t kill_how)
790 {
791         pthread_t thread;
792         void *status;
793         /* Kill only when calling __NR_prctl. */
794         struct sock_filter filter_thread[] = {
795                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
796                         offsetof(struct seccomp_data, nr)),
797                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
798                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
799                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
800         };
801         struct sock_fprog prog_thread = {
802                 .len = (unsigned short)ARRAY_SIZE(filter_thread),
803                 .filter = filter_thread,
804         };
805         int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAAA;
806         struct sock_filter filter_process[] = {
807                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
808                         offsetof(struct seccomp_data, nr)),
809                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
810                 BPF_STMT(BPF_RET|BPF_K, kill),
811                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
812         };
813         struct sock_fprog prog_process = {
814                 .len = (unsigned short)ARRAY_SIZE(filter_process),
815                 .filter = filter_process,
816         };
817
818         ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
819                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
820         }
821
822         ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0,
823                              kill_how == KILL_THREAD ? &prog_thread
824                                                      : &prog_process));
825
826         /*
827          * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS
828          * flag cannot be downgraded by a new filter.
829          */
830         if (kill_how == KILL_PROCESS)
831                 ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
832
833         /* Start a thread that will exit immediately. */
834         ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
835         ASSERT_EQ(0, pthread_join(thread, &status));
836         ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status);
837
838         /* Start a thread that will die immediately. */
839         ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true));
840         ASSERT_EQ(0, pthread_join(thread, &status));
841         ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status);
842
843         /*
844          * If we get here, only the spawned thread died. Let the parent know
845          * the whole process didn't die (i.e. this thread, the spawner,
846          * stayed running).
847          */
848         exit(42);
849 }
850
851 TEST(KILL_thread)
852 {
853         int status;
854         pid_t child_pid;
855
856         child_pid = fork();
857         ASSERT_LE(0, child_pid);
858         if (child_pid == 0) {
859                 kill_thread_or_group(_metadata, KILL_THREAD);
860                 _exit(38);
861         }
862
863         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
864
865         /* If only the thread was killed, we'll see exit 42. */
866         ASSERT_TRUE(WIFEXITED(status));
867         ASSERT_EQ(42, WEXITSTATUS(status));
868 }
869
870 TEST(KILL_process)
871 {
872         int status;
873         pid_t child_pid;
874
875         child_pid = fork();
876         ASSERT_LE(0, child_pid);
877         if (child_pid == 0) {
878                 kill_thread_or_group(_metadata, KILL_PROCESS);
879                 _exit(38);
880         }
881
882         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
883
884         /* If the entire process was killed, we'll see SIGSYS. */
885         ASSERT_TRUE(WIFSIGNALED(status));
886         ASSERT_EQ(SIGSYS, WTERMSIG(status));
887 }
888
889 TEST(KILL_unknown)
890 {
891         int status;
892         pid_t child_pid;
893
894         child_pid = fork();
895         ASSERT_LE(0, child_pid);
896         if (child_pid == 0) {
897                 kill_thread_or_group(_metadata, RET_UNKNOWN);
898                 _exit(38);
899         }
900
901         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
902
903         /* If the entire process was killed, we'll see SIGSYS. */
904         EXPECT_TRUE(WIFSIGNALED(status)) {
905                 TH_LOG("Unknown SECCOMP_RET is only killing the thread?");
906         }
907         ASSERT_EQ(SIGSYS, WTERMSIG(status));
908 }
909
910 /* TODO(wad) add 64-bit versus 32-bit arg tests. */
911 TEST(arg_out_of_range)
912 {
913         struct sock_filter filter[] = {
914                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)),
915                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
916         };
917         struct sock_fprog prog = {
918                 .len = (unsigned short)ARRAY_SIZE(filter),
919                 .filter = filter,
920         };
921         long ret;
922
923         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
924         ASSERT_EQ(0, ret);
925
926         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
927         EXPECT_EQ(-1, ret);
928         EXPECT_EQ(EINVAL, errno);
929 }
930
931 #define ERRNO_FILTER(name, errno)                                       \
932         struct sock_filter _read_filter_##name[] = {                    \
933                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,                          \
934                         offsetof(struct seccomp_data, nr)),             \
935                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),       \
936                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno),     \
937                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),             \
938         };                                                              \
939         struct sock_fprog prog_##name = {                               \
940                 .len = (unsigned short)ARRAY_SIZE(_read_filter_##name), \
941                 .filter = _read_filter_##name,                          \
942         }
943
944 /* Make sure basic errno values are correctly passed through a filter. */
945 TEST(ERRNO_valid)
946 {
947         ERRNO_FILTER(valid, E2BIG);
948         long ret;
949         pid_t parent = getppid();
950
951         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
952         ASSERT_EQ(0, ret);
953
954         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid);
955         ASSERT_EQ(0, ret);
956
957         EXPECT_EQ(parent, syscall(__NR_getppid));
958         EXPECT_EQ(-1, read(0, NULL, 0));
959         EXPECT_EQ(E2BIG, errno);
960 }
961
962 /* Make sure an errno of zero is correctly handled by the arch code. */
963 TEST(ERRNO_zero)
964 {
965         ERRNO_FILTER(zero, 0);
966         long ret;
967         pid_t parent = getppid();
968
969         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
970         ASSERT_EQ(0, ret);
971
972         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero);
973         ASSERT_EQ(0, ret);
974
975         EXPECT_EQ(parent, syscall(__NR_getppid));
976         /* "errno" of 0 is ok. */
977         EXPECT_EQ(0, read(0, NULL, 0));
978 }
979
980 /*
981  * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller.
982  * This tests that the errno value gets capped correctly, fixed by
983  * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO").
984  */
985 TEST(ERRNO_capped)
986 {
987         ERRNO_FILTER(capped, 4096);
988         long ret;
989         pid_t parent = getppid();
990
991         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
992         ASSERT_EQ(0, ret);
993
994         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped);
995         ASSERT_EQ(0, ret);
996
997         EXPECT_EQ(parent, syscall(__NR_getppid));
998         EXPECT_EQ(-1, read(0, NULL, 0));
999         EXPECT_EQ(4095, errno);
1000 }
1001
1002 /*
1003  * Filters are processed in reverse order: last applied is executed first.
1004  * Since only the SECCOMP_RET_ACTION mask is tested for return values, the
1005  * SECCOMP_RET_DATA mask results will follow the most recently applied
1006  * matching filter return (and not the lowest or highest value).
1007  */
1008 TEST(ERRNO_order)
1009 {
1010         ERRNO_FILTER(first,  11);
1011         ERRNO_FILTER(second, 13);
1012         ERRNO_FILTER(third,  12);
1013         long ret;
1014         pid_t parent = getppid();
1015
1016         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1017         ASSERT_EQ(0, ret);
1018
1019         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first);
1020         ASSERT_EQ(0, ret);
1021
1022         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second);
1023         ASSERT_EQ(0, ret);
1024
1025         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third);
1026         ASSERT_EQ(0, ret);
1027
1028         EXPECT_EQ(parent, syscall(__NR_getppid));
1029         EXPECT_EQ(-1, read(0, NULL, 0));
1030         EXPECT_EQ(12, errno);
1031 }
1032
1033 FIXTURE(TRAP) {
1034         struct sock_fprog prog;
1035 };
1036
1037 FIXTURE_SETUP(TRAP)
1038 {
1039         struct sock_filter filter[] = {
1040                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1041                         offsetof(struct seccomp_data, nr)),
1042                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
1043                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1044                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1045         };
1046
1047         memset(&self->prog, 0, sizeof(self->prog));
1048         self->prog.filter = malloc(sizeof(filter));
1049         ASSERT_NE(NULL, self->prog.filter);
1050         memcpy(self->prog.filter, filter, sizeof(filter));
1051         self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1052 }
1053
1054 FIXTURE_TEARDOWN(TRAP)
1055 {
1056         if (self->prog.filter)
1057                 free(self->prog.filter);
1058 }
1059
1060 TEST_F_SIGNAL(TRAP, dfl, SIGSYS)
1061 {
1062         long ret;
1063
1064         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1065         ASSERT_EQ(0, ret);
1066
1067         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1068         ASSERT_EQ(0, ret);
1069         syscall(__NR_getpid);
1070 }
1071
1072 /* Ensure that SIGSYS overrides SIG_IGN */
1073 TEST_F_SIGNAL(TRAP, ign, SIGSYS)
1074 {
1075         long ret;
1076
1077         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1078         ASSERT_EQ(0, ret);
1079
1080         signal(SIGSYS, SIG_IGN);
1081
1082         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1083         ASSERT_EQ(0, ret);
1084         syscall(__NR_getpid);
1085 }
1086
1087 static siginfo_t TRAP_info;
1088 static volatile int TRAP_nr;
1089 static void TRAP_action(int nr, siginfo_t *info, void *void_context)
1090 {
1091         memcpy(&TRAP_info, info, sizeof(TRAP_info));
1092         TRAP_nr = nr;
1093 }
1094
1095 TEST_F(TRAP, handler)
1096 {
1097         int ret, test;
1098         struct sigaction act;
1099         sigset_t mask;
1100
1101         memset(&act, 0, sizeof(act));
1102         sigemptyset(&mask);
1103         sigaddset(&mask, SIGSYS);
1104
1105         act.sa_sigaction = &TRAP_action;
1106         act.sa_flags = SA_SIGINFO;
1107         ret = sigaction(SIGSYS, &act, NULL);
1108         ASSERT_EQ(0, ret) {
1109                 TH_LOG("sigaction failed");
1110         }
1111         ret = sigprocmask(SIG_UNBLOCK, &mask, NULL);
1112         ASSERT_EQ(0, ret) {
1113                 TH_LOG("sigprocmask failed");
1114         }
1115
1116         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1117         ASSERT_EQ(0, ret);
1118         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1119         ASSERT_EQ(0, ret);
1120         TRAP_nr = 0;
1121         memset(&TRAP_info, 0, sizeof(TRAP_info));
1122         /* Expect the registers to be rolled back. (nr = error) may vary
1123          * based on arch. */
1124         ret = syscall(__NR_getpid);
1125         /* Silence gcc warning about volatile. */
1126         test = TRAP_nr;
1127         EXPECT_EQ(SIGSYS, test);
1128         struct local_sigsys {
1129                 void *_call_addr;       /* calling user insn */
1130                 int _syscall;           /* triggering system call number */
1131                 unsigned int _arch;     /* AUDIT_ARCH_* of syscall */
1132         } *sigsys = (struct local_sigsys *)
1133 #ifdef si_syscall
1134                 &(TRAP_info.si_call_addr);
1135 #else
1136                 &TRAP_info.si_pid;
1137 #endif
1138         EXPECT_EQ(__NR_getpid, sigsys->_syscall);
1139         /* Make sure arch is non-zero. */
1140         EXPECT_NE(0, sigsys->_arch);
1141         EXPECT_NE(0, (unsigned long)sigsys->_call_addr);
1142 }
1143
1144 FIXTURE(precedence) {
1145         struct sock_fprog allow;
1146         struct sock_fprog log;
1147         struct sock_fprog trace;
1148         struct sock_fprog error;
1149         struct sock_fprog trap;
1150         struct sock_fprog kill;
1151 };
1152
1153 FIXTURE_SETUP(precedence)
1154 {
1155         struct sock_filter allow_insns[] = {
1156                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1157         };
1158         struct sock_filter log_insns[] = {
1159                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1160                         offsetof(struct seccomp_data, nr)),
1161                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1162                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1163                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
1164         };
1165         struct sock_filter trace_insns[] = {
1166                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1167                         offsetof(struct seccomp_data, nr)),
1168                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1169                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1170                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE),
1171         };
1172         struct sock_filter error_insns[] = {
1173                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1174                         offsetof(struct seccomp_data, nr)),
1175                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1176                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1177                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO),
1178         };
1179         struct sock_filter trap_insns[] = {
1180                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1181                         offsetof(struct seccomp_data, nr)),
1182                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1183                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1184                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1185         };
1186         struct sock_filter kill_insns[] = {
1187                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1188                         offsetof(struct seccomp_data, nr)),
1189                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1190                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1191                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
1192         };
1193
1194         memset(self, 0, sizeof(*self));
1195 #define FILTER_ALLOC(_x) \
1196         self->_x.filter = malloc(sizeof(_x##_insns)); \
1197         ASSERT_NE(NULL, self->_x.filter); \
1198         memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
1199         self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
1200         FILTER_ALLOC(allow);
1201         FILTER_ALLOC(log);
1202         FILTER_ALLOC(trace);
1203         FILTER_ALLOC(error);
1204         FILTER_ALLOC(trap);
1205         FILTER_ALLOC(kill);
1206 }
1207
1208 FIXTURE_TEARDOWN(precedence)
1209 {
1210 #define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
1211         FILTER_FREE(allow);
1212         FILTER_FREE(log);
1213         FILTER_FREE(trace);
1214         FILTER_FREE(error);
1215         FILTER_FREE(trap);
1216         FILTER_FREE(kill);
1217 }
1218
1219 TEST_F(precedence, allow_ok)
1220 {
1221         pid_t parent, res = 0;
1222         long ret;
1223
1224         parent = getppid();
1225         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1226         ASSERT_EQ(0, ret);
1227
1228         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1229         ASSERT_EQ(0, ret);
1230         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1231         ASSERT_EQ(0, ret);
1232         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1233         ASSERT_EQ(0, ret);
1234         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1235         ASSERT_EQ(0, ret);
1236         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1237         ASSERT_EQ(0, ret);
1238         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1239         ASSERT_EQ(0, ret);
1240         /* Should work just fine. */
1241         res = syscall(__NR_getppid);
1242         EXPECT_EQ(parent, res);
1243 }
1244
1245 TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS)
1246 {
1247         pid_t parent, res = 0;
1248         long ret;
1249
1250         parent = getppid();
1251         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1252         ASSERT_EQ(0, ret);
1253
1254         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1255         ASSERT_EQ(0, ret);
1256         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1257         ASSERT_EQ(0, ret);
1258         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1259         ASSERT_EQ(0, ret);
1260         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1261         ASSERT_EQ(0, ret);
1262         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1263         ASSERT_EQ(0, ret);
1264         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1265         ASSERT_EQ(0, ret);
1266         /* Should work just fine. */
1267         res = syscall(__NR_getppid);
1268         EXPECT_EQ(parent, res);
1269         /* getpid() should never return. */
1270         res = syscall(__NR_getpid);
1271         EXPECT_EQ(0, res);
1272 }
1273
1274 TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS)
1275 {
1276         pid_t parent;
1277         long ret;
1278
1279         parent = getppid();
1280         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1281         ASSERT_EQ(0, ret);
1282
1283         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1284         ASSERT_EQ(0, ret);
1285         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1286         ASSERT_EQ(0, ret);
1287         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1288         ASSERT_EQ(0, ret);
1289         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1290         ASSERT_EQ(0, ret);
1291         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1292         ASSERT_EQ(0, ret);
1293         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1294         ASSERT_EQ(0, ret);
1295         /* Should work just fine. */
1296         EXPECT_EQ(parent, syscall(__NR_getppid));
1297         /* getpid() should never return. */
1298         EXPECT_EQ(0, syscall(__NR_getpid));
1299 }
1300
1301 TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS)
1302 {
1303         pid_t parent;
1304         long ret;
1305
1306         parent = getppid();
1307         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1308         ASSERT_EQ(0, ret);
1309
1310         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1311         ASSERT_EQ(0, ret);
1312         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1313         ASSERT_EQ(0, ret);
1314         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1315         ASSERT_EQ(0, ret);
1316         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1317         ASSERT_EQ(0, ret);
1318         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1319         ASSERT_EQ(0, ret);
1320         /* Should work just fine. */
1321         EXPECT_EQ(parent, syscall(__NR_getppid));
1322         /* getpid() should never return. */
1323         EXPECT_EQ(0, syscall(__NR_getpid));
1324 }
1325
1326 TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS)
1327 {
1328         pid_t parent;
1329         long ret;
1330
1331         parent = getppid();
1332         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1333         ASSERT_EQ(0, ret);
1334
1335         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1336         ASSERT_EQ(0, ret);
1337         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1338         ASSERT_EQ(0, ret);
1339         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1340         ASSERT_EQ(0, ret);
1341         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1342         ASSERT_EQ(0, ret);
1343         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1344         ASSERT_EQ(0, ret);
1345         /* Should work just fine. */
1346         EXPECT_EQ(parent, syscall(__NR_getppid));
1347         /* getpid() should never return. */
1348         EXPECT_EQ(0, syscall(__NR_getpid));
1349 }
1350
1351 TEST_F(precedence, errno_is_third)
1352 {
1353         pid_t parent;
1354         long ret;
1355
1356         parent = getppid();
1357         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1358         ASSERT_EQ(0, ret);
1359
1360         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1361         ASSERT_EQ(0, ret);
1362         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1363         ASSERT_EQ(0, ret);
1364         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1365         ASSERT_EQ(0, ret);
1366         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1367         ASSERT_EQ(0, ret);
1368         /* Should work just fine. */
1369         EXPECT_EQ(parent, syscall(__NR_getppid));
1370         EXPECT_EQ(0, syscall(__NR_getpid));
1371 }
1372
1373 TEST_F(precedence, errno_is_third_in_any_order)
1374 {
1375         pid_t parent;
1376         long ret;
1377
1378         parent = getppid();
1379         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1380         ASSERT_EQ(0, ret);
1381
1382         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1383         ASSERT_EQ(0, ret);
1384         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1385         ASSERT_EQ(0, ret);
1386         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1387         ASSERT_EQ(0, ret);
1388         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1389         ASSERT_EQ(0, ret);
1390         /* Should work just fine. */
1391         EXPECT_EQ(parent, syscall(__NR_getppid));
1392         EXPECT_EQ(0, syscall(__NR_getpid));
1393 }
1394
1395 TEST_F(precedence, trace_is_fourth)
1396 {
1397         pid_t parent;
1398         long ret;
1399
1400         parent = getppid();
1401         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1402         ASSERT_EQ(0, ret);
1403
1404         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1405         ASSERT_EQ(0, ret);
1406         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1407         ASSERT_EQ(0, ret);
1408         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1409         ASSERT_EQ(0, ret);
1410         /* Should work just fine. */
1411         EXPECT_EQ(parent, syscall(__NR_getppid));
1412         /* No ptracer */
1413         EXPECT_EQ(-1, syscall(__NR_getpid));
1414 }
1415
1416 TEST_F(precedence, trace_is_fourth_in_any_order)
1417 {
1418         pid_t parent;
1419         long ret;
1420
1421         parent = getppid();
1422         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1423         ASSERT_EQ(0, ret);
1424
1425         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1426         ASSERT_EQ(0, ret);
1427         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1428         ASSERT_EQ(0, ret);
1429         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1430         ASSERT_EQ(0, ret);
1431         /* Should work just fine. */
1432         EXPECT_EQ(parent, syscall(__NR_getppid));
1433         /* No ptracer */
1434         EXPECT_EQ(-1, syscall(__NR_getpid));
1435 }
1436
1437 TEST_F(precedence, log_is_fifth)
1438 {
1439         pid_t mypid, parent;
1440         long ret;
1441
1442         mypid = getpid();
1443         parent = getppid();
1444         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1445         ASSERT_EQ(0, ret);
1446
1447         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1448         ASSERT_EQ(0, ret);
1449         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1450         ASSERT_EQ(0, ret);
1451         /* Should work just fine. */
1452         EXPECT_EQ(parent, syscall(__NR_getppid));
1453         /* Should also work just fine */
1454         EXPECT_EQ(mypid, syscall(__NR_getpid));
1455 }
1456
1457 TEST_F(precedence, log_is_fifth_in_any_order)
1458 {
1459         pid_t mypid, parent;
1460         long ret;
1461
1462         mypid = getpid();
1463         parent = getppid();
1464         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1465         ASSERT_EQ(0, ret);
1466
1467         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1468         ASSERT_EQ(0, ret);
1469         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1470         ASSERT_EQ(0, ret);
1471         /* Should work just fine. */
1472         EXPECT_EQ(parent, syscall(__NR_getppid));
1473         /* Should also work just fine */
1474         EXPECT_EQ(mypid, syscall(__NR_getpid));
1475 }
1476
1477 #ifndef PTRACE_O_TRACESECCOMP
1478 #define PTRACE_O_TRACESECCOMP   0x00000080
1479 #endif
1480
1481 /* Catch the Ubuntu 12.04 value error. */
1482 #if PTRACE_EVENT_SECCOMP != 7
1483 #undef PTRACE_EVENT_SECCOMP
1484 #endif
1485
1486 #ifndef PTRACE_EVENT_SECCOMP
1487 #define PTRACE_EVENT_SECCOMP 7
1488 #endif
1489
1490 #define PTRACE_EVENT_MASK(status) ((status) >> 16)
1491 bool tracer_running;
1492 void tracer_stop(int sig)
1493 {
1494         tracer_running = false;
1495 }
1496
1497 typedef void tracer_func_t(struct __test_metadata *_metadata,
1498                            pid_t tracee, int status, void *args);
1499
1500 void start_tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
1501             tracer_func_t tracer_func, void *args, bool ptrace_syscall)
1502 {
1503         int ret = -1;
1504         struct sigaction action = {
1505                 .sa_handler = tracer_stop,
1506         };
1507
1508         /* Allow external shutdown. */
1509         tracer_running = true;
1510         ASSERT_EQ(0, sigaction(SIGUSR1, &action, NULL));
1511
1512         errno = 0;
1513         while (ret == -1 && errno != EINVAL)
1514                 ret = ptrace(PTRACE_ATTACH, tracee, NULL, 0);
1515         ASSERT_EQ(0, ret) {
1516                 kill(tracee, SIGKILL);
1517         }
1518         /* Wait for attach stop */
1519         wait(NULL);
1520
1521         ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, ptrace_syscall ?
1522                                                       PTRACE_O_TRACESYSGOOD :
1523                                                       PTRACE_O_TRACESECCOMP);
1524         ASSERT_EQ(0, ret) {
1525                 TH_LOG("Failed to set PTRACE_O_TRACESECCOMP");
1526                 kill(tracee, SIGKILL);
1527         }
1528         ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1529                      tracee, NULL, 0);
1530         ASSERT_EQ(0, ret);
1531
1532         /* Unblock the tracee */
1533         ASSERT_EQ(1, write(fd, "A", 1));
1534         ASSERT_EQ(0, close(fd));
1535
1536         /* Run until we're shut down. Must assert to stop execution. */
1537         while (tracer_running) {
1538                 int status;
1539
1540                 if (wait(&status) != tracee)
1541                         continue;
1542
1543                 if (WIFSIGNALED(status)) {
1544                         /* Child caught a fatal signal. */
1545                         return;
1546                 }
1547                 if (WIFEXITED(status)) {
1548                         /* Child exited with code. */
1549                         return;
1550                 }
1551
1552                 /* Check if we got an expected event. */
1553                 ASSERT_EQ(WIFCONTINUED(status), false);
1554                 ASSERT_EQ(WIFSTOPPED(status), true);
1555                 ASSERT_EQ(WSTOPSIG(status) & SIGTRAP, SIGTRAP) {
1556                         TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
1557                 }
1558
1559                 tracer_func(_metadata, tracee, status, args);
1560
1561                 ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1562                              tracee, NULL, 0);
1563                 ASSERT_EQ(0, ret);
1564         }
1565         /* Directly report the status of our test harness results. */
1566         syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE);
1567 }
1568
1569 /* Common tracer setup/teardown functions. */
1570 void cont_handler(int num)
1571 { }
1572 pid_t setup_trace_fixture(struct __test_metadata *_metadata,
1573                           tracer_func_t func, void *args, bool ptrace_syscall)
1574 {
1575         char sync;
1576         int pipefd[2];
1577         pid_t tracer_pid;
1578         pid_t tracee = getpid();
1579
1580         /* Setup a pipe for clean synchronization. */
1581         ASSERT_EQ(0, pipe(pipefd));
1582
1583         /* Fork a child which we'll promote to tracer */
1584         tracer_pid = fork();
1585         ASSERT_LE(0, tracer_pid);
1586         signal(SIGALRM, cont_handler);
1587         if (tracer_pid == 0) {
1588                 close(pipefd[0]);
1589                 start_tracer(_metadata, pipefd[1], tracee, func, args,
1590                              ptrace_syscall);
1591                 syscall(__NR_exit, 0);
1592         }
1593         close(pipefd[1]);
1594         prctl(PR_SET_PTRACER, tracer_pid, 0, 0, 0);
1595         read(pipefd[0], &sync, 1);
1596         close(pipefd[0]);
1597
1598         return tracer_pid;
1599 }
1600
1601 void teardown_trace_fixture(struct __test_metadata *_metadata,
1602                             pid_t tracer)
1603 {
1604         if (tracer) {
1605                 int status;
1606                 /*
1607                  * Extract the exit code from the other process and
1608                  * adopt it for ourselves in case its asserts failed.
1609                  */
1610                 ASSERT_EQ(0, kill(tracer, SIGUSR1));
1611                 ASSERT_EQ(tracer, waitpid(tracer, &status, 0));
1612                 if (WEXITSTATUS(status))
1613                         _metadata->passed = 0;
1614         }
1615 }
1616
1617 /* "poke" tracer arguments and function. */
1618 struct tracer_args_poke_t {
1619         unsigned long poke_addr;
1620 };
1621
1622 void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status,
1623                  void *args)
1624 {
1625         int ret;
1626         unsigned long msg;
1627         struct tracer_args_poke_t *info = (struct tracer_args_poke_t *)args;
1628
1629         ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1630         EXPECT_EQ(0, ret);
1631         /* If this fails, don't try to recover. */
1632         ASSERT_EQ(0x1001, msg) {
1633                 kill(tracee, SIGKILL);
1634         }
1635         /*
1636          * Poke in the message.
1637          * Registers are not touched to try to keep this relatively arch
1638          * agnostic.
1639          */
1640         ret = ptrace(PTRACE_POKEDATA, tracee, info->poke_addr, 0x1001);
1641         EXPECT_EQ(0, ret);
1642 }
1643
1644 FIXTURE(TRACE_poke) {
1645         struct sock_fprog prog;
1646         pid_t tracer;
1647         long poked;
1648         struct tracer_args_poke_t tracer_args;
1649 };
1650
1651 FIXTURE_SETUP(TRACE_poke)
1652 {
1653         struct sock_filter filter[] = {
1654                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1655                         offsetof(struct seccomp_data, nr)),
1656                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
1657                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1001),
1658                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1659         };
1660
1661         self->poked = 0;
1662         memset(&self->prog, 0, sizeof(self->prog));
1663         self->prog.filter = malloc(sizeof(filter));
1664         ASSERT_NE(NULL, self->prog.filter);
1665         memcpy(self->prog.filter, filter, sizeof(filter));
1666         self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1667
1668         /* Set up tracer args. */
1669         self->tracer_args.poke_addr = (unsigned long)&self->poked;
1670
1671         /* Launch tracer. */
1672         self->tracer = setup_trace_fixture(_metadata, tracer_poke,
1673                                            &self->tracer_args, false);
1674 }
1675
1676 FIXTURE_TEARDOWN(TRACE_poke)
1677 {
1678         teardown_trace_fixture(_metadata, self->tracer);
1679         if (self->prog.filter)
1680                 free(self->prog.filter);
1681 }
1682
1683 TEST_F(TRACE_poke, read_has_side_effects)
1684 {
1685         ssize_t ret;
1686
1687         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1688         ASSERT_EQ(0, ret);
1689
1690         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1691         ASSERT_EQ(0, ret);
1692
1693         EXPECT_EQ(0, self->poked);
1694         ret = read(-1, NULL, 0);
1695         EXPECT_EQ(-1, ret);
1696         EXPECT_EQ(0x1001, self->poked);
1697 }
1698
1699 TEST_F(TRACE_poke, getpid_runs_normally)
1700 {
1701         long ret;
1702
1703         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1704         ASSERT_EQ(0, ret);
1705
1706         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1707         ASSERT_EQ(0, ret);
1708
1709         EXPECT_EQ(0, self->poked);
1710         EXPECT_NE(0, syscall(__NR_getpid));
1711         EXPECT_EQ(0, self->poked);
1712 }
1713
1714 #if defined(__x86_64__)
1715 # define ARCH_REGS              struct user_regs_struct
1716 # define SYSCALL_NUM(_regs)     (_regs).orig_rax
1717 # define SYSCALL_RET(_regs)     (_regs).rax
1718 #elif defined(__i386__)
1719 # define ARCH_REGS              struct user_regs_struct
1720 # define SYSCALL_NUM(_regs)     (_regs).orig_eax
1721 # define SYSCALL_RET(_regs)     (_regs).eax
1722 #elif defined(__arm__)
1723 # define ARCH_REGS              struct pt_regs
1724 # define SYSCALL_NUM(_regs)     (_regs).ARM_r7
1725 # ifndef PTRACE_SET_SYSCALL
1726 #  define PTRACE_SET_SYSCALL   23
1727 # endif
1728 # define SYSCALL_NUM_SET(_regs, _nr)    \
1729                 EXPECT_EQ(0, ptrace(PTRACE_SET_SYSCALL, tracee, NULL, _nr))
1730 # define SYSCALL_RET(_regs)     (_regs).ARM_r0
1731 #elif defined(__aarch64__)
1732 # define ARCH_REGS              struct user_pt_regs
1733 # define SYSCALL_NUM(_regs)     (_regs).regs[8]
1734 # ifndef NT_ARM_SYSTEM_CALL
1735 #  define NT_ARM_SYSTEM_CALL 0x404
1736 # endif
1737 # define SYSCALL_NUM_SET(_regs, _nr)                            \
1738         do {                                                    \
1739                 struct iovec __v;                               \
1740                 typeof(_nr) __nr = (_nr);                       \
1741                 __v.iov_base = &__nr;                           \
1742                 __v.iov_len = sizeof(__nr);                     \
1743                 EXPECT_EQ(0, ptrace(PTRACE_SETREGSET, tracee,   \
1744                                     NT_ARM_SYSTEM_CALL, &__v)); \
1745         } while (0)
1746 # define SYSCALL_RET(_regs)     (_regs).regs[0]
1747 #elif defined(__riscv) && __riscv_xlen == 64
1748 # define ARCH_REGS              struct user_regs_struct
1749 # define SYSCALL_NUM(_regs)     (_regs).a7
1750 # define SYSCALL_RET(_regs)     (_regs).a0
1751 #elif defined(__csky__)
1752 # define ARCH_REGS              struct pt_regs
1753 #  if defined(__CSKYABIV2__)
1754 #   define SYSCALL_NUM(_regs)   (_regs).regs[3]
1755 #  else
1756 #   define SYSCALL_NUM(_regs)   (_regs).regs[9]
1757 #  endif
1758 # define SYSCALL_RET(_regs)     (_regs).a0
1759 #elif defined(__hppa__)
1760 # define ARCH_REGS              struct user_regs_struct
1761 # define SYSCALL_NUM(_regs)     (_regs).gr[20]
1762 # define SYSCALL_RET(_regs)     (_regs).gr[28]
1763 #elif defined(__powerpc__)
1764 # define ARCH_REGS              struct pt_regs
1765 # define SYSCALL_NUM(_regs)     (_regs).gpr[0]
1766 # define SYSCALL_RET(_regs)     (_regs).gpr[3]
1767 # define SYSCALL_RET_SET(_regs, _val)                           \
1768         do {                                                    \
1769                 typeof(_val) _result = (_val);                  \
1770                 if ((_regs.trap & 0xfff0) == 0x3000) {          \
1771                         /*                                      \
1772                          * scv 0 system call uses -ve result    \
1773                          * for error, so no need to adjust.     \
1774                          */                                     \
1775                         SYSCALL_RET(_regs) = _result;           \
1776                 } else {                                        \
1777                         /*                                      \
1778                          * A syscall error is signaled by the   \
1779                          * CR0 SO bit and the code is stored as \
1780                          * a positive value.                    \
1781                          */                                     \
1782                         if (_result < 0) {                      \
1783                                 SYSCALL_RET(_regs) = -_result;  \
1784                                 (_regs).ccr |= 0x10000000;      \
1785                         } else {                                \
1786                                 SYSCALL_RET(_regs) = _result;   \
1787                                 (_regs).ccr &= ~0x10000000;     \
1788                         }                                       \
1789                 }                                               \
1790         } while (0)
1791 # define SYSCALL_RET_SET_ON_PTRACE_EXIT
1792 #elif defined(__s390__)
1793 # define ARCH_REGS              s390_regs
1794 # define SYSCALL_NUM(_regs)     (_regs).gprs[2]
1795 # define SYSCALL_RET_SET(_regs, _val)                   \
1796                 TH_LOG("Can't modify syscall return on this architecture")
1797 #elif defined(__mips__)
1798 # include <asm/unistd_nr_n32.h>
1799 # include <asm/unistd_nr_n64.h>
1800 # include <asm/unistd_nr_o32.h>
1801 # define ARCH_REGS              struct pt_regs
1802 # define SYSCALL_NUM(_regs)                             \
1803         ({                                              \
1804                 typeof((_regs).regs[2]) _nr;            \
1805                 if ((_regs).regs[2] == __NR_O32_Linux)  \
1806                         _nr = (_regs).regs[4];          \
1807                 else                                    \
1808                         _nr = (_regs).regs[2];          \
1809                 _nr;                                    \
1810         })
1811 # define SYSCALL_NUM_SET(_regs, _nr)                    \
1812         do {                                            \
1813                 if ((_regs).regs[2] == __NR_O32_Linux)  \
1814                         (_regs).regs[4] = _nr;          \
1815                 else                                    \
1816                         (_regs).regs[2] = _nr;          \
1817         } while (0)
1818 # define SYSCALL_RET_SET(_regs, _val)                   \
1819                 TH_LOG("Can't modify syscall return on this architecture")
1820 #elif defined(__xtensa__)
1821 # define ARCH_REGS              struct user_pt_regs
1822 # define SYSCALL_NUM(_regs)     (_regs).syscall
1823 /*
1824  * On xtensa syscall return value is in the register
1825  * a2 of the current window which is not fixed.
1826  */
1827 #define SYSCALL_RET(_regs)      (_regs).a[(_regs).windowbase * 4 + 2]
1828 #elif defined(__sh__)
1829 # define ARCH_REGS              struct pt_regs
1830 # define SYSCALL_NUM(_regs)     (_regs).regs[3]
1831 # define SYSCALL_RET(_regs)     (_regs).regs[0]
1832 #else
1833 # error "Do not know how to find your architecture's registers and syscalls"
1834 #endif
1835
1836 /*
1837  * Most architectures can change the syscall by just updating the
1838  * associated register. This is the default if not defined above.
1839  */
1840 #ifndef SYSCALL_NUM_SET
1841 # define SYSCALL_NUM_SET(_regs, _nr)            \
1842         do {                                    \
1843                 SYSCALL_NUM(_regs) = (_nr);     \
1844         } while (0)
1845 #endif
1846 /*
1847  * Most architectures can change the syscall return value by just
1848  * writing to the SYSCALL_RET register. This is the default if not
1849  * defined above. If an architecture cannot set the return value
1850  * (for example when the syscall and return value register is
1851  * shared), report it with TH_LOG() in an arch-specific definition
1852  * of SYSCALL_RET_SET() above, and leave SYSCALL_RET undefined.
1853  */
1854 #if !defined(SYSCALL_RET) && !defined(SYSCALL_RET_SET)
1855 # error "One of SYSCALL_RET or SYSCALL_RET_SET is needed for this arch"
1856 #endif
1857 #ifndef SYSCALL_RET_SET
1858 # define SYSCALL_RET_SET(_regs, _val)           \
1859         do {                                    \
1860                 SYSCALL_RET(_regs) = (_val);    \
1861         } while (0)
1862 #endif
1863
1864 /* When the syscall return can't be changed, stub out the tests for it. */
1865 #ifndef SYSCALL_RET
1866 # define EXPECT_SYSCALL_RETURN(val, action)     EXPECT_EQ(-1, action)
1867 #else
1868 # define EXPECT_SYSCALL_RETURN(val, action)             \
1869         do {                                            \
1870                 errno = 0;                              \
1871                 if (val < 0) {                          \
1872                         EXPECT_EQ(-1, action);          \
1873                         EXPECT_EQ(-(val), errno);       \
1874                 } else {                                \
1875                         EXPECT_EQ(val, action);         \
1876                 }                                       \
1877         } while (0)
1878 #endif
1879
1880 /*
1881  * Some architectures (e.g. powerpc) can only set syscall
1882  * return values on syscall exit during ptrace.
1883  */
1884 const bool ptrace_entry_set_syscall_nr = true;
1885 const bool ptrace_entry_set_syscall_ret =
1886 #ifndef SYSCALL_RET_SET_ON_PTRACE_EXIT
1887         true;
1888 #else
1889         false;
1890 #endif
1891
1892 /*
1893  * Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
1894  * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
1895  */
1896 #if defined(__x86_64__) || defined(__i386__) || defined(__mips__)
1897 # define ARCH_GETREGS(_regs)    ptrace(PTRACE_GETREGS, tracee, 0, &(_regs))
1898 # define ARCH_SETREGS(_regs)    ptrace(PTRACE_SETREGS, tracee, 0, &(_regs))
1899 #else
1900 # define ARCH_GETREGS(_regs)    ({                                      \
1901                 struct iovec __v;                                       \
1902                 __v.iov_base = &(_regs);                                \
1903                 __v.iov_len = sizeof(_regs);                            \
1904                 ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &__v);    \
1905         })
1906 # define ARCH_SETREGS(_regs)    ({                                      \
1907                 struct iovec __v;                                       \
1908                 __v.iov_base = &(_regs);                                \
1909                 __v.iov_len = sizeof(_regs);                            \
1910                 ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &__v);    \
1911         })
1912 #endif
1913
1914 /* Architecture-specific syscall fetching routine. */
1915 int get_syscall(struct __test_metadata *_metadata, pid_t tracee)
1916 {
1917         ARCH_REGS regs;
1918
1919         EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1920                 return -1;
1921         }
1922
1923         return SYSCALL_NUM(regs);
1924 }
1925
1926 /* Architecture-specific syscall changing routine. */
1927 void __change_syscall(struct __test_metadata *_metadata,
1928                     pid_t tracee, long *syscall, long *ret)
1929 {
1930         ARCH_REGS orig, regs;
1931
1932         /* Do not get/set registers if we have nothing to do. */
1933         if (!syscall && !ret)
1934                 return;
1935
1936         EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1937                 return;
1938         }
1939         orig = regs;
1940
1941         if (syscall)
1942                 SYSCALL_NUM_SET(regs, *syscall);
1943
1944         if (ret)
1945                 SYSCALL_RET_SET(regs, *ret);
1946
1947         /* Flush any register changes made. */
1948         if (memcmp(&orig, &regs, sizeof(orig)) != 0)
1949                 EXPECT_EQ(0, ARCH_SETREGS(regs));
1950 }
1951
1952 /* Change only syscall number. */
1953 void change_syscall_nr(struct __test_metadata *_metadata,
1954                        pid_t tracee, long syscall)
1955 {
1956         __change_syscall(_metadata, tracee, &syscall, NULL);
1957 }
1958
1959 /* Change syscall return value (and set syscall number to -1). */
1960 void change_syscall_ret(struct __test_metadata *_metadata,
1961                         pid_t tracee, long ret)
1962 {
1963         long syscall = -1;
1964
1965         __change_syscall(_metadata, tracee, &syscall, &ret);
1966 }
1967
1968 void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
1969                     int status, void *args)
1970 {
1971         int ret;
1972         unsigned long msg;
1973
1974         EXPECT_EQ(PTRACE_EVENT_MASK(status), PTRACE_EVENT_SECCOMP) {
1975                 TH_LOG("Unexpected ptrace event: %d", PTRACE_EVENT_MASK(status));
1976                 return;
1977         }
1978
1979         /* Make sure we got the right message. */
1980         ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1981         EXPECT_EQ(0, ret);
1982
1983         /* Validate and take action on expected syscalls. */
1984         switch (msg) {
1985         case 0x1002:
1986                 /* change getpid to getppid. */
1987                 EXPECT_EQ(__NR_getpid, get_syscall(_metadata, tracee));
1988                 change_syscall_nr(_metadata, tracee, __NR_getppid);
1989                 break;
1990         case 0x1003:
1991                 /* skip gettid with valid return code. */
1992                 EXPECT_EQ(__NR_gettid, get_syscall(_metadata, tracee));
1993                 change_syscall_ret(_metadata, tracee, 45000);
1994                 break;
1995         case 0x1004:
1996                 /* skip openat with error. */
1997                 EXPECT_EQ(__NR_openat, get_syscall(_metadata, tracee));
1998                 change_syscall_ret(_metadata, tracee, -ESRCH);
1999                 break;
2000         case 0x1005:
2001                 /* do nothing (allow getppid) */
2002                 EXPECT_EQ(__NR_getppid, get_syscall(_metadata, tracee));
2003                 break;
2004         default:
2005                 EXPECT_EQ(0, msg) {
2006                         TH_LOG("Unknown PTRACE_GETEVENTMSG: 0x%lx", msg);
2007                         kill(tracee, SIGKILL);
2008                 }
2009         }
2010
2011 }
2012
2013 FIXTURE(TRACE_syscall) {
2014         struct sock_fprog prog;
2015         pid_t tracer, mytid, mypid, parent;
2016         long syscall_nr;
2017 };
2018
2019 void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
2020                    int status, void *args)
2021 {
2022         int ret;
2023         unsigned long msg;
2024         static bool entry;
2025         long syscall_nr_val, syscall_ret_val;
2026         long *syscall_nr = NULL, *syscall_ret = NULL;
2027         FIXTURE_DATA(TRACE_syscall) *self = args;
2028
2029         EXPECT_EQ(WSTOPSIG(status) & 0x80, 0x80) {
2030                 TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
2031                 return;
2032         }
2033
2034         /*
2035          * The traditional way to tell PTRACE_SYSCALL entry/exit
2036          * is by counting.
2037          */
2038         entry = !entry;
2039
2040         /* Make sure we got an appropriate message. */
2041         ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
2042         EXPECT_EQ(0, ret);
2043         EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
2044                         : PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
2045
2046         /*
2047          * Some architectures only support setting return values during
2048          * syscall exit under ptrace, and on exit the syscall number may
2049          * no longer be available. Therefore, save the initial sycall
2050          * number here, so it can be examined during both entry and exit
2051          * phases.
2052          */
2053         if (entry)
2054                 self->syscall_nr = get_syscall(_metadata, tracee);
2055
2056         /*
2057          * Depending on the architecture's syscall setting abilities, we
2058          * pick which things to set during this phase (entry or exit).
2059          */
2060         if (entry == ptrace_entry_set_syscall_nr)
2061                 syscall_nr = &syscall_nr_val;
2062         if (entry == ptrace_entry_set_syscall_ret)
2063                 syscall_ret = &syscall_ret_val;
2064
2065         /* Now handle the actual rewriting cases. */
2066         switch (self->syscall_nr) {
2067         case __NR_getpid:
2068                 syscall_nr_val = __NR_getppid;
2069                 /* Never change syscall return for this case. */
2070                 syscall_ret = NULL;
2071                 break;
2072         case __NR_gettid:
2073                 syscall_nr_val = -1;
2074                 syscall_ret_val = 45000;
2075                 break;
2076         case __NR_openat:
2077                 syscall_nr_val = -1;
2078                 syscall_ret_val = -ESRCH;
2079                 break;
2080         default:
2081                 /* Unhandled, do nothing. */
2082                 return;
2083         }
2084
2085         __change_syscall(_metadata, tracee, syscall_nr, syscall_ret);
2086 }
2087
2088 FIXTURE_VARIANT(TRACE_syscall) {
2089         /*
2090          * All of the SECCOMP_RET_TRACE behaviors can be tested with either
2091          * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
2092          * This indicates if we should use SECCOMP_RET_TRACE (false), or
2093          * ptrace (true).
2094          */
2095         bool use_ptrace;
2096 };
2097
2098 FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
2099         .use_ptrace = true,
2100 };
2101
2102 FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
2103         .use_ptrace = false,
2104 };
2105
2106 FIXTURE_SETUP(TRACE_syscall)
2107 {
2108         struct sock_filter filter[] = {
2109                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2110                         offsetof(struct seccomp_data, nr)),
2111                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
2112                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1002),
2113                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_gettid, 0, 1),
2114                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1003),
2115                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_openat, 0, 1),
2116                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1004),
2117                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2118                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1005),
2119                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2120         };
2121         struct sock_fprog prog = {
2122                 .len = (unsigned short)ARRAY_SIZE(filter),
2123                 .filter = filter,
2124         };
2125         long ret;
2126
2127         /* Prepare some testable syscall results. */
2128         self->mytid = syscall(__NR_gettid);
2129         ASSERT_GT(self->mytid, 0);
2130         ASSERT_NE(self->mytid, 1) {
2131                 TH_LOG("Running this test as init is not supported. :)");
2132         }
2133
2134         self->mypid = getpid();
2135         ASSERT_GT(self->mypid, 0);
2136         ASSERT_EQ(self->mytid, self->mypid);
2137
2138         self->parent = getppid();
2139         ASSERT_GT(self->parent, 0);
2140         ASSERT_NE(self->parent, self->mypid);
2141
2142         /* Launch tracer. */
2143         self->tracer = setup_trace_fixture(_metadata,
2144                                            variant->use_ptrace ? tracer_ptrace
2145                                                                : tracer_seccomp,
2146                                            self, variant->use_ptrace);
2147
2148         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2149         ASSERT_EQ(0, ret);
2150
2151         /* Do not install seccomp rewrite filters, as we'll use ptrace instead. */
2152         if (variant->use_ptrace)
2153                 return;
2154
2155         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2156         ASSERT_EQ(0, ret);
2157 }
2158
2159 FIXTURE_TEARDOWN(TRACE_syscall)
2160 {
2161         teardown_trace_fixture(_metadata, self->tracer);
2162 }
2163
2164 TEST(negative_ENOSYS)
2165 {
2166         /*
2167          * There should be no difference between an "internal" skip
2168          * and userspace asking for syscall "-1".
2169          */
2170         errno = 0;
2171         EXPECT_EQ(-1, syscall(-1));
2172         EXPECT_EQ(errno, ENOSYS);
2173         /* And no difference for "still not valid but not -1". */
2174         errno = 0;
2175         EXPECT_EQ(-1, syscall(-101));
2176         EXPECT_EQ(errno, ENOSYS);
2177 }
2178
2179 TEST_F(TRACE_syscall, negative_ENOSYS)
2180 {
2181         negative_ENOSYS(_metadata);
2182 }
2183
2184 TEST_F(TRACE_syscall, syscall_allowed)
2185 {
2186         /* getppid works as expected (no changes). */
2187         EXPECT_EQ(self->parent, syscall(__NR_getppid));
2188         EXPECT_NE(self->mypid, syscall(__NR_getppid));
2189 }
2190
2191 TEST_F(TRACE_syscall, syscall_redirected)
2192 {
2193         /* getpid has been redirected to getppid as expected. */
2194         EXPECT_EQ(self->parent, syscall(__NR_getpid));
2195         EXPECT_NE(self->mypid, syscall(__NR_getpid));
2196 }
2197
2198 TEST_F(TRACE_syscall, syscall_errno)
2199 {
2200         /* Tracer should skip the open syscall, resulting in ESRCH. */
2201         EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat));
2202 }
2203
2204 TEST_F(TRACE_syscall, syscall_faked)
2205 {
2206         /* Tracer skips the gettid syscall and store altered return value. */
2207         EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
2208 }
2209
2210 TEST_F_SIGNAL(TRACE_syscall, kill_immediate, SIGSYS)
2211 {
2212         struct sock_filter filter[] = {
2213                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2214                         offsetof(struct seccomp_data, nr)),
2215                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_mknodat, 0, 1),
2216                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
2217                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2218         };
2219         struct sock_fprog prog = {
2220                 .len = (unsigned short)ARRAY_SIZE(filter),
2221                 .filter = filter,
2222         };
2223         long ret;
2224
2225         /* Install "kill on mknodat" filter. */
2226         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2227         ASSERT_EQ(0, ret);
2228
2229         /* This should immediately die with SIGSYS, regardless of tracer. */
2230         EXPECT_EQ(-1, syscall(__NR_mknodat, -1, NULL, 0, 0));
2231 }
2232
2233 TEST_F(TRACE_syscall, skip_after)
2234 {
2235         struct sock_filter filter[] = {
2236                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2237                         offsetof(struct seccomp_data, nr)),
2238                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2239                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM),
2240                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2241         };
2242         struct sock_fprog prog = {
2243                 .len = (unsigned short)ARRAY_SIZE(filter),
2244                 .filter = filter,
2245         };
2246         long ret;
2247
2248         /* Install additional "errno on getppid" filter. */
2249         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2250         ASSERT_EQ(0, ret);
2251
2252         /* Tracer will redirect getpid to getppid, and we should see EPERM. */
2253         errno = 0;
2254         EXPECT_EQ(-1, syscall(__NR_getpid));
2255         EXPECT_EQ(EPERM, errno);
2256 }
2257
2258 TEST_F_SIGNAL(TRACE_syscall, kill_after, SIGSYS)
2259 {
2260         struct sock_filter filter[] = {
2261                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2262                         offsetof(struct seccomp_data, nr)),
2263                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2264                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2265                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2266         };
2267         struct sock_fprog prog = {
2268                 .len = (unsigned short)ARRAY_SIZE(filter),
2269                 .filter = filter,
2270         };
2271         long ret;
2272
2273         /* Install additional "death on getppid" filter. */
2274         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2275         ASSERT_EQ(0, ret);
2276
2277         /* Tracer will redirect getpid to getppid, and we should die. */
2278         EXPECT_NE(self->mypid, syscall(__NR_getpid));
2279 }
2280
2281 TEST(seccomp_syscall)
2282 {
2283         struct sock_filter filter[] = {
2284                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2285         };
2286         struct sock_fprog prog = {
2287                 .len = (unsigned short)ARRAY_SIZE(filter),
2288                 .filter = filter,
2289         };
2290         long ret;
2291
2292         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2293         ASSERT_EQ(0, ret) {
2294                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2295         }
2296
2297         /* Reject insane operation. */
2298         ret = seccomp(-1, 0, &prog);
2299         ASSERT_NE(ENOSYS, errno) {
2300                 TH_LOG("Kernel does not support seccomp syscall!");
2301         }
2302         EXPECT_EQ(EINVAL, errno) {
2303                 TH_LOG("Did not reject crazy op value!");
2304         }
2305
2306         /* Reject strict with flags or pointer. */
2307         ret = seccomp(SECCOMP_SET_MODE_STRICT, -1, NULL);
2308         EXPECT_EQ(EINVAL, errno) {
2309                 TH_LOG("Did not reject mode strict with flags!");
2310         }
2311         ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, &prog);
2312         EXPECT_EQ(EINVAL, errno) {
2313                 TH_LOG("Did not reject mode strict with uargs!");
2314         }
2315
2316         /* Reject insane args for filter. */
2317         ret = seccomp(SECCOMP_SET_MODE_FILTER, -1, &prog);
2318         EXPECT_EQ(EINVAL, errno) {
2319                 TH_LOG("Did not reject crazy filter flags!");
2320         }
2321         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, NULL);
2322         EXPECT_EQ(EFAULT, errno) {
2323                 TH_LOG("Did not reject NULL filter!");
2324         }
2325
2326         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2327         EXPECT_EQ(0, errno) {
2328                 TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER: %s",
2329                         strerror(errno));
2330         }
2331 }
2332
2333 TEST(seccomp_syscall_mode_lock)
2334 {
2335         struct sock_filter filter[] = {
2336                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2337         };
2338         struct sock_fprog prog = {
2339                 .len = (unsigned short)ARRAY_SIZE(filter),
2340                 .filter = filter,
2341         };
2342         long ret;
2343
2344         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2345         ASSERT_EQ(0, ret) {
2346                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2347         }
2348
2349         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2350         ASSERT_NE(ENOSYS, errno) {
2351                 TH_LOG("Kernel does not support seccomp syscall!");
2352         }
2353         EXPECT_EQ(0, ret) {
2354                 TH_LOG("Could not install filter!");
2355         }
2356
2357         /* Make sure neither entry point will switch to strict. */
2358         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
2359         EXPECT_EQ(EINVAL, errno) {
2360                 TH_LOG("Switched to mode strict!");
2361         }
2362
2363         ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, NULL);
2364         EXPECT_EQ(EINVAL, errno) {
2365                 TH_LOG("Switched to mode strict!");
2366         }
2367 }
2368
2369 /*
2370  * Test detection of known and unknown filter flags. Userspace needs to be able
2371  * to check if a filter flag is supported by the current kernel and a good way
2372  * of doing that is by attempting to enter filter mode, with the flag bit in
2373  * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates
2374  * that the flag is valid and EINVAL indicates that the flag is invalid.
2375  */
2376 TEST(detect_seccomp_filter_flags)
2377 {
2378         unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
2379                                  SECCOMP_FILTER_FLAG_LOG,
2380                                  SECCOMP_FILTER_FLAG_SPEC_ALLOW,
2381                                  SECCOMP_FILTER_FLAG_NEW_LISTENER,
2382                                  SECCOMP_FILTER_FLAG_TSYNC_ESRCH };
2383         unsigned int exclusive[] = {
2384                                 SECCOMP_FILTER_FLAG_TSYNC,
2385                                 SECCOMP_FILTER_FLAG_NEW_LISTENER };
2386         unsigned int flag, all_flags, exclusive_mask;
2387         int i;
2388         long ret;
2389
2390         /* Test detection of individual known-good filter flags */
2391         for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) {
2392                 int bits = 0;
2393
2394                 flag = flags[i];
2395                 /* Make sure the flag is a single bit! */
2396                 while (flag) {
2397                         if (flag & 0x1)
2398                                 bits ++;
2399                         flag >>= 1;
2400                 }
2401                 ASSERT_EQ(1, bits);
2402                 flag = flags[i];
2403
2404                 ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2405                 ASSERT_NE(ENOSYS, errno) {
2406                         TH_LOG("Kernel does not support seccomp syscall!");
2407                 }
2408                 EXPECT_EQ(-1, ret);
2409                 EXPECT_EQ(EFAULT, errno) {
2410                         TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!",
2411                                flag);
2412                 }
2413
2414                 all_flags |= flag;
2415         }
2416
2417         /*
2418          * Test detection of all known-good filter flags combined. But
2419          * for the exclusive flags we need to mask them out and try them
2420          * individually for the "all flags" testing.
2421          */
2422         exclusive_mask = 0;
2423         for (i = 0; i < ARRAY_SIZE(exclusive); i++)
2424                 exclusive_mask |= exclusive[i];
2425         for (i = 0; i < ARRAY_SIZE(exclusive); i++) {
2426                 flag = all_flags & ~exclusive_mask;
2427                 flag |= exclusive[i];
2428
2429                 ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2430                 EXPECT_EQ(-1, ret);
2431                 EXPECT_EQ(EFAULT, errno) {
2432                         TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!",
2433                                flag);
2434                 }
2435         }
2436
2437         /* Test detection of an unknown filter flags, without exclusives. */
2438         flag = -1;
2439         flag &= ~exclusive_mask;
2440         ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2441         EXPECT_EQ(-1, ret);
2442         EXPECT_EQ(EINVAL, errno) {
2443                 TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!",
2444                        flag);
2445         }
2446
2447         /*
2448          * Test detection of an unknown filter flag that may simply need to be
2449          * added to this test
2450          */
2451         flag = flags[ARRAY_SIZE(flags) - 1] << 1;
2452         ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2453         EXPECT_EQ(-1, ret);
2454         EXPECT_EQ(EINVAL, errno) {
2455                 TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?",
2456                        flag);
2457         }
2458 }
2459
2460 TEST(TSYNC_first)
2461 {
2462         struct sock_filter filter[] = {
2463                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2464         };
2465         struct sock_fprog prog = {
2466                 .len = (unsigned short)ARRAY_SIZE(filter),
2467                 .filter = filter,
2468         };
2469         long ret;
2470
2471         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2472         ASSERT_EQ(0, ret) {
2473                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2474         }
2475
2476         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2477                       &prog);
2478         ASSERT_NE(ENOSYS, errno) {
2479                 TH_LOG("Kernel does not support seccomp syscall!");
2480         }
2481         EXPECT_EQ(0, ret) {
2482                 TH_LOG("Could not install initial filter with TSYNC!");
2483         }
2484 }
2485
2486 #define TSYNC_SIBLINGS 2
2487 struct tsync_sibling {
2488         pthread_t tid;
2489         pid_t system_tid;
2490         sem_t *started;
2491         pthread_cond_t *cond;
2492         pthread_mutex_t *mutex;
2493         int diverge;
2494         int num_waits;
2495         struct sock_fprog *prog;
2496         struct __test_metadata *metadata;
2497 };
2498
2499 /*
2500  * To avoid joining joined threads (which is not allowed by Bionic),
2501  * make sure we both successfully join and clear the tid to skip a
2502  * later join attempt during fixture teardown. Any remaining threads
2503  * will be directly killed during teardown.
2504  */
2505 #define PTHREAD_JOIN(tid, status)                                       \
2506         do {                                                            \
2507                 int _rc = pthread_join(tid, status);                    \
2508                 if (_rc) {                                              \
2509                         TH_LOG("pthread_join of tid %u failed: %d\n",   \
2510                                 (unsigned int)tid, _rc);                \
2511                 } else {                                                \
2512                         tid = 0;                                        \
2513                 }                                                       \
2514         } while (0)
2515
2516 FIXTURE(TSYNC) {
2517         struct sock_fprog root_prog, apply_prog;
2518         struct tsync_sibling sibling[TSYNC_SIBLINGS];
2519         sem_t started;
2520         pthread_cond_t cond;
2521         pthread_mutex_t mutex;
2522         int sibling_count;
2523 };
2524
2525 FIXTURE_SETUP(TSYNC)
2526 {
2527         struct sock_filter root_filter[] = {
2528                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2529         };
2530         struct sock_filter apply_filter[] = {
2531                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2532                         offsetof(struct seccomp_data, nr)),
2533                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
2534                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2535                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2536         };
2537
2538         memset(&self->root_prog, 0, sizeof(self->root_prog));
2539         memset(&self->apply_prog, 0, sizeof(self->apply_prog));
2540         memset(&self->sibling, 0, sizeof(self->sibling));
2541         self->root_prog.filter = malloc(sizeof(root_filter));
2542         ASSERT_NE(NULL, self->root_prog.filter);
2543         memcpy(self->root_prog.filter, &root_filter, sizeof(root_filter));
2544         self->root_prog.len = (unsigned short)ARRAY_SIZE(root_filter);
2545
2546         self->apply_prog.filter = malloc(sizeof(apply_filter));
2547         ASSERT_NE(NULL, self->apply_prog.filter);
2548         memcpy(self->apply_prog.filter, &apply_filter, sizeof(apply_filter));
2549         self->apply_prog.len = (unsigned short)ARRAY_SIZE(apply_filter);
2550
2551         self->sibling_count = 0;
2552         pthread_mutex_init(&self->mutex, NULL);
2553         pthread_cond_init(&self->cond, NULL);
2554         sem_init(&self->started, 0, 0);
2555         self->sibling[0].tid = 0;
2556         self->sibling[0].cond = &self->cond;
2557         self->sibling[0].started = &self->started;
2558         self->sibling[0].mutex = &self->mutex;
2559         self->sibling[0].diverge = 0;
2560         self->sibling[0].num_waits = 1;
2561         self->sibling[0].prog = &self->root_prog;
2562         self->sibling[0].metadata = _metadata;
2563         self->sibling[1].tid = 0;
2564         self->sibling[1].cond = &self->cond;
2565         self->sibling[1].started = &self->started;
2566         self->sibling[1].mutex = &self->mutex;
2567         self->sibling[1].diverge = 0;
2568         self->sibling[1].prog = &self->root_prog;
2569         self->sibling[1].num_waits = 1;
2570         self->sibling[1].metadata = _metadata;
2571 }
2572
2573 FIXTURE_TEARDOWN(TSYNC)
2574 {
2575         int sib = 0;
2576
2577         if (self->root_prog.filter)
2578                 free(self->root_prog.filter);
2579         if (self->apply_prog.filter)
2580                 free(self->apply_prog.filter);
2581
2582         for ( ; sib < self->sibling_count; ++sib) {
2583                 struct tsync_sibling *s = &self->sibling[sib];
2584
2585                 if (!s->tid)
2586                         continue;
2587                 /*
2588                  * If a thread is still running, it may be stuck, so hit
2589                  * it over the head really hard.
2590                  */
2591                 pthread_kill(s->tid, 9);
2592         }
2593         pthread_mutex_destroy(&self->mutex);
2594         pthread_cond_destroy(&self->cond);
2595         sem_destroy(&self->started);
2596 }
2597
2598 void *tsync_sibling(void *data)
2599 {
2600         long ret = 0;
2601         struct tsync_sibling *me = data;
2602
2603         me->system_tid = syscall(__NR_gettid);
2604
2605         pthread_mutex_lock(me->mutex);
2606         if (me->diverge) {
2607                 /* Just re-apply the root prog to fork the tree */
2608                 ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
2609                                 me->prog, 0, 0);
2610         }
2611         sem_post(me->started);
2612         /* Return outside of started so parent notices failures. */
2613         if (ret) {
2614                 pthread_mutex_unlock(me->mutex);
2615                 return (void *)SIBLING_EXIT_FAILURE;
2616         }
2617         do {
2618                 pthread_cond_wait(me->cond, me->mutex);
2619                 me->num_waits = me->num_waits - 1;
2620         } while (me->num_waits);
2621         pthread_mutex_unlock(me->mutex);
2622
2623         ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
2624         if (!ret)
2625                 return (void *)SIBLING_EXIT_NEWPRIVS;
2626         read(0, NULL, 0);
2627         return (void *)SIBLING_EXIT_UNKILLED;
2628 }
2629
2630 void tsync_start_sibling(struct tsync_sibling *sibling)
2631 {
2632         pthread_create(&sibling->tid, NULL, tsync_sibling, (void *)sibling);
2633 }
2634
2635 TEST_F(TSYNC, siblings_fail_prctl)
2636 {
2637         long ret;
2638         void *status;
2639         struct sock_filter filter[] = {
2640                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2641                         offsetof(struct seccomp_data, nr)),
2642                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
2643                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EINVAL),
2644                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2645         };
2646         struct sock_fprog prog = {
2647                 .len = (unsigned short)ARRAY_SIZE(filter),
2648                 .filter = filter,
2649         };
2650
2651         ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2652                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2653         }
2654
2655         /* Check prctl failure detection by requesting sib 0 diverge. */
2656         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2657         ASSERT_NE(ENOSYS, errno) {
2658                 TH_LOG("Kernel does not support seccomp syscall!");
2659         }
2660         ASSERT_EQ(0, ret) {
2661                 TH_LOG("setting filter failed");
2662         }
2663
2664         self->sibling[0].diverge = 1;
2665         tsync_start_sibling(&self->sibling[0]);
2666         tsync_start_sibling(&self->sibling[1]);
2667
2668         while (self->sibling_count < TSYNC_SIBLINGS) {
2669                 sem_wait(&self->started);
2670                 self->sibling_count++;
2671         }
2672
2673         /* Signal the threads to clean up*/
2674         pthread_mutex_lock(&self->mutex);
2675         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2676                 TH_LOG("cond broadcast non-zero");
2677         }
2678         pthread_mutex_unlock(&self->mutex);
2679
2680         /* Ensure diverging sibling failed to call prctl. */
2681         PTHREAD_JOIN(self->sibling[0].tid, &status);
2682         EXPECT_EQ(SIBLING_EXIT_FAILURE, (long)status);
2683         PTHREAD_JOIN(self->sibling[1].tid, &status);
2684         EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2685 }
2686
2687 TEST_F(TSYNC, two_siblings_with_ancestor)
2688 {
2689         long ret;
2690         void *status;
2691
2692         ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2693                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2694         }
2695
2696         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2697         ASSERT_NE(ENOSYS, errno) {
2698                 TH_LOG("Kernel does not support seccomp syscall!");
2699         }
2700         ASSERT_EQ(0, ret) {
2701                 TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2702         }
2703         tsync_start_sibling(&self->sibling[0]);
2704         tsync_start_sibling(&self->sibling[1]);
2705
2706         while (self->sibling_count < TSYNC_SIBLINGS) {
2707                 sem_wait(&self->started);
2708                 self->sibling_count++;
2709         }
2710
2711         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2712                       &self->apply_prog);
2713         ASSERT_EQ(0, ret) {
2714                 TH_LOG("Could install filter on all threads!");
2715         }
2716         /* Tell the siblings to test the policy */
2717         pthread_mutex_lock(&self->mutex);
2718         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2719                 TH_LOG("cond broadcast non-zero");
2720         }
2721         pthread_mutex_unlock(&self->mutex);
2722         /* Ensure they are both killed and don't exit cleanly. */
2723         PTHREAD_JOIN(self->sibling[0].tid, &status);
2724         EXPECT_EQ(0x0, (long)status);
2725         PTHREAD_JOIN(self->sibling[1].tid, &status);
2726         EXPECT_EQ(0x0, (long)status);
2727 }
2728
2729 TEST_F(TSYNC, two_sibling_want_nnp)
2730 {
2731         void *status;
2732
2733         /* start siblings before any prctl() operations */
2734         tsync_start_sibling(&self->sibling[0]);
2735         tsync_start_sibling(&self->sibling[1]);
2736         while (self->sibling_count < TSYNC_SIBLINGS) {
2737                 sem_wait(&self->started);
2738                 self->sibling_count++;
2739         }
2740
2741         /* Tell the siblings to test no policy */
2742         pthread_mutex_lock(&self->mutex);
2743         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2744                 TH_LOG("cond broadcast non-zero");
2745         }
2746         pthread_mutex_unlock(&self->mutex);
2747
2748         /* Ensure they are both upset about lacking nnp. */
2749         PTHREAD_JOIN(self->sibling[0].tid, &status);
2750         EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2751         PTHREAD_JOIN(self->sibling[1].tid, &status);
2752         EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2753 }
2754
2755 TEST_F(TSYNC, two_siblings_with_no_filter)
2756 {
2757         long ret;
2758         void *status;
2759
2760         /* start siblings before any prctl() operations */
2761         tsync_start_sibling(&self->sibling[0]);
2762         tsync_start_sibling(&self->sibling[1]);
2763         while (self->sibling_count < TSYNC_SIBLINGS) {
2764                 sem_wait(&self->started);
2765                 self->sibling_count++;
2766         }
2767
2768         ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2769                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2770         }
2771
2772         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2773                       &self->apply_prog);
2774         ASSERT_NE(ENOSYS, errno) {
2775                 TH_LOG("Kernel does not support seccomp syscall!");
2776         }
2777         ASSERT_EQ(0, ret) {
2778                 TH_LOG("Could install filter on all threads!");
2779         }
2780
2781         /* Tell the siblings to test the policy */
2782         pthread_mutex_lock(&self->mutex);
2783         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2784                 TH_LOG("cond broadcast non-zero");
2785         }
2786         pthread_mutex_unlock(&self->mutex);
2787
2788         /* Ensure they are both killed and don't exit cleanly. */
2789         PTHREAD_JOIN(self->sibling[0].tid, &status);
2790         EXPECT_EQ(0x0, (long)status);
2791         PTHREAD_JOIN(self->sibling[1].tid, &status);
2792         EXPECT_EQ(0x0, (long)status);
2793 }
2794
2795 TEST_F(TSYNC, two_siblings_with_one_divergence)
2796 {
2797         long ret;
2798         void *status;
2799
2800         ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2801                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2802         }
2803
2804         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2805         ASSERT_NE(ENOSYS, errno) {
2806                 TH_LOG("Kernel does not support seccomp syscall!");
2807         }
2808         ASSERT_EQ(0, ret) {
2809                 TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2810         }
2811         self->sibling[0].diverge = 1;
2812         tsync_start_sibling(&self->sibling[0]);
2813         tsync_start_sibling(&self->sibling[1]);
2814
2815         while (self->sibling_count < TSYNC_SIBLINGS) {
2816                 sem_wait(&self->started);
2817                 self->sibling_count++;
2818         }
2819
2820         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2821                       &self->apply_prog);
2822         ASSERT_EQ(self->sibling[0].system_tid, ret) {
2823                 TH_LOG("Did not fail on diverged sibling.");
2824         }
2825
2826         /* Wake the threads */
2827         pthread_mutex_lock(&self->mutex);
2828         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2829                 TH_LOG("cond broadcast non-zero");
2830         }
2831         pthread_mutex_unlock(&self->mutex);
2832
2833         /* Ensure they are both unkilled. */
2834         PTHREAD_JOIN(self->sibling[0].tid, &status);
2835         EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2836         PTHREAD_JOIN(self->sibling[1].tid, &status);
2837         EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2838 }
2839
2840 TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err)
2841 {
2842         long ret, flags;
2843         void *status;
2844
2845         ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2846                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2847         }
2848
2849         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2850         ASSERT_NE(ENOSYS, errno) {
2851                 TH_LOG("Kernel does not support seccomp syscall!");
2852         }
2853         ASSERT_EQ(0, ret) {
2854                 TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2855         }
2856         self->sibling[0].diverge = 1;
2857         tsync_start_sibling(&self->sibling[0]);
2858         tsync_start_sibling(&self->sibling[1]);
2859
2860         while (self->sibling_count < TSYNC_SIBLINGS) {
2861                 sem_wait(&self->started);
2862                 self->sibling_count++;
2863         }
2864
2865         flags = SECCOMP_FILTER_FLAG_TSYNC | \
2866                 SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
2867         ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog);
2868         ASSERT_EQ(ESRCH, errno) {
2869                 TH_LOG("Did not return ESRCH for diverged sibling.");
2870         }
2871         ASSERT_EQ(-1, ret) {
2872                 TH_LOG("Did not fail on diverged sibling.");
2873         }
2874
2875         /* Wake the threads */
2876         pthread_mutex_lock(&self->mutex);
2877         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2878                 TH_LOG("cond broadcast non-zero");
2879         }
2880         pthread_mutex_unlock(&self->mutex);
2881
2882         /* Ensure they are both unkilled. */
2883         PTHREAD_JOIN(self->sibling[0].tid, &status);
2884         EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2885         PTHREAD_JOIN(self->sibling[1].tid, &status);
2886         EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2887 }
2888
2889 TEST_F(TSYNC, two_siblings_not_under_filter)
2890 {
2891         long ret, sib;
2892         void *status;
2893         struct timespec delay = { .tv_nsec = 100000000 };
2894
2895         ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2896                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2897         }
2898
2899         /*
2900          * Sibling 0 will have its own seccomp policy
2901          * and Sibling 1 will not be under seccomp at
2902          * all. Sibling 1 will enter seccomp and 0
2903          * will cause failure.
2904          */
2905         self->sibling[0].diverge = 1;
2906         tsync_start_sibling(&self->sibling[0]);
2907         tsync_start_sibling(&self->sibling[1]);
2908
2909         while (self->sibling_count < TSYNC_SIBLINGS) {
2910                 sem_wait(&self->started);
2911                 self->sibling_count++;
2912         }
2913
2914         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2915         ASSERT_NE(ENOSYS, errno) {
2916                 TH_LOG("Kernel does not support seccomp syscall!");
2917         }
2918         ASSERT_EQ(0, ret) {
2919                 TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2920         }
2921
2922         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2923                       &self->apply_prog);
2924         ASSERT_EQ(ret, self->sibling[0].system_tid) {
2925                 TH_LOG("Did not fail on diverged sibling.");
2926         }
2927         sib = 1;
2928         if (ret == self->sibling[0].system_tid)
2929                 sib = 0;
2930
2931         pthread_mutex_lock(&self->mutex);
2932
2933         /* Increment the other siblings num_waits so we can clean up
2934          * the one we just saw.
2935          */
2936         self->sibling[!sib].num_waits += 1;
2937
2938         /* Signal the thread to clean up*/
2939         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2940                 TH_LOG("cond broadcast non-zero");
2941         }
2942         pthread_mutex_unlock(&self->mutex);
2943         PTHREAD_JOIN(self->sibling[sib].tid, &status);
2944         EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2945         /* Poll for actual task death. pthread_join doesn't guarantee it. */
2946         while (!kill(self->sibling[sib].system_tid, 0))
2947                 nanosleep(&delay, NULL);
2948         /* Switch to the remaining sibling */
2949         sib = !sib;
2950
2951         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2952                       &self->apply_prog);
2953         ASSERT_EQ(0, ret) {
2954                 TH_LOG("Expected the remaining sibling to sync");
2955         };
2956
2957         pthread_mutex_lock(&self->mutex);
2958
2959         /* If remaining sibling didn't have a chance to wake up during
2960          * the first broadcast, manually reduce the num_waits now.
2961          */
2962         if (self->sibling[sib].num_waits > 1)
2963                 self->sibling[sib].num_waits = 1;
2964         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2965                 TH_LOG("cond broadcast non-zero");
2966         }
2967         pthread_mutex_unlock(&self->mutex);
2968         PTHREAD_JOIN(self->sibling[sib].tid, &status);
2969         EXPECT_EQ(0, (long)status);
2970         /* Poll for actual task death. pthread_join doesn't guarantee it. */
2971         while (!kill(self->sibling[sib].system_tid, 0))
2972                 nanosleep(&delay, NULL);
2973
2974         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2975                       &self->apply_prog);
2976         ASSERT_EQ(0, ret);  /* just us chickens */
2977 }
2978
2979 /* Make sure restarted syscalls are seen directly as "restart_syscall". */
2980 TEST(syscall_restart)
2981 {
2982         long ret;
2983         unsigned long msg;
2984         pid_t child_pid;
2985         int pipefd[2];
2986         int status;
2987         siginfo_t info = { };
2988         struct sock_filter filter[] = {
2989                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2990                          offsetof(struct seccomp_data, nr)),
2991
2992 #ifdef __NR_sigreturn
2993                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 7, 0),
2994 #endif
2995                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 6, 0),
2996                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 5, 0),
2997                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 4, 0),
2998                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_nanosleep, 5, 0),
2999                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_nanosleep, 4, 0),
3000                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 4, 0),
3001
3002                 /* Allow __NR_write for easy logging. */
3003                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 0, 1),
3004                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3005                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
3006                 /* The nanosleep jump target. */
3007                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x100),
3008                 /* The restart_syscall jump target. */
3009                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x200),
3010         };
3011         struct sock_fprog prog = {
3012                 .len = (unsigned short)ARRAY_SIZE(filter),
3013                 .filter = filter,
3014         };
3015 #if defined(__arm__)
3016         struct utsname utsbuf;
3017 #endif
3018
3019         ASSERT_EQ(0, pipe(pipefd));
3020
3021         child_pid = fork();
3022         ASSERT_LE(0, child_pid);
3023         if (child_pid == 0) {
3024                 /* Child uses EXPECT not ASSERT to deliver status correctly. */
3025                 char buf = ' ';
3026                 struct timespec timeout = { };
3027
3028                 /* Attach parent as tracer and stop. */
3029                 EXPECT_EQ(0, ptrace(PTRACE_TRACEME));
3030                 EXPECT_EQ(0, raise(SIGSTOP));
3031
3032                 EXPECT_EQ(0, close(pipefd[1]));
3033
3034                 EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
3035                         TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3036                 }
3037
3038                 ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
3039                 EXPECT_EQ(0, ret) {
3040                         TH_LOG("Failed to install filter!");
3041                 }
3042
3043                 EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3044                         TH_LOG("Failed to read() sync from parent");
3045                 }
3046                 EXPECT_EQ('.', buf) {
3047                         TH_LOG("Failed to get sync data from read()");
3048                 }
3049
3050                 /* Start nanosleep to be interrupted. */
3051                 timeout.tv_sec = 1;
3052                 errno = 0;
3053                 EXPECT_EQ(0, nanosleep(&timeout, NULL)) {
3054                         TH_LOG("Call to nanosleep() failed (errno %d)", errno);
3055                 }
3056
3057                 /* Read final sync from parent. */
3058                 EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3059                         TH_LOG("Failed final read() from parent");
3060                 }
3061                 EXPECT_EQ('!', buf) {
3062                         TH_LOG("Failed to get final data from read()");
3063                 }
3064
3065                 /* Directly report the status of our test harness results. */
3066                 syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS
3067                                                      : EXIT_FAILURE);
3068         }
3069         EXPECT_EQ(0, close(pipefd[0]));
3070
3071         /* Attach to child, setup options, and release. */
3072         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3073         ASSERT_EQ(true, WIFSTOPPED(status));
3074         ASSERT_EQ(0, ptrace(PTRACE_SETOPTIONS, child_pid, NULL,
3075                             PTRACE_O_TRACESECCOMP));
3076         ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3077         ASSERT_EQ(1, write(pipefd[1], ".", 1));
3078
3079         /* Wait for nanosleep() to start. */
3080         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3081         ASSERT_EQ(true, WIFSTOPPED(status));
3082         ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3083         ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3084         ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3085         ASSERT_EQ(0x100, msg);
3086         ret = get_syscall(_metadata, child_pid);
3087         EXPECT_TRUE(ret == __NR_nanosleep || ret == __NR_clock_nanosleep);
3088
3089         /* Might as well check siginfo for sanity while we're here. */
3090         ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3091         ASSERT_EQ(SIGTRAP, info.si_signo);
3092         ASSERT_EQ(SIGTRAP | (PTRACE_EVENT_SECCOMP << 8), info.si_code);
3093         EXPECT_EQ(0, info.si_errno);
3094         EXPECT_EQ(getuid(), info.si_uid);
3095         /* Verify signal delivery came from child (seccomp-triggered). */
3096         EXPECT_EQ(child_pid, info.si_pid);
3097
3098         /* Interrupt nanosleep with SIGSTOP (which we'll need to handle). */
3099         ASSERT_EQ(0, kill(child_pid, SIGSTOP));
3100         ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3101         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3102         ASSERT_EQ(true, WIFSTOPPED(status));
3103         ASSERT_EQ(SIGSTOP, WSTOPSIG(status));
3104         ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3105         /*
3106          * There is no siginfo on SIGSTOP any more, so we can't verify
3107          * signal delivery came from parent now (getpid() == info.si_pid).
3108          * https://lkml.kernel.org/r/CAGXu5jJaZAOzP1qFz66tYrtbuywqb+UN2SOA1VLHpCCOiYvYeg@mail.gmail.com
3109          * At least verify the SIGSTOP via PTRACE_GETSIGINFO.
3110          */
3111         EXPECT_EQ(SIGSTOP, info.si_signo);
3112
3113         /* Restart nanosleep with SIGCONT, which triggers restart_syscall. */
3114         ASSERT_EQ(0, kill(child_pid, SIGCONT));
3115         ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3116         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3117         ASSERT_EQ(true, WIFSTOPPED(status));
3118         ASSERT_EQ(SIGCONT, WSTOPSIG(status));
3119         ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3120
3121         /* Wait for restart_syscall() to start. */
3122         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3123         ASSERT_EQ(true, WIFSTOPPED(status));
3124         ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3125         ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3126         ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3127
3128         ASSERT_EQ(0x200, msg);
3129         ret = get_syscall(_metadata, child_pid);
3130 #if defined(__arm__)
3131         /*
3132          * FIXME:
3133          * - native ARM registers do NOT expose true syscall.
3134          * - compat ARM registers on ARM64 DO expose true syscall.
3135          */
3136         ASSERT_EQ(0, uname(&utsbuf));
3137         if (strncmp(utsbuf.machine, "arm", 3) == 0) {
3138                 EXPECT_EQ(__NR_nanosleep, ret);
3139         } else
3140 #endif
3141         {
3142                 EXPECT_EQ(__NR_restart_syscall, ret);
3143         }
3144
3145         /* Write again to end test. */
3146         ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3147         ASSERT_EQ(1, write(pipefd[1], "!", 1));
3148         EXPECT_EQ(0, close(pipefd[1]));
3149
3150         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3151         if (WIFSIGNALED(status) || WEXITSTATUS(status))
3152                 _metadata->passed = 0;
3153 }
3154
3155 TEST_SIGNAL(filter_flag_log, SIGSYS)
3156 {
3157         struct sock_filter allow_filter[] = {
3158                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3159         };
3160         struct sock_filter kill_filter[] = {
3161                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3162                         offsetof(struct seccomp_data, nr)),
3163                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
3164                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
3165                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3166         };
3167         struct sock_fprog allow_prog = {
3168                 .len = (unsigned short)ARRAY_SIZE(allow_filter),
3169                 .filter = allow_filter,
3170         };
3171         struct sock_fprog kill_prog = {
3172                 .len = (unsigned short)ARRAY_SIZE(kill_filter),
3173                 .filter = kill_filter,
3174         };
3175         long ret;
3176         pid_t parent = getppid();
3177
3178         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3179         ASSERT_EQ(0, ret);
3180
3181         /* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */
3182         ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG,
3183                       &allow_prog);
3184         ASSERT_NE(ENOSYS, errno) {
3185                 TH_LOG("Kernel does not support seccomp syscall!");
3186         }
3187         EXPECT_NE(0, ret) {
3188                 TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!");
3189         }
3190         EXPECT_EQ(EINVAL, errno) {
3191                 TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!");
3192         }
3193
3194         /* Verify that a simple, permissive filter can be added with no flags */
3195         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
3196         EXPECT_EQ(0, ret);
3197
3198         /* See if the same filter can be added with the FILTER_FLAG_LOG flag */
3199         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3200                       &allow_prog);
3201         ASSERT_NE(EINVAL, errno) {
3202                 TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!");
3203         }
3204         EXPECT_EQ(0, ret);
3205
3206         /* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */
3207         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3208                       &kill_prog);
3209         EXPECT_EQ(0, ret);
3210
3211         EXPECT_EQ(parent, syscall(__NR_getppid));
3212         /* getpid() should never return. */
3213         EXPECT_EQ(0, syscall(__NR_getpid));
3214 }
3215
3216 TEST(get_action_avail)
3217 {
3218         __u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP,
3219                             SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
3220                             SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
3221         __u32 unknown_action = 0x10000000U;
3222         int i;
3223         long ret;
3224
3225         ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]);
3226         ASSERT_NE(ENOSYS, errno) {
3227                 TH_LOG("Kernel does not support seccomp syscall!");
3228         }
3229         ASSERT_NE(EINVAL, errno) {
3230                 TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!");
3231         }
3232         EXPECT_EQ(ret, 0);
3233
3234         for (i = 0; i < ARRAY_SIZE(actions); i++) {
3235                 ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]);
3236                 EXPECT_EQ(ret, 0) {
3237                         TH_LOG("Expected action (0x%X) not available!",
3238                                actions[i]);
3239                 }
3240         }
3241
3242         /* Check that an unknown action is handled properly (EOPNOTSUPP) */
3243         ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action);
3244         EXPECT_EQ(ret, -1);
3245         EXPECT_EQ(errno, EOPNOTSUPP);
3246 }
3247
3248 TEST(get_metadata)
3249 {
3250         pid_t pid;
3251         int pipefd[2];
3252         char buf;
3253         struct seccomp_metadata md;
3254         long ret;
3255
3256         /* Only real root can get metadata. */
3257         if (geteuid()) {
3258                 SKIP(return, "get_metadata requires real root");
3259                 return;
3260         }
3261
3262         ASSERT_EQ(0, pipe(pipefd));
3263
3264         pid = fork();
3265         ASSERT_GE(pid, 0);
3266         if (pid == 0) {
3267                 struct sock_filter filter[] = {
3268                         BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3269                 };
3270                 struct sock_fprog prog = {
3271                         .len = (unsigned short)ARRAY_SIZE(filter),
3272                         .filter = filter,
3273                 };
3274
3275                 /* one with log, one without */
3276                 EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER,
3277                                      SECCOMP_FILTER_FLAG_LOG, &prog));
3278                 EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog));
3279
3280                 EXPECT_EQ(0, close(pipefd[0]));
3281                 ASSERT_EQ(1, write(pipefd[1], "1", 1));
3282                 ASSERT_EQ(0, close(pipefd[1]));
3283
3284                 while (1)
3285                         sleep(100);
3286         }
3287
3288         ASSERT_EQ(0, close(pipefd[1]));
3289         ASSERT_EQ(1, read(pipefd[0], &buf, 1));
3290
3291         ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid));
3292         ASSERT_EQ(pid, waitpid(pid, NULL, 0));
3293
3294         /* Past here must not use ASSERT or child process is never killed. */
3295
3296         md.filter_off = 0;
3297         errno = 0;
3298         ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3299         EXPECT_EQ(sizeof(md), ret) {
3300                 if (errno == EINVAL)
3301                         SKIP(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)");
3302         }
3303
3304         EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG);
3305         EXPECT_EQ(md.filter_off, 0);
3306
3307         md.filter_off = 1;
3308         ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3309         EXPECT_EQ(sizeof(md), ret);
3310         EXPECT_EQ(md.flags, 0);
3311         EXPECT_EQ(md.filter_off, 1);
3312
3313 skip:
3314         ASSERT_EQ(0, kill(pid, SIGKILL));
3315 }
3316
3317 static int user_notif_syscall(int nr, unsigned int flags)
3318 {
3319         struct sock_filter filter[] = {
3320                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3321                         offsetof(struct seccomp_data, nr)),
3322                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
3323                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF),
3324                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3325         };
3326
3327         struct sock_fprog prog = {
3328                 .len = (unsigned short)ARRAY_SIZE(filter),
3329                 .filter = filter,
3330         };
3331
3332         return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
3333 }
3334
3335 #define USER_NOTIF_MAGIC INT_MAX
3336 TEST(user_notification_basic)
3337 {
3338         pid_t pid;
3339         long ret;
3340         int status, listener;
3341         struct seccomp_notif req = {};
3342         struct seccomp_notif_resp resp = {};
3343         struct pollfd pollfd;
3344
3345         struct sock_filter filter[] = {
3346                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3347         };
3348         struct sock_fprog prog = {
3349                 .len = (unsigned short)ARRAY_SIZE(filter),
3350                 .filter = filter,
3351         };
3352
3353         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3354         ASSERT_EQ(0, ret) {
3355                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3356         }
3357
3358         pid = fork();
3359         ASSERT_GE(pid, 0);
3360
3361         /* Check that we get -ENOSYS with no listener attached */
3362         if (pid == 0) {
3363                 if (user_notif_syscall(__NR_getppid, 0) < 0)
3364                         exit(1);
3365                 ret = syscall(__NR_getppid);
3366                 exit(ret >= 0 || errno != ENOSYS);
3367         }
3368
3369         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3370         EXPECT_EQ(true, WIFEXITED(status));
3371         EXPECT_EQ(0, WEXITSTATUS(status));
3372
3373         /* Add some no-op filters for grins. */
3374         EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3375         EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3376         EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3377         EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3378
3379         /* Check that the basic notification machinery works */
3380         listener = user_notif_syscall(__NR_getppid,
3381                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
3382         ASSERT_GE(listener, 0);
3383
3384         /* Installing a second listener in the chain should EBUSY */
3385         EXPECT_EQ(user_notif_syscall(__NR_getppid,
3386                                      SECCOMP_FILTER_FLAG_NEW_LISTENER),
3387                   -1);
3388         EXPECT_EQ(errno, EBUSY);
3389
3390         pid = fork();
3391         ASSERT_GE(pid, 0);
3392
3393         if (pid == 0) {
3394                 ret = syscall(__NR_getppid);
3395                 exit(ret != USER_NOTIF_MAGIC);
3396         }
3397
3398         pollfd.fd = listener;
3399         pollfd.events = POLLIN | POLLOUT;
3400
3401         EXPECT_GT(poll(&pollfd, 1, -1), 0);
3402         EXPECT_EQ(pollfd.revents, POLLIN);
3403
3404         /* Test that we can't pass garbage to the kernel. */
3405         memset(&req, 0, sizeof(req));
3406         req.pid = -1;
3407         errno = 0;
3408         ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
3409         EXPECT_EQ(-1, ret);
3410         EXPECT_EQ(EINVAL, errno);
3411
3412         if (ret) {
3413                 req.pid = 0;
3414                 EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3415         }
3416
3417         pollfd.fd = listener;
3418         pollfd.events = POLLIN | POLLOUT;
3419
3420         EXPECT_GT(poll(&pollfd, 1, -1), 0);
3421         EXPECT_EQ(pollfd.revents, POLLOUT);
3422
3423         EXPECT_EQ(req.data.nr,  __NR_getppid);
3424
3425         resp.id = req.id;
3426         resp.error = 0;
3427         resp.val = USER_NOTIF_MAGIC;
3428
3429         /* check that we make sure flags == 0 */
3430         resp.flags = 1;
3431         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3432         EXPECT_EQ(errno, EINVAL);
3433
3434         resp.flags = 0;
3435         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3436
3437         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3438         EXPECT_EQ(true, WIFEXITED(status));
3439         EXPECT_EQ(0, WEXITSTATUS(status));
3440 }
3441
3442 TEST(user_notification_with_tsync)
3443 {
3444         int ret;
3445         unsigned int flags;
3446
3447         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3448         ASSERT_EQ(0, ret) {
3449                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3450         }
3451
3452         /* these were exclusive */
3453         flags = SECCOMP_FILTER_FLAG_NEW_LISTENER |
3454                 SECCOMP_FILTER_FLAG_TSYNC;
3455         ASSERT_EQ(-1, user_notif_syscall(__NR_getppid, flags));
3456         ASSERT_EQ(EINVAL, errno);
3457
3458         /* but now they're not */
3459         flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
3460         ret = user_notif_syscall(__NR_getppid, flags);
3461         close(ret);
3462         ASSERT_LE(0, ret);
3463 }
3464
3465 TEST(user_notification_kill_in_middle)
3466 {
3467         pid_t pid;
3468         long ret;
3469         int listener;
3470         struct seccomp_notif req = {};
3471         struct seccomp_notif_resp resp = {};
3472
3473         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3474         ASSERT_EQ(0, ret) {
3475                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3476         }
3477
3478         listener = user_notif_syscall(__NR_getppid,
3479                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
3480         ASSERT_GE(listener, 0);
3481
3482         /*
3483          * Check that nothing bad happens when we kill the task in the middle
3484          * of a syscall.
3485          */
3486         pid = fork();
3487         ASSERT_GE(pid, 0);
3488
3489         if (pid == 0) {
3490                 ret = syscall(__NR_getppid);
3491                 exit(ret != USER_NOTIF_MAGIC);
3492         }
3493
3494         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3495         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0);
3496
3497         EXPECT_EQ(kill(pid, SIGKILL), 0);
3498         EXPECT_EQ(waitpid(pid, NULL, 0), pid);
3499
3500         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1);
3501
3502         resp.id = req.id;
3503         ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
3504         EXPECT_EQ(ret, -1);
3505         EXPECT_EQ(errno, ENOENT);
3506 }
3507
3508 static int handled = -1;
3509
3510 static void signal_handler(int signal)
3511 {
3512         if (write(handled, "c", 1) != 1)
3513                 perror("write from signal");
3514 }
3515
3516 TEST(user_notification_signal)
3517 {
3518         pid_t pid;
3519         long ret;
3520         int status, listener, sk_pair[2];
3521         struct seccomp_notif req = {};
3522         struct seccomp_notif_resp resp = {};
3523         char c;
3524
3525         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3526         ASSERT_EQ(0, ret) {
3527                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3528         }
3529
3530         ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
3531
3532         listener = user_notif_syscall(__NR_gettid,
3533                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
3534         ASSERT_GE(listener, 0);
3535
3536         pid = fork();
3537         ASSERT_GE(pid, 0);
3538
3539         if (pid == 0) {
3540                 close(sk_pair[0]);
3541                 handled = sk_pair[1];
3542                 if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
3543                         perror("signal");
3544                         exit(1);
3545                 }
3546                 /*
3547                  * ERESTARTSYS behavior is a bit hard to test, because we need
3548                  * to rely on a signal that has not yet been handled. Let's at
3549                  * least check that the error code gets propagated through, and
3550                  * hope that it doesn't break when there is actually a signal :)
3551                  */
3552                 ret = syscall(__NR_gettid);
3553                 exit(!(ret == -1 && errno == 512));
3554         }
3555
3556         close(sk_pair[1]);
3557
3558         memset(&req, 0, sizeof(req));
3559         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3560
3561         EXPECT_EQ(kill(pid, SIGUSR1), 0);
3562
3563         /*
3564          * Make sure the signal really is delivered, which means we're not
3565          * stuck in the user notification code any more and the notification
3566          * should be dead.
3567          */
3568         EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
3569
3570         resp.id = req.id;
3571         resp.error = -EPERM;
3572         resp.val = 0;
3573
3574         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3575         EXPECT_EQ(errno, ENOENT);
3576
3577         memset(&req, 0, sizeof(req));
3578         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3579
3580         resp.id = req.id;
3581         resp.error = -512; /* -ERESTARTSYS */
3582         resp.val = 0;
3583
3584         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3585
3586         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3587         EXPECT_EQ(true, WIFEXITED(status));
3588         EXPECT_EQ(0, WEXITSTATUS(status));
3589 }
3590
3591 TEST(user_notification_closed_listener)
3592 {
3593         pid_t pid;
3594         long ret;
3595         int status, listener;
3596
3597         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3598         ASSERT_EQ(0, ret) {
3599                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3600         }
3601
3602         listener = user_notif_syscall(__NR_getppid,
3603                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
3604         ASSERT_GE(listener, 0);
3605
3606         /*
3607          * Check that we get an ENOSYS when the listener is closed.
3608          */
3609         pid = fork();
3610         ASSERT_GE(pid, 0);
3611         if (pid == 0) {
3612                 close(listener);
3613                 ret = syscall(__NR_getppid);
3614                 exit(ret != -1 && errno != ENOSYS);
3615         }
3616
3617         close(listener);
3618
3619         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3620         EXPECT_EQ(true, WIFEXITED(status));
3621         EXPECT_EQ(0, WEXITSTATUS(status));
3622 }
3623
3624 /*
3625  * Check that a pid in a child namespace still shows up as valid in ours.
3626  */
3627 TEST(user_notification_child_pid_ns)
3628 {
3629         pid_t pid;
3630         int status, listener;
3631         struct seccomp_notif req = {};
3632         struct seccomp_notif_resp resp = {};
3633
3634         ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0) {
3635                 if (errno == EINVAL)
3636                         SKIP(return, "kernel missing CLONE_NEWUSER support");
3637         };
3638
3639         listener = user_notif_syscall(__NR_getppid,
3640                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
3641         ASSERT_GE(listener, 0);
3642
3643         pid = fork();
3644         ASSERT_GE(pid, 0);
3645
3646         if (pid == 0)
3647                 exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3648
3649         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3650         EXPECT_EQ(req.pid, pid);
3651
3652         resp.id = req.id;
3653         resp.error = 0;
3654         resp.val = USER_NOTIF_MAGIC;
3655
3656         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3657
3658         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3659         EXPECT_EQ(true, WIFEXITED(status));
3660         EXPECT_EQ(0, WEXITSTATUS(status));
3661         close(listener);
3662 }
3663
3664 /*
3665  * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e.
3666  * invalid.
3667  */
3668 TEST(user_notification_sibling_pid_ns)
3669 {
3670         pid_t pid, pid2;
3671         int status, listener;
3672         struct seccomp_notif req = {};
3673         struct seccomp_notif_resp resp = {};
3674
3675         ASSERT_EQ(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), 0) {
3676                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3677         }
3678
3679         listener = user_notif_syscall(__NR_getppid,
3680                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
3681         ASSERT_GE(listener, 0);
3682
3683         pid = fork();
3684         ASSERT_GE(pid, 0);
3685
3686         if (pid == 0) {
3687                 ASSERT_EQ(unshare(CLONE_NEWPID), 0);
3688
3689                 pid2 = fork();
3690                 ASSERT_GE(pid2, 0);
3691
3692                 if (pid2 == 0)
3693                         exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3694
3695                 EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3696                 EXPECT_EQ(true, WIFEXITED(status));
3697                 EXPECT_EQ(0, WEXITSTATUS(status));
3698                 exit(WEXITSTATUS(status));
3699         }
3700
3701         /* Create the sibling ns, and sibling in it. */
3702         ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
3703                 if (errno == EPERM)
3704                         SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
3705         }
3706         ASSERT_EQ(errno, 0);
3707
3708         pid2 = fork();
3709         ASSERT_GE(pid2, 0);
3710
3711         if (pid2 == 0) {
3712                 ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3713                 /*
3714                  * The pid should be 0, i.e. the task is in some namespace that
3715                  * we can't "see".
3716                  */
3717                 EXPECT_EQ(req.pid, 0);
3718
3719                 resp.id = req.id;
3720                 resp.error = 0;
3721                 resp.val = USER_NOTIF_MAGIC;
3722
3723                 ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3724                 exit(0);
3725         }
3726
3727         close(listener);
3728
3729         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3730         EXPECT_EQ(true, WIFEXITED(status));
3731         EXPECT_EQ(0, WEXITSTATUS(status));
3732
3733         EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3734         EXPECT_EQ(true, WIFEXITED(status));
3735         EXPECT_EQ(0, WEXITSTATUS(status));
3736 }
3737
3738 TEST(user_notification_fault_recv)
3739 {
3740         pid_t pid;
3741         int status, listener;
3742         struct seccomp_notif req = {};
3743         struct seccomp_notif_resp resp = {};
3744
3745         ASSERT_EQ(unshare(CLONE_NEWUSER), 0);
3746
3747         listener = user_notif_syscall(__NR_getppid,
3748                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
3749         ASSERT_GE(listener, 0);
3750
3751         pid = fork();
3752         ASSERT_GE(pid, 0);
3753
3754         if (pid == 0)
3755                 exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3756
3757         /* Do a bad recv() */
3758         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1);
3759         EXPECT_EQ(errno, EFAULT);
3760
3761         /* We should still be able to receive this notification, though. */
3762         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3763         EXPECT_EQ(req.pid, pid);
3764
3765         resp.id = req.id;
3766         resp.error = 0;
3767         resp.val = USER_NOTIF_MAGIC;
3768
3769         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3770
3771         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3772         EXPECT_EQ(true, WIFEXITED(status));
3773         EXPECT_EQ(0, WEXITSTATUS(status));
3774 }
3775
3776 TEST(seccomp_get_notif_sizes)
3777 {
3778         struct seccomp_notif_sizes sizes;
3779
3780         ASSERT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0);
3781         EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif));
3782         EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
3783 }
3784
3785 TEST(user_notification_continue)
3786 {
3787         pid_t pid;
3788         long ret;
3789         int status, listener;
3790         struct seccomp_notif req = {};
3791         struct seccomp_notif_resp resp = {};
3792         struct pollfd pollfd;
3793
3794         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3795         ASSERT_EQ(0, ret) {
3796                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3797         }
3798
3799         listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3800         ASSERT_GE(listener, 0);
3801
3802         pid = fork();
3803         ASSERT_GE(pid, 0);
3804
3805         if (pid == 0) {
3806                 int dup_fd, pipe_fds[2];
3807                 pid_t self;
3808
3809                 ASSERT_GE(pipe(pipe_fds), 0);
3810
3811                 dup_fd = dup(pipe_fds[0]);
3812                 ASSERT_GE(dup_fd, 0);
3813                 EXPECT_NE(pipe_fds[0], dup_fd);
3814
3815                 self = getpid();
3816                 ASSERT_EQ(filecmp(self, self, pipe_fds[0], dup_fd), 0);
3817                 exit(0);
3818         }
3819
3820         pollfd.fd = listener;
3821         pollfd.events = POLLIN | POLLOUT;
3822
3823         EXPECT_GT(poll(&pollfd, 1, -1), 0);
3824         EXPECT_EQ(pollfd.revents, POLLIN);
3825
3826         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3827
3828         pollfd.fd = listener;
3829         pollfd.events = POLLIN | POLLOUT;
3830
3831         EXPECT_GT(poll(&pollfd, 1, -1), 0);
3832         EXPECT_EQ(pollfd.revents, POLLOUT);
3833
3834         EXPECT_EQ(req.data.nr, __NR_dup);
3835
3836         resp.id = req.id;
3837         resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
3838
3839         /*
3840          * Verify that setting SECCOMP_USER_NOTIF_FLAG_CONTINUE enforces other
3841          * args be set to 0.
3842          */
3843         resp.error = 0;
3844         resp.val = USER_NOTIF_MAGIC;
3845         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3846         EXPECT_EQ(errno, EINVAL);
3847
3848         resp.error = USER_NOTIF_MAGIC;
3849         resp.val = 0;
3850         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3851         EXPECT_EQ(errno, EINVAL);
3852
3853         resp.error = 0;
3854         resp.val = 0;
3855         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) {
3856                 if (errno == EINVAL)
3857                         SKIP(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE");
3858         }
3859
3860 skip:
3861         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3862         EXPECT_EQ(true, WIFEXITED(status));
3863         EXPECT_EQ(0, WEXITSTATUS(status)) {
3864                 if (WEXITSTATUS(status) == 2) {
3865                         SKIP(return, "Kernel does not support kcmp() syscall");
3866                         return;
3867                 }
3868         }
3869 }
3870
3871 TEST(user_notification_filter_empty)
3872 {
3873         pid_t pid;
3874         long ret;
3875         int status;
3876         struct pollfd pollfd;
3877         struct __clone_args args = {
3878                 .flags = CLONE_FILES,
3879                 .exit_signal = SIGCHLD,
3880         };
3881
3882         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3883         ASSERT_EQ(0, ret) {
3884                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3885         }
3886
3887         pid = sys_clone3(&args, sizeof(args));
3888         ASSERT_GE(pid, 0);
3889
3890         if (pid == 0) {
3891                 int listener;
3892
3893                 listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3894                 if (listener < 0)
3895                         _exit(EXIT_FAILURE);
3896
3897                 if (dup2(listener, 200) != 200)
3898                         _exit(EXIT_FAILURE);
3899
3900                 close(listener);
3901
3902                 _exit(EXIT_SUCCESS);
3903         }
3904
3905         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3906         EXPECT_EQ(true, WIFEXITED(status));
3907         EXPECT_EQ(0, WEXITSTATUS(status));
3908
3909         /*
3910          * The seccomp filter has become unused so we should be notified once
3911          * the kernel gets around to cleaning up task struct.
3912          */
3913         pollfd.fd = 200;
3914         pollfd.events = POLLHUP;
3915
3916         EXPECT_GT(poll(&pollfd, 1, 2000), 0);
3917         EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
3918 }
3919
3920 static void *do_thread(void *data)
3921 {
3922         return NULL;
3923 }
3924
3925 TEST(user_notification_filter_empty_threaded)
3926 {
3927         pid_t pid;
3928         long ret;
3929         int status;
3930         struct pollfd pollfd;
3931         struct __clone_args args = {
3932                 .flags = CLONE_FILES,
3933                 .exit_signal = SIGCHLD,
3934         };
3935
3936         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3937         ASSERT_EQ(0, ret) {
3938                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3939         }
3940
3941         pid = sys_clone3(&args, sizeof(args));
3942         ASSERT_GE(pid, 0);
3943
3944         if (pid == 0) {
3945                 pid_t pid1, pid2;
3946                 int listener, status;
3947                 pthread_t thread;
3948
3949                 listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3950                 if (listener < 0)
3951                         _exit(EXIT_FAILURE);
3952
3953                 if (dup2(listener, 200) != 200)
3954                         _exit(EXIT_FAILURE);
3955
3956                 close(listener);
3957
3958                 pid1 = fork();
3959                 if (pid1 < 0)
3960                         _exit(EXIT_FAILURE);
3961
3962                 if (pid1 == 0)
3963                         _exit(EXIT_SUCCESS);
3964
3965                 pid2 = fork();
3966                 if (pid2 < 0)
3967                         _exit(EXIT_FAILURE);
3968
3969                 if (pid2 == 0)
3970                         _exit(EXIT_SUCCESS);
3971
3972                 if (pthread_create(&thread, NULL, do_thread, NULL) ||
3973                     pthread_join(thread, NULL))
3974                         _exit(EXIT_FAILURE);
3975
3976                 if (pthread_create(&thread, NULL, do_thread, NULL) ||
3977                     pthread_join(thread, NULL))
3978                         _exit(EXIT_FAILURE);
3979
3980                 if (waitpid(pid1, &status, 0) != pid1 || !WIFEXITED(status) ||
3981                     WEXITSTATUS(status))
3982                         _exit(EXIT_FAILURE);
3983
3984                 if (waitpid(pid2, &status, 0) != pid2 || !WIFEXITED(status) ||
3985                     WEXITSTATUS(status))
3986                         _exit(EXIT_FAILURE);
3987
3988                 exit(EXIT_SUCCESS);
3989         }
3990
3991         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3992         EXPECT_EQ(true, WIFEXITED(status));
3993         EXPECT_EQ(0, WEXITSTATUS(status));
3994
3995         /*
3996          * The seccomp filter has become unused so we should be notified once
3997          * the kernel gets around to cleaning up task struct.
3998          */
3999         pollfd.fd = 200;
4000         pollfd.events = POLLHUP;
4001
4002         EXPECT_GT(poll(&pollfd, 1, 2000), 0);
4003         EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
4004 }
4005
4006 TEST(user_notification_addfd)
4007 {
4008         pid_t pid;
4009         long ret;
4010         int status, listener, memfd, fd, nextfd;
4011         struct seccomp_notif_addfd addfd = {};
4012         struct seccomp_notif_addfd_small small = {};
4013         struct seccomp_notif_addfd_big big = {};
4014         struct seccomp_notif req = {};
4015         struct seccomp_notif_resp resp = {};
4016         /* 100 ms */
4017         struct timespec delay = { .tv_nsec = 100000000 };
4018
4019         /* There may be arbitrary already-open fds at test start. */
4020         memfd = memfd_create("test", 0);
4021         ASSERT_GE(memfd, 0);
4022         nextfd = memfd + 1;
4023
4024         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4025         ASSERT_EQ(0, ret) {
4026                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4027         }
4028
4029         /* fd: 4 */
4030         /* Check that the basic notification machinery works */
4031         listener = user_notif_syscall(__NR_getppid,
4032                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
4033         ASSERT_EQ(listener, nextfd++);
4034
4035         pid = fork();
4036         ASSERT_GE(pid, 0);
4037
4038         if (pid == 0) {
4039                 /* fds will be added and this value is expected */
4040                 if (syscall(__NR_getppid) != USER_NOTIF_MAGIC)
4041                         exit(1);
4042
4043                 /* Atomic addfd+send is received here. Check it is a valid fd */
4044                 if (fcntl(syscall(__NR_getppid), F_GETFD) == -1)
4045                         exit(1);
4046
4047                 exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4048         }
4049
4050         ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4051
4052         addfd.srcfd = memfd;
4053         addfd.newfd = 0;
4054         addfd.id = req.id;
4055         addfd.flags = 0x0;
4056
4057         /* Verify bad newfd_flags cannot be set */
4058         addfd.newfd_flags = ~O_CLOEXEC;
4059         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4060         EXPECT_EQ(errno, EINVAL);
4061         addfd.newfd_flags = O_CLOEXEC;
4062
4063         /* Verify bad flags cannot be set */
4064         addfd.flags = 0xff;
4065         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4066         EXPECT_EQ(errno, EINVAL);
4067         addfd.flags = 0;
4068
4069         /* Verify that remote_fd cannot be set without setting flags */
4070         addfd.newfd = 1;
4071         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4072         EXPECT_EQ(errno, EINVAL);
4073         addfd.newfd = 0;
4074
4075         /* Verify small size cannot be set */
4076         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_SMALL, &small), -1);
4077         EXPECT_EQ(errno, EINVAL);
4078
4079         /* Verify we can't send bits filled in unknown buffer area */
4080         memset(&big, 0xAA, sizeof(big));
4081         big.addfd = addfd;
4082         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big), -1);
4083         EXPECT_EQ(errno, E2BIG);
4084
4085
4086         /* Verify we can set an arbitrary remote fd */
4087         fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4088         EXPECT_EQ(fd, nextfd++);
4089         EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4090
4091         /* Verify we can set an arbitrary remote fd with large size */
4092         memset(&big, 0x0, sizeof(big));
4093         big.addfd = addfd;
4094         fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big);
4095         EXPECT_EQ(fd, nextfd++);
4096
4097         /* Verify we can set a specific remote fd */
4098         addfd.newfd = 42;
4099         addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4100         fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4101         EXPECT_EQ(fd, 42);
4102         EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4103
4104         /* Resume syscall */
4105         resp.id = req.id;
4106         resp.error = 0;
4107         resp.val = USER_NOTIF_MAGIC;
4108         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4109
4110         /*
4111          * This sets the ID of the ADD FD to the last request plus 1. The
4112          * notification ID increments 1 per notification.
4113          */
4114         addfd.id = req.id + 1;
4115
4116         /* This spins until the underlying notification is generated */
4117         while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4118                errno != -EINPROGRESS)
4119                 nanosleep(&delay, NULL);
4120
4121         memset(&req, 0, sizeof(req));
4122         ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4123         ASSERT_EQ(addfd.id, req.id);
4124
4125         /* Verify we can do an atomic addfd and send */
4126         addfd.newfd = 0;
4127         addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4128         fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4129         /*
4130          * Child has earlier "low" fds and now 42, so we expect the next
4131          * lowest available fd to be assigned here.
4132          */
4133         EXPECT_EQ(fd, nextfd++);
4134         ASSERT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4135
4136         /*
4137          * This sets the ID of the ADD FD to the last request plus 1. The
4138          * notification ID increments 1 per notification.
4139          */
4140         addfd.id = req.id + 1;
4141
4142         /* This spins until the underlying notification is generated */
4143         while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4144                errno != -EINPROGRESS)
4145                 nanosleep(&delay, NULL);
4146
4147         memset(&req, 0, sizeof(req));
4148         ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4149         ASSERT_EQ(addfd.id, req.id);
4150
4151         resp.id = req.id;
4152         resp.error = 0;
4153         resp.val = USER_NOTIF_MAGIC;
4154         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4155
4156         /* Wait for child to finish. */
4157         EXPECT_EQ(waitpid(pid, &status, 0), pid);
4158         EXPECT_EQ(true, WIFEXITED(status));
4159         EXPECT_EQ(0, WEXITSTATUS(status));
4160
4161         close(memfd);
4162 }
4163
4164 TEST(user_notification_addfd_rlimit)
4165 {
4166         pid_t pid;
4167         long ret;
4168         int status, listener, memfd;
4169         struct seccomp_notif_addfd addfd = {};
4170         struct seccomp_notif req = {};
4171         struct seccomp_notif_resp resp = {};
4172         const struct rlimit lim = {
4173                 .rlim_cur       = 0,
4174                 .rlim_max       = 0,
4175         };
4176
4177         memfd = memfd_create("test", 0);
4178         ASSERT_GE(memfd, 0);
4179
4180         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4181         ASSERT_EQ(0, ret) {
4182                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4183         }
4184
4185         /* Check that the basic notification machinery works */
4186         listener = user_notif_syscall(__NR_getppid,
4187                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
4188         ASSERT_GE(listener, 0);
4189
4190         pid = fork();
4191         ASSERT_GE(pid, 0);
4192
4193         if (pid == 0)
4194                 exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4195
4196
4197         ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4198
4199         ASSERT_EQ(prlimit(pid, RLIMIT_NOFILE, &lim, NULL), 0);
4200
4201         addfd.srcfd = memfd;
4202         addfd.newfd_flags = O_CLOEXEC;
4203         addfd.newfd = 0;
4204         addfd.id = req.id;
4205         addfd.flags = 0;
4206
4207         /* Should probably spot check /proc/sys/fs/file-nr */
4208         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4209         EXPECT_EQ(errno, EMFILE);
4210
4211         addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4212         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4213         EXPECT_EQ(errno, EMFILE);
4214
4215         addfd.newfd = 100;
4216         addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4217         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4218         EXPECT_EQ(errno, EBADF);
4219
4220         resp.id = req.id;
4221         resp.error = 0;
4222         resp.val = USER_NOTIF_MAGIC;
4223
4224         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4225
4226         /* Wait for child to finish. */
4227         EXPECT_EQ(waitpid(pid, &status, 0), pid);
4228         EXPECT_EQ(true, WIFEXITED(status));
4229         EXPECT_EQ(0, WEXITSTATUS(status));
4230
4231         close(memfd);
4232 }
4233
4234 /*
4235  * TODO:
4236  * - expand NNP testing
4237  * - better arch-specific TRACE and TRAP handlers.
4238  * - endianness checking when appropriate
4239  * - 64-bit arg prodding
4240  * - arch value testing (x86 modes especially)
4241  * - verify that FILTER_FLAG_LOG filters generate log messages
4242  * - verify that RET_LOG generates log messages
4243  */
4244
4245 TEST_HARNESS_MAIN