Bitcoin Core  22.99.0
P2P Digital Currency
syscall_sandbox.cpp
Go to the documentation of this file.
1 // Copyright (c) 2020-2021 The Bitcoin Core developers
2 // Distributed under the MIT software license, see the accompanying
3 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
4 
5 #if defined(HAVE_CONFIG_H)
7 #endif // defined(HAVE_CONFIG_H)
8 
9 #include <util/syscall_sandbox.h>
10 
11 #if defined(USE_SYSCALL_SANDBOX)
12 #include <array>
13 #include <cassert>
14 #include <cstdint>
15 #include <exception>
16 #include <map>
17 #include <new>
18 #include <set>
19 #include <string>
20 #include <vector>
21 
22 #include <logging.h>
23 #include <tinyformat.h>
24 #include <util/threadnames.h>
25 
26 #include <linux/audit.h>
27 #include <linux/filter.h>
28 #include <linux/seccomp.h>
29 #include <linux/unistd.h>
30 #include <signal.h>
31 #include <sys/prctl.h>
32 #include <sys/types.h>
33 #include <unistd.h>
34 
35 namespace {
36 bool g_syscall_sandbox_enabled{false};
37 bool g_syscall_sandbox_log_violation_before_terminating{false};
38 
39 #if !defined(__x86_64__)
40 #error Syscall sandbox is an experimental feature currently available only under Linux x86-64.
41 #endif // defined(__x86_64__)
42 
43 #ifndef SECCOMP_RET_KILL_PROCESS
44 #define SECCOMP_RET_KILL_PROCESS 0x80000000U
45 #endif
46 
47 // Define system call numbers for x86_64 that are referenced in the system call profile
48 // but not provided by the kernel headers used in the GUIX build.
49 // Usually, they can be found via "grep name /usr/include/x86_64-linux-gnu/asm/unistd_64.h"
50 
51 #ifndef __NR_clone3
52 #define __NR_clone3 435
53 #endif
54 
55 #ifndef __NR_statx
56 #define __NR_statx 332
57 #endif
58 
59 #ifndef __NR_getrandom
60 #define __NR_getrandom 318
61 #endif
62 
63 #ifndef __NR_membarrier
64 #define __NR_membarrier 324
65 #endif
66 
67 #ifndef __NR_copy_file_range
68 #define __NR_copy_file_range 326
69 #endif
70 
71 // This list of syscalls in LINUX_SYSCALLS is only used to map syscall numbers to syscall names in
72 // order to be able to print user friendly error messages which include the syscall name in addition
73 // to the syscall number.
74 //
75 // Example output in case of a syscall violation where the syscall is present in LINUX_SYSCALLS:
76 //
77 // ```
78 // 2021-06-09T12:34:56Z ERROR: The syscall "execve" (syscall number 59) is not allowed by the syscall sandbox in thread "msghand". Please report.
79 // ```
80 //
81 // Example output in case of a syscall violation where the syscall is not present in LINUX_SYSCALLS:
82 //
83 // ```
84 // 2021-06-09T12:34:56Z ERROR: The syscall "*unknown*" (syscall number 314) is not allowed by the syscall sandbox in thread "msghand". Please report.
85 // ``
86 //
87 // LINUX_SYSCALLS contains two types of syscalls:
88 // 1.) Syscalls that are present under all architectures or relevant Linux kernel versions for which
89 // we support the syscall sandbox feature (currently only Linux x86-64). Examples include read,
90 // write, open, close, etc.
91 // 2.) Syscalls that are present under a subset of architectures or relevant Linux kernel versions
92 // for which we support the syscall sandbox feature. This type of syscalls should be added to
93 // LINUX_SYSCALLS conditional on availability like in the following example:
94 // ...
95 // #if defined(__NR_arch_dependent_syscall)
96 // {__NR_arch_dependent_syscall, "arch_dependent_syscall"},
97 // #endif // defined(__NR_arch_dependent_syscall)
98 // ...
99 const std::map<uint32_t, std::string> LINUX_SYSCALLS{
100  {__NR_accept, "accept"},
101  {__NR_accept4, "accept4"},
102  {__NR_access, "access"},
103  {__NR_acct, "acct"},
104  {__NR_add_key, "add_key"},
105  {__NR_adjtimex, "adjtimex"},
106  {__NR_afs_syscall, "afs_syscall"},
107  {__NR_alarm, "alarm"},
108  {__NR_arch_prctl, "arch_prctl"},
109  {__NR_bind, "bind"},
110  {__NR_bpf, "bpf"},
111  {__NR_brk, "brk"},
112  {__NR_capget, "capget"},
113  {__NR_capset, "capset"},
114  {__NR_chdir, "chdir"},
115  {__NR_chmod, "chmod"},
116  {__NR_chown, "chown"},
117  {__NR_chroot, "chroot"},
118  {__NR_clock_adjtime, "clock_adjtime"},
119  {__NR_clock_getres, "clock_getres"},
120  {__NR_clock_gettime, "clock_gettime"},
121  {__NR_clock_nanosleep, "clock_nanosleep"},
122  {__NR_clock_settime, "clock_settime"},
123  {__NR_clone, "clone"},
124  {__NR_clone3, "clone3"},
125  {__NR_close, "close"},
126  {__NR_connect, "connect"},
127  {__NR_copy_file_range, "copy_file_range"},
128  {__NR_creat, "creat"},
129  {__NR_create_module, "create_module"},
130  {__NR_delete_module, "delete_module"},
131  {__NR_dup, "dup"},
132  {__NR_dup2, "dup2"},
133  {__NR_dup3, "dup3"},
134  {__NR_epoll_create, "epoll_create"},
135  {__NR_epoll_create1, "epoll_create1"},
136  {__NR_epoll_ctl, "epoll_ctl"},
137  {__NR_epoll_ctl_old, "epoll_ctl_old"},
138  {__NR_epoll_pwait, "epoll_pwait"},
139  {__NR_epoll_wait, "epoll_wait"},
140  {__NR_epoll_wait_old, "epoll_wait_old"},
141  {__NR_eventfd, "eventfd"},
142  {__NR_eventfd2, "eventfd2"},
143  {__NR_execve, "execve"},
144  {__NR_execveat, "execveat"},
145  {__NR_exit, "exit"},
146  {__NR_exit_group, "exit_group"},
147  {__NR_faccessat, "faccessat"},
148  {__NR_fadvise64, "fadvise64"},
149  {__NR_fallocate, "fallocate"},
150  {__NR_fanotify_init, "fanotify_init"},
151  {__NR_fanotify_mark, "fanotify_mark"},
152  {__NR_fchdir, "fchdir"},
153  {__NR_fchmod, "fchmod"},
154  {__NR_fchmodat, "fchmodat"},
155  {__NR_fchown, "fchown"},
156  {__NR_fchownat, "fchownat"},
157  {__NR_fcntl, "fcntl"},
158  {__NR_fdatasync, "fdatasync"},
159  {__NR_fgetxattr, "fgetxattr"},
160  {__NR_finit_module, "finit_module"},
161  {__NR_flistxattr, "flistxattr"},
162  {__NR_flock, "flock"},
163  {__NR_fork, "fork"},
164  {__NR_fremovexattr, "fremovexattr"},
165  {__NR_fsetxattr, "fsetxattr"},
166  {__NR_fstat, "fstat"},
167  {__NR_fstatfs, "fstatfs"},
168  {__NR_fsync, "fsync"},
169  {__NR_ftruncate, "ftruncate"},
170  {__NR_futex, "futex"},
171  {__NR_futimesat, "futimesat"},
172  {__NR_get_kernel_syms, "get_kernel_syms"},
173  {__NR_get_mempolicy, "get_mempolicy"},
174  {__NR_get_robust_list, "get_robust_list"},
175  {__NR_get_thread_area, "get_thread_area"},
176  {__NR_getcpu, "getcpu"},
177  {__NR_getcwd, "getcwd"},
178  {__NR_getdents, "getdents"},
179  {__NR_getdents64, "getdents64"},
180  {__NR_getegid, "getegid"},
181  {__NR_geteuid, "geteuid"},
182  {__NR_getgid, "getgid"},
183  {__NR_getgroups, "getgroups"},
184  {__NR_getitimer, "getitimer"},
185  {__NR_getpeername, "getpeername"},
186  {__NR_getpgid, "getpgid"},
187  {__NR_getpgrp, "getpgrp"},
188  {__NR_getpid, "getpid"},
189  {__NR_getpmsg, "getpmsg"},
190  {__NR_getppid, "getppid"},
191  {__NR_getpriority, "getpriority"},
192  {__NR_getrandom, "getrandom"},
193  {__NR_getresgid, "getresgid"},
194  {__NR_getresuid, "getresuid"},
195  {__NR_getrlimit, "getrlimit"},
196  {__NR_getrusage, "getrusage"},
197  {__NR_getsid, "getsid"},
198  {__NR_getsockname, "getsockname"},
199  {__NR_getsockopt, "getsockopt"},
200  {__NR_gettid, "gettid"},
201  {__NR_gettimeofday, "gettimeofday"},
202  {__NR_getuid, "getuid"},
203  {__NR_getxattr, "getxattr"},
204  {__NR_init_module, "init_module"},
205  {__NR_inotify_add_watch, "inotify_add_watch"},
206  {__NR_inotify_init, "inotify_init"},
207  {__NR_inotify_init1, "inotify_init1"},
208  {__NR_inotify_rm_watch, "inotify_rm_watch"},
209  {__NR_io_cancel, "io_cancel"},
210  {__NR_io_destroy, "io_destroy"},
211  {__NR_io_getevents, "io_getevents"},
212  {__NR_io_setup, "io_setup"},
213  {__NR_io_submit, "io_submit"},
214  {__NR_ioctl, "ioctl"},
215  {__NR_ioperm, "ioperm"},
216  {__NR_iopl, "iopl"},
217  {__NR_ioprio_get, "ioprio_get"},
218  {__NR_ioprio_set, "ioprio_set"},
219  {__NR_kcmp, "kcmp"},
220  {__NR_kexec_file_load, "kexec_file_load"},
221  {__NR_kexec_load, "kexec_load"},
222  {__NR_keyctl, "keyctl"},
223  {__NR_kill, "kill"},
224  {__NR_lchown, "lchown"},
225  {__NR_lgetxattr, "lgetxattr"},
226  {__NR_link, "link"},
227  {__NR_linkat, "linkat"},
228  {__NR_listen, "listen"},
229  {__NR_listxattr, "listxattr"},
230  {__NR_llistxattr, "llistxattr"},
231  {__NR_lookup_dcookie, "lookup_dcookie"},
232  {__NR_lremovexattr, "lremovexattr"},
233  {__NR_lseek, "lseek"},
234  {__NR_lsetxattr, "lsetxattr"},
235  {__NR_lstat, "lstat"},
236  {__NR_madvise, "madvise"},
237  {__NR_mbind, "mbind"},
238  {__NR_membarrier, "membarrier"},
239  {__NR_memfd_create, "memfd_create"},
240  {__NR_migrate_pages, "migrate_pages"},
241  {__NR_mincore, "mincore"},
242  {__NR_mkdir, "mkdir"},
243  {__NR_mkdirat, "mkdirat"},
244  {__NR_mknod, "mknod"},
245  {__NR_mknodat, "mknodat"},
246  {__NR_mlock, "mlock"},
247  {__NR_mlock2, "mlock2"},
248  {__NR_mlockall, "mlockall"},
249  {__NR_mmap, "mmap"},
250  {__NR_modify_ldt, "modify_ldt"},
251  {__NR_mount, "mount"},
252  {__NR_move_pages, "move_pages"},
253  {__NR_mprotect, "mprotect"},
254  {__NR_mq_getsetattr, "mq_getsetattr"},
255  {__NR_mq_notify, "mq_notify"},
256  {__NR_mq_open, "mq_open"},
257  {__NR_mq_timedreceive, "mq_timedreceive"},
258  {__NR_mq_timedsend, "mq_timedsend"},
259  {__NR_mq_unlink, "mq_unlink"},
260  {__NR_mremap, "mremap"},
261  {__NR_msgctl, "msgctl"},
262  {__NR_msgget, "msgget"},
263  {__NR_msgrcv, "msgrcv"},
264  {__NR_msgsnd, "msgsnd"},
265  {__NR_msync, "msync"},
266  {__NR_munlock, "munlock"},
267  {__NR_munlockall, "munlockall"},
268  {__NR_munmap, "munmap"},
269  {__NR_name_to_handle_at, "name_to_handle_at"},
270  {__NR_nanosleep, "nanosleep"},
271  {__NR_newfstatat, "newfstatat"},
272  {__NR_nfsservctl, "nfsservctl"},
273  {__NR_open, "open"},
274  {__NR_open_by_handle_at, "open_by_handle_at"},
275  {__NR_openat, "openat"},
276  {__NR_pause, "pause"},
277  {__NR_perf_event_open, "perf_event_open"},
278  {__NR_personality, "personality"},
279  {__NR_pipe, "pipe"},
280  {__NR_pipe2, "pipe2"},
281  {__NR_pivot_root, "pivot_root"},
282 #ifdef __NR_pkey_alloc
283  {__NR_pkey_alloc, "pkey_alloc"},
284 #endif
285 #ifdef __NR_pkey_free
286  {__NR_pkey_free, "pkey_free"},
287 #endif
288 #ifdef __NR_pkey_mprotect
289  {__NR_pkey_mprotect, "pkey_mprotect"},
290 #endif
291  {__NR_poll, "poll"},
292  {__NR_ppoll, "ppoll"},
293  {__NR_prctl, "prctl"},
294  {__NR_pread64, "pread64"},
295  {__NR_preadv, "preadv"},
296 #ifdef __NR_preadv2
297  {__NR_preadv2, "preadv2"},
298 #endif
299  {__NR_prlimit64, "prlimit64"},
300  {__NR_process_vm_readv, "process_vm_readv"},
301  {__NR_process_vm_writev, "process_vm_writev"},
302  {__NR_pselect6, "pselect6"},
303  {__NR_ptrace, "ptrace"},
304  {__NR_putpmsg, "putpmsg"},
305  {__NR_pwrite64, "pwrite64"},
306  {__NR_pwritev, "pwritev"},
307 #ifdef __NR_pwritev2
308  {__NR_pwritev2, "pwritev2"},
309 #endif
310  {__NR__sysctl, "_sysctl"},
311  {__NR_query_module, "query_module"},
312  {__NR_quotactl, "quotactl"},
313  {__NR_read, "read"},
314  {__NR_readahead, "readahead"},
315  {__NR_readlink, "readlink"},
316  {__NR_readlinkat, "readlinkat"},
317  {__NR_readv, "readv"},
318  {__NR_reboot, "reboot"},
319  {__NR_recvfrom, "recvfrom"},
320  {__NR_recvmmsg, "recvmmsg"},
321  {__NR_recvmsg, "recvmsg"},
322  {__NR_remap_file_pages, "remap_file_pages"},
323  {__NR_removexattr, "removexattr"},
324  {__NR_rename, "rename"},
325  {__NR_renameat, "renameat"},
326  {__NR_renameat2, "renameat2"},
327  {__NR_request_key, "request_key"},
328  {__NR_restart_syscall, "restart_syscall"},
329  {__NR_rmdir, "rmdir"},
330  {__NR_rt_sigaction, "rt_sigaction"},
331  {__NR_rt_sigpending, "rt_sigpending"},
332  {__NR_rt_sigprocmask, "rt_sigprocmask"},
333  {__NR_rt_sigqueueinfo, "rt_sigqueueinfo"},
334  {__NR_rt_sigreturn, "rt_sigreturn"},
335  {__NR_rt_sigsuspend, "rt_sigsuspend"},
336  {__NR_rt_sigtimedwait, "rt_sigtimedwait"},
337  {__NR_rt_tgsigqueueinfo, "rt_tgsigqueueinfo"},
338  {__NR_sched_get_priority_max, "sched_get_priority_max"},
339  {__NR_sched_get_priority_min, "sched_get_priority_min"},
340  {__NR_sched_getaffinity, "sched_getaffinity"},
341  {__NR_sched_getattr, "sched_getattr"},
342  {__NR_sched_getparam, "sched_getparam"},
343  {__NR_sched_getscheduler, "sched_getscheduler"},
344  {__NR_sched_rr_get_interval, "sched_rr_get_interval"},
345  {__NR_sched_setaffinity, "sched_setaffinity"},
346  {__NR_sched_setattr, "sched_setattr"},
347  {__NR_sched_setparam, "sched_setparam"},
348  {__NR_sched_setscheduler, "sched_setscheduler"},
349  {__NR_sched_yield, "sched_yield"},
350  {__NR_seccomp, "seccomp"},
351  {__NR_security, "security"},
352  {__NR_select, "select"},
353  {__NR_semctl, "semctl"},
354  {__NR_semget, "semget"},
355  {__NR_semop, "semop"},
356  {__NR_semtimedop, "semtimedop"},
357  {__NR_sendfile, "sendfile"},
358  {__NR_sendmmsg, "sendmmsg"},
359  {__NR_sendmsg, "sendmsg"},
360  {__NR_sendto, "sendto"},
361  {__NR_set_mempolicy, "set_mempolicy"},
362  {__NR_set_robust_list, "set_robust_list"},
363  {__NR_set_thread_area, "set_thread_area"},
364  {__NR_set_tid_address, "set_tid_address"},
365  {__NR_setdomainname, "setdomainname"},
366  {__NR_setfsgid, "setfsgid"},
367  {__NR_setfsuid, "setfsuid"},
368  {__NR_setgid, "setgid"},
369  {__NR_setgroups, "setgroups"},
370  {__NR_sethostname, "sethostname"},
371  {__NR_setitimer, "setitimer"},
372  {__NR_setns, "setns"},
373  {__NR_setpgid, "setpgid"},
374  {__NR_setpriority, "setpriority"},
375  {__NR_setregid, "setregid"},
376  {__NR_setresgid, "setresgid"},
377  {__NR_setresuid, "setresuid"},
378  {__NR_setreuid, "setreuid"},
379  {__NR_setrlimit, "setrlimit"},
380  {__NR_setsid, "setsid"},
381  {__NR_setsockopt, "setsockopt"},
382  {__NR_settimeofday, "settimeofday"},
383  {__NR_setuid, "setuid"},
384  {__NR_setxattr, "setxattr"},
385  {__NR_shmat, "shmat"},
386  {__NR_shmctl, "shmctl"},
387  {__NR_shmdt, "shmdt"},
388  {__NR_shmget, "shmget"},
389  {__NR_shutdown, "shutdown"},
390  {__NR_sigaltstack, "sigaltstack"},
391  {__NR_signalfd, "signalfd"},
392  {__NR_signalfd4, "signalfd4"},
393  {__NR_socket, "socket"},
394  {__NR_socketpair, "socketpair"},
395  {__NR_splice, "splice"},
396  {__NR_stat, "stat"},
397  {__NR_statfs, "statfs"},
398  {__NR_statx, "statx"},
399  {__NR_swapoff, "swapoff"},
400  {__NR_swapon, "swapon"},
401  {__NR_symlink, "symlink"},
402  {__NR_symlinkat, "symlinkat"},
403  {__NR_sync, "sync"},
404  {__NR_sync_file_range, "sync_file_range"},
405  {__NR_syncfs, "syncfs"},
406  {__NR_sysfs, "sysfs"},
407  {__NR_sysinfo, "sysinfo"},
408  {__NR_syslog, "syslog"},
409  {__NR_tee, "tee"},
410  {__NR_tgkill, "tgkill"},
411  {__NR_time, "time"},
412  {__NR_timer_create, "timer_create"},
413  {__NR_timer_delete, "timer_delete"},
414  {__NR_timer_getoverrun, "timer_getoverrun"},
415  {__NR_timer_gettime, "timer_gettime"},
416  {__NR_timer_settime, "timer_settime"},
417  {__NR_timerfd_create, "timerfd_create"},
418  {__NR_timerfd_gettime, "timerfd_gettime"},
419  {__NR_timerfd_settime, "timerfd_settime"},
420  {__NR_times, "times"},
421  {__NR_tkill, "tkill"},
422  {__NR_truncate, "truncate"},
423  {__NR_tuxcall, "tuxcall"},
424  {__NR_umask, "umask"},
425  {__NR_umount2, "umount2"},
426  {__NR_uname, "uname"},
427  {__NR_unlink, "unlink"},
428  {__NR_unlinkat, "unlinkat"},
429  {__NR_unshare, "unshare"},
430  {__NR_uselib, "uselib"},
431  {__NR_userfaultfd, "userfaultfd"},
432  {__NR_ustat, "ustat"},
433  {__NR_utime, "utime"},
434  {__NR_utimensat, "utimensat"},
435  {__NR_utimes, "utimes"},
436  {__NR_vfork, "vfork"},
437  {__NR_vhangup, "vhangup"},
438  {__NR_vmsplice, "vmsplice"},
439  {__NR_vserver, "vserver"},
440  {__NR_wait4, "wait4"},
441  {__NR_waitid, "waitid"},
442  {__NR_write, "write"},
443  {__NR_writev, "writev"},
444 };
445 
446 std::string GetLinuxSyscallName(uint32_t syscall_number)
447 {
448  const auto element = LINUX_SYSCALLS.find(syscall_number);
449  if (element != LINUX_SYSCALLS.end()) {
450  return element->second;
451  }
452  return "*unknown*";
453 }
454 
455 // See Linux kernel developer Kees Cook's seccomp guide at <https://outflux.net/teach-seccomp/> for
456 // an accessible introduction to using seccomp.
457 //
458 // This function largely follows <https://outflux.net/teach-seccomp/step-3/syscall-reporter.c> and
459 // <https://outflux.net/teach-seccomp/step-3/seccomp-bpf.h>.
460 //
461 // Seccomp BPF resources:
462 // * Seccomp BPF documentation: <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html>
463 // * seccomp(2) manual page: <https://www.kernel.org/doc/man-pages/online/pages/man2/seccomp.2.html>
464 // * Seccomp BPF demo code samples: <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/samples/seccomp>
465 void SyscallSandboxDebugSignalHandler(int, siginfo_t* signal_info, void* void_signal_context)
466 {
467  // The si_code field inside the siginfo_t argument that is passed to a SA_SIGINFO signal handler
468  // is a value indicating why the signal was sent.
469  //
470  // The following value can be placed in si_code for a SIGSYS signal:
471  // * SYS_SECCOMP (since Linux 3.5): Triggered by a seccomp(2) filter rule.
472  constexpr int32_t SYS_SECCOMP_SI_CODE{1};
473  assert(signal_info->si_code == SYS_SECCOMP_SI_CODE);
474 
475  // The ucontext_t structure contains signal context information that was saved on the user-space
476  // stack by the kernel.
477  const ucontext_t* signal_context = static_cast<ucontext_t*>(void_signal_context);
478  assert(signal_context != nullptr);
479 
480  std::set_new_handler(std::terminate);
481  // Portability note: REG_RAX is Linux x86_64 specific.
482  const uint32_t syscall_number = static_cast<uint32_t>(signal_context->uc_mcontext.gregs[REG_RAX]);
483  const std::string syscall_name = GetLinuxSyscallName(syscall_number);
484  const std::string thread_name = !util::ThreadGetInternalName().empty() ? util::ThreadGetInternalName() : "*unnamed*";
485  const std::string error_message = strprintf("ERROR: The syscall \"%s\" (syscall number %d) is not allowed by the syscall sandbox in thread \"%s\". Please report.", syscall_name, syscall_number, thread_name);
486  tfm::format(std::cerr, "%s\n", error_message);
487  LogPrintf("%s\n", error_message);
488  std::terminate();
489 }
490 
491 // This function largely follows install_syscall_reporter from Kees Cook's seccomp guide:
492 // <https://outflux.net/teach-seccomp/step-3/syscall-reporter.c>
493 bool SetupSyscallSandboxDebugHandler()
494 {
495  struct sigaction action = {};
496  sigset_t mask;
497  sigemptyset(&mask);
498  sigaddset(&mask, SIGSYS);
499  action.sa_sigaction = &SyscallSandboxDebugSignalHandler;
500  action.sa_flags = SA_SIGINFO;
501  if (sigaction(SIGSYS, &action, nullptr) < 0) {
502  return false;
503  }
504  if (sigprocmask(SIG_UNBLOCK, &mask, nullptr)) {
505  return false;
506  }
507  return true;
508 }
509 
510 enum class SyscallSandboxAction {
511  KILL_PROCESS,
512  INVOKE_SIGNAL_HANDLER,
513 };
514 
515 class SeccompPolicyBuilder
516 {
517  std::set<uint32_t> allowed_syscalls;
518 
519 public:
520  SeccompPolicyBuilder()
521  {
522  // Allowed by default.
523  AllowAddressSpaceAccess();
524  AllowEpoll();
525  AllowEventFd();
526  AllowFutex();
527  AllowGeneralIo();
528  AllowGetRandom();
529  AllowGetSimpleId();
530  AllowGetTime();
531  AllowGlobalProcessEnvironment();
532  AllowGlobalSystemStatus();
533  AllowKernelInternalApi();
534  AllowNetworkSocketInformation();
535  AllowOperationOnExistingFileDescriptor();
536  AllowPipe();
537  AllowPrctl();
538  AllowProcessStartOrDeath();
539  AllowScheduling();
540  AllowSignalHandling();
541  AllowSleep();
542  AllowUmask();
543  }
544 
545  void AllowAddressSpaceAccess()
546  {
547  allowed_syscalls.insert(__NR_brk); // change data segment size
548  allowed_syscalls.insert(__NR_madvise); // give advice about use of memory
549  allowed_syscalls.insert(__NR_membarrier); // issue memory barriers on a set of threads
550  allowed_syscalls.insert(__NR_mincore); // check if virtual memory is in RAM
551  allowed_syscalls.insert(__NR_mlock); // lock memory
552  allowed_syscalls.insert(__NR_mmap); // map files or devices into memory
553  allowed_syscalls.insert(__NR_mprotect); // set protection on a region of memory
554  allowed_syscalls.insert(__NR_mremap); // remap a file in memory
555  allowed_syscalls.insert(__NR_munlock); // unlock memory
556  allowed_syscalls.insert(__NR_munmap); // unmap files or devices into memory
557  }
558 
559  void AllowEpoll()
560  {
561  allowed_syscalls.insert(__NR_epoll_create1); // open an epoll file descriptor
562  allowed_syscalls.insert(__NR_epoll_ctl); // control interface for an epoll file descriptor
563  allowed_syscalls.insert(__NR_epoll_pwait); // wait for an I/O event on an epoll file descriptor
564  allowed_syscalls.insert(__NR_epoll_wait); // wait for an I/O event on an epoll file descriptor
565  }
566 
567  void AllowEventFd()
568  {
569  allowed_syscalls.insert(__NR_eventfd2); // create a file descriptor for event notification
570  }
571 
572  void AllowFileSystem()
573  {
574  allowed_syscalls.insert(__NR_access); // check user's permissions for a file
575  allowed_syscalls.insert(__NR_chdir); // change working directory
576  allowed_syscalls.insert(__NR_chmod); // change permissions of a file
577  allowed_syscalls.insert(__NR_copy_file_range); // copy a range of data from one file to another
578  allowed_syscalls.insert(__NR_fallocate); // manipulate file space
579  allowed_syscalls.insert(__NR_fchmod); // change permissions of a file
580  allowed_syscalls.insert(__NR_fchown); // change ownership of a file
581  allowed_syscalls.insert(__NR_fdatasync); // synchronize a file's in-core state with storage device
582  allowed_syscalls.insert(__NR_flock); // apply or remove an advisory lock on an open file
583  allowed_syscalls.insert(__NR_fstat); // get file status
584  allowed_syscalls.insert(__NR_fstatfs); // get file system status
585  allowed_syscalls.insert(__NR_fsync); // synchronize a file's in-core state with storage device
586  allowed_syscalls.insert(__NR_ftruncate); // truncate a file to a specified length
587  allowed_syscalls.insert(__NR_getcwd); // get current working directory
588  allowed_syscalls.insert(__NR_getdents); // get directory entries
589  allowed_syscalls.insert(__NR_getdents64); // get directory entries
590  allowed_syscalls.insert(__NR_lstat); // get file status
591  allowed_syscalls.insert(__NR_mkdir); // create a directory
592  allowed_syscalls.insert(__NR_newfstatat); // get file status
593  allowed_syscalls.insert(__NR_open); // open and possibly create a file
594  allowed_syscalls.insert(__NR_openat); // open and possibly create a file
595  allowed_syscalls.insert(__NR_readlink); // read value of a symbolic link
596  allowed_syscalls.insert(__NR_rename); // change the name or location of a file
597  allowed_syscalls.insert(__NR_rmdir); // delete a directory
598  allowed_syscalls.insert(__NR_stat); // get file status
599  allowed_syscalls.insert(__NR_statfs); // get filesystem statistics
600  allowed_syscalls.insert(__NR_statx); // get file status (extended)
601  allowed_syscalls.insert(__NR_unlink); // delete a name and possibly the file it refers to
602  }
603 
604  void AllowFutex()
605  {
606  allowed_syscalls.insert(__NR_futex); // fast user-space locking
607  allowed_syscalls.insert(__NR_set_robust_list); // set list of robust futexes
608  }
609 
610  void AllowGeneralIo()
611  {
612  allowed_syscalls.insert(__NR_ioctl); // control device
613  allowed_syscalls.insert(__NR_lseek); // reposition read/write file offset
614  allowed_syscalls.insert(__NR_poll); // wait for some event on a file descriptor
615  allowed_syscalls.insert(__NR_ppoll); // wait for some event on a file descriptor
616  allowed_syscalls.insert(__NR_pread64); // read from a file descriptor at a given offset
617  allowed_syscalls.insert(__NR_pwrite64); // write to a file descriptor at a given offset
618  allowed_syscalls.insert(__NR_read); // read from a file descriptor
619  allowed_syscalls.insert(__NR_readv); // read data into multiple buffers
620  allowed_syscalls.insert(__NR_recvfrom); // receive a message from a socket
621  allowed_syscalls.insert(__NR_recvmsg); // receive a message from a socket
622  allowed_syscalls.insert(__NR_select); // synchronous I/O multiplexing
623  allowed_syscalls.insert(__NR_sendmmsg); // send multiple messages on a socket
624  allowed_syscalls.insert(__NR_sendmsg); // send a message on a socket
625  allowed_syscalls.insert(__NR_sendto); // send a message on a socket
626  allowed_syscalls.insert(__NR_write); // write to a file descriptor
627  allowed_syscalls.insert(__NR_writev); // write data into multiple buffers
628  }
629 
630  void AllowGetRandom()
631  {
632  allowed_syscalls.insert(__NR_getrandom); // obtain a series of random bytes
633  }
634 
635  void AllowGetSimpleId()
636  {
637  allowed_syscalls.insert(__NR_getegid); // get group identity
638  allowed_syscalls.insert(__NR_geteuid); // get user identity
639  allowed_syscalls.insert(__NR_getgid); // get group identity
640  allowed_syscalls.insert(__NR_getpgid); // get process group
641  allowed_syscalls.insert(__NR_getpid); // get process identification
642  allowed_syscalls.insert(__NR_getppid); // get process identification
643  allowed_syscalls.insert(__NR_getresgid); // get real, effective and saved group IDs
644  allowed_syscalls.insert(__NR_getresuid); // get real, effective and saved user IDs
645  allowed_syscalls.insert(__NR_getsid); // get session ID
646  allowed_syscalls.insert(__NR_gettid); // get thread identification
647  allowed_syscalls.insert(__NR_getuid); // get user identity
648  }
649 
650  void AllowGetTime()
651  {
652  allowed_syscalls.insert(__NR_clock_getres); // find the resolution (precision) of the specified clock
653  allowed_syscalls.insert(__NR_clock_gettime); // retrieve the time of the specified clock
654  allowed_syscalls.insert(__NR_gettimeofday); // get timeval
655  }
656 
657  void AllowGlobalProcessEnvironment()
658  {
659  allowed_syscalls.insert(__NR_getrlimit); // get resource limits
660  allowed_syscalls.insert(__NR_getrusage); // get resource usage
661  allowed_syscalls.insert(__NR_prlimit64); // get/set resource limits
662  }
663 
664  void AllowGlobalSystemStatus()
665  {
666  allowed_syscalls.insert(__NR_sysinfo); // return system information
667  allowed_syscalls.insert(__NR_uname); // get name and information about current kernel
668  }
669 
670  void AllowKernelInternalApi()
671  {
672  allowed_syscalls.insert(__NR_restart_syscall); // restart a system call after interruption by a stop signal
673  }
674 
675  void AllowNetwork()
676  {
677  allowed_syscalls.insert(__NR_accept); // accept a connection on a socket
678  allowed_syscalls.insert(__NR_accept4); // accept a connection on a socket
679  allowed_syscalls.insert(__NR_bind); // bind a name to a socket
680  allowed_syscalls.insert(__NR_connect); // initiate a connection on a socket
681  allowed_syscalls.insert(__NR_listen); // listen for connections on a socket
682  allowed_syscalls.insert(__NR_setsockopt); // set options on sockets
683  allowed_syscalls.insert(__NR_socket); // create an endpoint for communication
684  allowed_syscalls.insert(__NR_socketpair); // create a pair of connected sockets
685  }
686 
687  void AllowNetworkSocketInformation()
688  {
689  allowed_syscalls.insert(__NR_getpeername); // get name of connected peer socket
690  allowed_syscalls.insert(__NR_getsockname); // get socket name
691  allowed_syscalls.insert(__NR_getsockopt); // get options on sockets
692  }
693 
694  void AllowOperationOnExistingFileDescriptor()
695  {
696  allowed_syscalls.insert(__NR_close); // close a file descriptor
697  allowed_syscalls.insert(__NR_dup); // duplicate a file descriptor
698  allowed_syscalls.insert(__NR_dup2); // duplicate a file descriptor
699  allowed_syscalls.insert(__NR_fcntl); // manipulate file descriptor
700  allowed_syscalls.insert(__NR_shutdown); // shut down part of a full-duplex connection
701  }
702 
703  void AllowPipe()
704  {
705  allowed_syscalls.insert(__NR_pipe); // create pipe
706  allowed_syscalls.insert(__NR_pipe2); // create pipe
707  }
708 
709  void AllowPrctl()
710  {
711  allowed_syscalls.insert(__NR_arch_prctl); // set architecture-specific thread state
712  allowed_syscalls.insert(__NR_prctl); // operations on a process
713  }
714 
715  void AllowProcessStartOrDeath()
716  {
717  allowed_syscalls.insert(__NR_clone); // create a child process
718  allowed_syscalls.insert(__NR_clone3); // create a child process
719  allowed_syscalls.insert(__NR_exit); // terminate the calling process
720  allowed_syscalls.insert(__NR_exit_group); // exit all threads in a process
721  allowed_syscalls.insert(__NR_fork); // create a child process
722  allowed_syscalls.insert(__NR_tgkill); // send a signal to a thread
723  allowed_syscalls.insert(__NR_wait4); // wait for process to change state, BSD style
724  }
725 
726  void AllowScheduling()
727  {
728  allowed_syscalls.insert(__NR_sched_getaffinity); // set a thread's CPU affinity mask
729  allowed_syscalls.insert(__NR_sched_getparam); // get scheduling parameters
730  allowed_syscalls.insert(__NR_sched_getscheduler); // get scheduling policy/parameters
731  allowed_syscalls.insert(__NR_sched_setscheduler); // set scheduling policy/parameters
732  allowed_syscalls.insert(__NR_sched_yield); // yield the processor
733  }
734 
735  void AllowSignalHandling()
736  {
737  allowed_syscalls.insert(__NR_rt_sigaction); // examine and change a signal action
738  allowed_syscalls.insert(__NR_rt_sigprocmask); // examine and change blocked signals
739  allowed_syscalls.insert(__NR_rt_sigreturn); // return from signal handler and cleanup stack frame
740  allowed_syscalls.insert(__NR_sigaltstack); // set and/or get signal stack context
741  }
742 
743  void AllowSleep()
744  {
745  allowed_syscalls.insert(__NR_clock_nanosleep); // high-resolution sleep with specifiable clock
746  allowed_syscalls.insert(__NR_nanosleep); // high-resolution sleep
747  }
748 
749  void AllowUmask()
750  {
751  allowed_syscalls.insert(__NR_umask); // set file mode creation mask
752  }
753 
754  // See Linux kernel developer Kees Cook's seccomp guide at <https://outflux.net/teach-seccomp/>
755  // for an accessible introduction to using seccomp.
756  //
757  // This function largely follows <https://outflux.net/teach-seccomp/step-3/seccomp-bpf.h>.
758  std::vector<sock_filter> BuildFilter(SyscallSandboxAction default_action)
759  {
760  std::vector<sock_filter> bpf_policy;
761  // See VALIDATE_ARCHITECTURE in seccomp-bpf.h referenced above.
762  bpf_policy.push_back(BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct seccomp_data, arch)));
763  // Portability note: AUDIT_ARCH_X86_64 is Linux x86_64 specific.
764  bpf_policy.push_back(BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, AUDIT_ARCH_X86_64, 1, 0));
765  bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_KILL_PROCESS));
766  // See EXAMINE_SYSCALL in seccomp-bpf.h referenced above.
767  bpf_policy.push_back(BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct seccomp_data, nr)));
768  for (const uint32_t allowed_syscall : allowed_syscalls) {
769  // See ALLOW_SYSCALL in seccomp-bpf.h referenced above.
770  bpf_policy.push_back(BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, allowed_syscall, 0, 1));
771  bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW));
772  }
773  switch (default_action) {
774  case SyscallSandboxAction::KILL_PROCESS:
775  // Disallow syscall and kill the process.
776  //
777  // See KILL_PROCESS in seccomp-bpf.h referenced above.
778  //
779  // Note that we're using SECCOMP_RET_KILL_PROCESS (kill the process) instead
780  // of SECCOMP_RET_KILL_THREAD (kill the thread). The SECCOMP_RET_KILL_PROCESS
781  // action was introduced in Linux 4.14.
782  //
783  // SECCOMP_RET_KILL_PROCESS: Results in the entire process exiting immediately without
784  // executing the system call.
785  //
786  // SECCOMP_RET_KILL_PROCESS documentation:
787  // <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html>
788  bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_KILL_PROCESS));
789  break;
790  case SyscallSandboxAction::INVOKE_SIGNAL_HANDLER:
791  // Disallow syscall and force a SIGSYS to trigger syscall debug reporter.
792  //
793  // SECCOMP_RET_TRAP: Results in the kernel sending a SIGSYS signal to the triggering
794  // task without executing the system call.
795  //
796  // SECCOMP_RET_TRAP documentation:
797  // <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html>
798  bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_TRAP));
799  break;
800  }
801  return bpf_policy;
802  }
803 };
804 } // namespace
805 
806 bool SetupSyscallSandbox(bool log_syscall_violation_before_terminating)
807 {
808  assert(!g_syscall_sandbox_enabled && "SetupSyscallSandbox(...) should only be called once.");
809  g_syscall_sandbox_enabled = true;
810  g_syscall_sandbox_log_violation_before_terminating = log_syscall_violation_before_terminating;
811  if (log_syscall_violation_before_terminating) {
812  if (!SetupSyscallSandboxDebugHandler()) {
813  return false;
814  }
815  }
817  return true;
818 }
819 
820 void TestDisallowedSandboxCall()
821 {
822  // The getgroups syscall is assumed NOT to be allowed by the syscall sandbox policy.
823  std::array<gid_t, 1> groups;
824  [[maybe_unused]] int32_t ignored = getgroups(groups.size(), groups.data());
825 }
826 #endif // defined(USE_SYSCALL_SANDBOX)
827 
829 {
830 #if defined(USE_SYSCALL_SANDBOX)
831  if (!g_syscall_sandbox_enabled) {
832  return;
833  }
834  SeccompPolicyBuilder seccomp_policy_builder;
835  switch (syscall_policy) {
836  case SyscallSandboxPolicy::INITIALIZATION: // Thread: main thread (state: init)
837  // SyscallSandboxPolicy::INITIALIZATION is the first policy loaded.
838  //
839  // Subsequently loaded policies can reduce the abilities further, but
840  // abilities can never be regained.
841  //
842  // SyscallSandboxPolicy::INITIALIZATION must thus be a superset of all
843  // other policies.
844  seccomp_policy_builder.AllowFileSystem();
845  seccomp_policy_builder.AllowNetwork();
846  break;
847  case SyscallSandboxPolicy::INITIALIZATION_DNS_SEED: // Thread: dnsseed
848  seccomp_policy_builder.AllowFileSystem();
849  seccomp_policy_builder.AllowNetwork();
850  break;
851  case SyscallSandboxPolicy::INITIALIZATION_LOAD_BLOCKS: // Thread: loadblk
852  seccomp_policy_builder.AllowFileSystem();
853  break;
854  case SyscallSandboxPolicy::INITIALIZATION_MAP_PORT: // Thread: mapport
855  seccomp_policy_builder.AllowFileSystem();
856  seccomp_policy_builder.AllowNetwork();
857  break;
858  case SyscallSandboxPolicy::MESSAGE_HANDLER: // Thread: msghand
859  seccomp_policy_builder.AllowFileSystem();
860  break;
861  case SyscallSandboxPolicy::NET: // Thread: net
862  seccomp_policy_builder.AllowFileSystem();
863  seccomp_policy_builder.AllowNetwork();
864  break;
865  case SyscallSandboxPolicy::NET_ADD_CONNECTION: // Thread: addcon
866  seccomp_policy_builder.AllowFileSystem();
867  seccomp_policy_builder.AllowNetwork();
868  break;
869  case SyscallSandboxPolicy::NET_HTTP_SERVER: // Thread: http
870  seccomp_policy_builder.AllowFileSystem();
871  seccomp_policy_builder.AllowNetwork();
872  break;
873  case SyscallSandboxPolicy::NET_HTTP_SERVER_WORKER: // Thread: httpworker.<N>
874  seccomp_policy_builder.AllowFileSystem();
875  seccomp_policy_builder.AllowNetwork();
876  break;
877  case SyscallSandboxPolicy::NET_OPEN_CONNECTION: // Thread: opencon
878  seccomp_policy_builder.AllowFileSystem();
879  seccomp_policy_builder.AllowNetwork();
880  break;
881  case SyscallSandboxPolicy::SCHEDULER: // Thread: scheduler
882  seccomp_policy_builder.AllowFileSystem();
883  break;
884  case SyscallSandboxPolicy::TOR_CONTROL: // Thread: torcontrol
885  seccomp_policy_builder.AllowFileSystem();
886  seccomp_policy_builder.AllowNetwork();
887  break;
888  case SyscallSandboxPolicy::TX_INDEX: // Thread: txindex
889  seccomp_policy_builder.AllowFileSystem();
890  break;
891  case SyscallSandboxPolicy::VALIDATION_SCRIPT_CHECK: // Thread: scriptch.<N>
892  break;
893  case SyscallSandboxPolicy::SHUTOFF: // Thread: main thread (state: shutoff)
894  seccomp_policy_builder.AllowFileSystem();
895  break;
896  }
897 
898  const SyscallSandboxAction default_action = g_syscall_sandbox_log_violation_before_terminating ? SyscallSandboxAction::INVOKE_SIGNAL_HANDLER : SyscallSandboxAction::KILL_PROCESS;
899  std::vector<sock_filter> filter = seccomp_policy_builder.BuildFilter(default_action);
900  const sock_fprog prog = {
901  .len = static_cast<uint16_t>(filter.size()),
902  .filter = filter.data(),
903  };
904  // Do not allow abilities to be regained after being dropped.
905  //
906  // PR_SET_NO_NEW_PRIVS documentation: <https://www.kernel.org/doc/html/latest/userspace-api/no_new_privs.html>
907  if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0) {
908  throw std::runtime_error("Syscall sandbox enforcement failed: prctl(PR_SET_NO_NEW_PRIVS)");
909  }
910  // Install seccomp-bpf syscall filter.
911  //
912  // PR_SET_SECCOMP documentation: <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html>
913  if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) != 0) {
914  throw std::runtime_error("Syscall sandbox enforcement failed: prctl(PR_SET_SECCOMP)");
915  }
916 
917  const std::string thread_name = !util::ThreadGetInternalName().empty() ? util::ThreadGetInternalName() : "*unnamed*";
918  LogPrint(BCLog::UTIL, "Syscall filter installed for thread \"%s\"\n", thread_name);
919 #endif // defined(USE_SYSCALL_SANDBOX)
920 }
SyscallSandboxPolicy::INITIALIZATION_DNS_SEED
@ INITIALIZATION_DNS_SEED
assert
assert(!tx.IsCoinBase())
tinyformat::format
void format(std::ostream &out, const char *fmt, const Args &... args)
Format list of arguments to the stream according to given format string.
Definition: tinyformat.h:1062
SyscallSandboxPolicy::NET_OPEN_CONNECTION
@ NET_OPEN_CONNECTION
SyscallSandboxPolicy::TOR_CONTROL
@ TOR_CONTROL
SyscallSandboxPolicy::NET_ADD_CONNECTION
@ NET_ADD_CONNECTION
bitcoin-config.h
SyscallSandboxPolicy::VALIDATION_SCRIPT_CHECK
@ VALIDATION_SCRIPT_CHECK
SyscallSandboxPolicy::NET_HTTP_SERVER
@ NET_HTTP_SERVER
SyscallSandboxPolicy::INITIALIZATION
@ INITIALIZATION
tinyformat.h
SyscallSandboxPolicy::INITIALIZATION_MAP_PORT
@ INITIALIZATION_MAP_PORT
syscall_sandbox.h
util::ThreadGetInternalName
const std::string & ThreadGetInternalName()
Get the thread's internal (in-memory) name; used e.g.
Definition: threadnames.cpp:53
LogPrintf
#define LogPrintf(...)
Definition: logging.h:187
SetSyscallSandboxPolicy
void SetSyscallSandboxPolicy(SyscallSandboxPolicy syscall_policy)
Force the current thread (and threads created from the current thread) into a restricted-service oper...
Definition: syscall_sandbox.cpp:828
BCLog::UTIL
@ UTIL
Definition: logging.h:63
SyscallSandboxPolicy::SHUTOFF
@ SHUTOFF
LogPrint
#define LogPrint(category,...)
Definition: logging.h:191
SyscallSandboxPolicy
SyscallSandboxPolicy
Definition: syscall_sandbox.h:8
strprintf
#define strprintf
Format arguments and return the string or write to given std::ostream (see tinyformat::format doc for...
Definition: tinyformat.h:1164
SyscallSandboxPolicy::MESSAGE_HANDLER
@ MESSAGE_HANDLER
SyscallSandboxPolicy::NET_HTTP_SERVER_WORKER
@ NET_HTTP_SERVER_WORKER
SyscallSandboxPolicy::INITIALIZATION_LOAD_BLOCKS
@ INITIALIZATION_LOAD_BLOCKS
SyscallSandboxPolicy::NET
@ NET
SyscallSandboxPolicy::SCHEDULER
@ SCHEDULER
SyscallSandboxPolicy::TX_INDEX
@ TX_INDEX
threadnames.h