diff --git a/TODO b/TODO
index 48f320093e7..05e51a2896f 100644
--- a/TODO
+++ b/TODO
@@ -24,6 +24,8 @@ Janitorial Clean-ups:
Features:
+* set SystemCallArchitectures=native on all our services
+
* maybe add call sd_journal_set_block_timeout() or so to set SO_SNDTIMEO for
the sd-journal logging socket, and, if the timeout is set to 0, sets
O_NONBLOCK on it. That way people can control if and when to block for
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index fd47b0a20a0..e7e5d6b0c74 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1554,11 +1554,10 @@
setns2 system calls, taking
the specified flags parameters into account. Note that — if this option is used — in addition to restricting
creation and switching of the specified types of namespaces (or all of them, if true) access to the
- setns() system call with a zero flags parameter is prohibited.
- If running in user mode, or in system mode, but without the CAP_SYS_ADMIN
- capability (e.g. setting User=), NoNewPrivileges=yes
- is implied.
-
+ setns() system call with a zero flags parameter is prohibited. This setting is only
+ supported on x86, x86-64, s390 and s390x, and enforces no restrictions on other architectures. If running in user
+ mode, or in system mode, but without the CAP_SYS_ADMIN capability (e.g. setting
+ User=), NoNewPrivileges=yes is implied.
diff --git a/src/basic/raw-clone.h b/src/basic/raw-clone.h
index d4738289999..c6e531ada4d 100644
--- a/src/basic/raw-clone.h
+++ b/src/basic/raw-clone.h
@@ -47,8 +47,8 @@
static inline int raw_clone(unsigned long flags) {
assert((flags & (CLONE_VM|CLONE_PARENT_SETTID|CLONE_CHILD_SETTID|
CLONE_CHILD_CLEARTID|CLONE_SETTLS)) == 0);
-#if defined(__s390__) || defined(__CRIS__)
- /* On s390 and cris the order of the first and second arguments
+#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
+ /* On s390/s390x and cris the order of the first and second arguments
* of the raw clone() system call is reversed. */
return (int) syscall(__NR_clone, NULL, flags);
#elif defined(__sparc__) && defined(__arch64__)
diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c
index 44706669b4f..e35f18471ca 100644
--- a/src/shared/seccomp-util.c
+++ b/src/shared/seccomp-util.c
@@ -750,10 +750,35 @@ int seccomp_restrict_namespaces(unsigned long retain) {
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ int clone_reversed_order = -1;
unsigned i;
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
+ switch (arch) {
+
+ case SCMP_ARCH_X86_64:
+ case SCMP_ARCH_X86:
+ case SCMP_ARCH_X32:
+ clone_reversed_order = 0;
+ break;
+
+ case SCMP_ARCH_S390:
+ case SCMP_ARCH_S390X:
+ /* On s390/s390x the first two parameters to clone are switched */
+ clone_reversed_order = 1;
+ break;
+
+ /* Please add more definitions here, if you port systemd to other architectures! */
+
+#if !defined(__i386__) && !defined(__x86_64__) && !defined(__s390__) && !defined(__s390x__)
+#warning "Consider adding the right clone() syscall definitions here!"
+#endif
+ }
+
+ if (clone_reversed_order < 0) /* we don't know the right order, let's ignore this arch... */
+ continue;
+
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
if (r < 0)
return r;
@@ -802,12 +827,20 @@ int seccomp_restrict_namespaces(unsigned long retain) {
break;
}
- r = seccomp_rule_add_exact(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(clone),
- 1,
- SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
+ if (clone_reversed_order == 0)
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(clone),
+ 1,
+ SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
+ else
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(clone),
+ 1,
+ SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
if (r < 0) {
log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
break;
diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h
index bfbfb5ab3d7..61f94de638d 100644
--- a/src/shared/seccomp-util.h
+++ b/src/shared/seccomp-util.h
@@ -91,6 +91,13 @@ int seccomp_memory_deny_write_execute(void);
#define SECCOMP_MEMORY_DENY_WRITE_EXECUTE_BROKEN 1
#endif
+/* we don't know the right order of the clone() parameters except for these archs, for now */
+#if defined(__x86_64__) || defined(__i386__) || defined(__s390x__) || defined(__s390__)
+#define SECCOMP_RESTRICT_NAMESPACES_BROKEN 0
+#else
+#define SECCOMP_RESTRICT_NAMESPACES_BROKEN 1
+#endif
+
extern const uint32_t seccomp_local_archs[];
#define SECCOMP_FOREACH_LOCAL_ARCH(arch) \
diff --git a/src/test/test-seccomp.c b/src/test/test-seccomp.c
index 36592388105..34a1275162d 100644
--- a/src/test/test-seccomp.c
+++ b/src/test/test-seccomp.c
@@ -158,6 +158,8 @@ static void test_restrict_namespace(void) {
assert_se(streq(s, "cgroup ipc net mnt pid user uts"));
assert_se(namespace_flag_from_string_many(s, &ul) == 0 && ul == NAMESPACE_FLAGS_ALL);
+#if SECCOMP_RESTRICT_NAMESPACES_BROKEN == 0
+
if (!is_seccomp_available())
return;
if (geteuid() != 0)
@@ -216,6 +218,7 @@ static void test_restrict_namespace(void) {
}
assert_se(wait_for_terminate_and_warn("nsseccomp", pid, true) == EXIT_SUCCESS);
+#endif
}
static void test_protect_sysctl(void) {