diff --git a/src/apparmor.rs b/src/apparmor.rs index b548424..bd82263 100644 --- a/src/apparmor.rs +++ b/src/apparmor.rs @@ -4,7 +4,7 @@ use std::ffi::{OsStr, OsString}; use std::io::{self, Write}; use std::os::unix::ffi::{OsStrExt, OsStringExt}; -use crate::pidfd::PidFd; +use crate::process::PidFd; pub fn get_label(pidfd: &PidFd) -> io::Result> { let mut out = match pidfd.read_file(c_str!("attr/current")) { diff --git a/src/capability.rs b/src/capability.rs index 6b0c416..f76ba31 100644 --- a/src/capability.rs +++ b/src/capability.rs @@ -1,5 +1,5 @@ use std::io; -use std::os::raw::c_ulong; +use std::os::raw::{c_int, c_ulong}; bitflags::bitflags! { pub struct SecureBits: c_ulong { @@ -29,3 +29,60 @@ impl SecureBits { .ok_or_else(|| io_format_err!("prctl() returned unknown securebits")) } } + +#[derive(Clone, Default)] +pub struct Capabilities { + pub inheritable: u64, + pub permitted: u64, + pub effective: u64, + //bounding: u64, // we don't care currently +} + +// Too lazy to bindgen libcap stuff... +const CAPABILITY_VERSION_3: u32 = 0x2008_0522; + +/// Represents process capabilities. +/// +/// This can be used to change the process' capability sets (if permitted by the kernel). +impl Capabilities { + // We currently don't implement capget as it takes a pid which is racy on kernels without pidfd + // support. Later on we might support a `capget(&PidFd)` method? + + /// Change our process capabilities. This does not include the bounding set. + pub fn capset(&self) -> io::Result<()> { + #![allow(dead_code)] + // kernel abi: + struct Header { + version: u32, + pid: c_int, + } + + struct Data { + effective: u32, + permitted: u32, + inheritable: u32, + } + + let header = Header { + version: CAPABILITY_VERSION_3, + pid: 0, // equivalent to gettid(), + }; + + let data = [ + Data { + effective: self.effective as u32, + permitted: self.permitted as u32, + inheritable: self.inheritable as u32, + }, + Data { + effective: (self.effective >> 32) as u32, + permitted: (self.permitted >> 32) as u32, + inheritable: (self.inheritable >> 32) as u32, + }, + ]; + + c_try!(unsafe { libc::syscall(libc::SYS_capset, &header, &data) }); + + Ok(()) + } +} diff --git a/src/lxcseccomp.rs b/src/lxcseccomp.rs index 32b0998..271a427 100644 --- a/src/lxcseccomp.rs +++ b/src/lxcseccomp.rs @@ -13,7 +13,7 @@ use lazy_static::lazy_static; use libc::pid_t; use nix::errno::Errno; -use crate::pidfd::PidFd; +use crate::process::PidFd; use crate::seccomp::{SeccompNotif, SeccompNotifResp, SeccompNotifSizes}; use crate::tools::{Fd, FromFd, IoVec, IoVecMut}; diff --git a/src/main.rs b/src/main.rs index 5e26a74..fce36c0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -13,7 +13,7 @@ pub mod client; pub mod fork; pub mod lxcseccomp; pub mod nsfd; -pub mod pidfd; +pub mod process; pub mod seccomp; pub mod sys_mknod; pub mod sys_quotactl; diff --git a/src/process/cgroups.rs b/src/process/cgroups.rs new file mode 100644 index 0000000..7d12067 --- /dev/null +++ b/src/process/cgroups.rs @@ -0,0 +1,24 @@ +use std::collections::HashMap; +use std::ffi::{OsStr, OsString}; + +pub struct CGroups { + pub v1: HashMap, + pub v2: Option, +} + +impl CGroups { + pub fn new() -> Self { + Self { + v1: HashMap::new(), + v2: None, + } + } + + pub fn get(&self, name: &str) -> Option<&OsStr> { + self.v1.get(name).map(|s| s.as_os_str()) + } + + pub fn v2(&self) -> Option<&OsStr> { + self.v2.as_ref().map(|s| s.as_os_str()) + } +} diff --git a/src/pidfd.rs b/src/process/fd.rs similarity index 52% rename from src/pidfd.rs rename to src/process/fd.rs index 72a5325..d6780de 100644 --- a/src/pidfd.rs +++ b/src/process/fd.rs @@ -1,78 +1,23 @@ //! pidfd helper functionality -use std::collections::HashMap; -use std::ffi::{CStr, CString, OsStr, OsString}; +use std::ffi::{CStr, CString, OsString}; use std::io::{self, BufRead, BufReader}; use std::os::raw::c_int; -use std::os::unix::ffi::{OsStrExt, OsStringExt}; +use std::os::unix::ffi::OsStringExt; use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd, RawFd}; use failure::{bail, Error}; use libc::pid_t; +use crate::capability::Capabilities; use crate::nsfd::{ns_type, NsFd}; use crate::tools::Fd; +use super::{CGroups, IdMap, IdMapEntry, ProcStatus, Uids, UserCaps}; + pub struct PidFd(RawFd, pid_t); file_descriptor_impl!(PidFd); -#[derive(Default)] -pub struct Uids { - pub ruid: libc::uid_t, - pub euid: libc::uid_t, - pub suid: libc::uid_t, - pub fsuid: libc::uid_t, - pub rgid: libc::gid_t, - pub egid: libc::gid_t, - pub sgid: libc::gid_t, - pub fsgid: libc::gid_t, -} - -#[derive(Clone, Default)] -pub struct Capabilities { - inheritable: u64, - permitted: u64, - effective: u64, - //bounding: u64, // we don't care currently -} - -#[derive(Default)] -pub struct ProcStatus { - uids: Uids, - capabilities: Capabilities, - umask: libc::mode_t, -} - -pub struct IdMapEntry { - ns: u64, - host: u64, - range: u64, -} - -pub struct IdMap(Vec); - -impl IdMap { - pub fn map_into(&self, id: u64) -> Option { - for entry in self.0.iter() { - if entry.host <= id && entry.host + entry.range > id { - return Some(entry.ns + id - entry.host); - } - } - - None - } - - pub fn map_from(&self, id: u64) -> Option { - for entry in self.0.iter() { - if entry.ns <= id && entry.ns + entry.range > id { - return Some(id + entry.host); - } - } - - None - } -} - impl PidFd { pub fn current() -> io::Result { let fd = c_try!(unsafe { @@ -299,7 +244,7 @@ impl PidFd { entries.push(IdMapEntry { ns, host, range }); } - Ok(IdMap(entries)) + Ok(IdMap::new(entries)) } pub fn get_uid_map(&self) -> Result { @@ -323,198 +268,3 @@ impl PidFd { UserCaps::new(self) } } - -pub struct CGroups { - v1: HashMap, - v2: Option, -} - -impl CGroups { - fn new() -> Self { - Self { - v1: HashMap::new(), - v2: None, - } - } - - pub fn get(&self, name: &str) -> Option<&OsStr> { - self.v1.get(name).map(|s| s.as_os_str()) - } - - pub fn v2(&self) -> Option<&OsStr> { - self.v2.as_ref().map(|s| s.as_os_str()) - } -} - -// Too lazy to bindgen libcap stuff... -const CAPABILITY_VERSION_3: u32 = 0x2008_0522; - -/// Represents process capabilities. -/// -/// This can be used to change the process' capability sets (if permitted by the kernel). -impl Capabilities { - // We currently don't implement capget as it takes a pid which is racy on kernels without pidfd - // support. Later on we might support a `capget(&PidFd)` method? - - /// Change our process capabilities. This does not include the bounding set. - pub fn capset(&self) -> io::Result<()> { - #![allow(dead_code)] - // kernel abi: - struct Header { - version: u32, - pid: c_int, - } - - struct Data { - effective: u32, - permitted: u32, - inheritable: u32, - } - - let header = Header { - version: CAPABILITY_VERSION_3, - pid: 0, // equivalent to gettid(), - }; - - let data = [ - Data { - effective: self.effective as u32, - permitted: self.permitted as u32, - inheritable: self.inheritable as u32, - }, - Data { - effective: (self.effective >> 32) as u32, - permitted: (self.permitted >> 32) as u32, - inheritable: (self.inheritable >> 32) as u32, - }, - ]; - - c_try!(unsafe { libc::syscall(libc::SYS_capset, &header, &data) }); - - Ok(()) - } -} - -/// Helper to enter a process' permission-check environment. -/// -/// When we execute a syscall on behalf of another process, we should try to trigger as many -/// permission checks as we can. It is impractical to implement them all manually, so the best -/// thing to do is cause as many of them to happen on the kernel-side as we can. -/// -/// We start by entering the process' devices and v2 cgroup. As calls like `mknod()` may be -/// affected, and access to devices as well. -/// -/// Then we must enter the mount namespace, chroot and current working directory, in order to get -/// the correct view of paths. -/// -/// Next we copy the caller's `umask`. -/// -/// Then switch over our effective and file system uid and gid. This has 2 reasons: First, it means -/// we do not need to run `chown()` on files we create, secondly, the user may have dropped -/// `CAP_DAC_OVERRIDE` / `CAP_DAC_READ_SEARCH` which may have prevented the creation of the file in -/// the first place (for example, the container program may be a non-root executable with -/// `cap_mknod=ep` as file-capabilities, in which case we do not want a user to be allowed to run -/// `mknod()` on a path owned by different user (and checking file system permissions would -/// require us to handle ACLs, quotas, which are all file system tyep dependent as well, so better -/// leave all that up to the kernel, too!)). -/// -/// Next we clone the process' capability set. This is because the process may have dropped -/// capabilties which under normal conditions would prevent them from executing the syscall. For -/// example a process may be executing `mknod()` after having dropped `CAP_MKNOD`. -#[derive(Clone)] -#[must_use = "not using UserCaps may be a security issue"] -pub struct UserCaps<'a> { - pidfd: &'a PidFd, - apply_uids: bool, - euid: libc::uid_t, - egid: libc::gid_t, - fsuid: libc::uid_t, - fsgid: libc::gid_t, - capabilities: Capabilities, - umask: libc::mode_t, - cgroup_v1_devices: Option, - cgroup_v2: Option, - apparmor_profile: Option, -} - -impl UserCaps<'_> { - pub fn new(pidfd: &PidFd) -> Result { - let status = pidfd.get_status()?; - let cgroups = pidfd.get_cgroups()?; - let apparmor_profile = crate::apparmor::get_label(pidfd)?; - - Ok(UserCaps { - pidfd, - apply_uids: true, - euid: status.uids.euid, - egid: status.uids.egid, - fsuid: status.uids.fsuid, - fsgid: status.uids.fsgid, - capabilities: status.capabilities, - umask: status.umask, - cgroup_v1_devices: cgroups.get("devices").map(|s| s.to_owned()), - cgroup_v2: cgroups.v2().map(|s| s.to_owned()), - apparmor_profile, - }) - } - - fn apply_cgroups(&self) -> io::Result<()> { - fn enter_cgroup(kind: &str, name: &OsStr) -> io::Result<()> { - let mut path = OsString::with_capacity(15 + kind.len() + name.len() + 13 + 1); - path.push(OsStr::from_bytes(b"/sys/fs/cgroup/")); - path.push(kind); - path.push(name); - path.push(OsStr::from_bytes(b"/cgroup.procs")); - std::fs::write(path, b"0") - } - - if let Some(ref cg) = self.cgroup_v1_devices { - enter_cgroup("devices/", cg)?; - } - - if let Some(ref cg) = self.cgroup_v2 { - enter_cgroup("unified/", cg)?; - } - - Ok(()) - } - - fn apply_user_caps(&self) -> io::Result<()> { - use crate::capability::SecureBits; - if self.apply_uids { - unsafe { - libc::umask(self.umask); - } - let mut secbits = SecureBits::get_current()?; - secbits |= SecureBits::KEEP_CAPS | SecureBits::NO_SETUID_FIXUP; - secbits.apply()?; - c_try!(unsafe { libc::setegid(self.egid) }); - c_try!(unsafe { libc::setfsgid(self.fsgid) }); - c_try!(unsafe { libc::seteuid(self.euid) }); - c_try!(unsafe { libc::setfsuid(self.fsuid) }); - } - self.capabilities.capset()?; - Ok(()) - } - - pub fn disable_uid_change(&mut self) { - self.apply_uids = false; - } - - pub fn disable_cgroup_change(&mut self) { - self.cgroup_v1_devices = None; - self.cgroup_v2 = None; - } - - pub fn apply(self, own_pidfd: &PidFd) -> io::Result<()> { - self.apply_cgroups()?; - self.pidfd.mount_namespace()?.setns()?; - self.pidfd.enter_chroot()?; - self.pidfd.enter_cwd()?; - if let Some(ref label) = self.apparmor_profile { - crate::apparmor::set_label(own_pidfd, label)?; - } - self.apply_user_caps()?; - Ok(()) - } -} diff --git a/src/process/id_map.rs b/src/process/id_map.rs new file mode 100644 index 0000000..d61d489 --- /dev/null +++ b/src/process/id_map.rs @@ -0,0 +1,35 @@ +//! uid/gid mapping helpers + +pub struct IdMap(Vec); + +pub struct IdMapEntry { + pub ns: u64, + pub host: u64, + pub range: u64, +} + +impl IdMap { + pub fn new(entries: Vec) -> Self { + Self(entries) + } + + pub fn map_into(&self, id: u64) -> Option { + for entry in self.0.iter() { + if entry.host <= id && entry.host + entry.range > id { + return Some(entry.ns + id - entry.host); + } + } + + None + } + + pub fn map_from(&self, id: u64) -> Option { + for entry in self.0.iter() { + if entry.ns <= id && entry.ns + entry.range > id { + return Some(id + entry.host); + } + } + + None + } +} diff --git a/src/process/mod.rs b/src/process/mod.rs new file mode 100644 index 0000000..678e7b4 --- /dev/null +++ b/src/process/mod.rs @@ -0,0 +1,37 @@ +use crate::capability::Capabilities; + +pub mod cgroups; +pub mod fd; +pub mod id_map; +pub mod user_caps; + +#[doc(inline)] +pub use cgroups::CGroups; + +#[doc(inline)] +pub use fd::PidFd; + +#[doc(inline)] +pub use id_map::{IdMap, IdMapEntry}; + +#[doc(inline)] +pub use user_caps::UserCaps; + +#[derive(Default)] +pub struct Uids { + pub ruid: libc::uid_t, + pub euid: libc::uid_t, + pub suid: libc::uid_t, + pub fsuid: libc::uid_t, + pub rgid: libc::gid_t, + pub egid: libc::gid_t, + pub sgid: libc::gid_t, + pub fsgid: libc::gid_t, +} + +#[derive(Default)] +pub struct ProcStatus { + uids: Uids, + capabilities: Capabilities, + umask: libc::mode_t, +} diff --git a/src/process/user_caps.rs b/src/process/user_caps.rs new file mode 100644 index 0000000..91e9276 --- /dev/null +++ b/src/process/user_caps.rs @@ -0,0 +1,134 @@ +//! User and capability management. + +use std::ffi::{OsStr, OsString}; +use std::io; +use std::os::unix::ffi::OsStrExt; + +use failure::Error; + +use super::PidFd; +use crate::capability::Capabilities; + +/// Helper to enter a process' permission-check environment. +/// +/// When we execute a syscall on behalf of another process, we should try to trigger as many +/// permission checks as we can. It is impractical to implement them all manually, so the best +/// thing to do is cause as many of them to happen on the kernel-side as we can. +/// +/// We start by entering the process' devices and v2 cgroup. As calls like `mknod()` may be +/// affected, and access to devices as well. +/// +/// Then we must enter the mount namespace, chroot and current working directory, in order to get +/// the correct view of paths. +/// +/// Next we copy the caller's `umask`. +/// +/// Then switch over our effective and file system uid and gid. This has 2 reasons: First, it means +/// we do not need to run `chown()` on files we create, secondly, the user may have dropped +/// `CAP_DAC_OVERRIDE` / `CAP_DAC_READ_SEARCH` which may have prevented the creation of the file in +/// the first place (for example, the container program may be a non-root executable with +/// `cap_mknod=ep` as file-capabilities, in which case we do not want a user to be allowed to run +/// `mknod()` on a path owned by different user (and checking file system permissions would +/// require us to handle ACLs, quotas, which are all file system tyep dependent as well, so better +/// leave all that up to the kernel, too!)). +/// +/// Next we clone the process' capability set. This is because the process may have dropped +/// capabilties which under normal conditions would prevent them from executing the syscall. For +/// example a process may be executing `mknod()` after having dropped `CAP_MKNOD`. +#[derive(Clone)] +#[must_use = "not using UserCaps may be a security issue"] +pub struct UserCaps<'a> { + pidfd: &'a PidFd, + apply_uids: bool, + euid: libc::uid_t, + egid: libc::gid_t, + fsuid: libc::uid_t, + fsgid: libc::gid_t, + capabilities: Capabilities, + umask: libc::mode_t, + cgroup_v1_devices: Option, + cgroup_v2: Option, + apparmor_profile: Option, +} + +impl UserCaps<'_> { + pub fn new(pidfd: &PidFd) -> Result { + let status = pidfd.get_status()?; + let cgroups = pidfd.get_cgroups()?; + let apparmor_profile = crate::apparmor::get_label(pidfd)?; + + Ok(UserCaps { + pidfd, + apply_uids: true, + euid: status.uids.euid, + egid: status.uids.egid, + fsuid: status.uids.fsuid, + fsgid: status.uids.fsgid, + capabilities: status.capabilities, + umask: status.umask, + cgroup_v1_devices: cgroups.get("devices").map(|s| s.to_owned()), + cgroup_v2: cgroups.v2().map(|s| s.to_owned()), + apparmor_profile, + }) + } + + fn apply_cgroups(&self) -> io::Result<()> { + fn enter_cgroup(kind: &str, name: &OsStr) -> io::Result<()> { + let mut path = OsString::with_capacity(15 + kind.len() + name.len() + 13 + 1); + path.push(OsStr::from_bytes(b"/sys/fs/cgroup/")); + path.push(kind); + path.push(name); + path.push(OsStr::from_bytes(b"/cgroup.procs")); + std::fs::write(path, b"0") + } + + if let Some(ref cg) = self.cgroup_v1_devices { + enter_cgroup("devices/", cg)?; + } + + if let Some(ref cg) = self.cgroup_v2 { + enter_cgroup("unified/", cg)?; + } + + Ok(()) + } + + fn apply_user_caps(&self) -> io::Result<()> { + use crate::capability::SecureBits; + if self.apply_uids { + unsafe { + libc::umask(self.umask); + } + let mut secbits = SecureBits::get_current()?; + secbits |= SecureBits::KEEP_CAPS | SecureBits::NO_SETUID_FIXUP; + secbits.apply()?; + c_try!(unsafe { libc::setegid(self.egid) }); + c_try!(unsafe { libc::setfsgid(self.fsgid) }); + c_try!(unsafe { libc::seteuid(self.euid) }); + c_try!(unsafe { libc::setfsuid(self.fsuid) }); + } + self.capabilities.capset()?; + Ok(()) + } + + pub fn disable_uid_change(&mut self) { + self.apply_uids = false; + } + + pub fn disable_cgroup_change(&mut self) { + self.cgroup_v1_devices = None; + self.cgroup_v2 = None; + } + + pub fn apply(self, own_pidfd: &PidFd) -> io::Result<()> { + self.apply_cgroups()?; + self.pidfd.mount_namespace()?.setns()?; + self.pidfd.enter_chroot()?; + self.pidfd.enter_cwd()?; + if let Some(ref label) = self.apparmor_profile { + crate::apparmor::set_label(own_pidfd, label)?; + } + self.apply_user_caps()?; + Ok(()) + } +} diff --git a/src/sys_mknod.rs b/src/sys_mknod.rs index 0dc90bf..35b3a11 100644 --- a/src/sys_mknod.rs +++ b/src/sys_mknod.rs @@ -7,7 +7,7 @@ use nix::sys::stat; use crate::fork::forking_syscall; use crate::lxcseccomp::ProxyMessageBuffer; -use crate::pidfd::PidFd; +use crate::process::PidFd; use crate::sc_libc_try; use crate::syscall::SyscallStatus; use crate::tools::Fd; diff --git a/src/sys_quotactl.rs b/src/sys_quotactl.rs index 0dd64b6..c94edd3 100644 --- a/src/sys_quotactl.rs +++ b/src/sys_quotactl.rs @@ -8,7 +8,7 @@ use nix::errno::Errno; use crate::fork::forking_syscall; use crate::lxcseccomp::ProxyMessageBuffer; -use crate::pidfd::{IdMap, PidFd}; +use crate::process::{IdMap, PidFd}; use crate::sc_libc_try; use crate::syscall::SyscallStatus;