diff --git a/src/lxcseccomp.rs b/src/lxcseccomp.rs index f1c0b73..9d7b346 100644 --- a/src/lxcseccomp.rs +++ b/src/lxcseccomp.rs @@ -4,10 +4,10 @@ use std::convert::TryFrom; use std::ffi::CString; use std::os::raw::c_int; use std::os::unix::fs::FileExt; -use std::os::unix::io::{FromRawFd, IntoRawFd, RawFd}; +use std::os::unix::io::RawFd; use std::{io, mem}; -use failure::{bail, Error}; +use failure::{bail, format_err, Error}; use lazy_static::lazy_static; use libc::pid_t; use nix::errno::Errno; @@ -15,7 +15,7 @@ use nix::errno::Errno; use crate::pidfd::PidFd; use crate::seccomp::{SeccompNotif, SeccompNotifResp, SeccompNotifSizes}; use crate::socket::AsyncSeqPacketSocket; -use crate::tools::{Fd, IoVec, IoVecMut}; +use crate::tools::{Fd, FromFd, IoVec, IoVecMut}; /// Seccomp notification proxy message sent by the lxc monitor. /// @@ -134,16 +134,18 @@ impl ProxyMessageBuffer { self.set_len(size)?; let mut fds = fds.into_iter(); - self.pid_fd = fds + let pid_fd = unsafe { + PidFd::try_from_fd( + fds.next() + .ok_or_else(|| format_err!("lxc seccomp message without pidfd"))?, + )? + }; + let mem_fd = fds .next() - .map(|fd| unsafe { PidFd::from_raw_fd(fd.into_raw_fd()) }); - self.mem_fd = fds - .next() - .map(|fd| unsafe { std::fs::File::from_raw_fd(fd.into_raw_fd()) }); - if self.mem_fd.is_none() { - self.drop_fds(); - bail!("missing file descriptors with proxied seccomp message"); - } + .ok_or_else(|| format_err!("lxc seccomp message without memfd"))?; + + self.pid_fd = Some(pid_fd); + self.mem_fd = Some(std::fs::File::from_fd(mem_fd)); Ok(true) } diff --git a/src/pidfd.rs b/src/pidfd.rs index fa301f6..1fe121f 100644 --- a/src/pidfd.rs +++ b/src/pidfd.rs @@ -1,24 +1,64 @@ //! pidfd helper functionality -use std::ffi::{CStr, CString}; -use std::io; +use std::collections::HashMap; +use std::ffi::{CStr, CString, OsStr, OsString}; +use std::io::{self, BufRead, BufReader}; use std::os::raw::c_int; +use std::os::unix::ffi::{OsStrExt, OsStringExt}; use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd, RawFd}; +use failure::{bail, Error}; +use libc::pid_t; + +use crate::libc_try; use crate::nsfd::{ns_type, NsFd}; use crate::tools::Fd; -use crate::{file_descriptor_type, libc_try}; -file_descriptor_type!(PidFd); +pub struct PidFd(RawFd, pid_t); +crate::file_descriptor_impl!(PidFd); + +#[derive(Default)] +pub struct Uids { + pub ruid: libc::uid_t, + pub euid: libc::uid_t, + pub suid: libc::uid_t, + pub fsuid: libc::uid_t, + pub rgid: libc::gid_t, + pub egid: libc::gid_t, + pub sgid: libc::gid_t, + pub fsgid: libc::gid_t, +} + +#[derive(Clone, Default)] +pub struct Capabilities { + inheritable: u64, + permitted: u64, + effective: u64, + //bounding: u64, // we don't care currently +} + +#[derive(Default)] +pub struct ProcStatus { + uids: Uids, + capabilities: Capabilities, + umask: libc::mode_t, +} impl PidFd { - pub fn open(pid: libc::pid_t) -> io::Result { + pub fn open(pid: pid_t) -> io::Result { let path = CString::new(format!("/proc/{}", pid)).unwrap(); let fd = libc_try!(unsafe { libc::open(path.as_ptr(), libc::O_DIRECTORY | libc::O_CLOEXEC) }); - Ok(Self(fd)) + Ok(Self(fd, pid)) + } + + pub unsafe fn try_from_fd(fd: Fd) -> io::Result { + let mut this = Self(fd.into_raw_fd(), -1 as pid_t); + let pid = this.read_pid()?; + this.1 = pid; + Ok(this) } pub fn mount_namespace(&self) -> io::Result> { @@ -51,74 +91,334 @@ impl PidFd { } pub fn fd_cwd(&self) -> io::Result { - self.fd(unsafe { CStr::from_bytes_with_nul_unchecked(b"cwd\0") }, libc::O_DIRECTORY, 0) + self.fd( + unsafe { CStr::from_bytes_with_nul_unchecked(b"cwd\0") }, + libc::O_DIRECTORY, + 0, + ) } pub fn fd_num(&self, num: RawFd, flags: c_int) -> io::Result { let path = format!("fd/{}\0", num); - self.fd(unsafe { CStr::from_bytes_with_nul_unchecked(path.as_bytes()) }, flags, 0) + self.fd( + unsafe { CStr::from_bytes_with_nul_unchecked(path.as_bytes()) }, + flags, + 0, + ) } - //pub fn dup(&self) -> io::Result { - // Ok(Self(libc_try!(unsafe { - // libc::fcntl(self.as_raw_fd(), libc::F_DUPFD_CLOEXEC, 0) - // }))) - //} - + pub fn chroot(&self) -> io::Result<()> { + libc_try!(unsafe { libc::fchdir(self.as_raw_fd()) }); + libc_try!(unsafe { libc::chroot(b"root\0".as_ptr() as *const _) }); + libc_try!(unsafe { libc::chdir(b"/\0".as_ptr() as *const _) }); + Ok(()) + } // procfs files cannot be async, we cannot add them to epoll... pub fn open_file(&self, path: &CStr, flags: c_int, mode: c_int) -> io::Result { Ok(unsafe { std::fs::File::from_raw_fd(self.fd(path, flags, mode)?.into_raw_fd()) }) } - pub fn get_euid_egid(&self) -> io::Result<(libc::uid_t, libc::gid_t)> { - use io::BufRead; - - let reader = io::BufReader::new(self.open_file( - unsafe { CStr::from_bytes_with_nul_unchecked(b"status\0") }, + #[inline] + fn open_buffered(&self, path: &CStr) -> io::Result { + Ok(BufReader::new(self.open_file( + path, libc::O_RDONLY | libc::O_CLOEXEC, 0, - )?); + )?)) + } - let mut uid = None; - let mut gid = None; + #[inline] + pub fn get_pid(&self) -> pid_t { + self.1 + } + + fn read_pid(&self) -> io::Result { + let reader = + self.open_buffered(unsafe { CStr::from_bytes_with_nul_unchecked(b"status\0") })?; + + for line in reader.lines() { + let line = line?; + let mut parts = line.split_ascii_whitespace(); + if parts.next() == Some("Pid:") { + let pid = parts + .next() + .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "bad 'Pid:' line in proc"))? + .parse::() + .map_err(|_| { + io::Error::new(io::ErrorKind::Other, "failed to parse pid from proc") + })?; + return Ok(pid); + } + } + + Err(io::ErrorKind::NotFound.into()) + } + + pub fn get_status(&self) -> io::Result { + let reader = + self.open_buffered(unsafe { CStr::from_bytes_with_nul_unchecked(b"status\0") })?; + + #[inline] + fn check_uid_gid(value: Option<&str>) -> io::Result { + value + .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "bad 'Uid/Gid:' line in proc"))? + .parse::() + .map_err(|_| io::Error::new(io::ErrorKind::Other, "failed to parse uid from proc")) + } + + #[inline] + fn check_u64_hex(value: Option<&str>) -> io::Result { + Ok(u64::from_str_radix( + value.ok_or_else(|| { + io::Error::new(io::ErrorKind::Other, "bad numeric property line in proc") + })?, + 16, + ) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?) + } + + #[inline] + fn check_u32_oct(value: Option<&str>) -> io::Result { + Ok(u32::from_str_radix( + value.ok_or_else(|| { + io::Error::new(io::ErrorKind::Other, "bad numeric property line in proc") + })?, + 8, + ) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?) + } + + let mut ids = Uids::default(); + let mut caps = Capabilities::default(); + let mut umask = 0o022; for line in reader.lines() { let line = line?; let mut parts = line.split_ascii_whitespace(); match parts.next() { Some("Uid:") => { - uid = Some(parts - .skip(1) - .next() - .ok_or_else(|| { - io::Error::new(io::ErrorKind::Other, "bad 'Uid:' line in proc") - })? - .parse::() - .map_err(|_| { - io::Error::new(io::ErrorKind::Other, "failed to parse uid from proc") - })? - ); + ids.ruid = check_uid_gid(parts.next())?; + ids.euid = check_uid_gid(parts.next())?; + ids.suid = check_uid_gid(parts.next())?; + ids.fsuid = check_uid_gid(parts.next())?; } Some("Gid:") => { - gid = Some(parts - .skip(1) - .next() - .ok_or_else(|| { - io::Error::new(io::ErrorKind::Other, "bad 'Uid:' line in proc") - })? - .parse::() - .map_err(|_| { - io::Error::new(io::ErrorKind::Other, "failed to parse gid from proc") - })? - ); + ids.rgid = check_uid_gid(parts.next())?; + ids.egid = check_uid_gid(parts.next())?; + ids.sgid = check_uid_gid(parts.next())?; + ids.fsgid = check_uid_gid(parts.next())?; } + Some("CapInh:") => caps.inheritable = check_u64_hex(parts.next())?, + Some("CapPrm:") => caps.permitted = check_u64_hex(parts.next())?, + Some("CapEff:") => caps.effective = check_u64_hex(parts.next())?, + //Some("CapBnd:") => caps.bounding = check_u64_hex(parts.next())?, + Some("Umask:") => umask = check_u32_oct(parts.next())?, _ => continue, } - if let (Some(u), Some(g)) = (uid, gid) { - return Ok((u, g)); - } } - Err(io::ErrorKind::InvalidData.into()) + Ok(ProcStatus { + uids: ids, + capabilities: caps, + umask, + }) + } + + pub fn get_cgroups(&self) -> Result { + let reader = + self.open_buffered(unsafe { CStr::from_bytes_with_nul_unchecked(b"cgroup\0") })?; + + let mut cgroups = CGroups::new(); + + for line in reader.split(b'\n') { + let line = line?; + let mut parts = line.splitn(3, |b| *b == b':'); + let num = parts.next(); + let name = parts.next(); + let path = parts.next(); + if !num.is_some() || !name.is_some() || !path.is_some() || parts.next().is_some() { + bail!("failed to parse cgroup line: {:?}", line); + } + + let name = String::from_utf8(name.unwrap().to_vec())?; + let path = OsString::from_vec(path.unwrap().to_vec()); + + if name.len() == 0 { + cgroups.v2 = Some(path); + } else { + for entry in name.split(',') { + cgroups.v1.insert(entry.to_string(), path.clone()); + } + } + } + + Ok(cgroups) + } + + pub fn user_caps(&self) -> Result { + UserCaps::new(self) + } +} + +pub struct CGroups { + v1: HashMap, + v2: Option, +} + +impl CGroups { + fn new() -> Self { + Self { + v1: HashMap::new(), + v2: None, + } + } + + pub fn get(&self, name: &str) -> Option<&OsStr> { + self.v1.get(name).map(|s| s.as_os_str()) + } + + pub fn v2(&self) -> Option<&OsStr> { + self.v2.as_ref().map(|s| s.as_os_str()) + } +} + +// Too lazy to bindgen libcap stuff... +const CAPABILITY_VERSION_3: u32 = 0x20080522; + +/// Represents process capabilities. +/// +/// This can be used to change the process' capability sets (if permitted by the kernel). +impl Capabilities { + // We currently don't implement capget as it takes a pid which is racy on kernels without pidfd + // support. Later on we might support a `capget(&PidFd)` method? + + /// Change our process capabilities. This does not include the bounding set. + pub fn capset(&self) -> io::Result<()> { + #![allow(dead_code)] + // kernel abi: + struct Header { + version: u32, + pid: c_int, + } + + struct Data { + effective: u32, + permitted: u32, + inheritable: u32, + } + + let header = Header { + version: CAPABILITY_VERSION_3, + pid: 0, // equivalent to gettid(), + }; + + let data = [ + Data { + effective: self.effective as u32, + permitted: self.permitted as u32, + inheritable: self.inheritable as u32, + }, + Data { + effective: (self.effective >> 32) as u32, + permitted: (self.permitted >> 32) as u32, + inheritable: (self.inheritable >> 32) as u32, + }, + ]; + + libc_try!(unsafe { libc::syscall(libc::SYS_capset, &header, &data) }); + + Ok(()) + } + + /// Change the thread's keep-capabilities flag. + pub fn set_keep_caps(on: bool) -> io::Result<()> { + libc_try!(unsafe { libc::prctl(libc::PR_SET_KEEPCAPS, c_int::from(on)) }); + Ok(()) + } +} + +/// Helper to enter a process' permission-check environment. +/// +/// When we execute a syscall on behalf of another process, we should try to trigger as many +/// permission checks as we can. It is impractical to implement them all manually, so the best +/// thing to do is cause as many of them to happen on the kernel-side as we can. +/// +/// We start by cloning the process' capability set. This is because the process may have dropped +/// capabilties which under normal conditions would prevent them from executing the syscall. +/// For example a process may be executing `mknod()` after having dropped `CAP_MKNOD`. +/// +/// We then switch over our effective and file system uid and gid. This has 2 reasons: First, it +/// means we do not need to run `chown()` on files we create, secondly, the user may have dropped +/// `CAP_DAC_OVERRIDE` / `CAP_DAC_READ_SEARCH` which may have prevented the creation of the file in +/// the first place (for example, the container program may be a non-root executable with +/// `cap_mknod=ep` as file-capabilities, in which case we do not want a user to be allowed to run +/// `mknod()` on a path owned by different user. (And checking file system permissions would +/// require us to handle ACLs, quotas, which are all file system tyep dependent as well, so better +/// leave all that up to the kernel, too!) +/// +/// Finally, we also need to make sure we are in the same `devices` cgroup, because it too should +/// affect calls such as `mknod()`. +#[derive(Clone)] +#[must_use = "not using UserCaps may be a security issue"] +pub struct UserCaps { + euid: libc::uid_t, + egid: libc::gid_t, + fsuid: libc::uid_t, + fsgid: libc::gid_t, + capabilities: Capabilities, + umask: libc::mode_t, + cgroup_v1_devices: Option, + cgroup_v2: Option, +} + +impl UserCaps { + pub fn new(pidfd: &PidFd) -> Result { + let status = pidfd.get_status()?; + let cgroups = pidfd.get_cgroups()?; + + Ok(UserCaps { + euid: status.uids.euid, + egid: status.uids.egid, + fsuid: status.uids.fsuid, + fsgid: status.uids.fsgid, + capabilities: status.capabilities, + umask: status.umask, + cgroup_v1_devices: cgroups.get("devices").map(|s| s.to_owned()), + cgroup_v2: cgroups.v2().map(|s| s.to_owned()), + }) + } + + pub fn apply_cgroups(&self) -> io::Result<()> { + fn enter_cgroup(kind: &str, name: &OsStr) -> io::Result<()> { + let mut path = OsString::with_capacity(15 + kind.len() + name.len() + 13 + 1); + path.push(OsStr::from_bytes(b"/sys/fs/cgroup/")); + path.push(kind); + path.push(name); + path.push(OsStr::from_bytes(b"/cgroup.procs")); + std::fs::write(path, b"0") + } + + if let Some(ref cg) = self.cgroup_v1_devices { + enter_cgroup("devices/", cg)?; + } + + if let Some(ref cg) = self.cgroup_v2 { + enter_cgroup("unified/", cg)?; + } + + Ok(()) + } + + pub fn apply_user_caps(self) -> io::Result<()> { + unsafe { + libc::umask(self.umask); + } + Capabilities::set_keep_caps(true)?; + libc_try!(unsafe { libc::setegid(self.egid) }); + libc_try!(unsafe { libc::setfsgid(self.fsgid) }); + libc_try!(unsafe { libc::seteuid(self.euid) }); + libc_try!(unsafe { libc::setfsuid(self.fsuid) }); + self.capabilities.capset()?; + Ok(()) } } diff --git a/src/sys_mknod.rs b/src/sys_mknod.rs index 445cd9b..7e4fa0b 100644 --- a/src/sys_mknod.rs +++ b/src/sys_mknod.rs @@ -1,15 +1,15 @@ use std::ffi::CString; -use std::os::unix::io::{AsRawFd, FromRawFd}; +use std::os::unix::io::AsRawFd; use failure::Error; use nix::sys::stat; use crate::fork::forking_syscall; -use crate::{libc_try, sc_libc_try}; use crate::lxcseccomp::ProxyMessageBuffer; use crate::pidfd::PidFd; use crate::syscall::SyscallStatus; use crate::tools::Fd; +use crate::{libc_try, sc_libc_try}; pub async fn mknod(msg: &ProxyMessageBuffer) -> Result { let mode = msg.arg_mode_t(1)?; @@ -21,8 +21,7 @@ pub async fn mknod(msg: &ProxyMessageBuffer) -> Result { let pathname = msg.arg_c_string(0)?; let cwd = msg.pid_fd().fd_cwd()?; - let pidfd = unsafe { PidFd::from_raw_fd(msg.pid_fd().as_raw_fd()) }; - do_mknodat(pidfd, cwd, pathname, mode, dev).await + do_mknodat(msg.pid_fd(), cwd, pathname, mode, dev).await } pub async fn mknodat(msg: &ProxyMessageBuffer) -> Result { @@ -35,8 +34,7 @@ pub async fn mknodat(msg: &ProxyMessageBuffer) -> Result { let dirfd = msg.arg_fd(0, libc::O_DIRECTORY)?; let pathname = msg.arg_c_string(1)?; - let pidfd = unsafe { PidFd::from_raw_fd(msg.pid_fd().as_raw_fd()) }; - do_mknodat(pidfd, dirfd, pathname, mode, dev).await + do_mknodat(msg.pid_fd(), dirfd, pathname, mode, dev).await } fn check_mknod_dev(mode: stat::mode_t, dev: stat::dev_t) -> bool { @@ -51,29 +49,24 @@ fn check_mknod_dev(mode: stat::mode_t, dev: stat::dev_t) -> bool { } async fn do_mknodat( - pidfd: PidFd, + pidfd: &PidFd, dirfd: Fd, pathname: CString, mode: stat::mode_t, dev: stat::dev_t, ) -> Result { - let (uid, gid) = pidfd.get_euid_egid()?; + let caps = pidfd.user_caps()?; // FIXME: !!! ALSO COPY THE PROCESS' CAPABILITY SET AND USE KEEP_CAPS! Ok(forking_syscall(move || { + caps.apply_cgroups()?; pidfd.mount_namespace()?.setns()?; + pidfd.chroot()?; libc_try!(unsafe { libc::fchdir(dirfd.as_raw_fd()) }); - libc_try!(unsafe { libc::setegid(gid) }); - libc_try!(unsafe { libc::seteuid(uid) }); - let out = sc_libc_try!(unsafe { - libc::mknodat( - dirfd.as_raw_fd(), - pathname.as_ptr(), - mode, - dev, - ) - }); + caps.apply_user_caps()?; + let out = + sc_libc_try!(unsafe { libc::mknodat(dirfd.as_raw_fd(), pathname.as_ptr(), mode, dev) }); Ok(SyscallStatus::Ok(out.into())) }) .await?) diff --git a/src/syscall.rs b/src/syscall.rs index c7753eb..efd5b21 100644 --- a/src/syscall.rs +++ b/src/syscall.rs @@ -32,7 +32,9 @@ macro_rules! sc_libc_try { ($expr:expr) => {{ let res = $expr; if res == -1 { - return Ok($crate::syscall::SyscallStatus::Err(::nix::errno::errno() as _)) + return Ok($crate::syscall::SyscallStatus::Err( + ::nix::errno::errno() as _ + )); } else { res } diff --git a/src/tools.rs b/src/tools.rs index 6347cc1..efdfc3d 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -21,6 +21,12 @@ macro_rules! file_descriptor_type { pub struct $type(RawFd); crate::file_descriptor_impl!($type); + + impl FromRawFd for $type { + unsafe fn from_raw_fd(fd: RawFd) -> Self { + Self(fd) + } + } }; } @@ -50,12 +56,6 @@ macro_rules! file_descriptor_impl { fd } } - - impl FromRawFd for $type { - unsafe fn from_raw_fd(fd: RawFd) -> Self { - Self(fd) - } - } }; } @@ -67,6 +67,12 @@ pub struct Fd(pub RawFd); file_descriptor_impl!(Fd); +impl FromRawFd for Fd { + unsafe fn from_raw_fd(fd: RawFd) -> Self { + Self(fd) + } +} + impl mio::Evented for Fd { fn register( &self, @@ -111,7 +117,7 @@ impl AsyncFd { let registration = tokio::reactor::Registration::new(); if !registration.register(&fd)? { return Err(io::Error::new( - io::ErrorKind::Other, + io::ErrorKind::Other, "duplicate file descriptor registration?", )); } @@ -301,3 +307,13 @@ macro_rules! libc_try { } }}; } + +pub trait FromFd { + fn from_fd(fd: Fd) -> Self; +} + +impl FromFd for T { + fn from_fd(fd: Fd) -> Self { + unsafe { Self::from_raw_fd(fd.into_raw_fd()) } + } +}