refactor 'pidfd.rs' into a process module

Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
This commit is contained in:
Wolfgang Bumiller 2019-10-21 14:04:50 +02:00
parent 1e80bab0d3
commit 3bbd1db091
11 changed files with 299 additions and 262 deletions

View File

@ -4,7 +4,7 @@ use std::ffi::{OsStr, OsString};
use std::io::{self, Write};
use std::os::unix::ffi::{OsStrExt, OsStringExt};
use crate::pidfd::PidFd;
use crate::process::PidFd;
pub fn get_label(pidfd: &PidFd) -> io::Result<Option<OsString>> {
let mut out = match pidfd.read_file(c_str!("attr/current")) {

View File

@ -1,5 +1,5 @@
use std::io;
use std::os::raw::c_ulong;
use std::os::raw::{c_int, c_ulong};
bitflags::bitflags! {
pub struct SecureBits: c_ulong {
@ -29,3 +29,60 @@ impl SecureBits {
.ok_or_else(|| io_format_err!("prctl() returned unknown securebits"))
}
}
#[derive(Clone, Default)]
pub struct Capabilities {
pub inheritable: u64,
pub permitted: u64,
pub effective: u64,
//bounding: u64, // we don't care currently
}
// Too lazy to bindgen libcap stuff...
const CAPABILITY_VERSION_3: u32 = 0x2008_0522;
/// Represents process capabilities.
///
/// This can be used to change the process' capability sets (if permitted by the kernel).
impl Capabilities {
// We currently don't implement capget as it takes a pid which is racy on kernels without pidfd
// support. Later on we might support a `capget(&PidFd)` method?
/// Change our process capabilities. This does not include the bounding set.
pub fn capset(&self) -> io::Result<()> {
#![allow(dead_code)]
// kernel abi:
struct Header {
version: u32,
pid: c_int,
}
struct Data {
effective: u32,
permitted: u32,
inheritable: u32,
}
let header = Header {
version: CAPABILITY_VERSION_3,
pid: 0, // equivalent to gettid(),
};
let data = [
Data {
effective: self.effective as u32,
permitted: self.permitted as u32,
inheritable: self.inheritable as u32,
},
Data {
effective: (self.effective >> 32) as u32,
permitted: (self.permitted >> 32) as u32,
inheritable: (self.inheritable >> 32) as u32,
},
];
c_try!(unsafe { libc::syscall(libc::SYS_capset, &header, &data) });
Ok(())
}
}

View File

@ -13,7 +13,7 @@ use lazy_static::lazy_static;
use libc::pid_t;
use nix::errno::Errno;
use crate::pidfd::PidFd;
use crate::process::PidFd;
use crate::seccomp::{SeccompNotif, SeccompNotifResp, SeccompNotifSizes};
use crate::tools::{Fd, FromFd, IoVec, IoVecMut};

View File

@ -13,7 +13,7 @@ pub mod client;
pub mod fork;
pub mod lxcseccomp;
pub mod nsfd;
pub mod pidfd;
pub mod process;
pub mod seccomp;
pub mod sys_mknod;
pub mod sys_quotactl;

24
src/process/cgroups.rs Normal file
View File

@ -0,0 +1,24 @@
use std::collections::HashMap;
use std::ffi::{OsStr, OsString};
pub struct CGroups {
pub v1: HashMap<String, OsString>,
pub v2: Option<OsString>,
}
impl CGroups {
pub fn new() -> Self {
Self {
v1: HashMap::new(),
v2: None,
}
}
pub fn get(&self, name: &str) -> Option<&OsStr> {
self.v1.get(name).map(|s| s.as_os_str())
}
pub fn v2(&self) -> Option<&OsStr> {
self.v2.as_ref().map(|s| s.as_os_str())
}
}

View File

@ -1,78 +1,23 @@
//! pidfd helper functionality
use std::collections::HashMap;
use std::ffi::{CStr, CString, OsStr, OsString};
use std::ffi::{CStr, CString, OsString};
use std::io::{self, BufRead, BufReader};
use std::os::raw::c_int;
use std::os::unix::ffi::{OsStrExt, OsStringExt};
use std::os::unix::ffi::OsStringExt;
use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd, RawFd};
use failure::{bail, Error};
use libc::pid_t;
use crate::capability::Capabilities;
use crate::nsfd::{ns_type, NsFd};
use crate::tools::Fd;
use super::{CGroups, IdMap, IdMapEntry, ProcStatus, Uids, UserCaps};
pub struct PidFd(RawFd, pid_t);
file_descriptor_impl!(PidFd);
#[derive(Default)]
pub struct Uids {
pub ruid: libc::uid_t,
pub euid: libc::uid_t,
pub suid: libc::uid_t,
pub fsuid: libc::uid_t,
pub rgid: libc::gid_t,
pub egid: libc::gid_t,
pub sgid: libc::gid_t,
pub fsgid: libc::gid_t,
}
#[derive(Clone, Default)]
pub struct Capabilities {
inheritable: u64,
permitted: u64,
effective: u64,
//bounding: u64, // we don't care currently
}
#[derive(Default)]
pub struct ProcStatus {
uids: Uids,
capabilities: Capabilities,
umask: libc::mode_t,
}
pub struct IdMapEntry {
ns: u64,
host: u64,
range: u64,
}
pub struct IdMap(Vec<IdMapEntry>);
impl IdMap {
pub fn map_into(&self, id: u64) -> Option<u64> {
for entry in self.0.iter() {
if entry.host <= id && entry.host + entry.range > id {
return Some(entry.ns + id - entry.host);
}
}
None
}
pub fn map_from(&self, id: u64) -> Option<u64> {
for entry in self.0.iter() {
if entry.ns <= id && entry.ns + entry.range > id {
return Some(id + entry.host);
}
}
None
}
}
impl PidFd {
pub fn current() -> io::Result<Self> {
let fd = c_try!(unsafe {
@ -299,7 +244,7 @@ impl PidFd {
entries.push(IdMapEntry { ns, host, range });
}
Ok(IdMap(entries))
Ok(IdMap::new(entries))
}
pub fn get_uid_map(&self) -> Result<IdMap, Error> {
@ -323,198 +268,3 @@ impl PidFd {
UserCaps::new(self)
}
}
pub struct CGroups {
v1: HashMap<String, OsString>,
v2: Option<OsString>,
}
impl CGroups {
fn new() -> Self {
Self {
v1: HashMap::new(),
v2: None,
}
}
pub fn get(&self, name: &str) -> Option<&OsStr> {
self.v1.get(name).map(|s| s.as_os_str())
}
pub fn v2(&self) -> Option<&OsStr> {
self.v2.as_ref().map(|s| s.as_os_str())
}
}
// Too lazy to bindgen libcap stuff...
const CAPABILITY_VERSION_3: u32 = 0x2008_0522;
/// Represents process capabilities.
///
/// This can be used to change the process' capability sets (if permitted by the kernel).
impl Capabilities {
// We currently don't implement capget as it takes a pid which is racy on kernels without pidfd
// support. Later on we might support a `capget(&PidFd)` method?
/// Change our process capabilities. This does not include the bounding set.
pub fn capset(&self) -> io::Result<()> {
#![allow(dead_code)]
// kernel abi:
struct Header {
version: u32,
pid: c_int,
}
struct Data {
effective: u32,
permitted: u32,
inheritable: u32,
}
let header = Header {
version: CAPABILITY_VERSION_3,
pid: 0, // equivalent to gettid(),
};
let data = [
Data {
effective: self.effective as u32,
permitted: self.permitted as u32,
inheritable: self.inheritable as u32,
},
Data {
effective: (self.effective >> 32) as u32,
permitted: (self.permitted >> 32) as u32,
inheritable: (self.inheritable >> 32) as u32,
},
];
c_try!(unsafe { libc::syscall(libc::SYS_capset, &header, &data) });
Ok(())
}
}
/// Helper to enter a process' permission-check environment.
///
/// When we execute a syscall on behalf of another process, we should try to trigger as many
/// permission checks as we can. It is impractical to implement them all manually, so the best
/// thing to do is cause as many of them to happen on the kernel-side as we can.
///
/// We start by entering the process' devices and v2 cgroup. As calls like `mknod()` may be
/// affected, and access to devices as well.
///
/// Then we must enter the mount namespace, chroot and current working directory, in order to get
/// the correct view of paths.
///
/// Next we copy the caller's `umask`.
///
/// Then switch over our effective and file system uid and gid. This has 2 reasons: First, it means
/// we do not need to run `chown()` on files we create, secondly, the user may have dropped
/// `CAP_DAC_OVERRIDE` / `CAP_DAC_READ_SEARCH` which may have prevented the creation of the file in
/// the first place (for example, the container program may be a non-root executable with
/// `cap_mknod=ep` as file-capabilities, in which case we do not want a user to be allowed to run
/// `mknod()` on a path owned by different user (and checking file system permissions would
/// require us to handle ACLs, quotas, which are all file system tyep dependent as well, so better
/// leave all that up to the kernel, too!)).
///
/// Next we clone the process' capability set. This is because the process may have dropped
/// capabilties which under normal conditions would prevent them from executing the syscall. For
/// example a process may be executing `mknod()` after having dropped `CAP_MKNOD`.
#[derive(Clone)]
#[must_use = "not using UserCaps may be a security issue"]
pub struct UserCaps<'a> {
pidfd: &'a PidFd,
apply_uids: bool,
euid: libc::uid_t,
egid: libc::gid_t,
fsuid: libc::uid_t,
fsgid: libc::gid_t,
capabilities: Capabilities,
umask: libc::mode_t,
cgroup_v1_devices: Option<OsString>,
cgroup_v2: Option<OsString>,
apparmor_profile: Option<OsString>,
}
impl UserCaps<'_> {
pub fn new(pidfd: &PidFd) -> Result<UserCaps, Error> {
let status = pidfd.get_status()?;
let cgroups = pidfd.get_cgroups()?;
let apparmor_profile = crate::apparmor::get_label(pidfd)?;
Ok(UserCaps {
pidfd,
apply_uids: true,
euid: status.uids.euid,
egid: status.uids.egid,
fsuid: status.uids.fsuid,
fsgid: status.uids.fsgid,
capabilities: status.capabilities,
umask: status.umask,
cgroup_v1_devices: cgroups.get("devices").map(|s| s.to_owned()),
cgroup_v2: cgroups.v2().map(|s| s.to_owned()),
apparmor_profile,
})
}
fn apply_cgroups(&self) -> io::Result<()> {
fn enter_cgroup(kind: &str, name: &OsStr) -> io::Result<()> {
let mut path = OsString::with_capacity(15 + kind.len() + name.len() + 13 + 1);
path.push(OsStr::from_bytes(b"/sys/fs/cgroup/"));
path.push(kind);
path.push(name);
path.push(OsStr::from_bytes(b"/cgroup.procs"));
std::fs::write(path, b"0")
}
if let Some(ref cg) = self.cgroup_v1_devices {
enter_cgroup("devices/", cg)?;
}
if let Some(ref cg) = self.cgroup_v2 {
enter_cgroup("unified/", cg)?;
}
Ok(())
}
fn apply_user_caps(&self) -> io::Result<()> {
use crate::capability::SecureBits;
if self.apply_uids {
unsafe {
libc::umask(self.umask);
}
let mut secbits = SecureBits::get_current()?;
secbits |= SecureBits::KEEP_CAPS | SecureBits::NO_SETUID_FIXUP;
secbits.apply()?;
c_try!(unsafe { libc::setegid(self.egid) });
c_try!(unsafe { libc::setfsgid(self.fsgid) });
c_try!(unsafe { libc::seteuid(self.euid) });
c_try!(unsafe { libc::setfsuid(self.fsuid) });
}
self.capabilities.capset()?;
Ok(())
}
pub fn disable_uid_change(&mut self) {
self.apply_uids = false;
}
pub fn disable_cgroup_change(&mut self) {
self.cgroup_v1_devices = None;
self.cgroup_v2 = None;
}
pub fn apply(self, own_pidfd: &PidFd) -> io::Result<()> {
self.apply_cgroups()?;
self.pidfd.mount_namespace()?.setns()?;
self.pidfd.enter_chroot()?;
self.pidfd.enter_cwd()?;
if let Some(ref label) = self.apparmor_profile {
crate::apparmor::set_label(own_pidfd, label)?;
}
self.apply_user_caps()?;
Ok(())
}
}

35
src/process/id_map.rs Normal file
View File

@ -0,0 +1,35 @@
//! uid/gid mapping helpers
pub struct IdMap(Vec<IdMapEntry>);
pub struct IdMapEntry {
pub ns: u64,
pub host: u64,
pub range: u64,
}
impl IdMap {
pub fn new(entries: Vec<IdMapEntry>) -> Self {
Self(entries)
}
pub fn map_into(&self, id: u64) -> Option<u64> {
for entry in self.0.iter() {
if entry.host <= id && entry.host + entry.range > id {
return Some(entry.ns + id - entry.host);
}
}
None
}
pub fn map_from(&self, id: u64) -> Option<u64> {
for entry in self.0.iter() {
if entry.ns <= id && entry.ns + entry.range > id {
return Some(id + entry.host);
}
}
None
}
}

37
src/process/mod.rs Normal file
View File

@ -0,0 +1,37 @@
use crate::capability::Capabilities;
pub mod cgroups;
pub mod fd;
pub mod id_map;
pub mod user_caps;
#[doc(inline)]
pub use cgroups::CGroups;
#[doc(inline)]
pub use fd::PidFd;
#[doc(inline)]
pub use id_map::{IdMap, IdMapEntry};
#[doc(inline)]
pub use user_caps::UserCaps;
#[derive(Default)]
pub struct Uids {
pub ruid: libc::uid_t,
pub euid: libc::uid_t,
pub suid: libc::uid_t,
pub fsuid: libc::uid_t,
pub rgid: libc::gid_t,
pub egid: libc::gid_t,
pub sgid: libc::gid_t,
pub fsgid: libc::gid_t,
}
#[derive(Default)]
pub struct ProcStatus {
uids: Uids,
capabilities: Capabilities,
umask: libc::mode_t,
}

134
src/process/user_caps.rs Normal file
View File

@ -0,0 +1,134 @@
//! User and capability management.
use std::ffi::{OsStr, OsString};
use std::io;
use std::os::unix::ffi::OsStrExt;
use failure::Error;
use super::PidFd;
use crate::capability::Capabilities;
/// Helper to enter a process' permission-check environment.
///
/// When we execute a syscall on behalf of another process, we should try to trigger as many
/// permission checks as we can. It is impractical to implement them all manually, so the best
/// thing to do is cause as many of them to happen on the kernel-side as we can.
///
/// We start by entering the process' devices and v2 cgroup. As calls like `mknod()` may be
/// affected, and access to devices as well.
///
/// Then we must enter the mount namespace, chroot and current working directory, in order to get
/// the correct view of paths.
///
/// Next we copy the caller's `umask`.
///
/// Then switch over our effective and file system uid and gid. This has 2 reasons: First, it means
/// we do not need to run `chown()` on files we create, secondly, the user may have dropped
/// `CAP_DAC_OVERRIDE` / `CAP_DAC_READ_SEARCH` which may have prevented the creation of the file in
/// the first place (for example, the container program may be a non-root executable with
/// `cap_mknod=ep` as file-capabilities, in which case we do not want a user to be allowed to run
/// `mknod()` on a path owned by different user (and checking file system permissions would
/// require us to handle ACLs, quotas, which are all file system tyep dependent as well, so better
/// leave all that up to the kernel, too!)).
///
/// Next we clone the process' capability set. This is because the process may have dropped
/// capabilties which under normal conditions would prevent them from executing the syscall. For
/// example a process may be executing `mknod()` after having dropped `CAP_MKNOD`.
#[derive(Clone)]
#[must_use = "not using UserCaps may be a security issue"]
pub struct UserCaps<'a> {
pidfd: &'a PidFd,
apply_uids: bool,
euid: libc::uid_t,
egid: libc::gid_t,
fsuid: libc::uid_t,
fsgid: libc::gid_t,
capabilities: Capabilities,
umask: libc::mode_t,
cgroup_v1_devices: Option<OsString>,
cgroup_v2: Option<OsString>,
apparmor_profile: Option<OsString>,
}
impl UserCaps<'_> {
pub fn new(pidfd: &PidFd) -> Result<UserCaps, Error> {
let status = pidfd.get_status()?;
let cgroups = pidfd.get_cgroups()?;
let apparmor_profile = crate::apparmor::get_label(pidfd)?;
Ok(UserCaps {
pidfd,
apply_uids: true,
euid: status.uids.euid,
egid: status.uids.egid,
fsuid: status.uids.fsuid,
fsgid: status.uids.fsgid,
capabilities: status.capabilities,
umask: status.umask,
cgroup_v1_devices: cgroups.get("devices").map(|s| s.to_owned()),
cgroup_v2: cgroups.v2().map(|s| s.to_owned()),
apparmor_profile,
})
}
fn apply_cgroups(&self) -> io::Result<()> {
fn enter_cgroup(kind: &str, name: &OsStr) -> io::Result<()> {
let mut path = OsString::with_capacity(15 + kind.len() + name.len() + 13 + 1);
path.push(OsStr::from_bytes(b"/sys/fs/cgroup/"));
path.push(kind);
path.push(name);
path.push(OsStr::from_bytes(b"/cgroup.procs"));
std::fs::write(path, b"0")
}
if let Some(ref cg) = self.cgroup_v1_devices {
enter_cgroup("devices/", cg)?;
}
if let Some(ref cg) = self.cgroup_v2 {
enter_cgroup("unified/", cg)?;
}
Ok(())
}
fn apply_user_caps(&self) -> io::Result<()> {
use crate::capability::SecureBits;
if self.apply_uids {
unsafe {
libc::umask(self.umask);
}
let mut secbits = SecureBits::get_current()?;
secbits |= SecureBits::KEEP_CAPS | SecureBits::NO_SETUID_FIXUP;
secbits.apply()?;
c_try!(unsafe { libc::setegid(self.egid) });
c_try!(unsafe { libc::setfsgid(self.fsgid) });
c_try!(unsafe { libc::seteuid(self.euid) });
c_try!(unsafe { libc::setfsuid(self.fsuid) });
}
self.capabilities.capset()?;
Ok(())
}
pub fn disable_uid_change(&mut self) {
self.apply_uids = false;
}
pub fn disable_cgroup_change(&mut self) {
self.cgroup_v1_devices = None;
self.cgroup_v2 = None;
}
pub fn apply(self, own_pidfd: &PidFd) -> io::Result<()> {
self.apply_cgroups()?;
self.pidfd.mount_namespace()?.setns()?;
self.pidfd.enter_chroot()?;
self.pidfd.enter_cwd()?;
if let Some(ref label) = self.apparmor_profile {
crate::apparmor::set_label(own_pidfd, label)?;
}
self.apply_user_caps()?;
Ok(())
}
}

View File

@ -7,7 +7,7 @@ use nix::sys::stat;
use crate::fork::forking_syscall;
use crate::lxcseccomp::ProxyMessageBuffer;
use crate::pidfd::PidFd;
use crate::process::PidFd;
use crate::sc_libc_try;
use crate::syscall::SyscallStatus;
use crate::tools::Fd;

View File

@ -8,7 +8,7 @@ use nix::errno::Errno;
use crate::fork::forking_syscall;
use crate::lxcseccomp::ProxyMessageBuffer;
use crate::pidfd::{IdMap, PidFd};
use crate::process::{IdMap, PidFd};
use crate::sc_libc_try;
use crate::syscall::SyscallStatus;