diff --git a/po/POTFILES.in b/po/POTFILES.in index 984ec36c0f..f93fb9694d 100644 --- a/po/POTFILES.in +++ b/po/POTFILES.in @@ -230,6 +230,7 @@ @SRCDIR@/src/util/vircgroupbackend.h @SRCDIR@/src/util/vircgroupv1.c @SRCDIR@/src/util/vircgroupv2.c +@SRCDIR@/src/util/vircgroupv2devices.c @SRCDIR@/src/util/virclosecallbacks.c @SRCDIR@/src/util/vircommand.c @SRCDIR@/src/util/virconf.c diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index c933277918..975733f71e 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1717,6 +1717,7 @@ virCgroupV1Register; virCgroupV2Register; # util/vircgroupv2devices.h +virCgroupV2DevicesAttachProg; virCgroupV2DevicesAvailable; # util/virclosecallbacks.h diff --git a/src/util/vircgrouppriv.h b/src/util/vircgrouppriv.h index 334095719e..f2a80aeb82 100644 --- a/src/util/vircgrouppriv.h +++ b/src/util/vircgrouppriv.h @@ -41,10 +41,20 @@ struct _virCgroupV1Controller { typedef struct _virCgroupV1Controller virCgroupV1Controller; typedef virCgroupV1Controller *virCgroupV1ControllerPtr; +struct _virCgroupV2Devices { + int mapfd; + int progfd; + ssize_t count; + ssize_t max; +}; +typedef struct _virCgroupV2Devices virCgroupV2Devices; +typedef virCgroupV2Devices *virCgroupV2DevicesPtr; + struct _virCgroupV2Controller { int controllers; char *mountPoint; char *placement; + virCgroupV2Devices devices; }; typedef struct _virCgroupV2Controller virCgroupV2Controller; typedef virCgroupV2Controller *virCgroupV2ControllerPtr; diff --git a/src/util/vircgroupv2devices.c b/src/util/vircgroupv2devices.c index 8641645810..c30a23f165 100644 --- a/src/util/vircgroupv2devices.c +++ b/src/util/vircgroupv2devices.c @@ -30,6 +30,7 @@ #define LIBVIRT_VIRCGROUPPRIV_H_ALLOW #include "vircgrouppriv.h" +#include "viralloc.h" #include "virbpf.h" #include "vircgroup.h" #include "vircgroupv2devices.h" @@ -60,10 +61,283 @@ virCgroupV2DevicesAvailable(virCgroupPtr group) return true; } + + +/* Steps to get assembly version of devices BPF program: + * + * Save the following program into bpfprog.c, compile it using clang: + * + * clang -O2 -Wall -target bpf -c bpfprog.c -o bpfprog.o + * + * Now you can use llvm-objdump to get the list if instructions: + * + * llvm-objdump -S -no-show-raw-insn bpfprog.o + * + * which can be converted into program using VIR_BPF_* macros. + * + * ---------------------------------------------------------------------------- + * #include + * #include + * + * #define SEC(NAME) __attribute__((section(NAME), used)) + * + * struct bpf_map_def { + * unsigned int type; + * unsigned int key_size; + * unsigned int value_size; + * unsigned int max_entries; + * unsigned int map_flags; + * unsigned int inner_map_idx; + * unsigned int numa_node; + * }; + * + * static void *(*bpf_map_lookup_elem)(void *map, void *key) = + * (void *) BPF_FUNC_map_lookup_elem; + * + * struct bpf_map_def SEC("maps") devices = { + * .type = BPF_MAP_TYPE_HASH, + * .key_size = sizeof(__u64), + * .value_size = sizeof(__u32), + * .max_entries = 65, + * }; + * + * SEC("cgroup/dev") int + * bpf_libvirt_cgroup_device(struct bpf_cgroup_dev_ctx *ctx) + * { + * __u64 key = ((__u64)ctx->major << 32) | ctx->minor; + * __u32 *val = 0; + * + * val = bpf_map_lookup_elem(&devices, &key); + * if (val && (ctx->access_type & *val) == ctx->access_type) + * return 1; + * + * key = ((__u64)ctx->major << 32) | 0xffffffff; + * val = bpf_map_lookup_elem(&devices, &key); + * if (val && (ctx->access_type & *val) == ctx->access_type) + * return 1; + * + * key = 0xffffffff00000000 | ctx->minor; + * val = bpf_map_lookup_elem(&devices, &key); + * if (val && (ctx->access_type & *val) == ctx->access_type) + * return 1; + * + * key = 0xffffffffffffffff; + * val = bpf_map_lookup_elem(&devices, &key); + * if (val && (ctx->access_type & *val) == ctx->access_type) + * return 1; + * + * return 0; + * } + * + * char _license[] SEC("license") = "GPL"; + * __u32 _version SEC("version") = LINUX_VERSION_CODE; + * ---------------------------------------------------------------------------- + * */ +static int +virCgroupV2DevicesLoadProg(int mapfd) +{ + struct bpf_insn prog[] = { + /* 0: r6 = r1 */ + VIR_BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + /* 1: r1 = *(u32 *)(r6 + 8) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 8), + /* 2: r2 = *(u32 *)(r6 + 4) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), + /* 3: r2 <<= 32 */ + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + /* 4: r2 |= r1 */ + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + /* 5: *(u64 *)(r10 - 8) = r2 */ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), + /* 6: r2 = r10 */ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + /* 7: r2 += -8 */ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + /* 8: r1 = 0 ll */ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), + /* 10: call 1 */ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), + /* 11: r1 = r0 */ + VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + /* 12: if r1 == 0 goto +5 */ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), + /* 13: r0 = 1 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), + /* 14: r2 = *(u32 *)(r6 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), + /* 15: r1 = *(u32 *)(r1 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), + /* 16: r1 &= r2 */ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + /* 17: if r1 == r2 goto +50 */ + VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 50), + /* LBB0_2: */ + /* 18: r1 = *(u32 *)(r6 + 4) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 4), + /* 19: r1 <<= 32 */ + VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32), + /* 20: r2 = 4294967295 ll */ + VIR_BPF_LD_IMM64(BPF_REG_2, 0xffffffff), + /* 22: r1 |= r2 */ + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_2), + /* 23: *(u64 *)(r10 - 8) = r1 */ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + /* 24: r2 = r10 */ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + /* 25: r2 += -8 */ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + /* 26: r1 = 0 ll */ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), + /* 28: call 1 */ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), + /* 29: r1 = r0 */ + VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + /* 30: if r1 == 0 goto +5 */ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), + /* 31: r0 = 1 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), + /* 32: r2 = *(u32 *)(r6 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), + /* 33: r1 = *(u32 *)(r1 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), + /* 34: r1 &= r2 */ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + /* 35: if r1 == r2 goto +32 */ + VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 32), + /* LBB0_4: */ + /* 36: r1 = *(u32 *)(r6 + 8) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 8), + /* 37: r2 = -4294967296 ll */ + VIR_BPF_LD_IMM64(BPF_REG_2, 0xffffffff00000000), + /* 39: r1 |= r2 */ + VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_2), + /* 40: *(u64 *)(r10 - 8) = r1 */ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + /* 41: r2 = r10 */ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + /* 42: r2 += -8 */ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + /* 43: r1 = 0 ll */ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), + /* 45: call 1 */ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), + /* 46: r1 = r0 */ + VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + /* 47: if r1 == 0 goto +5 */ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), + /* 48: r0 = 1 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), + /* 49: r2 = *(u32 *)(r6 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), + /* 50: r1 = *(u32 *)(r1 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), + /* 51: r1 &= r2 */ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + /* 52: if r1 == r2 goto +15 */ + VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 15), + /* LBB0_6: */ + /* 53: r1 = -1 */ + VIR_BPF_MOV64_IMM(BPF_REG_1, -1), + /* 54: *(u64 *)(r10 - 8) = r1 */ + VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + /* 55: r2 = r10 */ + VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + /* 56: r2 += -8 */ + VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + /* 57: r1 = 0 ll */ + VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), + /* 59: call 1 */ + VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), + /* 60: r1 = r0 */ + VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + /* 61: if r1 == 0 goto +5 */ + VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), + /* 62: r0 = 1 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 1), + /* 63: r2 = *(u32 *)(r6 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), + /* 64: r1 = *(u32 *)(r1 + 0) */ + VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), + /* 65: r1 &= r2 */ + VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + /* 66: if r1 == r2 goto +1 */ + VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1), + /* LBB0_8: */ + /* 67: r0 = 0 */ + VIR_BPF_MOV64_IMM(BPF_REG_0, 0), + /* LBB0_9: */ + /* 68: exit */ + VIR_BPF_EXIT_INSN(), + }; + + return virBPFLoadProg(prog, BPF_PROG_TYPE_CGROUP_DEVICE, G_N_ELEMENTS(prog)); +} + + +int +virCgroupV2DevicesAttachProg(virCgroupPtr group, + int mapfd, + size_t max) +{ + int ret = -1; + VIR_AUTOCLOSE progfd = -1; + VIR_AUTOCLOSE cgroupfd = -1; + g_autofree char *path = NULL; + + if (virCgroupPathOfController(group, VIR_CGROUP_CONTROLLER_DEVICES, + NULL, &path) < 0) { + goto cleanup; + } + + progfd = virCgroupV2DevicesLoadProg(mapfd); + if (progfd < 0) { + virReportSystemError(errno, "%s", _("failed to load cgroup BPF prog")); + goto cleanup; + } + + cgroupfd = open(path, O_RDONLY); + if (cgroupfd < 0) { + virReportSystemError(errno, _("unable to open '%s'"), path); + goto cleanup; + } + + if (virBPFAttachProg(progfd, cgroupfd, BPF_CGROUP_DEVICE) < 0) { + virReportSystemError(errno, "%s", _("failed to attach cgroup BPF prog")); + goto cleanup; + } + + if (group->unified.devices.progfd > 0) { + VIR_DEBUG("Closing existing program that was replaced by new one."); + VIR_FORCE_CLOSE(group->unified.devices.progfd); + } + + group->unified.devices.progfd = progfd; + group->unified.devices.mapfd = mapfd; + group->unified.devices.max = max; + progfd = -1; + mapfd = -1; + + ret = 0; + cleanup: + VIR_FORCE_CLOSE(mapfd); + return ret; +} #else /* !HAVE_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroupPtr group G_GNUC_UNUSED) { return false; } + + +int +virCgroupV2DevicesAttachProg(virCgroupPtr group G_GNUC_UNUSED, + int mapfd G_GNUC_UNUSED, + size_t max G_GNUC_UNUSED) +{ + virReportSystemError(ENOSYS, "%s", + _("cgroups v2 BPF devices not supported " + "with this kernel")); + return -1; +} #endif /* !HAVE_DECL_BPF_CGROUP_DEVICE */ diff --git a/src/util/vircgroupv2devices.h b/src/util/vircgroupv2devices.h index 2448a8890f..57454e80af 100644 --- a/src/util/vircgroupv2devices.h +++ b/src/util/vircgroupv2devices.h @@ -22,3 +22,8 @@ bool virCgroupV2DevicesAvailable(virCgroupPtr group); + +int +virCgroupV2DevicesAttachProg(virCgroupPtr group, + int mapfd, + size_t max);