From 7f719598bdf727d25490d5dac2915b0396b51309 Mon Sep 17 00:00:00 2001 From: Alejandro Huertas Herrero Date: Thu, 17 Feb 2022 15:51:27 +0100 Subject: [PATCH] F #5351: add support for NVIDIA vGPU (#1779) --- install.sh | 3 +- share/pkgs/sudoers/centos/opennebula | 1 + share/pkgs/sudoers/debian/opennebula | 1 + share/pkgs/sudoers/opennebula-node-kvm | 2 +- share/sudoers/sudoers.rb | 5 +- src/host/HostSharePCI.cc | 9 +- src/im_mad/remotes/node-probes.d/pci.conf | 5 + src/im_mad/remotes/node-probes.d/pci.rb | 31 +++++- src/vmm/LibVirtDriverKVM.cc | 56 ++++++---- src/vmm_mad/remotes/kvm/cancel | 6 + src/vmm_mad/remotes/kvm/deploy | 3 + src/vmm_mad/remotes/kvm/shutdown | 6 + src/vmm_mad/remotes/kvm/vgpu | 130 ++++++++++++++++++++++ 13 files changed, 229 insertions(+), 29 deletions(-) create mode 100755 src/vmm_mad/remotes/kvm/vgpu diff --git a/install.sh b/install.sh index 9479a13870..57d3dc1c34 100755 --- a/install.sh +++ b/install.sh @@ -1043,7 +1043,8 @@ RUBY_AUTH_LIB_FILES="src/authm_mad/remotes/ssh/ssh_auth.rb \ MAD_SH_LIB_FILES="src/mad/sh/scripts_common.sh \ src/mad/sh/create_container_image.sh \ - src/mad/sh/create_docker_image.sh" + src/mad/sh/create_docker_image.sh \ + src/vmm_mad/remotes/kvm/vgpu" MAD_RUBY_LIB_FILES="src/mad/ruby/scripts_common.rb" diff --git a/share/pkgs/sudoers/centos/opennebula b/share/pkgs/sudoers/centos/opennebula index e553ce68d6..0bcf51416f 100644 --- a/share/pkgs/sudoers/centos/opennebula +++ b/share/pkgs/sudoers/centos/opennebula @@ -10,6 +10,7 @@ Cmnd_Alias ONE_MARKET = /usr/lib/one/sh/create_container_image.sh, /usr/lib/one/ Cmnd_Alias ONE_NET = /usr/sbin/ebtables, /usr/sbin/iptables, /usr/sbin/ip6tables, /usr/sbin/ipset, /usr/sbin/ip link *, /usr/sbin/ip tuntap *, /usr/sbin/ip route *, /usr/sbin/ip neighbour * Cmnd_Alias ONE_OVS = /usr/bin/ovs-ofctl, /usr/bin/ovs-vsctl Cmnd_Alias ONE_MEM = /usr/sbin/sysctl vm.drop_caches=3 vm.compact_memory=1 +Cmnd_Alias ONE_VGPU = /usr/lib/one/sh/vgpu ## Command aliases are enabled individually in dedicated ## sudoers files by each OpenNebula component (server, node). diff --git a/share/pkgs/sudoers/debian/opennebula b/share/pkgs/sudoers/debian/opennebula index afc634c6c9..4071c57433 100644 --- a/share/pkgs/sudoers/debian/opennebula +++ b/share/pkgs/sudoers/debian/opennebula @@ -11,6 +11,7 @@ Cmnd_Alias ONE_MARKET = /usr/lib/one/sh/create_container_image.sh, /usr/lib/one/ Cmnd_Alias ONE_NET = /sbin/ebtables, /sbin/iptables, /sbin/ip6tables, /sbin/ipset, /sbin/ip link *, /sbin/ip tuntap *, /sbin/ip route *, /sbin/ip neighbour * Cmnd_Alias ONE_OVS = /usr/bin/ovs-ofctl, /usr/bin/ovs-vsctl Cmnd_Alias ONE_MEM = /sbin/sysctl vm.drop_caches=3 vm.compact_memory=1 +Cmnd_Alias ONE_VGPU = /usr/lib/one/sh/vgpu ## Command aliases are enabled individually in dedicated ## sudoers files by each OpenNebula component (server, node). diff --git a/share/pkgs/sudoers/opennebula-node-kvm b/share/pkgs/sudoers/opennebula-node-kvm index 01bed72a8d..86f9b016ce 100644 --- a/share/pkgs/sudoers/opennebula-node-kvm +++ b/share/pkgs/sudoers/opennebula-node-kvm @@ -1 +1 @@ -oneadmin ALL=(ALL:ALL) NOPASSWD: ONE_CEPH, ONE_NET, ONE_OVS, ONE_LVM, ONE_MEM +oneadmin ALL=(ALL:ALL) NOPASSWD: ONE_CEPH, ONE_NET, ONE_OVS, ONE_LVM, ONE_MEM, ONE_VGPU diff --git a/share/sudoers/sudoers.rb b/share/sudoers/sudoers.rb index 8a81495a67..a45a95c6bb 100644 --- a/share/sudoers/sudoers.rb +++ b/share/sudoers/sudoers.rb @@ -17,7 +17,7 @@ # Holds configuration about sudoers requirements for OpeNebula class Sudoers - NODECMDS = [:NET, :OVS, :LVM, :LXD, :MEM] + NODECMDS = [:NET, :OVS, :LVM, :LXD, :MEM, :VGPU] attr_accessor :cmds @@ -72,7 +72,8 @@ class Sudoers lxc-create lxc-destroy lxc-info lxc-ls lxc-start lxc-stop lxc-console e2fsck resize2fs xfs_growfs rbd-nbd ], - :MEM => ['sysctl vm.drop_caches=3 vm.compact_memory=1'] + :MEM => ['sysctl vm.drop_caches=3 vm.compact_memory=1'], + :VGPU => %w[/usr/lib/one/sh/vgpu] } end diff --git a/src/host/HostSharePCI.cc b/src/host/HostSharePCI.cc index a2be1cf053..1e57621c76 100644 --- a/src/host/HostSharePCI.cc +++ b/src/host/HostSharePCI.cc @@ -130,7 +130,7 @@ bool HostSharePCI::test(const vector &devs) const void HostSharePCI::add(vector &devs, int vmid) { unsigned int vendor_id, device_id, class_id; - string address; + string address, uuid; int vendor_rc, device_rc, class_rc, addr_rc; for (auto device : devs) @@ -172,6 +172,13 @@ void HostSharePCI::add(vector &devs, int vmid) device->replace("NUMA_NODE", node); } + uuid = dev->attrs->vector_value("UUID"); + + if ( !uuid.empty() ) + { + device->replace("UUID", uuid); + } + break; } } diff --git a/src/im_mad/remotes/node-probes.d/pci.conf b/src/im_mad/remotes/node-probes.d/pci.conf index 63f0a65ac6..3272045b50 100644 --- a/src/im_mad/remotes/node-probes.d/pci.conf +++ b/src/im_mad/remotes/node-probes.d/pci.conf @@ -64,3 +64,8 @@ # - '^MegaRAID' # :device_name: [] + +# List of NVIDIA vendor IDs, these are used to recognize PCI devices from +# NVIDIA and use vGPU feature +:nvidia_vendors: + - '10de' diff --git a/src/im_mad/remotes/node-probes.d/pci.rb b/src/im_mad/remotes/node-probes.d/pci.rb index 983759578a..46ddde3858 100755 --- a/src/im_mad/remotes/node-probes.d/pci.rb +++ b/src/im_mad/remotes/node-probes.d/pci.rb @@ -25,9 +25,10 @@ begin NAME = File.join(__dir__, "../../../../etc/im/#{ETC_NAME}/pci.conf") CONF = { - :filter => '0:0', - :short_address => [], - :device_name => [] + :filter => '0:0', + :short_address => [], + :device_name => [], + :nvidia_vendors => ['10de'] }.merge(YAML.load_file(NAME)) rescue StandardError STDERR.puts "Invalid configuration #{NAME}" @@ -110,13 +111,18 @@ devices.each do |dev| next if matched != true end + # The main device cannot be used, skip it + if CONF[:nvidia_vendors].include?(dev[:vendor]) && + `ls /sys/class/mdev_bus | grep #{dev[:short_address]}`.empty? + next + end + puts 'PCI = [' values = [ pval('TYPE', dev[:type]), pval('VENDOR', dev[:vendor]), pval('VENDOR_NAME', dev[:vendor_name]), pval('DEVICE', dev[:device]), - pval('DEVICE_NAME', dev[:device_name]), pval('CLASS', dev[:class]), pval('CLASS_NAME', dev[:class_name]), pval('ADDRESS', dev[:address]), @@ -128,6 +134,23 @@ devices.each do |dev| pval('NUMA_NODE', dev[:numa_node]) ] + # NVIDIA device + # + # The uuid is based on the address to get always the same + if CONF[:nvidia_vendors].include?(dev[:vendor]) + values << pval( + 'UUID', + `uuidgen --name '#{dev[:address]}' --namespace '@x500' --sha1`.strip + ) + + # When having vGPU the name is always Device, so we merge it with vendor + # name, in this way Sunstone shows a better name + values << pval('DEVICE_NAME', + "#{dev[:vendor_name]} #{dev[:device_name]}") + else + values << pval('DEVICE_NAME', dev[:device_name]) + end + puts values.join(",\n") puts ']' end diff --git a/src/vmm/LibVirtDriverKVM.cc b/src/vmm/LibVirtDriverKVM.cc index ee6cb0dcc5..1e8b9d427f 100644 --- a/src/vmm/LibVirtDriverKVM.cc +++ b/src/vmm/LibVirtDriverKVM.cc @@ -602,6 +602,8 @@ int LibVirtDriver::deployment_description_kvm( string vm_slot = ""; string vm_func = ""; + string uuid = ""; + bool pae = false; bool acpi = false; bool apic = false; @@ -796,7 +798,7 @@ int LibVirtDriver::deployment_description_kvm( bool boot_secure = false; string firmware; - + get_attribute(vm, nullptr, nullptr, "OS", "FIRMWARE", firmware); bool is_uefi = !firmware.empty() && !one_util::icasecmp(firmware, "BIOS"); @@ -1788,6 +1790,8 @@ int LibVirtDriver::deployment_description_kvm( vm_slot = pci[i]->vector_value("VM_SLOT"); vm_func = pci[i]->vector_value("VM_FUNCTION"); + uuid = pci[i]->vector_value("UUID"); + if ( domain.empty() || bus.empty() || slot.empty() || func.empty() ) { vm->log("VMM", Log::WARNING, @@ -1796,26 +1800,38 @@ int LibVirtDriver::deployment_description_kvm( continue; } - file << "\t\t\n"; - - file << "\t\t\t\n"; - file << "\t\t\t\t
\n"; - file << "\t\t\t\n"; - - if ( !vm_domain.empty() && !vm_bus.empty() && !vm_slot.empty() && - !vm_func.empty() ) + if ( !uuid.empty() ) { - file << "\t\t\t\t
\n"; + file << "\t\t\n"; + file << "\t\t\t\n"; + file << "\t\t\t\t
\n"; + file << "\t\t\t\n"; + } + else + { + file << "\t\t\n"; + + file << "\t\t\t\n"; + file << "\t\t\t\t
\n"; + file << "\t\t\t\n"; + + if ( !vm_domain.empty() && !vm_bus.empty() && !vm_slot.empty() && + !vm_func.empty() ) + { + file << "\t\t\t\t
\n"; + } } file << "\t\t" << endl; diff --git a/src/vmm_mad/remotes/kvm/cancel b/src/vmm_mad/remotes/kvm/cancel index e9711d0830..65841fd1e1 100755 --- a/src/vmm_mad/remotes/kvm/cancel +++ b/src/vmm_mad/remotes/kvm/cancel @@ -25,10 +25,16 @@ TIMEOUT=60 function destroy_and_monitor { + # Get datastore path to get vm.xml + DATASTORE="$(/usr/lib/one/sh/vgpu "DATASTORE" "$deploy_id" "$(dirname "$0")")" + virsh --connect $LIBVIRT_URI --readonly dominfo $deploy_id > /dev/null 2>&1 || return 0 virsh --connect $LIBVIRT_URI destroy $deploy_id + # Destroy vGPU + sudo /usr/lib/one/sh/vgpu "DELETE" "$DATASTORE/vm.xml" "$(dirname "$0")" + virsh --connect $LIBVIRT_URI --readonly dominfo $deploy_id > /dev/null 2>&1 [ "x$?" != "x0" ] } diff --git a/src/vmm_mad/remotes/kvm/deploy b/src/vmm_mad/remotes/kvm/deploy index d4ccee16f9..ebf815be3f 100755 --- a/src/vmm_mad/remotes/kvm/deploy +++ b/src/vmm_mad/remotes/kvm/deploy @@ -36,6 +36,9 @@ if [ -n "${nvram}" ]; then cp -n "${OVMF_NVRAM}" "${nvram}" fi +# Create vGPU following NVIDIA official guide: https://docs.nvidia.com/grid/latest/pdf/grid-vgpu-user-guide.pdf +sudo /usr/lib/one/sh/vgpu "CREATE" "$DEP_FILE_LOCATION/vm.xml" "$(dirname "$0")" + DATA=`virsh --connect $LIBVIRT_URI create $DEP_FILE` if [ "x$?" = "x0" ]; then diff --git a/src/vmm_mad/remotes/kvm/shutdown b/src/vmm_mad/remotes/kvm/shutdown index af163cea02..29a5b54b61 100755 --- a/src/vmm_mad/remotes/kvm/shutdown +++ b/src/vmm_mad/remotes/kvm/shutdown @@ -23,6 +23,9 @@ count=0 deploy_id=$1 +# Get datastore path to get vm.xml +DATASTORE="$(/usr/lib/one/sh/vgpu "DATASTORE" "$deploy_id" "$(dirname "$0")")" + shutdown_command="virsh --connect $LIBVIRT_URI shutdown $deploy_id" # Check if the domain is already shutdown @@ -70,6 +73,9 @@ retry $TIMEOUT monitor force_shutdown "$deploy_id" \ "virsh --connect $LIBVIRT_URI destroy $deploy_id" +# Destroy vGPU +sudo /usr/lib/one/sh/vgpu "DELETE" "$DATASTORE/vm.xml" "$(dirname "$0")" + # Compact memory if [ "x$CLEANUP_MEMORY_ON_STOP" = "xyes" ]; then sudo -n sysctl vm.drop_caches=3 vm.compact_memory=1 &>/dev/null & diff --git a/src/vmm_mad/remotes/kvm/vgpu b/src/vmm_mad/remotes/kvm/vgpu new file mode 100755 index 0000000000..d4325b736f --- /dev/null +++ b/src/vmm_mad/remotes/kvm/vgpu @@ -0,0 +1,130 @@ +#!/bin/bash + +# -------------------------------------------------------------------------- # +# Copyright 2002-2022, OpenNebula Project, OpenNebula Systems # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); you may # +# not use this file except in compliance with the License. You may obtain # +# a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +#--------------------------------------------------------------------------- # + +# ------------------------------------------------------------------------------ +# HELPERS +# ------------------------------------------------------------------------------ + +# Get PCI devices UUID +function get_uuids() { + uuids="$(xmllint --format --xpath '/VM/TEMPLATE/PCI/UUID/text()' "$1" 2>/dev/null)" + echo "$uuids" | sed -e 's///g' +} + +# Get value from XML and remove CDATA part +function get_xpath_val() { + echo "$1" | xmllint --format --xpath "$2/text()" - | sed -e 's///g' +} + +# Get mdev path used to (de)activate mediated device +function get_mdev_path() { + pci="$(xmllint --format --xpath "/VM/TEMPLATE/PCI[UUID='$1']" "$2" 2>/dev/null)" + + # Get specific information about the PCI + domain=$(get_xpath_val "$pci" "/PCI/DOMAIN") + bus=$(get_xpath_val "$pci" "/PCI/BUS") + slot=$(get_xpath_val "$pci" "/PCI/SLOT") + func=$(get_xpath_val "$pci" "/PCI/FUNCTION") + + # Generate mdev path + mdev="/sys/class/mdev_bus/$domain:$bus:$slot.$func" + + if [[ ! -d $mdev ]] + then + error_message "Directory '$mdev' does not exist" + exit 1 + fi + + # TODO: give the user the ability to choose this + device="$(ls "$mdev/mdev_supported_types" | head -n1)" + + echo "$mdev/mdev_supported_types/$device/" +} + +# ------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ + +ACTION=${1,,} + +# create -> vm.xml path +# delete -> vm.xml path +# datastore -> vm deploy ID +VM="$2" + +# Variables from driver +DRIVER_PATH="$3" + +XPATH="${DRIVER_PATH}/../../datastore/xpath.rb --stdin" + +source "$DRIVER_PATH/../../etc/vmm/kvm/kvmrc" +source "$DRIVER_PATH/../../scripts_common.sh" + +case "$ACTION" in + "create") + uuids="$(get_uuids "$VM")" + + if [ -n "$uuids" ]; then + for uuid in $uuids; do + mdev="$(get_mdev_path "$uuid" "$VM")" + + if ! echo "$uuid" > "$mdev/create"; then + error_message "Error creating mediated device" + exit 1 + fi + done + fi + ;; + "delete") + uuids="$(get_uuids "$VM")" + + if [ -n "$uuids" ]; then + for uuid in $uuids; do + mdev="$(get_mdev_path "$uuid" "$VM")" + + if ! echo "1" > "$mdev/devices/$uuid/remove"; then + error_message "Error removing mediated device" + # Not exit with error, just log the error + # exit -1 + fi + done + fi + ;; + "datastore") + METADATA_XML="$(virsh --connect "$LIBVIRT_URI" metadata "$VM" "$LIBVIRT_MD_URI" "$LIBVIRT_MD_KEY")" + + unset i XPATH_ELEMENTS + + while IFS= read -r -d '' element; do + XPATH_ELEMENTS[i++]="$element" + done < <(echo "$METADATA_XML" | $XPATH /vm/system_datastore/) + + unset i + + DATASTORE_PATH="${XPATH_ELEMENTS[i++]}" + + if [ -z "$DATASTORE_PATH" ]; then + error_message "Datastore path not found" + exit 1 + fi + + echo "$DATASTORE_PATH" + ;; + *) + error_message "Unsupported action '$ACTION'" + exit 1 +esac