Fixed Proxmox concurrencly on vmid assignation problem

This commit is contained in:
Adolfo Gómez García 2021-07-06 12:39:22 +02:00
parent 31b513a7ef
commit 548b6e813d
5 changed files with 190 additions and 42 deletions

View File

@ -180,7 +180,7 @@ class ServiceProvider(Module):
val = self.maxPreparingServices = GlobalConfig.MAX_PREPARING_SERVICES.getInt(force=True) # Recover global an cache till restart val = self.maxPreparingServices = GlobalConfig.MAX_PREPARING_SERVICES.getInt(force=True) # Recover global an cache till restart
if isinstance(val, gui.InputField): if isinstance(val, gui.InputField):
retVal = val.value retVal = int(val.value)
else: else:
retVal = val retVal = val
return retVal if retVal > 0 else 1 return retVal if retVal > 0 else 1
@ -191,7 +191,7 @@ class ServiceProvider(Module):
val = self.maxRemovingServices = GlobalConfig.MAX_REMOVING_SERVICES.getInt(force=True) # Recover global an cache till restart val = self.maxRemovingServices = GlobalConfig.MAX_REMOVING_SERVICES.getInt(force=True) # Recover global an cache till restart
if isinstance(val, gui.InputField): if isinstance(val, gui.InputField):
retVal = val.value retVal = int(val.value)
else: else:
retVal = val retVal = val
return retVal if retVal > 0 else 1 return retVal if retVal > 0 else 1

View File

@ -28,8 +28,8 @@
from uds.core import managers from uds.core import managers
from .provider import ProxmoxProvider from .provider import ProxmoxProvider
from .jobs import ProxmoxDeferredRemoval from .jobs import ProxmoxDeferredRemoval, ProxmoxVmidReleaser
# Scheduled task to do clean processes # Scheduled task to do clean processes
for cls in (ProxmoxDeferredRemoval, ): for cls in (ProxmoxDeferredRemoval, ProxmoxVmidReleaser):
managers.taskManager().registerJob(cls) managers.taskManager().registerJob(cls)

View File

@ -256,6 +256,14 @@ class ProxmoxClient:
def getNextVMId(self) -> int: def getNextVMId(self) -> int:
return int(self._get('cluster/nextid')['data']) return int(self._get('cluster/nextid')['data'])
@ensureConected
def isVMIdAvailable(self, vmId: int) -> bool:
try:
self._get(f'cluster/nextid?vmid={vmId}')
except Exception: # Not available
return False
return True
@ensureConected @ensureConected
@allowCache( @allowCache(
'nodeNets', 'nodeNets',
@ -289,6 +297,7 @@ class ProxmoxClient:
def cloneVm( def cloneVm(
self, self,
vmId: int, vmId: int,
newVmId: int,
name: str, name: str,
description: typing.Optional[str], description: typing.Optional[str],
linkedClone: bool, linkedClone: bool,
@ -296,7 +305,6 @@ class ProxmoxClient:
toStorage: typing.Optional[str] = None, toStorage: typing.Optional[str] = None,
toPool: typing.Optional[str] = None, toPool: typing.Optional[str] = None,
) -> types.VmCreationResult: ) -> types.VmCreationResult:
newVmId = self.getNextVMId()
vmInfo = self.getVmInfo(vmId) vmInfo = self.getVmInfo(vmId)
fromNode = vmInfo.node fromNode = vmInfo.node

View File

@ -33,15 +33,17 @@ import typing
from uds.core import jobs from uds.core import jobs
from uds.models import Provider from uds.models import Provider, getSqlDatetimeAsUnix
from uds.core.util.unique_id_generator import UniqueIDGenerator
from . import provider from . import provider
from . import client from . import client
# Not imported at runtime, just for type checking MAX_VMID_LIFE_SECS = 365 * 24 * 60 * 60 * 3 # 3 years for "reseting"
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class ProxmoxDeferredRemoval(jobs.Job): class ProxmoxDeferredRemoval(jobs.Job):
frecuency = 60 * 5 # Once every NN minutes frecuency = 60 * 5 # Once every NN minutes
friendly_name = 'Proxmox removal' friendly_name = 'Proxmox removal'
@ -49,55 +51,81 @@ class ProxmoxDeferredRemoval(jobs.Job):
@staticmethod @staticmethod
def remove(providerInstance: 'provider.ProxmoxProvider', vmId: int) -> None: def remove(providerInstance: 'provider.ProxmoxProvider', vmId: int) -> None:
logger.debug('Adding %s from %s to defeffed removal process', vmId, providerInstance) logger.debug(
'Adding %s from %s to defeffed removal process', vmId, providerInstance
)
ProxmoxDeferredRemoval.counter += 1 ProxmoxDeferredRemoval.counter += 1
try: try:
# First check state & stop machine if needed # First check state & stop machine if needed
vmInfo = providerInstance.getMachineInfo(vmId) vmInfo = providerInstance.getMachineInfo(vmId)
if vmInfo.status == 'running': if vmInfo.status == 'running':
# If running vm, simply stops it and wait for next # If running vm, simply stops it and wait for next
ProxmoxDeferredRemoval.waitForTaskFinish(providerInstance, providerInstance.stopMachine(vmId)) ProxmoxDeferredRemoval.waitForTaskFinish(
providerInstance, providerInstance.stopMachine(vmId)
)
ProxmoxDeferredRemoval.waitForTaskFinish(providerInstance, providerInstance.removeMachine(vmId)) ProxmoxDeferredRemoval.waitForTaskFinish(
providerInstance, providerInstance.removeMachine(vmId)
)
except client.ProxmoxNotFound: except client.ProxmoxNotFound:
return # Machine does not exists return # Machine does not exists
except Exception as e: except Exception as e:
providerInstance.storage.saveData('tr' + str(vmId), str(vmId), attr1='tRm') providerInstance.storage.saveData('tr' + str(vmId), str(vmId), attr1='tRm')
logger.info('Machine %s could not be removed right now, queued for later: %s', vmId, e) logger.info(
'Machine %s could not be removed right now, queued for later: %s',
vmId,
e,
)
@staticmethod @staticmethod
def waitForTaskFinish(providerInstance: 'provider.ProxmoxProvider', upid: 'client.types.UPID', maxWait: int = 30) -> bool: def waitForTaskFinish(
providerInstance: 'provider.ProxmoxProvider',
upid: 'client.types.UPID',
maxWait: int = 30,
) -> bool:
counter = 0 counter = 0
while providerInstance.getTaskInfo(upid.node, upid.upid).isRunning() and counter < maxWait: while (
providerInstance.getTaskInfo(upid.node, upid.upid).isRunning()
and counter < maxWait
):
time.sleep(0.3) time.sleep(0.3)
counter += 1 counter += 1
return counter < maxWait return counter < maxWait
def run(self) -> None: def run(self) -> None:
dbProvider: Provider dbProvider: Provider
# Look for Providers of type proxmox # Look for Providers of type proxmox
for dbProvider in Provider.objects.filter(maintenance_mode=False, data_type=provider.ProxmoxProvider.typeType): for dbProvider in Provider.objects.filter(
maintenance_mode=False, data_type=provider.ProxmoxProvider.typeType
):
logger.debug('Provider %s if os type proxmox', dbProvider) logger.debug('Provider %s if os type proxmox', dbProvider)
storage = dbProvider.getEnvironment().storage storage = dbProvider.getEnvironment().storage
instance: provider.ProxmoxProvider = typing.cast(provider.ProxmoxProvider, dbProvider.getInstance()) instance: provider.ProxmoxProvider = typing.cast(
provider.ProxmoxProvider, dbProvider.getInstance()
)
for i in storage.filter('tRm'): for i in storage.filter('tRm'):
vmId = int(i[1].decode()) vmId = int(i[1].decode())
try: try:
vmInfo = instance.getMachineInfo(vmId) vmInfo = instance.getMachineInfo(vmId)
logger.debug('Found %s for removal %s', vmId, i) logger.debug('Found %s for removal %s', vmId, i)
# If machine is powered on, tries to stop it # If machine is powered on, tries to stop it
# tries to remove in sync mode # tries to remove in sync mode
if vmInfo.status == 'running': if vmInfo.status == 'running':
ProxmoxDeferredRemoval.waitForTaskFinish(instance, instance.stopMachine(vmId)) ProxmoxDeferredRemoval.waitForTaskFinish(
instance, instance.stopMachine(vmId)
)
return return
if vmInfo.status == 'stopped': # Machine exists, try to remove it now if (
ProxmoxDeferredRemoval.waitForTaskFinish(instance, instance.removeMachine(vmId)) vmInfo.status == 'stopped'
): # Machine exists, try to remove it now
ProxmoxDeferredRemoval.waitForTaskFinish(
instance, instance.removeMachine(vmId)
)
# It this is reached, remove check # It this is reached, remove check
storage.remove('tr' + str(vmId)) storage.remove('tr' + str(vmId))
@ -108,3 +136,13 @@ class ProxmoxDeferredRemoval(jobs.Job):
logger.error('Delayed removal of %s failed: %s', i, e) logger.error('Delayed removal of %s failed: %s', i, e)
logger.debug('Deferred removal for proxmox finished') logger.debug('Deferred removal for proxmox finished')
class ProxmoxVmidReleaser(jobs.Job):
frecuency = 60 * 60 * 24 * 30 # Once a month
friendly_name = 'Proxmox maintenance'
def run(self) -> None:
logger.debug('Proxmox Vmid releader running')
gen = UniqueIDGenerator('vmid', 'proxmox', 'proxmox')
gen.releaseOlderThan(getSqlDatetimeAsUnix() - MAX_VMID_LIFE_SECS)

View File

@ -32,9 +32,11 @@ import typing
from django.utils.translation import ugettext_noop as _ from django.utils.translation import ugettext_noop as _
from uds.models import getSqlDatetimeAsUnix
from uds.core import services from uds.core import services
from uds.core.ui import gui from uds.core.ui import gui
from uds.core.util import validators from uds.core.util import validators
from uds.core.util.unique_id_generator import UniqueIDGenerator
from .service import ProxmoxLinkedService from .service import ProxmoxLinkedService
@ -48,35 +50,114 @@ if typing.TYPE_CHECKING:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
CACHE_TIME_FOR_SERVER = 1800 CACHE_TIME_FOR_SERVER = 1800
MAX_VM_ID = 999999999
class ProxmoxProvider(
class ProxmoxProvider(services.ServiceProvider): # pylint: disable=too-many-public-methods services.ServiceProvider
): # pylint: disable=too-many-public-methods
offers = [ProxmoxLinkedService] offers = [ProxmoxLinkedService]
typeName = _('Proxmox Platform Provider') typeName = _('Proxmox Platform Provider')
typeType = 'ProxmoxPlatform' typeType = 'ProxmoxPlatform'
typeDescription = _('Proxmox platform service provider') typeDescription = _('Proxmox platform service provider')
iconFile = 'provider.png' iconFile = 'provider.png'
host = gui.TextField(length=64, label=_('Host'), order=1, tooltip=_('Proxmox Server IP or Hostname'), required=True) host = gui.TextField(
port = gui.NumericField(lengh=5, label=_('Port'), order=2, tooltip=_('Proxmox API port (default is 8006)'), required=True, defvalue='8006') length=64,
label=_('Host'),
order=1,
tooltip=_('Proxmox Server IP or Hostname'),
required=True,
)
port = gui.NumericField(
lengh=5,
label=_('Port'),
order=2,
tooltip=_('Proxmox API port (default is 8006)'),
required=True,
defvalue='8006',
)
username = gui.TextField(length=32, label=_('Username'), order=3, tooltip=_('User with valid privileges on Proxmox, (use "user@authenticator" form)'), required=True, defvalue='root@pam') username = gui.TextField(
password = gui.PasswordField(lenth=32, label=_('Password'), order=4, tooltip=_('Password of the user of Proxmox'), required=True) length=32,
label=_('Username'),
order=3,
tooltip=_(
'User with valid privileges on Proxmox, (use "user@authenticator" form)'
),
required=True,
defvalue='root@pam',
)
password = gui.PasswordField(
lenth=32,
label=_('Password'),
order=4,
tooltip=_('Password of the user of Proxmox'),
required=True,
)
maxPreparingServices = gui.NumericField(length=3, label=_('Creation concurrency'), defvalue='10', minValue=1, maxValue=65536, order=50, tooltip=_('Maximum number of concurrently creating VMs'), required=True, tab=gui.ADVANCED_TAB) maxPreparingServices = gui.NumericField(
maxRemovingServices = gui.NumericField(length=3, label=_('Removal concurrency'), defvalue='5', minValue=1, maxValue=65536, order=51, tooltip=_('Maximum number of concurrently removing VMs'), required=True, tab=gui.ADVANCED_TAB) length=3,
label=_('Creation concurrency'),
defvalue='10',
minValue=1,
maxValue=65536,
order=50,
tooltip=_('Maximum number of concurrently creating VMs'),
required=True,
tab=gui.ADVANCED_TAB,
)
maxRemovingServices = gui.NumericField(
length=3,
label=_('Removal concurrency'),
defvalue='5',
minValue=1,
maxValue=65536,
order=51,
tooltip=_('Maximum number of concurrently removing VMs'),
required=True,
tab=gui.ADVANCED_TAB,
)
timeout = gui.NumericField(length=3, label=_('Timeout'), defvalue='20', order=90, tooltip=_('Timeout in seconds of connection to Proxmox'), required=True, tab=gui.ADVANCED_TAB) timeout = gui.NumericField(
length=3,
label=_('Timeout'),
defvalue='20',
order=90,
tooltip=_('Timeout in seconds of connection to Proxmox'),
required=True,
tab=gui.ADVANCED_TAB,
)
startVmId = gui.NumericField(
length=3,
label=_('Starting VmId'),
defvalue='10000',
minValue=10000,
maxValue=100000,
order=91,
tooltip=_('Starting machine id on proxmox'),
required=True,
tab=gui.ADVANCED_TAB,
)
# Own variables # Own variables
_api: typing.Optional[client.ProxmoxClient] = None _api: typing.Optional[client.ProxmoxClient] = None
_vmid_generator: UniqueIDGenerator
def __getApi(self) -> client.ProxmoxClient: def __getApi(self) -> client.ProxmoxClient:
""" """
Returns the connection API object Returns the connection API object
""" """
if self._api is None: if self._api is None:
self._api = client.ProxmoxClient(self.host.value, self.port.num(), self.username.value, self.password.value, self.timeout.num(), False, self.cache) self._api = client.ProxmoxClient(
self.host.value,
self.port.num(),
self.username.value,
self.password.value,
self.timeout.num(),
False,
self.cache,
)
return self._api return self._api
@ -88,6 +169,8 @@ class ProxmoxProvider(services.ServiceProvider): # pylint: disable=too-many-pub
# Just reset _api connection variable # Just reset _api connection variable
self._api = None self._api = None
# All proxmox use same UniqueId generator
self._vmid_generator = UniqueIDGenerator('vmid', 'proxmox', 'proxmox')
if values is not None: if values is not None:
self.timeout.value = validators.validateTimeout(self.timeout.value) self.timeout.value = validators.validateTimeout(self.timeout.value)
@ -107,16 +190,20 @@ class ProxmoxProvider(services.ServiceProvider): # pylint: disable=too-many-pub
def listMachines(self) -> typing.List[client.types.VMInfo]: def listMachines(self) -> typing.List[client.types.VMInfo]:
return self.__getApi().listVms() return self.__getApi().listVms()
def getMachineInfo(self, vmId: int, poolId: typing.Optional[str] = None) -> client.types.VMInfo: def getMachineInfo(
self, vmId: int, poolId: typing.Optional[str] = None
) -> client.types.VMInfo:
return self.__getApi().getVMPoolInfo(vmId, poolId, force=True) return self.__getApi().getVMPoolInfo(vmId, poolId, force=True)
def getMachineConfiguration(self, vmId: int) -> client.types.VMConfiguration: def getMachineConfiguration(self, vmId: int) -> client.types.VMConfiguration:
return self.__getApi().getVmConfiguration(vmId, force=True) return self.__getApi().getVmConfiguration(vmId, force=True)
def getStorageInfo(self, storageId: str, node: str) -> client.types.StorageInfo: def getStorageInfo(self, storageId: str, node: str) -> client.types.StorageInfo:
return self.__getApi().getStorage(storageId, node) return self.__getApi().getStorage(storageId, node)
def listStorages(self, node: typing.Optional[str]) -> typing.List[client.types.StorageInfo]: def listStorages(
self, node: typing.Optional[str]
) -> typing.List[client.types.StorageInfo]:
return self.__getApi().listStorages(node=node, content='images') return self.__getApi().listStorages(node=node, content='images')
def listPools(self) -> typing.List[client.types.PoolInfo]: def listPools(self) -> typing.List[client.types.PoolInfo]:
@ -133,11 +220,13 @@ class ProxmoxProvider(services.ServiceProvider): # pylint: disable=too-many-pub
linkedClone: bool, linkedClone: bool,
toNode: typing.Optional[str] = None, toNode: typing.Optional[str] = None,
toStorage: typing.Optional[str] = None, toStorage: typing.Optional[str] = None,
toPool: typing.Optional[str] = None toPool: typing.Optional[str] = None,
) -> client.types.VmCreationResult: ) -> client.types.VmCreationResult:
return self.__getApi().cloneVm(vmId, name, description, linkedClone, toNode, toStorage, toPool) return self.__getApi().cloneVm(
vmId, self.getNewVmId(), name, description, linkedClone, toNode, toStorage, toPool
)
def startMachine(self,vmId: int) -> client.types.UPID: def startMachine(self, vmId: int) -> client.types.UPID:
return self.__getApi().startVm(vmId) return self.__getApi().startVm(vmId)
def stopMachine(self, vmId: int) -> client.types.UPID: def stopMachine(self, vmId: int) -> client.types.UPID:
@ -158,22 +247,35 @@ class ProxmoxProvider(services.ServiceProvider): # pylint: disable=too-many-pub
def getTaskInfo(self, node: str, upid: str) -> client.types.TaskStatus: def getTaskInfo(self, node: str, upid: str) -> client.types.TaskStatus:
return self.__getApi().getTask(node, upid) return self.__getApi().getTask(node, upid)
def enableHA(self, vmId: int, started: bool = False, group: typing.Optional[str] = None) -> None: def enableHA(
self, vmId: int, started: bool = False, group: typing.Optional[str] = None
) -> None:
self.__getApi().enableVmHA(vmId, started, group) self.__getApi().enableVmHA(vmId, started, group)
def disableHA(self, vmId: int) -> None: def disableHA(self, vmId: int) -> None:
self.__getApi().disableVmHA(vmId) self.__getApi().disableVmHA(vmId)
def setProtection(self, vmId: int, node: typing.Optional[str] = None, protection: bool = False) -> None: def setProtection(
self.__getApi().setProtection(vmId, node, protection) self, vmId: int, node: typing.Optional[str] = None, protection: bool = False
) -> None:
self.__getApi().setProtection(vmId, node, protection)
def listHaGroups(self) -> typing.List[str]: def listHaGroups(self) -> typing.List[str]:
return self.__getApi().listHAGroups() return self.__getApi().listHAGroups()
def getConsoleConnection(self, machineId: str) -> typing.Optional[typing.MutableMapping[str, typing.Any]]: def getConsoleConnection(
self, machineId: str
) -> typing.Optional[typing.MutableMapping[str, typing.Any]]:
# TODO: maybe proxmox also supports "spice"? for future release... # TODO: maybe proxmox also supports "spice"? for future release...
return None return None
def getNewVmId(self) -> int:
while True: # look for an unused VmId
vmId = self._vmid_generator.get(self.startVmId.num(), MAX_VM_ID)
if self.__getApi().isVMIdAvailable(vmId):
return vmId
# All assigned VMId will be left as unusable on UDS until released by time (3 months)
@staticmethod @staticmethod
def test(env: 'Environment', data: 'Module.ValuesType') -> typing.List[typing.Any]: def test(env: 'Environment', data: 'Module.ValuesType') -> typing.List[typing.Any]:
""" """