Refactoring Proxmox

2025-01-18 06:03:54 +03:00 · 2024-07-03 19:15:59 +02:00 · 2024-07-03 19:15:59 +02:00 · e62e9875da
commit e62e9875da
parent 65d7e81263
13 changed files with 77 additions and 58 deletions
--- a/server/src/tests/services/proxmox/test_provider.py
+++ b/server/src/tests/services/proxmox/test_provider.py
@ -139,10 +139,10 @@ class TestProxmoxProvider(UDSTransactionTestCase):
            self.assertEqual(provider.list_machines(), fixtures.VMS_INFO)
            api.list_machines.assert_called_once_with(force=False)

-            self.assertEqual(provider.get_machine_info(1), fixtures.VMS_INFO[0])
+            self.assertEqual(provider.get_vm_info(1), fixtures.VMS_INFO[0])
            api.get_machine_pool_info.assert_called_once_with(1, None, force=True)

-            self.assertEqual(provider.get_machine_configuration(1), fixtures.VMS_CONFIGURATION[0])
+            self.assertEqual(provider.get_vm_config(1), fixtures.VMS_CONFIGURATION[0])
            api.get_machine_configuration.assert_called_once_with(1, force=True)

            self.assertEqual(
@ -209,7 +209,7 @@ class TestProxmoxProvider(UDSTransactionTestCase):
            api.convert_to_template.assert_called_once_with(1)

            self.assertEqual(
-                provider.clone_machine(1, 'name', 'description', True, 'node', 'storage', 'pool', True),
+                provider.clone_vm(1, 'name', 'description', True, 'node', 'storage', 'pool', True),
                fixtures.VM_CREATION_RESULT,
            )
            api.clone_machine.assert_called_once_with(
@ -300,7 +300,7 @@ class TestProxmoxProvider(UDSTransactionTestCase):
            # Patch get_provider to return te ProxmoxProvider instance (provider)
            with mock.patch('uds.services.Proxmox.helpers.get_provider', return_value=provider):
                # Test get_storage
-                vm_info = provider.get_machine_info(1)
+                vm_info = provider.get_vm_info(1)
                h_storage = get_storage({'prov_uuid': 'test', 'machine': '1'})
                self.assertEqual(
                    list(
--- a/server/src/tests/services/proxmox/test_service_fixed.py
+++ b/server/src/tests/services/proxmox/test_service_fixed.py
@ -66,7 +66,7 @@ class TestProxmoxFixedService(UDSTransactionTestCase):
        with fixtures.patched_provider() as provider:
            service = fixtures.create_service_fixed(provider=provider)

-            self.assertEqual(service.get_machine_info(2).name, fixtures.VMS_INFO[1].name)
+            self.assertEqual(service.get_vm_info(2).name, fixtures.VMS_INFO[1].name)

            # is_available is already tested, so we will skip it

--- a/server/src/tests/services/proxmox/test_service_linked.py
+++ b/server/src/tests/services/proxmox/test_service_linked.py
@ -109,7 +109,7 @@ class TestProxmovLinkedService(UDSTestCase):
            )

            # Get machine info
-            self.assertEqual(service.get_machine_info(1), fixtures.VMS_INFO[0])
+            self.assertEqual(service.get_vm_info(1), fixtures.VMS_INFO[0])
            api.get_machine_pool_info.assert_called_with(1, service.pool.value, force=True)

            # Get nic mac
@ -119,7 +119,7 @@ class TestProxmovLinkedService(UDSTestCase):
            self.assertEqual(service.provider().remove_machine(1), fixtures.UPID)

            # Enable HA
-            service.enable_machine_ha(1, True)
+            service.enable_vm_ha(1, True)
            api.enable_machine_ha.assert_called_with(1, True, service.ha.value)

    def test_service_methods_2(self) -> None:
@ -128,7 +128,7 @@ class TestProxmovLinkedService(UDSTestCase):
            service = fixtures.create_service_linked(provider=provider)

            # Disable HA
-            service.disable_machine_ha(1)
+            service.disable_vm_ha(1)
            api.disable_machine_ha.assert_called_with(1)

            # Get basename
--- a/server/src/uds/core/managers/servers.py
+++ b/server/src/uds/core/managers/servers.py
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023 Virtual Cable S.L.U.
+# Copyright (c) 2023-2024 Virtual Cable S.L.U.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification,
@ -102,7 +102,7 @@ class ServerManager(metaclass=singleton.Singleton):
                counters[uuid] = counters.get(uuid, 0) + 1

    def get_server_stats(
-        self, serversFltr: 'QuerySet[models.Server]'
+        self, severs_filter: 'QuerySet[models.Server]'
    ) -> list[tuple[typing.Optional['types.servers.ServerStats'], 'models.Server']]:
        """
        Returns a list of stats for a list of servers
@ -120,7 +120,7 @@ class ServerManager(metaclass=singleton.Singleton):

        # Retrieve, in parallel, stats for all servers (not restrained)
        with ThreadPoolExecutor(max_workers=10) as executor:
-            for server in serversFltr.select_for_update():
+            for server in severs_filter.select_for_update():
                if server.is_restrained():
                    continue  # Skip restrained servers
                executor.submit(_retrieve_stats, server)
@ -134,6 +134,7 @@ class ServerManager(metaclass=singleton.Singleton):
        now: datetime.datetime,
        min_memory_mb: int = 0,
        excluded_servers_uuids: typing.Optional[typing.Set[str]] = None,
+        weight_threshold: int = 0,  # If not 0, server with weight below and nearer to this value will be selected
    ) -> tuple['models.Server', 'types.servers.ServerStats']:
        """
        Finds the best server for a service
@ -145,19 +146,31 @@ class ServerManager(metaclass=singleton.Singleton):
        if excluded_servers_uuids:
            fltrs = fltrs.exclude(uuid__in=excluded_servers_uuids)

-        serversStats = self.get_server_stats(fltrs)
+        stats_and_servers = self.get_server_stats(fltrs)
+
+        def _weight_threshold(stats: 'types.servers.ServerStats') -> float:
+            if weight_threshold == 0:
+                return stats.weight()
+            # Values under threshold are better, weight is in between 0 and 1, lower is better
+            # To values over threshold, we will add 1, so they are always worse than any value under threshold
+            return stats.weight() if stats.weight() < weight_threshold else 1 + stats.weight()

        # Now, cachedStats has a list of tuples (stats, server), use it to find the best server
-        for stats, server in serversStats:
+        for stats, server in stats_and_servers:
            if stats is None:
                unmanaged_list.append(server)
                continue
            if min_memory_mb and stats.memused // (1024 * 1024) < min_memory_mb:  # Stats has minMemory in bytes
                continue

-            if best is None or stats.weight() < best[1].weight():
+            if best is None:
                best = (server, stats)

+            if _weight_threshold(stats) < _weight_threshold(best[1]):
+                best = (server, stats)
+
+            # stats.weight() < best[1].weight()
+
        # Cannot be assigned to any server!!
        # If no best, select one from unmanaged
        if best is None:
@ -226,7 +239,9 @@ class ServerManager(metaclass=singleton.Singleton):
            # If server is forced, and server is part of the group, use it
            if server:
                if (
-                    server.groups.filter(uuid=server_group.uuid).exclude(uuid__in=excluded_servers_uuids).count()  
+                    server.groups.filter(uuid=server_group.uuid)
+                    .exclude(uuid__in=excluded_servers_uuids)
+                    .count()
                    == 0
                ):
                    raise exceptions.UDSException(_('Server is not part of the group'))
@ -319,9 +334,9 @@ class ServerManager(metaclass=singleton.Singleton):
                resetCounter = False
                # ServerCounterType

-                serverCounter: typing.Optional[
-                    types.servers.ServerCounter
-                ] = types.servers.ServerCounter.from_iterable(props.get(prop_name))
+                serverCounter: typing.Optional[types.servers.ServerCounter] = (
+                    types.servers.ServerCounter.from_iterable(props.get(prop_name))
+                )
                # If no cached value, get server assignation
                if serverCounter is None:
                    return types.servers.ServerCounter.null()
--- a/server/src/uds/core/types/servers.py
+++ b/server/src/uds/core/types/servers.py
@ -41,6 +41,7 @@ from uds.core.util import ensure, singleton

 IP_SUBTYPE: typing.Final[str] = 'ip'

+
 class ServerType(enum.IntEnum):
    TUNNEL = 1
    ACTOR = 2
@ -148,10 +149,12 @@ class ServerStats:

    @property
    def cpufree_ratio(self) -> float:
+        # Returns a valuen between 0 and 1, being 1 the best value (no cpu used per user) and 0 the worst
        return (1 - self.cpuused) / (self.current_users + 1)

    @property
    def memfree_ratio(self) -> float:
+        # Returns a valuen between 0 and 1, being 1 the best value (no memory used per user) and 0 the worst
        return (self.memtotal - self.memused) / (self.memtotal or 1) / (self.current_users + 1)

    @property
@ -169,17 +172,23 @@ class ServerStats:

        return self.stamp > sql_stamp() - consts.cache.DEFAULT_CACHE_TIMEOUT

-    def weight(self, minMemory: int = 0) -> float:
+    def weight(self, min_memory: int = 0) -> float:
        # Weights are calculated as:
        # 0.5 * cpu_usage + 0.5 * (1 - mem_free / mem_total) / (current_users + 1)
        # +1 is because this weights the connection of current users + new user
        # Dividing by number of users + 1 gives us a "ratio" of available resources per user when a new user is added
        # Also note that +512 forces that if mem_free is less than 512 MB, this server will be put at the end of the list
-        if self.memtotal - self.memused < minMemory:
+        if self.memtotal - self.memused < min_memory:
            return 1000000000  # At the end of the list

        # Lower is better
-        return 1 / ((self.cpufree_ratio * 1.3 + self.memfree_ratio) or 1)
+        # value can be between:
+        # (1 / (1 * 1.3 + 1) - 0.434) * 1.76 ~= 0.0 (worst case, no memory, all cpu)
+        # and
+        # (1 / ((0 * 1.3 + 0) or 1) - 0.434) * 1.76 = 0.566 * 1.76 ~= 1.0 (best case, all memory, no cpu)
+
+        w = (1 / ((self.cpufree_ratio * 1.3 + self.memfree_ratio) or 1) - 0.434) * 1.76
+        return min(max(0.0, w), 1.0)

    def adjust(self, users_increment: int) -> 'ServerStats':
        """
@ -250,6 +259,7 @@ class ServerStats:
        # Human readable
        return f'memory: {self.memused//(1024*1024)}/{self.memtotal//(1024*1024)} cpu: {self.cpuused*100} users: {self.current_users}, weight: {self.weight()}, valid: {self.is_valid}'

+
 # ServerCounter must be serializable by json, so
 # we keep it as a NamedTuple instead of a dataclass
 class ServerCounter(typing.NamedTuple):
@ -257,10 +267,12 @@ class ServerCounter(typing.NamedTuple):
    counter: int

    @staticmethod
-    def from_iterable(data: typing.Optional[collections.abc.Iterable[typing.Any]]) -> typing.Optional['ServerCounter']:
+    def from_iterable(
+        data: typing.Optional[collections.abc.Iterable[typing.Any]],
+    ) -> typing.Optional['ServerCounter']:
        if data is None:
            return None
-        
+
        return ServerCounter(*ensure.as_iterable(data))

    @staticmethod
--- a/server/src/uds/services/Proxmox/deployment_fixed.py
+++ b/server/src/uds/services/Proxmox/deployment_fixed.py
@ -77,7 +77,7 @@ class ProxmoxUserServiceFixed(FixedUserService, autoserializable.AutoSerializabl
            return types.states.TaskState.FINISHED

        try:
-            vminfo = self.service().get_machine_info(int(self._vmid))
+            vminfo = self.service().get_vm_info(int(self._vmid))
        except prox_exceptions.ProxmoxConnectionError:
            raise  # If connection fails, let it fail on parent
        except Exception as e:
@ -104,7 +104,7 @@ class ProxmoxUserServiceFixed(FixedUserService, autoserializable.AutoSerializabl

    def op_start(self) -> None:
        try:
-            vminfo = self.service().get_machine_info(int(self._vmid))
+            vminfo = self.service().get_vm_info(int(self._vmid))
        except prox_exceptions.ProxmoxConnectionError:
            self.retry_later()
            return
--- a/server/src/uds/services/Proxmox/deployment_linked.py
+++ b/server/src/uds/services/Proxmox/deployment_linked.py
@ -201,7 +201,7 @@ class ProxmoxUserserviceLinked(DynamicUserService):
        # Set mac
        try:
            # Note: service will only enable ha if it is configured to do so
-            self.service().enable_machine_ha(int(self._vmid), True)  # Enable HA before continuing here
+            self.service().enable_vm_ha(int(self._vmid), True)  # Enable HA before continuing here

            # Set vm mac address now on first interface
            self.service().provider().set_machine_mac(int(self._vmid), self.get_unique_id())
--- a/server/src/uds/services/Proxmox/helpers.py
+++ b/server/src/uds/services/Proxmox/helpers.py
@ -52,7 +52,7 @@ def get_storage(parameters: typing.Any) -> types.ui.CallbackResultType:

    # Obtains datacenter from cluster
    try:
-        vm_info = provider.get_machine_info(int(parameters['machine']))
+        vm_info = provider.get_vm_info(int(parameters['machine']))
    except Exception:
        return []

--- a/server/src/uds/services/Proxmox/jobs.py
+++ b/server/src/uds/services/Proxmox/jobs.py
@ -134,7 +134,7 @@ class ProxmoxDeferredRemoval(jobs.Job):
                # The soft shutdown has already being initiated by the remove method
                   
                try:
-                    vmInfo = instance.get_machine_info(vmid)
+                    vmInfo = instance.get_vm_info(vmid)
                    logger.debug('Found %s for removal %s', vmid, data)
                    # If machine is powered on, tries to stop it
                    # tries to remove in sync mode
--- a/server/src/uds/services/Proxmox/provider.py
+++ b/server/src/uds/services/Proxmox/provider.py
@ -169,10 +169,10 @@ class ProxmoxProvider(services.ServiceProvider):
    def list_machines(self, force: bool = False) -> list[prox_types.VMInfo]:
        return self._api().list_machines(force=force)

-    def get_machine_info(self, vmid: int, poolid: typing.Optional[str] = None) -> prox_types.VMInfo:
+    def get_vm_info(self, vmid: int, poolid: typing.Optional[str] = None) -> prox_types.VMInfo:
        return self._api().get_machine_pool_info(vmid, poolid, force=True)

-    def get_machine_configuration(self, vmid: int) -> prox_types.VMConfiguration:
+    def get_vm_config(self, vmid: int) -> prox_types.VMConfiguration:
        return self._api().get_machine_configuration(vmid, force=True)

    def get_storage_info(self, storageid: str, node: str, force: bool = False) -> prox_types.StorageInfo:
@ -194,7 +194,7 @@ class ProxmoxProvider(services.ServiceProvider):
    def create_template(self, vmid: int) -> None:
        self._api().convert_to_template(vmid)

-    def clone_machine(
+    def clone_vm(
        self,
        vmid: int,
        name: str,
--- a/server/src/uds/services/Proxmox/publication.py
+++ b/server/src/uds/services/Proxmox/publication.py
@ -118,7 +118,7 @@ class ProxmoxPublication(DynamicPublication, autoserializable.AutoSerializable):
        self.service().provider().set_protection(int(self._vmid), protection=False)
        time.sleep(0.5)  # Give some tome to proxmox. We have observed some concurrency issues
        # And add it to HA if needed (decided by service configuration)
-        self.service().enable_machine_ha(int(self._vmid))
+        self.service().enable_vm_ha(int(self._vmid))
        # Wait a bit, if too fast, proxmox fails.. (Have not tested on 8.x, but previous versions failed if too fast..)
        time.sleep(0.5)
        # Mark vm as template
--- a/server/src/uds/services/Proxmox/service_fixed.py
+++ b/server/src/uds/services/Proxmox/service_fixed.py
@ -113,8 +113,8 @@ class ProxmoxServiceFixed(FixedService):  # pylint: disable=too-many-public-meth
    def provider(self) -> 'ProxmoxProvider':
        return typing.cast('ProxmoxProvider', super().provider())

-    def get_machine_info(self, vmId: int) -> 'prox_types.VMInfo':
-        return self.provider().get_machine_info(vmId, self.pool.value.strip())
+    def get_vm_info(self, vmId: int) -> 'prox_types.VMInfo':
+        return self.provider().get_vm_info(vmId, self.pool.value.strip())

    def is_avaliable(self) -> bool:
        return self.provider().is_available()
@ -185,7 +185,7 @@ class ProxmoxServiceFixed(FixedService):  # pylint: disable=too-many-public-meth
                    if checking_vmid not in assigned_vms:  # Not already assigned
                        try:
                            # Invoke to check it exists, do not need to store the result
-                            self.provider().get_machine_info(int(checking_vmid), self.pool.value.strip())
+                            self.provider().get_vm_info(int(checking_vmid), self.pool.value.strip())
                            found_vmid = checking_vmid
                            break
                        except Exception:  # Notifies on log, but skipt it
@ -209,11 +209,11 @@ class ProxmoxServiceFixed(FixedService):  # pylint: disable=too-many-public-meth
        return str(found_vmid)

    def get_mac(self, vmid: str) -> str:
-        config = self.provider().get_machine_configuration(int(vmid))
+        config = self.provider().get_vm_config(int(vmid))
        return config.networks[0].mac.lower()

    def get_ip(self, vmid: str) -> str:
        return self.provider().get_guest_ip_address(int(vmid))

    def get_name(self, vmid: str) -> str:
-        return self.provider().get_machine_info(int(vmid)).name or ''
+        return self.provider().get_vm_info(int(vmid)).name or ''
--- a/server/src/uds/services/Proxmox/service_linked.py
+++ b/server/src/uds/services/Proxmox/service_linked.py
@ -39,7 +39,7 @@ from uds.core.services.generics.dynamic.publication import DynamicPublication
 from uds.core.services.generics.dynamic.service import DynamicService
 from uds.core.services.generics.dynamic.userservice import DynamicUserService
 from uds.core.ui import gui
-from uds.core.util import validators, fields
+from uds.core.util import validators

 from . import helpers
 from .deployment_linked import ProxmoxUserserviceLinked
@ -123,7 +123,7 @@ class ProxmoxServiceLinked(DynamicService):
        readonly=True,
    )

-    try_soft_shutdown = fields.soft_shutdown_field()
+    try_soft_shutdown = DynamicService.try_soft_shutdown

    machine = gui.ChoiceField(
        label=_("Base Machine"),
@ -211,7 +211,7 @@ class ProxmoxServiceLinked(DynamicService):
        name = self.sanitized_name(name)
        pool = self.pool.value or None
        if vmid == -1:  # vmId == -1 if cloning for template
-            return self.provider().clone_machine(
+            return self.provider().clone_vm(
                self.machine.as_int(),
                name,
                description,
@ -220,7 +220,7 @@ class ProxmoxServiceLinked(DynamicService):
                target_pool=pool,
            )

-        return self.provider().clone_machine(
+        return self.provider().clone_vm(
            vmid,
            name,
            description,
@ -230,17 +230,18 @@ class ProxmoxServiceLinked(DynamicService):
            must_have_vgpus={'1': True, '2': False}.get(self.gpu.value, None),
        )

-    def get_machine_info(self, vmid: int) -> 'prox_types.VMInfo':
-        return self.provider().get_machine_info(vmid, self.pool.value.strip())
+    def get_vm_info(self, vmid: int) -> 'prox_types.VMInfo':
+        return self.provider().get_vm_info(vmid, self.pool.value.strip())

    def get_nic_mac(self, vmid: int) -> str:
-        config = self.provider().get_machine_configuration(vmid)
+        config = self.provider().get_vm_config(vmid)
        return config.networks[0].mac.lower()

-    def xremove_machine(self, vmid: int) -> 'prox_types.UPID':
+    # TODO: Remove this method, kept for reference of old code
+    def _xremove_machine(self, vmid: int) -> 'prox_types.UPID':
        # First, remove from HA if needed
        try:
-            self.disable_machine_ha(vmid)
+            self.disable_vm_ha(vmid)
        except Exception as e:
            logger.warning('Exception disabling HA for vm %s: %s', vmid, e)
            self.do_log(level=types.log.LogLevel.WARNING, message=f'Exception disabling HA for vm {vmid}: {e}')
@ -248,22 +249,16 @@ class ProxmoxServiceLinked(DynamicService):
        # And remove it
        return self.provider().remove_machine(vmid)

-    def enable_machine_ha(self, vmid: int, started: bool = False) -> None:
+    def enable_vm_ha(self, vmid: int, started: bool = False) -> None:
        if self.ha.value == '__':
            return
        self.provider().enable_machine_ha(vmid, started, self.ha.value or None)

-    def disable_machine_ha(self, vmid: int) -> None:
+    def disable_vm_ha(self, vmid: int) -> None:
        if self.ha.value == '__':
            return
        self.provider().disable_machine_ha(vmid)

-    def get_basename(self) -> str:
-        return self.basename.value
-
-    def get_lenname(self) -> int:
-        return int(self.lenname.value)
-
    def get_macs_range(self) -> str:
        """
        Returns de selected mac range
@ -273,9 +268,6 @@ class ProxmoxServiceLinked(DynamicService):
    def is_ha_enabled(self) -> bool:
        return self.ha.value != '__'

-    def should_try_soft_shutdown(self) -> bool:
-        return self.try_soft_shutdown.as_bool()
-
    def get_console_connection(self, vmid: str) -> typing.Optional[types.services.ConsoleConnectionInfo]:
        return self.provider().get_console_connection(vmid)

@ -320,7 +312,7 @@ class ProxmoxServiceLinked(DynamicService):

    def is_running(self, caller_instance: typing.Optional['DynamicUserService | DynamicPublication'], vmid: str) -> bool:
        # Raise an exception if fails to get machine info
-        vminfo = self.get_machine_info(int(vmid))
+        vminfo = self.get_vm_info(int(vmid))

        return vminfo.status != 'stopped'