lvmdbusd: Re-work error handling

Introduce an exception which is used for known existing issues with lvm. This is used to distinguish between errors between lvm itself and lvmdbusd. In the case of lvm bugs, when we simply retry the operation we will log very little. Otherwise, we will dump a full traceback for investigation when we do the retry.
2025-03-28 02:50:41 +03:00 · 2022-08-31 11:20:49 -05:00 · 2022-08-31 11:20:49 -05:00 · d42bdb07de
commit d42bdb07de
parent cb32b0a87f
6 changed files with 140 additions and 100 deletions
--- a/daemons/lvmdbusd/cmdhandler.py
+++ b/daemons/lvmdbusd/cmdhandler.py
@ -17,7 +17,7 @@ import os

 from lvmdbusd import cfg
 from lvmdbusd.utils import pv_dest_ranges, log_debug, log_error, add_no_notify,\
-			make_non_block, read_decoded, extract_stack_trace
+			make_non_block, read_decoded, extract_stack_trace, LvmBug
 from lvmdbusd.lvm_shell_proxy import LVMShellProxy

 try:
@ -628,7 +628,7 @@ def lvm_full_report_json():
 							(str(joe), out))
 				raise joe

-	return None
+	raise LvmBug("'fullreport' exited with code '%d'" % rc)


 def pv_resize(device, size_bytes, create_options):
--- a/daemons/lvmdbusd/fetch.py
+++ b/daemons/lvmdbusd/fetch.py
@ -11,11 +11,10 @@ from .pv import load_pvs
 from .vg import load_vgs
 from .lv import load_lvs
 from . import cfg
-from .utils import MThreadRunner, log_debug, log_error
+from .utils import MThreadRunner, log_debug, log_error, LvmBug, extract_stack_trace
 import threading
 import queue
 import time
-import traceback


 def _main_thread_load(refresh=True, emit_signal=True):
@ -160,6 +159,20 @@ class StateUpdate(object):
 			except queue.Empty:
 				pass

+		def _handle_error():
+			nonlocal exception_count
+			exception_count += 1
+
+			if exception_count >= 5:
+				log_error("Too many errors in update_thread, exiting daemon")
+				cfg.debug.dump()
+				cfg.flightrecorder.dump()
+				bailing(e)
+				cfg.exit_daemon()
+			else:
+				# Slow things down when encountering errors
+				time.sleep(1)
+
 		while cfg.run.value != 0:
 			# noinspection PyBroadException
 			try:
@ -191,18 +204,12 @@ class StateUpdate(object):
 				pass
 			except SystemExit:
 				break
+			except LvmBug as bug:
+				log_error(str(bug))
+				_handle_error()
 			except Exception as e:
-				exception_count += 1
-				if exception_count >= 5:
-					st = traceback.format_exc()
-					log_error("Too many errors in update_thread, exiting daemon (last exception reported): \n %s" % st)
-					cfg.debug.dump()
-					cfg.flightrecorder.dump()
-					bailing(e)
-					cfg.exit_daemon()
-				else:
-					# Slow things down when encountering errors
-					time.sleep(1)
+				log_error("update_thread: \n%s" % extract_stack_trace(e))
+				_handle_error()

 		# Make sure to unblock any that may be waiting before we exit this thread
 		# otherwise they hang forever ...
--- a/daemons/lvmdbusd/lv.py
+++ b/daemons/lvmdbusd/lv.py
@ -10,7 +10,7 @@
 from .automatedproperties import AutomatedProperties

 from . import utils
-from .utils import vg_obj_path_generate, log_error, _handle_execute
+from .utils import vg_obj_path_generate, log_error, _handle_execute, LvmBug
 import dbus
 from . import cmdhandler
 from . import cfg
@ -71,67 +71,76 @@ def lvs_state_retrieve(selection, cache_refresh=True):
 	# don't have information available yet.
 	lvs = sorted(cfg.db.fetch_lvs(selection), key=get_key)

-	for l in lvs:
-		if cfg.vdo_support:
-			rc.append(LvStateVdo(
-				l['lv_uuid'], l['lv_name'],
-				l['lv_path'], n(l['lv_size']),
-				l['vg_name'],
-				l['vg_uuid'], l['pool_lv_uuid'],
-				l['pool_lv'], l['origin_uuid'], l['origin'],
-				n32(l['data_percent']), l['lv_attr'],
-				l['lv_tags'], l['lv_active'], l['data_lv'],
-				l['metadata_lv'], l['segtype'], l['lv_role'],
-				l['lv_layout'],
-				n32(l['snap_percent']),
-				n32(l['metadata_percent']),
-				n32(l['copy_percent']),
-				n32(l['sync_percent']),
-				n(l['lv_metadata_size']),
-				l['move_pv'],
-				l['move_pv_uuid'],
-				l['vdo_operating_mode'],
-				l['vdo_compression_state'],
-				l['vdo_index_state'],
-				n(l['vdo_used_size']),
-				d(l['vdo_saving_percent']),
-				l['vdo_compression'],
-				l['vdo_deduplication'],
-				l['vdo_use_metadata_hints'],
-				n32(l['vdo_minimum_io_size']),
-				n(l['vdo_block_map_cache_size']),
-				n32(l['vdo_block_map_era_length']),
-				l['vdo_use_sparse_index'],
-				n(l['vdo_index_memory_size']),
-				n(l['vdo_slab_size']),
-				n32(l['vdo_ack_threads']),
-				n32(l['vdo_bio_threads']),
-				n32(l['vdo_bio_rotation']),
-				n32(l['vdo_cpu_threads']),
-				n32(l['vdo_hash_zone_threads']),
-				n32(l['vdo_logical_threads']),
-				n32(l['vdo_physical_threads']),
-				n32(l['vdo_max_discard']),
-				l['vdo_write_policy'],
-				n32(l['vdo_header_size'])))
-		else:
-			rc.append(LvState(
-				l['lv_uuid'], l['lv_name'],
-				l['lv_path'], n(l['lv_size']),
-				l['vg_name'],
-				l['vg_uuid'], l['pool_lv_uuid'],
-				l['pool_lv'], l['origin_uuid'], l['origin'],
-				n32(l['data_percent']), l['lv_attr'],
-				l['lv_tags'], l['lv_active'], l['data_lv'],
-				l['metadata_lv'], l['segtype'], l['lv_role'],
-				l['lv_layout'],
-				n32(l['snap_percent']),
-				n32(l['metadata_percent']),
-				n32(l['copy_percent']),
-				n32(l['sync_percent']),
-				n(l['lv_metadata_size']),
-				l['move_pv'],
-				l['move_pv_uuid']))
+	try:
+		for l in lvs:
+			if cfg.vdo_support:
+				rc.append(LvStateVdo(
+					l['lv_uuid'], l['lv_name'],
+					l['lv_path'], n(l['lv_size']),
+					l['vg_name'],
+					l['vg_uuid'], l['pool_lv_uuid'],
+					l['pool_lv'], l['origin_uuid'], l['origin'],
+					n32(l['data_percent']), l['lv_attr'],
+					l['lv_tags'], l['lv_active'], l['data_lv'],
+					l['metadata_lv'], l['segtype'], l['lv_role'],
+					l['lv_layout'],
+					n32(l['snap_percent']),
+					n32(l['metadata_percent']),
+					n32(l['copy_percent']),
+					n32(l['sync_percent']),
+					n(l['lv_metadata_size']),
+					l['move_pv'],
+					l['move_pv_uuid'],
+					l['vdo_operating_mode'],
+					l['vdo_compression_state'],
+					l['vdo_index_state'],
+					n(l['vdo_used_size']),
+					d(l['vdo_saving_percent']),
+					l['vdo_compression'],
+					l['vdo_deduplication'],
+					l['vdo_use_metadata_hints'],
+					n32(l['vdo_minimum_io_size']),
+					n(l['vdo_block_map_cache_size']),
+					n32(l['vdo_block_map_era_length']),
+					l['vdo_use_sparse_index'],
+					n(l['vdo_index_memory_size']),
+					n(l['vdo_slab_size']),
+					n32(l['vdo_ack_threads']),
+					n32(l['vdo_bio_threads']),
+					n32(l['vdo_bio_rotation']),
+					n32(l['vdo_cpu_threads']),
+					n32(l['vdo_hash_zone_threads']),
+					n32(l['vdo_logical_threads']),
+					n32(l['vdo_physical_threads']),
+					n32(l['vdo_max_discard']),
+					l['vdo_write_policy'],
+					n32(l['vdo_header_size'])))
+			else:
+				rc.append(LvState(
+					l['lv_uuid'], l['lv_name'],
+					l['lv_path'], n(l['lv_size']),
+					l['vg_name'],
+					l['vg_uuid'], l['pool_lv_uuid'],
+					l['pool_lv'], l['origin_uuid'], l['origin'],
+					n32(l['data_percent']), l['lv_attr'],
+					l['lv_tags'], l['lv_active'], l['data_lv'],
+					l['metadata_lv'], l['segtype'], l['lv_role'],
+					l['lv_layout'],
+					n32(l['snap_percent']),
+					n32(l['metadata_percent']),
+					n32(l['copy_percent']),
+					n32(l['sync_percent']),
+					n(l['lv_metadata_size']),
+					l['move_pv'],
+					l['move_pv_uuid']))
+	except KeyError as ke:
+		# Sometimes lvm omits returning one of the keys we requested.
+		key = ke.args[0]
+		if key.startswith("lv_") or key.startswith("vg_") or key.startswith("pool_") or \
+			key.endswith("_percent") or key.startswith("move_") or key.startswith("vdo_") or \
+			key in ["origin_uuid", "segtype", "origin", "data_lv", "metadata_lv"]:
+			raise LvmBug("missing JSON key: '%s'" % key)
+		raise ke
 	return rc


--- a/daemons/lvmdbusd/pv.py
+++ b/daemons/lvmdbusd/pv.py
@ -18,7 +18,7 @@ from .utils import vg_obj_path_generate, n, pv_obj_path_generate, \
 from .loader import common
 from .request import RequestEntry
 from .state import State
-from .utils import round_size
+from .utils import round_size, LvmBug


 # noinspection PyUnusedLocal
@ -28,16 +28,23 @@ def pvs_state_retrieve(selection, cache_refresh=True):
 	if cache_refresh:
 		cfg.db.refresh()

-	for p in cfg.db.fetch_pvs(selection):
-		rc.append(
-			PvState(
-				p["pv_name"], p["pv_uuid"], p["pv_name"],
-				p["pv_fmt"], n(p["pv_size"]), n(p["pv_free"]),
-				n(p["pv_used"]), n(p["dev_size"]), n(p["pv_mda_size"]),
-				n(p["pv_mda_free"]), int(p["pv_ba_start"]),
-				n(p["pv_ba_size"]), n(p["pe_start"]),
-				int(p["pv_pe_count"]), int(p["pv_pe_alloc_count"]),
-				p["pv_attr"], p["pv_tags"], p["vg_name"], p["vg_uuid"]))
+	try:
+		for p in cfg.db.fetch_pvs(selection):
+			rc.append(
+				PvState(
+					p["pv_name"], p["pv_uuid"], p["pv_name"],
+					p["pv_fmt"], n(p["pv_size"]), n(p["pv_free"]),
+					n(p["pv_used"]), n(p["dev_size"]), n(p["pv_mda_size"]),
+					n(p["pv_mda_free"]), int(p["pv_ba_start"]),
+					n(p["pv_ba_size"]), n(p["pe_start"]),
+					int(p["pv_pe_count"]), int(p["pv_pe_alloc_count"]),
+					p["pv_attr"], p["pv_tags"], p["vg_name"], p["vg_uuid"]))
+	except KeyError as ke:
+		# Sometimes lvm omits returning one of the keys we requested.
+		key = ke.args[0]
+		if key.startswith("pv") or key.startswith("vg") or (key in ['dev_size', 'pe_start']):
+			raise LvmBug("missing JSON key: '%s'" % key)
+		raise ke
 	return rc


--- a/daemons/lvmdbusd/utils.py
+++ b/daemons/lvmdbusd/utils.py
@ -768,3 +768,12 @@ def extract_stack_trace(exception):
 	return ''.join(traceback.format_exception(None, exception, exception.__traceback__))


+class LvmBug(RuntimeError):
+	"""
+	Things that are clearly a bug with lvm itself.
+	"""
+	def __init__(self, msg):
+		super().__init__(msg)
+
+	def __str__(self):
+		return "lvm bug encountered: %s" % ' '.join(self.args)
--- a/daemons/lvmdbusd/vg.py
+++ b/daemons/lvmdbusd/vg.py
@ -20,7 +20,7 @@ from .request import RequestEntry
 from .loader import common
 from .state import State
 from . import background
-from .utils import round_size, mt_remove_dbus_objects
+from .utils import round_size, mt_remove_dbus_objects, LvmBug
 from .job import JobState


@ -31,17 +31,25 @@ def vgs_state_retrieve(selection, cache_refresh=True):
 	if cache_refresh:
 		cfg.db.refresh()

-	for v in cfg.db.fetch_vgs(selection):
-		rc.append(
-			VgState(
-				v['vg_uuid'], v['vg_name'], v['vg_fmt'], n(v['vg_size']),
-				n(v['vg_free']), v['vg_sysid'], n(v['vg_extent_size']),
-				n(v['vg_extent_count']), n(v['vg_free_count']),
-				v['vg_profile'], n(v['max_lv']), n(v['max_pv']),
-				n(v['pv_count']), n(v['lv_count']), n(v['snap_count']),
-				n(v['vg_seqno']), n(v['vg_mda_count']),
-				n(v['vg_mda_free']), n(v['vg_mda_size']),
-				n(v['vg_mda_used_count']), v['vg_attr'], v['vg_tags']))
+	try:
+		for v in cfg.db.fetch_vgs(selection):
+			rc.append(
+				VgState(
+					v['vg_uuid'], v['vg_name'], v['vg_fmt'], n(v['vg_size']),
+					n(v['vg_free']), v['vg_sysid'], n(v['vg_extent_size']),
+					n(v['vg_extent_count']), n(v['vg_free_count']),
+					v['vg_profile'], n(v['max_lv']), n(v['max_pv']),
+					n(v['pv_count']), n(v['lv_count']), n(v['snap_count']),
+					n(v['vg_seqno']), n(v['vg_mda_count']),
+					n(v['vg_mda_free']), n(v['vg_mda_size']),
+					n(v['vg_mda_used_count']), v['vg_attr'], v['vg_tags']))
+	except KeyError as ke:
+		# Sometimes lvm omits returning one of the keys we requested.
+		key = ke.args[0]
+		if key.startswith("vg_") or key.startswith("lv_") or key.startswith("pv_") or \
+			key in ["max_lv", "max_pv", "snap_count"]:
+			raise LvmBug("missing JSON key: '%s'" % key)
+		raise ke
 	return rc