From edac6ca75621dc48d4804dab34ca71f4cbe34a69 Mon Sep 17 00:00:00 2001 From: Peter Krempa Date: Tue, 14 Feb 2023 14:38:40 +0100 Subject: [PATCH] scripts: check-html-refernces: Add checking for image file usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check both that a file is referenced from our pages and also that pages reference existing images. The mode for dumping external references now also dumps images. '--ignore-image' can be used repeatedly to suppress errors for specific images. Signed-off-by: Peter Krempa Reviewed-by: Daniel P. Berrangé --- scripts/check-html-references.py | 101 ++++++++++++++++++++++++++----- 1 file changed, 87 insertions(+), 14 deletions(-) diff --git a/scripts/check-html-references.py b/scripts/check-html-references.py index 4f08feab59..788622a2d0 100755 --- a/scripts/check-html-references.py +++ b/scripts/check-html-references.py @@ -24,25 +24,32 @@ import xml.etree.ElementTree as ET ns = {'html': 'http://www.w3.org/1999/xhtml'} externallinks = [] +externalimages = [] def get_file_list(prefix): filelist = [] + imagelist = [] + imageformats = ['.jpg', '.svg', '.png'] for root, dir, files in os.walk(prefix): for file in files: - if not re.search('\\.html$', file): - continue + ext = os.path.splitext(file)[1] - # the 404 page doesn't play well - if '404.html' in file: - continue + if ext == '.html': + # the 404 page doesn't play well + if '404.html' in file: + continue - filelist.append(os.path.join(root, file)) + filelist.append(os.path.join(root, file)) + + elif ext in imageformats: + imagelist.append(os.path.join(root, file)) filelist.sort() + imagelist.sort() - return filelist + return filelist, imagelist # loads an XHTML and extracts all anchors, local and remote links for the one file @@ -50,12 +57,14 @@ def process_file(filename): tree = ET.parse(filename) root = tree.getroot() docname = root.get('data-sourcedoc') + dirname = os.path.dirname(filename) if not docname: docname = filename anchors = [filename] targets = [] + images = [] for elem in root.findall('.//html:a', ns): target = elem.get('href') @@ -68,7 +77,6 @@ def process_file(filename): if re.search('://', target): externallinks.append(target) elif target[0] != '#' and 'mailto:' not in target: - dirname = os.path.dirname(filename) targetfull = os.path.normpath(os.path.join(dirname, target)) targets.append((filename, docname, targetfull, target)) @@ -87,20 +95,33 @@ def process_file(filename): if an: anchors.append(filename + '#' + an) - return (anchors, targets) + # find local images + for elem in root.findall('.//html:img', ns): + src = elem.get('src') + + if src: + if re.search('://', src): + externalimages.append(src) + else: + imagefull = os.path.normpath(os.path.join(dirname, src)) + images.append((imagefull, docname)) + + return (anchors, targets, images) def process_all(filelist): anchors = [] targets = [] + images = [] for file in filelist: - anchor, target = process_file(file) + anchor, target, image = process_file(file) targets = targets + target anchors = anchors + anchor + images = images + image - return (targets, anchors) + return (targets, anchors, images) def check_targets(targets, anchors): @@ -163,6 +184,46 @@ def check_usage(targets, files, entrypoint): return fail +# checks that images present in the directory are being used and also that +# pages link to existing images. For favicons, which are not referenced from +# the '.html' files there's a builtin set of exceptions. +def check_images(usedimages, imagefiles, ignoreimages): + favicons = [ + 'android-chrome-192x192.png', + 'android-chrome-256x256.png', + 'apple-touch-icon.png', + 'favicon-16x16.png', + 'favicon-32x32.png', + 'mstile-150x150.png', + ] + fail = False + + if ignoreimages: + favicons = favicons + ignoreimages + + for usedimage, docname in usedimages: + if usedimage not in imagefiles: + print(f'ERROR: \'{docname}\' references image \'{usedimage}\' not among images') + fail = True + + for imagefile in imagefiles: + used = False + + if imagefile in (usedimage[0] for usedimage in usedimages): + used = True + else: + for favicon in favicons: + if favicon in imagefile: + used = True + break + + if not used: + print(f'ERROR: Image \'{imagefile}\' is not used by any page') + fail = True + + return fail + + parser = argparse.ArgumentParser(description='HTML reference checker') parser.add_argument('--webroot', required=True, help='path to the web root') @@ -170,14 +231,16 @@ parser.add_argument('--entrypoint', default="index.html", help='file name of web entry point relative to --webroot') parser.add_argument('--external', action="store_true", help='print external references instead') +parser.add_argument('--ignore-images', action='append', + help='paths to images that should be considered as used') args = parser.parse_args() -files = get_file_list(os.path.abspath(args.webroot)) +files, imagefiles = get_file_list(os.path.abspath(args.webroot)) entrypoint = os.path.join(os.path.abspath(args.webroot), args.entrypoint) -targets, anchors = process_all(files) +targets, anchors, usedimages = process_all(files) fail = False @@ -186,7 +249,14 @@ if args.external: externallinks.sort() for ext in externallinks: if ext != prev: - print(ext) + print(f'link: {ext}') + + prev = ext + + externalimages.sort() + for ext in externalimages: + if ext != prev: + print(f'image: {ext}') prev = ext else: @@ -196,6 +266,9 @@ else: if check_usage(targets, files, entrypoint): fail = True + if check_images(usedimages, imagefiles, args.ignore_images): + fail = True + if fail: sys.exit(1)