scripts: check-html-refernces: Add checking for image file usage

Check both that a file is referenced from our pages and also that pages reference existing images. The mode for dumping external references now also dumps images. '--ignore-image' can be used repeatedly to suppress errors for specific images. Signed-off-by: Peter Krempa <pkrempa@redhat.com> Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
2024-12-22 17:34:18 +03:00 · 2023-02-14 14:38:40 +01:00 · 2023-02-14 14:38:40 +01:00 · edac6ca756
commit edac6ca756
parent 0e3970bf9b
1 changed files with 87 additions and 14 deletions
--- a/scripts/check-html-references.py
+++ b/scripts/check-html-references.py
@ -24,25 +24,32 @@ import xml.etree.ElementTree as ET
 ns = {'html': 'http://www.w3.org/1999/xhtml'}
 externallinks = []
 externalimages = []
 def get_file_list(prefix):
    filelist = []
    imagelist = []
    imageformats = ['.jpg', '.svg', '.png']
    for root, dir, files in os.walk(prefix):
        for file in files:
-            if not re.search('\\.html$', file):
+            ext = os.path.splitext(file)[1]
                continue
            if ext == '.html':
                # the 404 page doesn't play well
                if '404.html' in file:
                    continue
                filelist.append(os.path.join(root, file))
-    filelist.sort()
+            elif ext in imageformats:
                imagelist.append(os.path.join(root, file))
-    return filelist
+    filelist.sort()
    imagelist.sort()
    return filelist, imagelist
 # loads an XHTML and extracts all anchors, local and remote links for the one file
@ -50,12 +57,14 @@ def process_file(filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    docname = root.get('data-sourcedoc')
    dirname = os.path.dirname(filename)
    if not docname:
        docname = filename
    anchors = [filename]
    targets = []
    images = []
    for elem in root.findall('.//html:a', ns):
        target = elem.get('href')
@ -68,7 +77,6 @@ def process_file(filename):
            if re.search('://', target):
                externallinks.append(target)
            elif target[0] != '#' and 'mailto:' not in target:
                dirname = os.path.dirname(filename)
                targetfull = os.path.normpath(os.path.join(dirname, target))
                targets.append((filename, docname, targetfull, target))
@ -87,20 +95,33 @@ def process_file(filename):
        if an:
            anchors.append(filename + '#' + an)
-    return (anchors, targets)
+    # find local images
    for elem in root.findall('.//html:img', ns):
        src = elem.get('src')
        if src:
            if re.search('://', src):
                externalimages.append(src)
            else:
                imagefull = os.path.normpath(os.path.join(dirname, src))
                images.append((imagefull, docname))
    return (anchors, targets, images)
 def process_all(filelist):
    anchors = []
    targets = []
    images = []
    for file in filelist:
-        anchor, target = process_file(file)
+        anchor, target, image = process_file(file)
        targets = targets + target
        anchors = anchors + anchor
        images = images + image
-    return (targets, anchors)
+    return (targets, anchors, images)
 def check_targets(targets, anchors):
@ -163,6 +184,46 @@ def check_usage(targets, files, entrypoint):
    return fail
 # checks that images present in the directory are being used and also that
 # pages link to existing images. For favicons, which are not referenced from
 # the '.html' files there's a builtin set of exceptions.
 def check_images(usedimages, imagefiles, ignoreimages):
    favicons = [
        'android-chrome-192x192.png',
        'android-chrome-256x256.png',
        'apple-touch-icon.png',
        'favicon-16x16.png',
        'favicon-32x32.png',
        'mstile-150x150.png',
    ]
    fail = False
    if ignoreimages:
        favicons = favicons + ignoreimages
    for usedimage, docname in usedimages:
        if usedimage not in imagefiles:
            print(f'ERROR: \'{docname}\' references image \'{usedimage}\' not among images')
            fail = True
    for imagefile in imagefiles:
        used = False
        if imagefile in (usedimage[0] for usedimage in usedimages):
            used = True
        else:
            for favicon in favicons:
                if favicon in imagefile:
                    used = True
                    break
        if not used:
            print(f'ERROR: Image \'{imagefile}\' is not used by any page')
            fail = True
    return fail
 parser = argparse.ArgumentParser(description='HTML reference checker')
 parser.add_argument('--webroot', required=True,
                    help='path to the web root')
@ -170,14 +231,16 @@ parser.add_argument('--entrypoint', default="index.html",
                    help='file name of web entry point relative to --webroot')
 parser.add_argument('--external', action="store_true",
                    help='print external references instead')
 parser.add_argument('--ignore-images', action='append',
                    help='paths to images that should be considered as used')
 args = parser.parse_args()
-files = get_file_list(os.path.abspath(args.webroot))
+files, imagefiles = get_file_list(os.path.abspath(args.webroot))
 entrypoint = os.path.join(os.path.abspath(args.webroot), args.entrypoint)
-targets, anchors = process_all(files)
+targets, anchors, usedimages = process_all(files)
 fail = False
@ -186,7 +249,14 @@ if args.external:
    externallinks.sort()
    for ext in externallinks:
        if ext != prev:
-            print(ext)
+            print(f'link: {ext}')
        prev = ext
    externalimages.sort()
    for ext in externalimages:
        if ext != prev:
            print(f'image: {ext}')
        prev = ext
 else:
@ -196,6 +266,9 @@ else:
    if check_usage(targets, files, entrypoint):
        fail = True
    if check_images(usedimages, imagefiles, args.ignore_images):
        fail = True
    if fail:
        sys.exit(1)