2022-05-31 16:15:57 +03:00
#!/usr/bin/env python3
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library. If not, see
# <http://www.gnu.org/licenses/>.
#
# Check that external references between documentation HTML files are not broken.
import argparse
2023-06-05 10:40:13 +03:00
import os
2022-05-31 16:15:57 +03:00
import re
2023-06-05 10:40:13 +03:00
import sys
2022-05-31 16:15:57 +03:00
import xml . etree . ElementTree as ET
ns = { ' html ' : ' http://www.w3.org/1999/xhtml ' }
externallinks = [ ]
2023-02-14 16:38:40 +03:00
externalimages = [ ]
2022-05-31 16:15:57 +03:00
def get_file_list ( prefix ) :
filelist = [ ]
2023-02-14 16:38:40 +03:00
imagelist = [ ]
imageformats = [ ' .jpg ' , ' .svg ' , ' .png ' ]
2022-05-31 16:15:57 +03:00
for root , dir , files in os . walk ( prefix ) :
for file in files :
2023-02-14 16:38:40 +03:00
ext = os . path . splitext ( file ) [ 1 ]
2022-05-31 16:15:57 +03:00
2023-02-14 16:38:40 +03:00
if ext == ' .html ' :
# the 404 page doesn't play well
if ' 404.html ' in file :
continue
filelist . append ( os . path . join ( root , file ) )
2022-05-31 16:15:57 +03:00
2023-02-14 16:38:40 +03:00
elif ext in imageformats :
imagelist . append ( os . path . join ( root , file ) )
2022-05-31 16:15:57 +03:00
2023-02-14 15:14:25 +03:00
filelist . sort ( )
2023-02-14 16:38:40 +03:00
imagelist . sort ( )
2023-02-14 15:14:25 +03:00
2023-02-14 16:38:40 +03:00
return filelist , imagelist
2022-05-31 16:15:57 +03:00
# loads an XHTML and extracts all anchors, local and remote links for the one file
2024-10-08 16:06:17 +03:00
def process_file ( filename , project_uri ) :
2022-05-31 16:15:57 +03:00
tree = ET . parse ( filename )
root = tree . getroot ( )
2023-02-14 14:35:23 +03:00
docname = root . get ( ' data-sourcedoc ' )
2023-02-14 16:38:40 +03:00
dirname = os . path . dirname ( filename )
2022-05-31 16:15:57 +03:00
2023-02-14 14:35:23 +03:00
if not docname :
docname = filename
anchors = [ filename ]
2022-05-31 16:15:57 +03:00
targets = [ ]
2023-02-14 16:38:40 +03:00
images = [ ]
2024-10-08 16:06:17 +03:00
projectlinks = [ ]
2022-05-31 16:15:57 +03:00
for elem in root . findall ( ' .//html:a ' , ns ) :
target = elem . get ( ' href ' )
an = elem . get ( ' id ' )
if an :
2023-02-14 14:35:23 +03:00
anchors . append ( filename + ' # ' + an )
2022-05-31 16:15:57 +03:00
if target :
if re . search ( ' :// ' , target ) :
externallinks . append ( target )
2024-10-08 16:06:17 +03:00
if project_uri is not None and target . startswith ( project_uri ) :
projectlinks . append ( ( target , docname ) )
2022-05-31 16:15:57 +03:00
elif target [ 0 ] != ' # ' and ' mailto: ' not in target :
2023-02-14 14:35:23 +03:00
targetfull = os . path . normpath ( os . path . join ( dirname , target ) )
2022-05-31 16:15:57 +03:00
2023-02-14 14:35:23 +03:00
targets . append ( ( filename , docname , targetfull , target ) )
2022-05-31 16:15:57 +03:00
# older docutils generate "<div class='section'"
for elem in root . findall ( ' .//html:div/[@class= \' section \' ] ' , ns ) :
an = elem . get ( ' id ' )
if an :
2023-02-14 14:35:23 +03:00
anchors . append ( filename + ' # ' + an )
2022-05-31 16:15:57 +03:00
# modern docutils generate a <section element
for elem in root . findall ( ' .//html:section ' , ns ) :
an = elem . get ( ' id ' )
if an :
2023-02-14 14:35:23 +03:00
anchors . append ( filename + ' # ' + an )
2022-05-31 16:15:57 +03:00
2023-02-14 16:38:40 +03:00
# find local images
for elem in root . findall ( ' .//html:img ' , ns ) :
src = elem . get ( ' src ' )
if src :
if re . search ( ' :// ' , src ) :
externalimages . append ( src )
else :
imagefull = os . path . normpath ( os . path . join ( dirname , src ) )
images . append ( ( imagefull , docname ) )
2024-10-08 16:06:17 +03:00
return ( anchors , targets , images , projectlinks )
2022-05-31 16:15:57 +03:00
2024-10-08 16:06:17 +03:00
def process_all ( filelist , project_uri ) :
2022-05-31 16:15:57 +03:00
anchors = [ ]
targets = [ ]
2023-02-14 16:38:40 +03:00
images = [ ]
2024-10-08 16:06:17 +03:00
projectlinks = [ ]
2022-05-31 16:15:57 +03:00
2023-02-14 14:35:23 +03:00
for file in filelist :
2024-10-08 16:06:17 +03:00
anchor , target , image , projectlink = process_file ( file , project_uri )
2022-05-31 16:15:57 +03:00
targets = targets + target
anchors = anchors + anchor
2023-02-14 16:38:40 +03:00
images = images + image
2024-10-08 16:06:17 +03:00
projectlinks = projectlinks + projectlink
2022-05-31 16:15:57 +03:00
2024-10-08 16:06:17 +03:00
return ( targets , anchors , images , projectlinks )
2022-05-31 16:15:57 +03:00
def check_targets ( targets , anchors ) :
errors = [ ]
2023-02-14 14:35:23 +03:00
for _ , docname , target , targetorig in targets :
2022-05-31 16:15:57 +03:00
if target not in anchors :
2023-02-14 14:35:23 +03:00
errors . append ( ( docname , targetorig ) )
2022-05-31 16:15:57 +03:00
if errors :
errors . sort ( )
for file , target in errors :
2023-02-14 14:35:23 +03:00
print ( f ' ERROR: \' { file } \' : broken link to: \' { target } \' ' )
2022-05-31 16:15:57 +03:00
return True
return False
2023-02-14 15:14:25 +03:00
def check_usage_crawl ( page , targets , visited ) :
visited . append ( page )
tocrawl = [ ]
for filename , docname , target , _ in targets :
if page != filename :
continue
targetpage = target . split ( " # " , 1 ) [ 0 ]
if targetpage not in visited and targetpage not in tocrawl :
tocrawl . append ( targetpage )
for crawl in tocrawl :
check_usage_crawl ( crawl , targets , visited )
# crawls the document references starting from entrypoint and tries to find
# unreachable pages
def check_usage ( targets , files , entrypoint ) :
visited = [ ]
fail = False
check_usage_crawl ( entrypoint , targets , visited )
for file in files :
if file not in visited :
brokendoc = file
for filename , docname , _ , _ in targets :
if filename != file :
continue
if docname :
brokendoc = docname
break
print ( f ' ERROR: \' { brokendoc } \' : is not referenced from anywhere ' )
fail = True
return fail
2023-02-14 16:38:40 +03:00
# checks that images present in the directory are being used and also that
# pages link to existing images. For favicons, which are not referenced from
# the '.html' files there's a builtin set of exceptions.
def check_images ( usedimages , imagefiles , ignoreimages ) :
favicons = [
' android-chrome-192x192.png ' ,
' android-chrome-256x256.png ' ,
' apple-touch-icon.png ' ,
' favicon-16x16.png ' ,
' favicon-32x32.png ' ,
' mstile-150x150.png ' ,
]
fail = False
if ignoreimages :
favicons = favicons + ignoreimages
for usedimage , docname in usedimages :
if usedimage not in imagefiles :
print ( f ' ERROR: \' { docname } \' references image \' { usedimage } \' not among images ' )
fail = True
for imagefile in imagefiles :
used = False
if imagefile in ( usedimage [ 0 ] for usedimage in usedimages ) :
used = True
else :
for favicon in favicons :
if favicon in imagefile :
used = True
break
if not used :
print ( f ' ERROR: Image \' { imagefile } \' is not used by any page ' )
fail = True
return fail
2024-10-08 14:38:34 +03:00
# checks that all links are accessed via https
def check_https ( links ) :
fail = False
for link in links :
if link . startswith ( ' http:// ' ) :
print ( f ' ERROR: URI \' { link } \' uses insecure " http " protocol ' )
fail = True
return fail
2024-10-08 16:06:17 +03:00
# checks prohibited external links to local files
def check_projectlinks ( projectlinks , exceptions ) :
fail = False
for ( link , filename ) in projectlinks :
allowed = False
if exceptions is not None :
for exc in exceptions :
if exc in filename :
allowed = True
break
if not allowed :
print ( f ' ERROR: prohibited external URI \' { link } \' to local project in \' { filename } \' ' )
fail = True
return fail
2022-05-31 16:15:57 +03:00
parser = argparse . ArgumentParser ( description = ' HTML reference checker ' )
2023-02-14 14:05:30 +03:00
parser . add_argument ( ' --webroot ' , required = True ,
help = ' path to the web root ' )
2023-02-14 15:14:25 +03:00
parser . add_argument ( ' --entrypoint ' , default = " index.html " ,
help = ' file name of web entry point relative to --webroot ' )
2022-05-31 16:15:57 +03:00
parser . add_argument ( ' --external ' , action = " store_true " ,
help = ' print external references instead ' )
2023-02-14 16:38:40 +03:00
parser . add_argument ( ' --ignore-images ' , action = ' append ' ,
help = ' paths to images that should be considered as used ' )
2024-10-08 14:38:34 +03:00
parser . add_argument ( ' --require-https ' , action = " store_true " ,
help = ' require secure https for external links ' )
2024-10-08 16:06:17 +03:00
parser . add_argument ( ' --project-uri ' ,
help = ' external prefix of the local project (e.g. https://libvirt.org; external links with that prefix are prohibited ' )
parser . add_argument ( ' --project-uri-exceptions ' , action = ' append ' ,
help = ' list of path prefixes excluded from the " --project-uri " checks ' )
2022-05-31 16:15:57 +03:00
args = parser . parse_args ( )
2023-02-14 16:38:40 +03:00
files , imagefiles = get_file_list ( os . path . abspath ( args . webroot ) )
2022-05-31 16:15:57 +03:00
2023-02-14 15:14:25 +03:00
entrypoint = os . path . join ( os . path . abspath ( args . webroot ) , args . entrypoint )
2024-10-08 16:06:17 +03:00
targets , anchors , usedimages , projectlinks = process_all ( files , args . project_uri )
2022-05-31 16:15:57 +03:00
2023-02-14 15:14:25 +03:00
fail = False
2022-05-31 16:15:57 +03:00
if args . external :
prev = None
externallinks . sort ( )
for ext in externallinks :
if ext != prev :
2023-02-14 16:38:40 +03:00
print ( f ' link: { ext } ' )
prev = ext
externalimages . sort ( )
for ext in externalimages :
if ext != prev :
print ( f ' image: { ext } ' )
2022-05-31 16:15:57 +03:00
prev = ext
else :
if check_targets ( targets , anchors ) :
2023-02-14 15:14:25 +03:00
fail = True
if check_usage ( targets , files , entrypoint ) :
fail = True
2023-02-14 16:38:40 +03:00
if check_images ( usedimages , imagefiles , args . ignore_images ) :
fail = True
2024-10-08 16:06:17 +03:00
if check_projectlinks ( projectlinks , args . project_uri_exceptions ) :
fail = True
2024-10-08 14:38:34 +03:00
if args . require_https :
if check_https ( externallinks ) :
fail = True
if check_https ( externalimages ) :
fail = True
2023-02-14 15:14:25 +03:00
if fail :
2022-05-31 16:15:57 +03:00
sys . exit ( 1 )
sys . exit ( 0 )