1e5c31af7Sopenharmony_ci#!/usr/bin/python3 2e5c31af7Sopenharmony_ci# Copyright 2013-2024 The Khronos Group Inc. 3e5c31af7Sopenharmony_ci# SPDX-License-Identifier: Apache-2.0 4e5c31af7Sopenharmony_ci 5e5c31af7Sopenharmony_ci# linkcheck - check internal links of the specified HTML file against 6e5c31af7Sopenharmony_ci# internal anchors and report inconsistencies. 7e5c31af7Sopenharmony_ci# 8e5c31af7Sopenharmony_ci# Usage: linkcheck file.html 9e5c31af7Sopenharmony_ci 10e5c31af7Sopenharmony_ciimport argparse 11e5c31af7Sopenharmony_cifrom lxml import etree as et 12e5c31af7Sopenharmony_ci 13e5c31af7Sopenharmony_cidef printSet(s): 14e5c31af7Sopenharmony_ci for key in sorted(s): 15e5c31af7Sopenharmony_ci print(' {}'.format(key)) 16e5c31af7Sopenharmony_ci 17e5c31af7Sopenharmony_cidef checkLinks(file, args): 18e5c31af7Sopenharmony_ci parser = et.HTMLParser() 19e5c31af7Sopenharmony_ci tree = et.parse(file, parser) 20e5c31af7Sopenharmony_ci 21e5c31af7Sopenharmony_ci # Remove all <svg> elements, which just add noise to the cross-referencing 22e5c31af7Sopenharmony_ci for svg in tree.findall('//svg'): 23e5c31af7Sopenharmony_ci svg.getparent().remove(svg) 24e5c31af7Sopenharmony_ci 25e5c31af7Sopenharmony_ci # Extract elements with href= and id= attributes 26e5c31af7Sopenharmony_ci hrefs = tree.findall('//*[@href]') 27e5c31af7Sopenharmony_ci ids = tree.findall('//*[@id]') 28e5c31af7Sopenharmony_ci 29e5c31af7Sopenharmony_ci # Extract xref name from each xref 30e5c31af7Sopenharmony_ci internals = set() 31e5c31af7Sopenharmony_ci externals = set() 32e5c31af7Sopenharmony_ci 33e5c31af7Sopenharmony_ci for e in hrefs: 34e5c31af7Sopenharmony_ci # Do not track '<link>' tags from HTML headers 35e5c31af7Sopenharmony_ci if e.tag != 'link': 36e5c31af7Sopenharmony_ci xref = e.get('href') 37e5c31af7Sopenharmony_ci 38e5c31af7Sopenharmony_ci if xref[0:1] == '#': 39e5c31af7Sopenharmony_ci # Internal anchor 40e5c31af7Sopenharmony_ci internals.add(xref[1:]) 41e5c31af7Sopenharmony_ci else: 42e5c31af7Sopenharmony_ci externals.add(xref) 43e5c31af7Sopenharmony_ci 44e5c31af7Sopenharmony_ci # Extract anchor name from each id 45e5c31af7Sopenharmony_ci anchors = set() 46e5c31af7Sopenharmony_ci 47e5c31af7Sopenharmony_ci for e in ids: 48e5c31af7Sopenharmony_ci # Do not track SVG '<g>' tags 49e5c31af7Sopenharmony_ci if e.tag != 'g': 50e5c31af7Sopenharmony_ci anchors.add(e.get('id')) 51e5c31af7Sopenharmony_ci 52e5c31af7Sopenharmony_ci # Intersect them to find inconsistencies 53e5c31af7Sopenharmony_ci xrefsOnly = internals.difference(anchors) 54e5c31af7Sopenharmony_ci anchorsOnly = anchors.difference(internals) 55e5c31af7Sopenharmony_ci 56e5c31af7Sopenharmony_ci # print('External xrefs:', len(externals)) 57e5c31af7Sopenharmony_ci # printSet(externals) 58e5c31af7Sopenharmony_ci # 59e5c31af7Sopenharmony_ci # print('Internal xrefs:', len(internals)) 60e5c31af7Sopenharmony_ci # print('Anchors: ', len(anchors)) 61e5c31af7Sopenharmony_ci 62e5c31af7Sopenharmony_ci print('Internal xrefs not in anchors:', len(xrefsOnly)) 63e5c31af7Sopenharmony_ci printSet(xrefsOnly) 64e5c31af7Sopenharmony_ci 65e5c31af7Sopenharmony_ci if args.anchors: 66e5c31af7Sopenharmony_ci print('Internal anchors not in xrefs:', len(anchorsOnly)) 67e5c31af7Sopenharmony_ci printSet(anchorsOnly) 68e5c31af7Sopenharmony_ci 69e5c31af7Sopenharmony_ci# Patterns used to recognize interesting lines in an asciidoc source file. 70e5c31af7Sopenharmony_ci# These patterns are only compiled once. 71e5c31af7Sopenharmony_ci 72e5c31af7Sopenharmony_ciif __name__ == '__main__': 73e5c31af7Sopenharmony_ci parser = argparse.ArgumentParser() 74e5c31af7Sopenharmony_ci 75e5c31af7Sopenharmony_ci parser.add_argument('files', metavar='filename', nargs='*', 76e5c31af7Sopenharmony_ci help='a filename to promote text in') 77e5c31af7Sopenharmony_ci parser.add_argument('-anchors', action='store_true', 78e5c31af7Sopenharmony_ci help='Report orphaned anchors') 79e5c31af7Sopenharmony_ci 80e5c31af7Sopenharmony_ci 81e5c31af7Sopenharmony_ci args = parser.parse_args() 82e5c31af7Sopenharmony_ci 83e5c31af7Sopenharmony_ci for file in args.files: 84e5c31af7Sopenharmony_ci checkLinks(file, args) 85