1e5c31af7Sopenharmony_ci#!/usr/bin/python3 2e5c31af7Sopenharmony_ci# 3e5c31af7Sopenharmony_ci# Copyright 2020-2024 The Khronos Group Inc. 4e5c31af7Sopenharmony_ci# 5e5c31af7Sopenharmony_ci# SPDX-License-Identifier: Apache-2.0 6e5c31af7Sopenharmony_ci 7e5c31af7Sopenharmony_ci# check_html_xrefs - simple-minded check for internal xrefs in spec HTML 8e5c31af7Sopenharmony_ci# that do not exist. 9e5c31af7Sopenharmony_ci 10e5c31af7Sopenharmony_ci# Usage: check_html_xrefs file 11e5c31af7Sopenharmony_ci# Just reports bad xrefs, not where they occur 12e5c31af7Sopenharmony_ci 13e5c31af7Sopenharmony_ciimport argparse 14e5c31af7Sopenharmony_ciimport re 15e5c31af7Sopenharmony_ciimport sys 16e5c31af7Sopenharmony_cifrom lxml import etree 17e5c31af7Sopenharmony_ci 18e5c31af7Sopenharmony_ciSECTNAME = re.compile(r'sect(?P<level>\d+)') 19e5c31af7Sopenharmony_ci 20e5c31af7Sopenharmony_cidef find_parent_ids(elem, href): 21e5c31af7Sopenharmony_ci """Find section titles in parents, which are the 'id' elements of '<hN' 22e5c31af7Sopenharmony_ci children of '<div class="sectM"' tags, and N = M + 1. This may be 23e5c31af7Sopenharmony_ci specific to the Vulkan spec, though - hierarchy could be different in 24e5c31af7Sopenharmony_ci other asciidoctor documents. Returns a list of [ anchor, title ]. 25e5c31af7Sopenharmony_ci 26e5c31af7Sopenharmony_ci elem - this node 27e5c31af7Sopenharmony_ci href - href link text of elem""" 28e5c31af7Sopenharmony_ci 29e5c31af7Sopenharmony_ci # Find parent <div> with class="sect#" 30e5c31af7Sopenharmony_ci parent = elem.getparent() 31e5c31af7Sopenharmony_ci while parent is not None: 32e5c31af7Sopenharmony_ci if parent.tag == 'div': 33e5c31af7Sopenharmony_ci cssclass = parent.get('class') 34e5c31af7Sopenharmony_ci matches = SECTNAME.match(cssclass) 35e5c31af7Sopenharmony_ci if matches is not None: 36e5c31af7Sopenharmony_ci level = int(matches.group('level')) 37e5c31af7Sopenharmony_ci # Look for corresponding header tag in this div 38e5c31af7Sopenharmony_ci helem = parent.find('./h{}'.format(level+1)) 39e5c31af7Sopenharmony_ci if helem is not None: 40e5c31af7Sopenharmony_ci return [ helem.get('id'), ''.join(helem.itertext()) ] 41e5c31af7Sopenharmony_ci parent = parent.getparent() 42e5c31af7Sopenharmony_ci return [ '** NO PARENT NODE IDENTIFIED **', '' ] 43e5c31af7Sopenharmony_ci 44e5c31af7Sopenharmony_ciif __name__ == '__main__': 45e5c31af7Sopenharmony_ci parser = argparse.ArgumentParser() 46e5c31af7Sopenharmony_ci 47e5c31af7Sopenharmony_ci parser.add_argument('files', metavar='filename', nargs='*', 48e5c31af7Sopenharmony_ci help='Path to registry XML') 49e5c31af7Sopenharmony_ci args = parser.parse_args() 50e5c31af7Sopenharmony_ci 51e5c31af7Sopenharmony_ci for filename in args.files: 52e5c31af7Sopenharmony_ci parser = etree.HTMLParser() 53e5c31af7Sopenharmony_ci tree = etree.parse(filename, parser) 54e5c31af7Sopenharmony_ci 55e5c31af7Sopenharmony_ci # Find all 'id' elements 56e5c31af7Sopenharmony_ci id_elems = tree.findall('.//*[@id]') 57e5c31af7Sopenharmony_ci ids = set() 58e5c31af7Sopenharmony_ci for elem in id_elems: 59e5c31af7Sopenharmony_ci id = elem.get('id') 60e5c31af7Sopenharmony_ci if id in ids: 61e5c31af7Sopenharmony_ci True 62e5c31af7Sopenharmony_ci # print('Duplicate ID attribute:', id) 63e5c31af7Sopenharmony_ci else: 64e5c31af7Sopenharmony_ci ids.add(id) 65e5c31af7Sopenharmony_ci 66e5c31af7Sopenharmony_ci # Find all internal 'href' attributes and see if they are valid 67e5c31af7Sopenharmony_ci # Keep an [element, href] list for tracking parents 68e5c31af7Sopenharmony_ci # Also keep a count of each href 69e5c31af7Sopenharmony_ci ref_elems = tree.findall('.//a[@href]') 70e5c31af7Sopenharmony_ci refs = [] 71e5c31af7Sopenharmony_ci count = {} 72e5c31af7Sopenharmony_ci for elem in ref_elems: 73e5c31af7Sopenharmony_ci href = elem.get('href') 74e5c31af7Sopenharmony_ci # If not a local href, skip it 75e5c31af7Sopenharmony_ci if href[0] == '#': 76e5c31af7Sopenharmony_ci # If there is a corresponding id, skip it 77e5c31af7Sopenharmony_ci href = href[1:] 78e5c31af7Sopenharmony_ci if href not in ids: 79e5c31af7Sopenharmony_ci if href in count: 80e5c31af7Sopenharmony_ci refs.append((elem, href)) 81e5c31af7Sopenharmony_ci True 82e5c31af7Sopenharmony_ci count[href] = count[href] + 1 83e5c31af7Sopenharmony_ci else: 84e5c31af7Sopenharmony_ci refs.append((elem, href)) 85e5c31af7Sopenharmony_ci count[href] = 1 86e5c31af7Sopenharmony_ci else: 87e5c31af7Sopenharmony_ci True 88e5c31af7Sopenharmony_ci # print('Skipping external href:', ref) 89e5c31af7Sopenharmony_ci 90e5c31af7Sopenharmony_ci # Check for hrefs not found in ids 91e5c31af7Sopenharmony_ci if len(refs) > 0: 92e5c31af7Sopenharmony_ci print('Found bad links in {}:'.format(filename)) 93e5c31af7Sopenharmony_ci for (elem, href) in refs: 94e5c31af7Sopenharmony_ci parents = find_parent_ids(elem, href) 95e5c31af7Sopenharmony_ci print('{:<40} in {:<28} ({})'.format(href, parents[0], parents[1])) 96e5c31af7Sopenharmony_ci sys.exit(1) 97