1e5c31af7Sopenharmony_ci#!/usr/bin/python3
2e5c31af7Sopenharmony_ci#
3e5c31af7Sopenharmony_ci# Copyright 2020-2024 The Khronos Group Inc.
4e5c31af7Sopenharmony_ci#
5e5c31af7Sopenharmony_ci# SPDX-License-Identifier: Apache-2.0
6e5c31af7Sopenharmony_ci
7e5c31af7Sopenharmony_ci# check_html_xrefs - simple-minded check for internal xrefs in spec HTML
8e5c31af7Sopenharmony_ci# that do not exist.
9e5c31af7Sopenharmony_ci
10e5c31af7Sopenharmony_ci# Usage: check_html_xrefs file
11e5c31af7Sopenharmony_ci# Just reports bad xrefs, not where they occur
12e5c31af7Sopenharmony_ci
13e5c31af7Sopenharmony_ciimport argparse
14e5c31af7Sopenharmony_ciimport re
15e5c31af7Sopenharmony_ciimport sys
16e5c31af7Sopenharmony_cifrom lxml import etree
17e5c31af7Sopenharmony_ci
18e5c31af7Sopenharmony_ciSECTNAME = re.compile(r'sect(?P<level>\d+)')
19e5c31af7Sopenharmony_ci
20e5c31af7Sopenharmony_cidef find_parent_ids(elem, href):
21e5c31af7Sopenharmony_ci    """Find section titles in parents, which are the 'id' elements of '<hN'
22e5c31af7Sopenharmony_ci       children of '<div class="sectM"' tags, and N = M + 1. This may be
23e5c31af7Sopenharmony_ci       specific to the Vulkan spec, though - hierarchy could be different in
24e5c31af7Sopenharmony_ci       other asciidoctor documents. Returns a list of [ anchor, title ].
25e5c31af7Sopenharmony_ci
26e5c31af7Sopenharmony_ci       elem - this node
27e5c31af7Sopenharmony_ci       href - href link text of elem"""
28e5c31af7Sopenharmony_ci
29e5c31af7Sopenharmony_ci    # Find parent <div> with class="sect#"
30e5c31af7Sopenharmony_ci    parent = elem.getparent()
31e5c31af7Sopenharmony_ci    while parent is not None:
32e5c31af7Sopenharmony_ci        if parent.tag == 'div':
33e5c31af7Sopenharmony_ci            cssclass = parent.get('class')
34e5c31af7Sopenharmony_ci            matches = SECTNAME.match(cssclass)
35e5c31af7Sopenharmony_ci            if matches is not None:
36e5c31af7Sopenharmony_ci                level = int(matches.group('level'))
37e5c31af7Sopenharmony_ci                # Look for corresponding header tag in this div
38e5c31af7Sopenharmony_ci                helem = parent.find('./h{}'.format(level+1))
39e5c31af7Sopenharmony_ci                if helem is not None:
40e5c31af7Sopenharmony_ci                    return [ helem.get('id'), ''.join(helem.itertext()) ]
41e5c31af7Sopenharmony_ci        parent = parent.getparent()
42e5c31af7Sopenharmony_ci    return [ '** NO PARENT NODE IDENTIFIED **', '' ]
43e5c31af7Sopenharmony_ci
44e5c31af7Sopenharmony_ciif __name__ == '__main__':
45e5c31af7Sopenharmony_ci    parser = argparse.ArgumentParser()
46e5c31af7Sopenharmony_ci
47e5c31af7Sopenharmony_ci    parser.add_argument('files', metavar='filename', nargs='*',
48e5c31af7Sopenharmony_ci                        help='Path to registry XML')
49e5c31af7Sopenharmony_ci    args = parser.parse_args()
50e5c31af7Sopenharmony_ci
51e5c31af7Sopenharmony_ci    for filename in args.files:
52e5c31af7Sopenharmony_ci        parser = etree.HTMLParser()
53e5c31af7Sopenharmony_ci        tree = etree.parse(filename, parser)
54e5c31af7Sopenharmony_ci
55e5c31af7Sopenharmony_ci        # Find all 'id' elements
56e5c31af7Sopenharmony_ci        id_elems = tree.findall('.//*[@id]')
57e5c31af7Sopenharmony_ci        ids = set()
58e5c31af7Sopenharmony_ci        for elem in id_elems:
59e5c31af7Sopenharmony_ci            id = elem.get('id')
60e5c31af7Sopenharmony_ci            if id in ids:
61e5c31af7Sopenharmony_ci                True
62e5c31af7Sopenharmony_ci                # print('Duplicate ID attribute:', id)
63e5c31af7Sopenharmony_ci            else:
64e5c31af7Sopenharmony_ci                ids.add(id)
65e5c31af7Sopenharmony_ci
66e5c31af7Sopenharmony_ci        # Find all internal 'href' attributes and see if they are valid
67e5c31af7Sopenharmony_ci        # Keep an [element, href] list for tracking parents
68e5c31af7Sopenharmony_ci        # Also keep a count of each href
69e5c31af7Sopenharmony_ci        ref_elems = tree.findall('.//a[@href]')
70e5c31af7Sopenharmony_ci        refs = []
71e5c31af7Sopenharmony_ci        count = {}
72e5c31af7Sopenharmony_ci        for elem in ref_elems:
73e5c31af7Sopenharmony_ci            href = elem.get('href')
74e5c31af7Sopenharmony_ci            # If not a local href, skip it
75e5c31af7Sopenharmony_ci            if href[0] == '#':
76e5c31af7Sopenharmony_ci                # If there is a corresponding id, skip it
77e5c31af7Sopenharmony_ci                href = href[1:]
78e5c31af7Sopenharmony_ci                if href not in ids:
79e5c31af7Sopenharmony_ci                    if href in count:
80e5c31af7Sopenharmony_ci                        refs.append((elem, href))
81e5c31af7Sopenharmony_ci                        True
82e5c31af7Sopenharmony_ci                        count[href] = count[href] + 1
83e5c31af7Sopenharmony_ci                    else:
84e5c31af7Sopenharmony_ci                        refs.append((elem, href))
85e5c31af7Sopenharmony_ci                        count[href] = 1
86e5c31af7Sopenharmony_ci            else:
87e5c31af7Sopenharmony_ci                True
88e5c31af7Sopenharmony_ci                # print('Skipping external href:', ref)
89e5c31af7Sopenharmony_ci
90e5c31af7Sopenharmony_ci        # Check for hrefs not found in ids
91e5c31af7Sopenharmony_ci        if len(refs) > 0:
92e5c31af7Sopenharmony_ci            print('Found bad links in {}:'.format(filename))
93e5c31af7Sopenharmony_ci            for (elem, href) in refs:
94e5c31af7Sopenharmony_ci                parents = find_parent_ids(elem, href)
95e5c31af7Sopenharmony_ci                print('{:<40} in {:<28} ({})'.format(href, parents[0], parents[1]))
96e5c31af7Sopenharmony_ci            sys.exit(1)
97