1e5c31af7Sopenharmony_ci#!/usr/bin/python3
2e5c31af7Sopenharmony_ci# Copyright 2013-2024 The Khronos Group Inc.
3e5c31af7Sopenharmony_ci# SPDX-License-Identifier: Apache-2.0
4e5c31af7Sopenharmony_ci
5e5c31af7Sopenharmony_ci# linkcheck - check internal links of the specified HTML file against
6e5c31af7Sopenharmony_ci# internal anchors and report inconsistencies.
7e5c31af7Sopenharmony_ci#
8e5c31af7Sopenharmony_ci# Usage: linkcheck file.html
9e5c31af7Sopenharmony_ci
10e5c31af7Sopenharmony_ciimport argparse
11e5c31af7Sopenharmony_cifrom lxml import etree as et
12e5c31af7Sopenharmony_ci
13e5c31af7Sopenharmony_cidef printSet(s):
14e5c31af7Sopenharmony_ci    for key in sorted(s):
15e5c31af7Sopenharmony_ci        print('    {}'.format(key))
16e5c31af7Sopenharmony_ci
17e5c31af7Sopenharmony_cidef checkLinks(file, args):
18e5c31af7Sopenharmony_ci    parser = et.HTMLParser()
19e5c31af7Sopenharmony_ci    tree = et.parse(file, parser)
20e5c31af7Sopenharmony_ci
21e5c31af7Sopenharmony_ci    # Remove all <svg> elements, which just add noise to the cross-referencing
22e5c31af7Sopenharmony_ci    for svg in tree.findall('//svg'):
23e5c31af7Sopenharmony_ci        svg.getparent().remove(svg)
24e5c31af7Sopenharmony_ci
25e5c31af7Sopenharmony_ci    # Extract elements with href= and id= attributes
26e5c31af7Sopenharmony_ci    hrefs = tree.findall('//*[@href]')
27e5c31af7Sopenharmony_ci    ids = tree.findall('//*[@id]')
28e5c31af7Sopenharmony_ci
29e5c31af7Sopenharmony_ci    # Extract xref name from each xref
30e5c31af7Sopenharmony_ci    internals = set()
31e5c31af7Sopenharmony_ci    externals = set()
32e5c31af7Sopenharmony_ci
33e5c31af7Sopenharmony_ci    for e in hrefs:
34e5c31af7Sopenharmony_ci        # Do not track '<link>' tags from HTML headers
35e5c31af7Sopenharmony_ci        if e.tag != 'link':
36e5c31af7Sopenharmony_ci            xref = e.get('href')
37e5c31af7Sopenharmony_ci
38e5c31af7Sopenharmony_ci            if xref[0:1] == '#':
39e5c31af7Sopenharmony_ci                # Internal anchor
40e5c31af7Sopenharmony_ci                internals.add(xref[1:])
41e5c31af7Sopenharmony_ci            else:
42e5c31af7Sopenharmony_ci                externals.add(xref)
43e5c31af7Sopenharmony_ci
44e5c31af7Sopenharmony_ci    # Extract anchor name from each id
45e5c31af7Sopenharmony_ci    anchors = set()
46e5c31af7Sopenharmony_ci
47e5c31af7Sopenharmony_ci    for e in ids:
48e5c31af7Sopenharmony_ci        # Do not track SVG '<g>' tags
49e5c31af7Sopenharmony_ci        if e.tag != 'g':
50e5c31af7Sopenharmony_ci            anchors.add(e.get('id'))
51e5c31af7Sopenharmony_ci
52e5c31af7Sopenharmony_ci    # Intersect them to find inconsistencies
53e5c31af7Sopenharmony_ci    xrefsOnly = internals.difference(anchors)
54e5c31af7Sopenharmony_ci    anchorsOnly = anchors.difference(internals)
55e5c31af7Sopenharmony_ci
56e5c31af7Sopenharmony_ci    # print('External xrefs:', len(externals))
57e5c31af7Sopenharmony_ci    # printSet(externals)
58e5c31af7Sopenharmony_ci    #
59e5c31af7Sopenharmony_ci    # print('Internal xrefs:', len(internals))
60e5c31af7Sopenharmony_ci    # print('Anchors:       ', len(anchors))
61e5c31af7Sopenharmony_ci
62e5c31af7Sopenharmony_ci    print('Internal xrefs not in anchors:', len(xrefsOnly))
63e5c31af7Sopenharmony_ci    printSet(xrefsOnly)
64e5c31af7Sopenharmony_ci
65e5c31af7Sopenharmony_ci    if args.anchors:
66e5c31af7Sopenharmony_ci        print('Internal anchors not in xrefs:', len(anchorsOnly))
67e5c31af7Sopenharmony_ci        printSet(anchorsOnly)
68e5c31af7Sopenharmony_ci
69e5c31af7Sopenharmony_ci# Patterns used to recognize interesting lines in an asciidoc source file.
70e5c31af7Sopenharmony_ci# These patterns are only compiled once.
71e5c31af7Sopenharmony_ci
72e5c31af7Sopenharmony_ciif __name__ == '__main__':
73e5c31af7Sopenharmony_ci    parser = argparse.ArgumentParser()
74e5c31af7Sopenharmony_ci
75e5c31af7Sopenharmony_ci    parser.add_argument('files', metavar='filename', nargs='*',
76e5c31af7Sopenharmony_ci                        help='a filename to promote text in')
77e5c31af7Sopenharmony_ci    parser.add_argument('-anchors', action='store_true',
78e5c31af7Sopenharmony_ci                        help='Report orphaned anchors')
79e5c31af7Sopenharmony_ci
80e5c31af7Sopenharmony_ci
81e5c31af7Sopenharmony_ci    args = parser.parse_args()
82e5c31af7Sopenharmony_ci
83e5c31af7Sopenharmony_ci    for file in args.files:
84e5c31af7Sopenharmony_ci        checkLinks(file, args)
85