17db96d56Sopenharmony_ci""" 27db96d56Sopenharmony_ciTry to detect suspicious constructs, resembling markup 37db96d56Sopenharmony_cithat has leaked into the final output. 47db96d56Sopenharmony_ci 57db96d56Sopenharmony_ciSuspicious lines are reported in a comma-separated-file, 67db96d56Sopenharmony_ci``suspicious.csv``, located in the output directory. 77db96d56Sopenharmony_ci 87db96d56Sopenharmony_ciThe file is utf-8 encoded, and each line contains four fields: 97db96d56Sopenharmony_ci 107db96d56Sopenharmony_ci * document name (normalized) 117db96d56Sopenharmony_ci * line number in the source document 127db96d56Sopenharmony_ci * problematic text 137db96d56Sopenharmony_ci * complete line showing the problematic text in context 147db96d56Sopenharmony_ci 157db96d56Sopenharmony_ciIt is common to find many false positives. To avoid reporting them 167db96d56Sopenharmony_ciagain and again, they may be added to the ``ignored.csv`` file 177db96d56Sopenharmony_ci(located in the configuration directory). The file has the same 187db96d56Sopenharmony_ciformat as ``suspicious.csv`` with a few differences: 197db96d56Sopenharmony_ci 207db96d56Sopenharmony_ci - each line defines a rule; if the rule matches, the issue 217db96d56Sopenharmony_ci is ignored. 227db96d56Sopenharmony_ci - line number may be empty (that is, nothing between the 237db96d56Sopenharmony_ci commas: ",,"). In this case, line numbers are ignored (the 247db96d56Sopenharmony_ci rule matches anywhere in the file). 257db96d56Sopenharmony_ci - the last field does not have to be a complete line; some 267db96d56Sopenharmony_ci surrounding text (never more than a line) is enough for 277db96d56Sopenharmony_ci context. 287db96d56Sopenharmony_ci 297db96d56Sopenharmony_ciRules are processed sequentially. A rule matches when: 307db96d56Sopenharmony_ci 317db96d56Sopenharmony_ci * document names are the same 327db96d56Sopenharmony_ci * problematic texts are the same 337db96d56Sopenharmony_ci * line numbers are close to each other (5 lines up or down) 347db96d56Sopenharmony_ci * the rule text is completely contained into the source line 357db96d56Sopenharmony_ci 367db96d56Sopenharmony_ciThe simplest way to create the ignored.csv file is by copying 377db96d56Sopenharmony_ciundesired entries from suspicious.csv (possibly trimming the last 387db96d56Sopenharmony_cifield.) 397db96d56Sopenharmony_ci 407db96d56Sopenharmony_ciCopyright 2009 Gabriel A. Genellina 417db96d56Sopenharmony_ci 427db96d56Sopenharmony_ci""" 437db96d56Sopenharmony_ci 447db96d56Sopenharmony_ciimport os 457db96d56Sopenharmony_ciimport re 467db96d56Sopenharmony_ciimport csv 477db96d56Sopenharmony_ci 487db96d56Sopenharmony_cifrom docutils import nodes 497db96d56Sopenharmony_cifrom sphinx.builders import Builder 507db96d56Sopenharmony_ciimport sphinx.util 517db96d56Sopenharmony_ci 527db96d56Sopenharmony_cidetect_all = re.compile(r''' 537db96d56Sopenharmony_ci ::(?=[^=])| # two :: (but NOT ::=) 547db96d56Sopenharmony_ci :[a-zA-Z][a-zA-Z0-9]+| # :foo 557db96d56Sopenharmony_ci `| # ` (seldom used by itself) 567db96d56Sopenharmony_ci (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:) 577db96d56Sopenharmony_ci ''', re.VERBOSE).finditer 587db96d56Sopenharmony_ci 597db96d56Sopenharmony_ci 607db96d56Sopenharmony_ciclass Rule: 617db96d56Sopenharmony_ci def __init__(self, docname, lineno, issue, line): 627db96d56Sopenharmony_ci """A rule for ignoring issues""" 637db96d56Sopenharmony_ci self.docname = docname # document to which this rule applies 647db96d56Sopenharmony_ci self.lineno = lineno # line number in the original source; 657db96d56Sopenharmony_ci # this rule matches only near that. 667db96d56Sopenharmony_ci # None -> don't care 677db96d56Sopenharmony_ci self.issue = issue # the markup fragment that triggered this rule 687db96d56Sopenharmony_ci self.line = line # text of the container element (single line only) 697db96d56Sopenharmony_ci self.used = False 707db96d56Sopenharmony_ci 717db96d56Sopenharmony_ci def __repr__(self): 727db96d56Sopenharmony_ci return '{0.docname},,{0.issue},{0.line}'.format(self) 737db96d56Sopenharmony_ci 747db96d56Sopenharmony_ci 757db96d56Sopenharmony_ci 767db96d56Sopenharmony_ciclass dialect(csv.excel): 777db96d56Sopenharmony_ci """Our dialect: uses only linefeed as newline.""" 787db96d56Sopenharmony_ci lineterminator = '\n' 797db96d56Sopenharmony_ci 807db96d56Sopenharmony_ci 817db96d56Sopenharmony_ciclass CheckSuspiciousMarkupBuilder(Builder): 827db96d56Sopenharmony_ci """ 837db96d56Sopenharmony_ci Checks for possibly invalid markup that may leak into the output. 847db96d56Sopenharmony_ci """ 857db96d56Sopenharmony_ci name = 'suspicious' 867db96d56Sopenharmony_ci logger = sphinx.util.logging.getLogger("CheckSuspiciousMarkupBuilder") 877db96d56Sopenharmony_ci 887db96d56Sopenharmony_ci def init(self): 897db96d56Sopenharmony_ci # create output file 907db96d56Sopenharmony_ci self.log_file_name = os.path.join(self.outdir, 'suspicious.csv') 917db96d56Sopenharmony_ci open(self.log_file_name, 'w').close() 927db96d56Sopenharmony_ci # load database of previously ignored issues 937db96d56Sopenharmony_ci self.load_rules(os.path.join(os.path.dirname(__file__), '..', 947db96d56Sopenharmony_ci 'susp-ignored.csv')) 957db96d56Sopenharmony_ci 967db96d56Sopenharmony_ci def get_outdated_docs(self): 977db96d56Sopenharmony_ci return self.env.found_docs 987db96d56Sopenharmony_ci 997db96d56Sopenharmony_ci def get_target_uri(self, docname, typ=None): 1007db96d56Sopenharmony_ci return '' 1017db96d56Sopenharmony_ci 1027db96d56Sopenharmony_ci def prepare_writing(self, docnames): 1037db96d56Sopenharmony_ci pass 1047db96d56Sopenharmony_ci 1057db96d56Sopenharmony_ci def write_doc(self, docname, doctree): 1067db96d56Sopenharmony_ci # set when any issue is encountered in this document 1077db96d56Sopenharmony_ci self.any_issue = False 1087db96d56Sopenharmony_ci self.docname = docname 1097db96d56Sopenharmony_ci visitor = SuspiciousVisitor(doctree, self) 1107db96d56Sopenharmony_ci doctree.walk(visitor) 1117db96d56Sopenharmony_ci 1127db96d56Sopenharmony_ci def finish(self): 1137db96d56Sopenharmony_ci unused_rules = [rule for rule in self.rules if not rule.used] 1147db96d56Sopenharmony_ci if unused_rules: 1157db96d56Sopenharmony_ci self.logger.warning( 1167db96d56Sopenharmony_ci 'Found %s/%s unused rules: %s' % ( 1177db96d56Sopenharmony_ci len(unused_rules), len(self.rules), 1187db96d56Sopenharmony_ci '\n'.join(repr(rule) for rule in unused_rules), 1197db96d56Sopenharmony_ci ) 1207db96d56Sopenharmony_ci ) 1217db96d56Sopenharmony_ci return 1227db96d56Sopenharmony_ci 1237db96d56Sopenharmony_ci def check_issue(self, line, lineno, issue): 1247db96d56Sopenharmony_ci if not self.is_ignored(line, lineno, issue): 1257db96d56Sopenharmony_ci self.report_issue(line, lineno, issue) 1267db96d56Sopenharmony_ci 1277db96d56Sopenharmony_ci def is_ignored(self, line, lineno, issue): 1287db96d56Sopenharmony_ci """Determine whether this issue should be ignored.""" 1297db96d56Sopenharmony_ci docname = self.docname 1307db96d56Sopenharmony_ci for rule in self.rules: 1317db96d56Sopenharmony_ci if rule.docname != docname: continue 1327db96d56Sopenharmony_ci if rule.issue != issue: continue 1337db96d56Sopenharmony_ci # Both lines must match *exactly*. This is rather strict, 1347db96d56Sopenharmony_ci # and probably should be improved. 1357db96d56Sopenharmony_ci # Doing fuzzy matches with levenshtein distance could work, 1367db96d56Sopenharmony_ci # but that means bringing other libraries... 1377db96d56Sopenharmony_ci # Ok, relax that requirement: just check if the rule fragment 1387db96d56Sopenharmony_ci # is contained in the document line 1397db96d56Sopenharmony_ci if rule.line not in line: continue 1407db96d56Sopenharmony_ci # Check both line numbers. If they're "near" 1417db96d56Sopenharmony_ci # this rule matches. (lineno=None means "don't care") 1427db96d56Sopenharmony_ci if (rule.lineno is not None) and \ 1437db96d56Sopenharmony_ci abs(rule.lineno - lineno) > 5: continue 1447db96d56Sopenharmony_ci # if it came this far, the rule matched 1457db96d56Sopenharmony_ci rule.used = True 1467db96d56Sopenharmony_ci return True 1477db96d56Sopenharmony_ci return False 1487db96d56Sopenharmony_ci 1497db96d56Sopenharmony_ci def report_issue(self, text, lineno, issue): 1507db96d56Sopenharmony_ci self.any_issue = True 1517db96d56Sopenharmony_ci self.write_log_entry(lineno, issue, text) 1527db96d56Sopenharmony_ci self.logger.warning('[%s:%d] "%s" found in "%-.120s"' % 1537db96d56Sopenharmony_ci (self.docname, lineno, issue, text)) 1547db96d56Sopenharmony_ci self.app.statuscode = 1 1557db96d56Sopenharmony_ci 1567db96d56Sopenharmony_ci def write_log_entry(self, lineno, issue, text): 1577db96d56Sopenharmony_ci f = open(self.log_file_name, 'a') 1587db96d56Sopenharmony_ci writer = csv.writer(f, dialect) 1597db96d56Sopenharmony_ci writer.writerow([self.docname, lineno, issue, text.strip()]) 1607db96d56Sopenharmony_ci f.close() 1617db96d56Sopenharmony_ci 1627db96d56Sopenharmony_ci def load_rules(self, filename): 1637db96d56Sopenharmony_ci """Load database of previously ignored issues. 1647db96d56Sopenharmony_ci 1657db96d56Sopenharmony_ci A csv file, with exactly the same format as suspicious.csv 1667db96d56Sopenharmony_ci Fields: document name (normalized), line number, issue, surrounding text 1677db96d56Sopenharmony_ci """ 1687db96d56Sopenharmony_ci self.logger.info("loading ignore rules... ", nonl=1) 1697db96d56Sopenharmony_ci self.rules = rules = [] 1707db96d56Sopenharmony_ci try: 1717db96d56Sopenharmony_ci f = open(filename, 'r') 1727db96d56Sopenharmony_ci except IOError: 1737db96d56Sopenharmony_ci return 1747db96d56Sopenharmony_ci for i, row in enumerate(csv.reader(f)): 1757db96d56Sopenharmony_ci if len(row) != 4: 1767db96d56Sopenharmony_ci raise ValueError( 1777db96d56Sopenharmony_ci "wrong format in %s, line %d: %s" % (filename, i+1, row)) 1787db96d56Sopenharmony_ci docname, lineno, issue, text = row 1797db96d56Sopenharmony_ci if lineno: 1807db96d56Sopenharmony_ci lineno = int(lineno) 1817db96d56Sopenharmony_ci else: 1827db96d56Sopenharmony_ci lineno = None 1837db96d56Sopenharmony_ci rule = Rule(docname, lineno, issue, text) 1847db96d56Sopenharmony_ci rules.append(rule) 1857db96d56Sopenharmony_ci f.close() 1867db96d56Sopenharmony_ci self.logger.info('done, %d rules loaded' % len(self.rules)) 1877db96d56Sopenharmony_ci 1887db96d56Sopenharmony_ci 1897db96d56Sopenharmony_cidef get_lineno(node): 1907db96d56Sopenharmony_ci """Obtain line number information for a node.""" 1917db96d56Sopenharmony_ci lineno = None 1927db96d56Sopenharmony_ci while lineno is None and node: 1937db96d56Sopenharmony_ci node = node.parent 1947db96d56Sopenharmony_ci lineno = node.line 1957db96d56Sopenharmony_ci return lineno 1967db96d56Sopenharmony_ci 1977db96d56Sopenharmony_ci 1987db96d56Sopenharmony_cidef extract_line(text, index): 1997db96d56Sopenharmony_ci """text may be a multiline string; extract 2007db96d56Sopenharmony_ci only the line containing the given character index. 2017db96d56Sopenharmony_ci 2027db96d56Sopenharmony_ci >>> extract_line("abc\ndefgh\ni", 6) 2037db96d56Sopenharmony_ci >>> 'defgh' 2047db96d56Sopenharmony_ci >>> for i in (0, 2, 3, 4, 10): 2057db96d56Sopenharmony_ci ... print extract_line("abc\ndefgh\ni", i) 2067db96d56Sopenharmony_ci abc 2077db96d56Sopenharmony_ci abc 2087db96d56Sopenharmony_ci abc 2097db96d56Sopenharmony_ci defgh 2107db96d56Sopenharmony_ci defgh 2117db96d56Sopenharmony_ci i 2127db96d56Sopenharmony_ci """ 2137db96d56Sopenharmony_ci p = text.rfind('\n', 0, index) + 1 2147db96d56Sopenharmony_ci q = text.find('\n', index) 2157db96d56Sopenharmony_ci if q < 0: 2167db96d56Sopenharmony_ci q = len(text) 2177db96d56Sopenharmony_ci return text[p:q] 2187db96d56Sopenharmony_ci 2197db96d56Sopenharmony_ci 2207db96d56Sopenharmony_ciclass SuspiciousVisitor(nodes.GenericNodeVisitor): 2217db96d56Sopenharmony_ci 2227db96d56Sopenharmony_ci lastlineno = 0 2237db96d56Sopenharmony_ci 2247db96d56Sopenharmony_ci def __init__(self, document, builder): 2257db96d56Sopenharmony_ci nodes.GenericNodeVisitor.__init__(self, document) 2267db96d56Sopenharmony_ci self.builder = builder 2277db96d56Sopenharmony_ci 2287db96d56Sopenharmony_ci def default_visit(self, node): 2297db96d56Sopenharmony_ci if isinstance(node, (nodes.Text, nodes.image)): # direct text containers 2307db96d56Sopenharmony_ci text = node.astext() 2317db96d56Sopenharmony_ci # lineno seems to go backwards sometimes (?) 2327db96d56Sopenharmony_ci self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno) 2337db96d56Sopenharmony_ci seen = set() # don't report the same issue more than only once per line 2347db96d56Sopenharmony_ci for match in detect_all(text): 2357db96d56Sopenharmony_ci issue = match.group() 2367db96d56Sopenharmony_ci line = extract_line(text, match.start()) 2377db96d56Sopenharmony_ci if (issue, line) not in seen: 2387db96d56Sopenharmony_ci self.builder.check_issue(line, lineno, issue) 2397db96d56Sopenharmony_ci seen.add((issue, line)) 2407db96d56Sopenharmony_ci 2417db96d56Sopenharmony_ci unknown_visit = default_visit 2427db96d56Sopenharmony_ci 2437db96d56Sopenharmony_ci def visit_document(self, node): 2447db96d56Sopenharmony_ci self.lastlineno = 0 2457db96d56Sopenharmony_ci 2467db96d56Sopenharmony_ci def visit_comment(self, node): 2477db96d56Sopenharmony_ci # ignore comments -- too much false positives. 2487db96d56Sopenharmony_ci # (although doing this could miss some errors; 2497db96d56Sopenharmony_ci # there were two sections "commented-out" by mistake 2507db96d56Sopenharmony_ci # in the Python docs that would not be caught) 2517db96d56Sopenharmony_ci raise nodes.SkipNode 252