17db96d56Sopenharmony_ci"""A parser for HTML and XHTML.""" 27db96d56Sopenharmony_ci 37db96d56Sopenharmony_ci# This file is based on sgmllib.py, but the API is slightly different. 47db96d56Sopenharmony_ci 57db96d56Sopenharmony_ci# XXX There should be a way to distinguish between PCDATA (parsed 67db96d56Sopenharmony_ci# character data -- the normal case), RCDATA (replaceable character 77db96d56Sopenharmony_ci# data -- only char and entity references and end tags are special) 87db96d56Sopenharmony_ci# and CDATA (character data -- only end tags are special). 97db96d56Sopenharmony_ci 107db96d56Sopenharmony_ci 117db96d56Sopenharmony_ciimport re 127db96d56Sopenharmony_ciimport _markupbase 137db96d56Sopenharmony_ci 147db96d56Sopenharmony_cifrom html import unescape 157db96d56Sopenharmony_ci 167db96d56Sopenharmony_ci 177db96d56Sopenharmony_ci__all__ = ['HTMLParser'] 187db96d56Sopenharmony_ci 197db96d56Sopenharmony_ci# Regular expressions used for parsing 207db96d56Sopenharmony_ci 217db96d56Sopenharmony_ciinteresting_normal = re.compile('[&<]') 227db96d56Sopenharmony_ciincomplete = re.compile('&[a-zA-Z#]') 237db96d56Sopenharmony_ci 247db96d56Sopenharmony_cientityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 257db96d56Sopenharmony_cicharref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') 267db96d56Sopenharmony_ci 277db96d56Sopenharmony_cistarttagopen = re.compile('<[a-zA-Z]') 287db96d56Sopenharmony_cipiclose = re.compile('>') 297db96d56Sopenharmony_cicommentclose = re.compile(r'--\s*>') 307db96d56Sopenharmony_ci# Note: 317db96d56Sopenharmony_ci# 1) if you change tagfind/attrfind remember to update locatestarttagend too; 327db96d56Sopenharmony_ci# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will 337db96d56Sopenharmony_ci# explode, so don't do it. 347db96d56Sopenharmony_ci# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state 357db96d56Sopenharmony_ci# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state 367db96d56Sopenharmony_citagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') 377db96d56Sopenharmony_ciattrfind_tolerant = re.compile( 387db96d56Sopenharmony_ci r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' 397db96d56Sopenharmony_ci r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') 407db96d56Sopenharmony_cilocatestarttagend_tolerant = re.compile(r""" 417db96d56Sopenharmony_ci <[a-zA-Z][^\t\n\r\f />\x00]* # tag name 427db96d56Sopenharmony_ci (?:[\s/]* # optional whitespace before attribute name 437db96d56Sopenharmony_ci (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name 447db96d56Sopenharmony_ci (?:\s*=+\s* # value indicator 457db96d56Sopenharmony_ci (?:'[^']*' # LITA-enclosed value 467db96d56Sopenharmony_ci |"[^"]*" # LIT-enclosed value 477db96d56Sopenharmony_ci |(?!['"])[^>\s]* # bare value 487db96d56Sopenharmony_ci ) 497db96d56Sopenharmony_ci \s* # possibly followed by a space 507db96d56Sopenharmony_ci )?(?:\s|/(?!>))* 517db96d56Sopenharmony_ci )* 527db96d56Sopenharmony_ci )? 537db96d56Sopenharmony_ci \s* # trailing whitespace 547db96d56Sopenharmony_ci""", re.VERBOSE) 557db96d56Sopenharmony_ciendendtag = re.compile('>') 567db96d56Sopenharmony_ci# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between 577db96d56Sopenharmony_ci# </ and the tag name, so maybe this should be fixed 587db96d56Sopenharmony_ciendtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') 597db96d56Sopenharmony_ci 607db96d56Sopenharmony_ci 617db96d56Sopenharmony_ci 627db96d56Sopenharmony_ciclass HTMLParser(_markupbase.ParserBase): 637db96d56Sopenharmony_ci """Find tags and other markup and call handler functions. 647db96d56Sopenharmony_ci 657db96d56Sopenharmony_ci Usage: 667db96d56Sopenharmony_ci p = HTMLParser() 677db96d56Sopenharmony_ci p.feed(data) 687db96d56Sopenharmony_ci ... 697db96d56Sopenharmony_ci p.close() 707db96d56Sopenharmony_ci 717db96d56Sopenharmony_ci Start tags are handled by calling self.handle_starttag() or 727db96d56Sopenharmony_ci self.handle_startendtag(); end tags by self.handle_endtag(). The 737db96d56Sopenharmony_ci data between tags is passed from the parser to the derived class 747db96d56Sopenharmony_ci by calling self.handle_data() with the data as argument (the data 757db96d56Sopenharmony_ci may be split up in arbitrary chunks). If convert_charrefs is 767db96d56Sopenharmony_ci True the character references are converted automatically to the 777db96d56Sopenharmony_ci corresponding Unicode character (and self.handle_data() is no 787db96d56Sopenharmony_ci longer split in chunks), otherwise they are passed by calling 797db96d56Sopenharmony_ci self.handle_entityref() or self.handle_charref() with the string 807db96d56Sopenharmony_ci containing respectively the named or numeric reference as the 817db96d56Sopenharmony_ci argument. 827db96d56Sopenharmony_ci """ 837db96d56Sopenharmony_ci 847db96d56Sopenharmony_ci CDATA_CONTENT_ELEMENTS = ("script", "style") 857db96d56Sopenharmony_ci 867db96d56Sopenharmony_ci def __init__(self, *, convert_charrefs=True): 877db96d56Sopenharmony_ci """Initialize and reset this instance. 887db96d56Sopenharmony_ci 897db96d56Sopenharmony_ci If convert_charrefs is True (the default), all character references 907db96d56Sopenharmony_ci are automatically converted to the corresponding Unicode characters. 917db96d56Sopenharmony_ci """ 927db96d56Sopenharmony_ci self.convert_charrefs = convert_charrefs 937db96d56Sopenharmony_ci self.reset() 947db96d56Sopenharmony_ci 957db96d56Sopenharmony_ci def reset(self): 967db96d56Sopenharmony_ci """Reset this instance. Loses all unprocessed data.""" 977db96d56Sopenharmony_ci self.rawdata = '' 987db96d56Sopenharmony_ci self.lasttag = '???' 997db96d56Sopenharmony_ci self.interesting = interesting_normal 1007db96d56Sopenharmony_ci self.cdata_elem = None 1017db96d56Sopenharmony_ci _markupbase.ParserBase.reset(self) 1027db96d56Sopenharmony_ci 1037db96d56Sopenharmony_ci def feed(self, data): 1047db96d56Sopenharmony_ci r"""Feed data to the parser. 1057db96d56Sopenharmony_ci 1067db96d56Sopenharmony_ci Call this as often as you want, with as little or as much text 1077db96d56Sopenharmony_ci as you want (may include '\n'). 1087db96d56Sopenharmony_ci """ 1097db96d56Sopenharmony_ci self.rawdata = self.rawdata + data 1107db96d56Sopenharmony_ci self.goahead(0) 1117db96d56Sopenharmony_ci 1127db96d56Sopenharmony_ci def close(self): 1137db96d56Sopenharmony_ci """Handle any buffered data.""" 1147db96d56Sopenharmony_ci self.goahead(1) 1157db96d56Sopenharmony_ci 1167db96d56Sopenharmony_ci __starttag_text = None 1177db96d56Sopenharmony_ci 1187db96d56Sopenharmony_ci def get_starttag_text(self): 1197db96d56Sopenharmony_ci """Return full source of start tag: '<...>'.""" 1207db96d56Sopenharmony_ci return self.__starttag_text 1217db96d56Sopenharmony_ci 1227db96d56Sopenharmony_ci def set_cdata_mode(self, elem): 1237db96d56Sopenharmony_ci self.cdata_elem = elem.lower() 1247db96d56Sopenharmony_ci self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 1257db96d56Sopenharmony_ci 1267db96d56Sopenharmony_ci def clear_cdata_mode(self): 1277db96d56Sopenharmony_ci self.interesting = interesting_normal 1287db96d56Sopenharmony_ci self.cdata_elem = None 1297db96d56Sopenharmony_ci 1307db96d56Sopenharmony_ci # Internal -- handle data as far as reasonable. May leave state 1317db96d56Sopenharmony_ci # and data to be processed by a subsequent call. If 'end' is 1327db96d56Sopenharmony_ci # true, force handling all data as if followed by EOF marker. 1337db96d56Sopenharmony_ci def goahead(self, end): 1347db96d56Sopenharmony_ci rawdata = self.rawdata 1357db96d56Sopenharmony_ci i = 0 1367db96d56Sopenharmony_ci n = len(rawdata) 1377db96d56Sopenharmony_ci while i < n: 1387db96d56Sopenharmony_ci if self.convert_charrefs and not self.cdata_elem: 1397db96d56Sopenharmony_ci j = rawdata.find('<', i) 1407db96d56Sopenharmony_ci if j < 0: 1417db96d56Sopenharmony_ci # if we can't find the next <, either we are at the end 1427db96d56Sopenharmony_ci # or there's more text incoming. If the latter is True, 1437db96d56Sopenharmony_ci # we can't pass the text to handle_data in case we have 1447db96d56Sopenharmony_ci # a charref cut in half at end. Try to determine if 1457db96d56Sopenharmony_ci # this is the case before proceeding by looking for an 1467db96d56Sopenharmony_ci # & near the end and see if it's followed by a space or ;. 1477db96d56Sopenharmony_ci amppos = rawdata.rfind('&', max(i, n-34)) 1487db96d56Sopenharmony_ci if (amppos >= 0 and 1497db96d56Sopenharmony_ci not re.compile(r'[\s;]').search(rawdata, amppos)): 1507db96d56Sopenharmony_ci break # wait till we get all the text 1517db96d56Sopenharmony_ci j = n 1527db96d56Sopenharmony_ci else: 1537db96d56Sopenharmony_ci match = self.interesting.search(rawdata, i) # < or & 1547db96d56Sopenharmony_ci if match: 1557db96d56Sopenharmony_ci j = match.start() 1567db96d56Sopenharmony_ci else: 1577db96d56Sopenharmony_ci if self.cdata_elem: 1587db96d56Sopenharmony_ci break 1597db96d56Sopenharmony_ci j = n 1607db96d56Sopenharmony_ci if i < j: 1617db96d56Sopenharmony_ci if self.convert_charrefs and not self.cdata_elem: 1627db96d56Sopenharmony_ci self.handle_data(unescape(rawdata[i:j])) 1637db96d56Sopenharmony_ci else: 1647db96d56Sopenharmony_ci self.handle_data(rawdata[i:j]) 1657db96d56Sopenharmony_ci i = self.updatepos(i, j) 1667db96d56Sopenharmony_ci if i == n: break 1677db96d56Sopenharmony_ci startswith = rawdata.startswith 1687db96d56Sopenharmony_ci if startswith('<', i): 1697db96d56Sopenharmony_ci if starttagopen.match(rawdata, i): # < + letter 1707db96d56Sopenharmony_ci k = self.parse_starttag(i) 1717db96d56Sopenharmony_ci elif startswith("</", i): 1727db96d56Sopenharmony_ci k = self.parse_endtag(i) 1737db96d56Sopenharmony_ci elif startswith("<!--", i): 1747db96d56Sopenharmony_ci k = self.parse_comment(i) 1757db96d56Sopenharmony_ci elif startswith("<?", i): 1767db96d56Sopenharmony_ci k = self.parse_pi(i) 1777db96d56Sopenharmony_ci elif startswith("<!", i): 1787db96d56Sopenharmony_ci k = self.parse_html_declaration(i) 1797db96d56Sopenharmony_ci elif (i + 1) < n: 1807db96d56Sopenharmony_ci self.handle_data("<") 1817db96d56Sopenharmony_ci k = i + 1 1827db96d56Sopenharmony_ci else: 1837db96d56Sopenharmony_ci break 1847db96d56Sopenharmony_ci if k < 0: 1857db96d56Sopenharmony_ci if not end: 1867db96d56Sopenharmony_ci break 1877db96d56Sopenharmony_ci k = rawdata.find('>', i + 1) 1887db96d56Sopenharmony_ci if k < 0: 1897db96d56Sopenharmony_ci k = rawdata.find('<', i + 1) 1907db96d56Sopenharmony_ci if k < 0: 1917db96d56Sopenharmony_ci k = i + 1 1927db96d56Sopenharmony_ci else: 1937db96d56Sopenharmony_ci k += 1 1947db96d56Sopenharmony_ci if self.convert_charrefs and not self.cdata_elem: 1957db96d56Sopenharmony_ci self.handle_data(unescape(rawdata[i:k])) 1967db96d56Sopenharmony_ci else: 1977db96d56Sopenharmony_ci self.handle_data(rawdata[i:k]) 1987db96d56Sopenharmony_ci i = self.updatepos(i, k) 1997db96d56Sopenharmony_ci elif startswith("&#", i): 2007db96d56Sopenharmony_ci match = charref.match(rawdata, i) 2017db96d56Sopenharmony_ci if match: 2027db96d56Sopenharmony_ci name = match.group()[2:-1] 2037db96d56Sopenharmony_ci self.handle_charref(name) 2047db96d56Sopenharmony_ci k = match.end() 2057db96d56Sopenharmony_ci if not startswith(';', k-1): 2067db96d56Sopenharmony_ci k = k - 1 2077db96d56Sopenharmony_ci i = self.updatepos(i, k) 2087db96d56Sopenharmony_ci continue 2097db96d56Sopenharmony_ci else: 2107db96d56Sopenharmony_ci if ";" in rawdata[i:]: # bail by consuming &# 2117db96d56Sopenharmony_ci self.handle_data(rawdata[i:i+2]) 2127db96d56Sopenharmony_ci i = self.updatepos(i, i+2) 2137db96d56Sopenharmony_ci break 2147db96d56Sopenharmony_ci elif startswith('&', i): 2157db96d56Sopenharmony_ci match = entityref.match(rawdata, i) 2167db96d56Sopenharmony_ci if match: 2177db96d56Sopenharmony_ci name = match.group(1) 2187db96d56Sopenharmony_ci self.handle_entityref(name) 2197db96d56Sopenharmony_ci k = match.end() 2207db96d56Sopenharmony_ci if not startswith(';', k-1): 2217db96d56Sopenharmony_ci k = k - 1 2227db96d56Sopenharmony_ci i = self.updatepos(i, k) 2237db96d56Sopenharmony_ci continue 2247db96d56Sopenharmony_ci match = incomplete.match(rawdata, i) 2257db96d56Sopenharmony_ci if match: 2267db96d56Sopenharmony_ci # match.group() will contain at least 2 chars 2277db96d56Sopenharmony_ci if end and match.group() == rawdata[i:]: 2287db96d56Sopenharmony_ci k = match.end() 2297db96d56Sopenharmony_ci if k <= i: 2307db96d56Sopenharmony_ci k = n 2317db96d56Sopenharmony_ci i = self.updatepos(i, i + 1) 2327db96d56Sopenharmony_ci # incomplete 2337db96d56Sopenharmony_ci break 2347db96d56Sopenharmony_ci elif (i + 1) < n: 2357db96d56Sopenharmony_ci # not the end of the buffer, and can't be confused 2367db96d56Sopenharmony_ci # with some other construct 2377db96d56Sopenharmony_ci self.handle_data("&") 2387db96d56Sopenharmony_ci i = self.updatepos(i, i + 1) 2397db96d56Sopenharmony_ci else: 2407db96d56Sopenharmony_ci break 2417db96d56Sopenharmony_ci else: 2427db96d56Sopenharmony_ci assert 0, "interesting.search() lied" 2437db96d56Sopenharmony_ci # end while 2447db96d56Sopenharmony_ci if end and i < n and not self.cdata_elem: 2457db96d56Sopenharmony_ci if self.convert_charrefs and not self.cdata_elem: 2467db96d56Sopenharmony_ci self.handle_data(unescape(rawdata[i:n])) 2477db96d56Sopenharmony_ci else: 2487db96d56Sopenharmony_ci self.handle_data(rawdata[i:n]) 2497db96d56Sopenharmony_ci i = self.updatepos(i, n) 2507db96d56Sopenharmony_ci self.rawdata = rawdata[i:] 2517db96d56Sopenharmony_ci 2527db96d56Sopenharmony_ci # Internal -- parse html declarations, return length or -1 if not terminated 2537db96d56Sopenharmony_ci # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state 2547db96d56Sopenharmony_ci # See also parse_declaration in _markupbase 2557db96d56Sopenharmony_ci def parse_html_declaration(self, i): 2567db96d56Sopenharmony_ci rawdata = self.rawdata 2577db96d56Sopenharmony_ci assert rawdata[i:i+2] == '<!', ('unexpected call to ' 2587db96d56Sopenharmony_ci 'parse_html_declaration()') 2597db96d56Sopenharmony_ci if rawdata[i:i+4] == '<!--': 2607db96d56Sopenharmony_ci # this case is actually already handled in goahead() 2617db96d56Sopenharmony_ci return self.parse_comment(i) 2627db96d56Sopenharmony_ci elif rawdata[i:i+3] == '<![': 2637db96d56Sopenharmony_ci return self.parse_marked_section(i) 2647db96d56Sopenharmony_ci elif rawdata[i:i+9].lower() == '<!doctype': 2657db96d56Sopenharmony_ci # find the closing > 2667db96d56Sopenharmony_ci gtpos = rawdata.find('>', i+9) 2677db96d56Sopenharmony_ci if gtpos == -1: 2687db96d56Sopenharmony_ci return -1 2697db96d56Sopenharmony_ci self.handle_decl(rawdata[i+2:gtpos]) 2707db96d56Sopenharmony_ci return gtpos+1 2717db96d56Sopenharmony_ci else: 2727db96d56Sopenharmony_ci return self.parse_bogus_comment(i) 2737db96d56Sopenharmony_ci 2747db96d56Sopenharmony_ci # Internal -- parse bogus comment, return length or -1 if not terminated 2757db96d56Sopenharmony_ci # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state 2767db96d56Sopenharmony_ci def parse_bogus_comment(self, i, report=1): 2777db96d56Sopenharmony_ci rawdata = self.rawdata 2787db96d56Sopenharmony_ci assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to ' 2797db96d56Sopenharmony_ci 'parse_comment()') 2807db96d56Sopenharmony_ci pos = rawdata.find('>', i+2) 2817db96d56Sopenharmony_ci if pos == -1: 2827db96d56Sopenharmony_ci return -1 2837db96d56Sopenharmony_ci if report: 2847db96d56Sopenharmony_ci self.handle_comment(rawdata[i+2:pos]) 2857db96d56Sopenharmony_ci return pos + 1 2867db96d56Sopenharmony_ci 2877db96d56Sopenharmony_ci # Internal -- parse processing instr, return end or -1 if not terminated 2887db96d56Sopenharmony_ci def parse_pi(self, i): 2897db96d56Sopenharmony_ci rawdata = self.rawdata 2907db96d56Sopenharmony_ci assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' 2917db96d56Sopenharmony_ci match = piclose.search(rawdata, i+2) # > 2927db96d56Sopenharmony_ci if not match: 2937db96d56Sopenharmony_ci return -1 2947db96d56Sopenharmony_ci j = match.start() 2957db96d56Sopenharmony_ci self.handle_pi(rawdata[i+2: j]) 2967db96d56Sopenharmony_ci j = match.end() 2977db96d56Sopenharmony_ci return j 2987db96d56Sopenharmony_ci 2997db96d56Sopenharmony_ci # Internal -- handle starttag, return end or -1 if not terminated 3007db96d56Sopenharmony_ci def parse_starttag(self, i): 3017db96d56Sopenharmony_ci self.__starttag_text = None 3027db96d56Sopenharmony_ci endpos = self.check_for_whole_start_tag(i) 3037db96d56Sopenharmony_ci if endpos < 0: 3047db96d56Sopenharmony_ci return endpos 3057db96d56Sopenharmony_ci rawdata = self.rawdata 3067db96d56Sopenharmony_ci self.__starttag_text = rawdata[i:endpos] 3077db96d56Sopenharmony_ci 3087db96d56Sopenharmony_ci # Now parse the data between i+1 and j into a tag and attrs 3097db96d56Sopenharmony_ci attrs = [] 3107db96d56Sopenharmony_ci match = tagfind_tolerant.match(rawdata, i+1) 3117db96d56Sopenharmony_ci assert match, 'unexpected call to parse_starttag()' 3127db96d56Sopenharmony_ci k = match.end() 3137db96d56Sopenharmony_ci self.lasttag = tag = match.group(1).lower() 3147db96d56Sopenharmony_ci while k < endpos: 3157db96d56Sopenharmony_ci m = attrfind_tolerant.match(rawdata, k) 3167db96d56Sopenharmony_ci if not m: 3177db96d56Sopenharmony_ci break 3187db96d56Sopenharmony_ci attrname, rest, attrvalue = m.group(1, 2, 3) 3197db96d56Sopenharmony_ci if not rest: 3207db96d56Sopenharmony_ci attrvalue = None 3217db96d56Sopenharmony_ci elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 3227db96d56Sopenharmony_ci attrvalue[:1] == '"' == attrvalue[-1:]: 3237db96d56Sopenharmony_ci attrvalue = attrvalue[1:-1] 3247db96d56Sopenharmony_ci if attrvalue: 3257db96d56Sopenharmony_ci attrvalue = unescape(attrvalue) 3267db96d56Sopenharmony_ci attrs.append((attrname.lower(), attrvalue)) 3277db96d56Sopenharmony_ci k = m.end() 3287db96d56Sopenharmony_ci 3297db96d56Sopenharmony_ci end = rawdata[k:endpos].strip() 3307db96d56Sopenharmony_ci if end not in (">", "/>"): 3317db96d56Sopenharmony_ci self.handle_data(rawdata[i:endpos]) 3327db96d56Sopenharmony_ci return endpos 3337db96d56Sopenharmony_ci if end.endswith('/>'): 3347db96d56Sopenharmony_ci # XHTML-style empty tag: <span attr="value" /> 3357db96d56Sopenharmony_ci self.handle_startendtag(tag, attrs) 3367db96d56Sopenharmony_ci else: 3377db96d56Sopenharmony_ci self.handle_starttag(tag, attrs) 3387db96d56Sopenharmony_ci if tag in self.CDATA_CONTENT_ELEMENTS: 3397db96d56Sopenharmony_ci self.set_cdata_mode(tag) 3407db96d56Sopenharmony_ci return endpos 3417db96d56Sopenharmony_ci 3427db96d56Sopenharmony_ci # Internal -- check to see if we have a complete starttag; return end 3437db96d56Sopenharmony_ci # or -1 if incomplete. 3447db96d56Sopenharmony_ci def check_for_whole_start_tag(self, i): 3457db96d56Sopenharmony_ci rawdata = self.rawdata 3467db96d56Sopenharmony_ci m = locatestarttagend_tolerant.match(rawdata, i) 3477db96d56Sopenharmony_ci if m: 3487db96d56Sopenharmony_ci j = m.end() 3497db96d56Sopenharmony_ci next = rawdata[j:j+1] 3507db96d56Sopenharmony_ci if next == ">": 3517db96d56Sopenharmony_ci return j + 1 3527db96d56Sopenharmony_ci if next == "/": 3537db96d56Sopenharmony_ci if rawdata.startswith("/>", j): 3547db96d56Sopenharmony_ci return j + 2 3557db96d56Sopenharmony_ci if rawdata.startswith("/", j): 3567db96d56Sopenharmony_ci # buffer boundary 3577db96d56Sopenharmony_ci return -1 3587db96d56Sopenharmony_ci # else bogus input 3597db96d56Sopenharmony_ci if j > i: 3607db96d56Sopenharmony_ci return j 3617db96d56Sopenharmony_ci else: 3627db96d56Sopenharmony_ci return i + 1 3637db96d56Sopenharmony_ci if next == "": 3647db96d56Sopenharmony_ci # end of input 3657db96d56Sopenharmony_ci return -1 3667db96d56Sopenharmony_ci if next in ("abcdefghijklmnopqrstuvwxyz=/" 3677db96d56Sopenharmony_ci "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): 3687db96d56Sopenharmony_ci # end of input in or before attribute value, or we have the 3697db96d56Sopenharmony_ci # '/' from a '/>' ending 3707db96d56Sopenharmony_ci return -1 3717db96d56Sopenharmony_ci if j > i: 3727db96d56Sopenharmony_ci return j 3737db96d56Sopenharmony_ci else: 3747db96d56Sopenharmony_ci return i + 1 3757db96d56Sopenharmony_ci raise AssertionError("we should not get here!") 3767db96d56Sopenharmony_ci 3777db96d56Sopenharmony_ci # Internal -- parse endtag, return end or -1 if incomplete 3787db96d56Sopenharmony_ci def parse_endtag(self, i): 3797db96d56Sopenharmony_ci rawdata = self.rawdata 3807db96d56Sopenharmony_ci assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" 3817db96d56Sopenharmony_ci match = endendtag.search(rawdata, i+1) # > 3827db96d56Sopenharmony_ci if not match: 3837db96d56Sopenharmony_ci return -1 3847db96d56Sopenharmony_ci gtpos = match.end() 3857db96d56Sopenharmony_ci match = endtagfind.match(rawdata, i) # </ + tag + > 3867db96d56Sopenharmony_ci if not match: 3877db96d56Sopenharmony_ci if self.cdata_elem is not None: 3887db96d56Sopenharmony_ci self.handle_data(rawdata[i:gtpos]) 3897db96d56Sopenharmony_ci return gtpos 3907db96d56Sopenharmony_ci # find the name: w3.org/TR/html5/tokenization.html#tag-name-state 3917db96d56Sopenharmony_ci namematch = tagfind_tolerant.match(rawdata, i+2) 3927db96d56Sopenharmony_ci if not namematch: 3937db96d56Sopenharmony_ci # w3.org/TR/html5/tokenization.html#end-tag-open-state 3947db96d56Sopenharmony_ci if rawdata[i:i+3] == '</>': 3957db96d56Sopenharmony_ci return i+3 3967db96d56Sopenharmony_ci else: 3977db96d56Sopenharmony_ci return self.parse_bogus_comment(i) 3987db96d56Sopenharmony_ci tagname = namematch.group(1).lower() 3997db96d56Sopenharmony_ci # consume and ignore other stuff between the name and the > 4007db96d56Sopenharmony_ci # Note: this is not 100% correct, since we might have things like 4017db96d56Sopenharmony_ci # </tag attr=">">, but looking for > after the name should cover 4027db96d56Sopenharmony_ci # most of the cases and is much simpler 4037db96d56Sopenharmony_ci gtpos = rawdata.find('>', namematch.end()) 4047db96d56Sopenharmony_ci self.handle_endtag(tagname) 4057db96d56Sopenharmony_ci return gtpos+1 4067db96d56Sopenharmony_ci 4077db96d56Sopenharmony_ci elem = match.group(1).lower() # script or style 4087db96d56Sopenharmony_ci if self.cdata_elem is not None: 4097db96d56Sopenharmony_ci if elem != self.cdata_elem: 4107db96d56Sopenharmony_ci self.handle_data(rawdata[i:gtpos]) 4117db96d56Sopenharmony_ci return gtpos 4127db96d56Sopenharmony_ci 4137db96d56Sopenharmony_ci self.handle_endtag(elem) 4147db96d56Sopenharmony_ci self.clear_cdata_mode() 4157db96d56Sopenharmony_ci return gtpos 4167db96d56Sopenharmony_ci 4177db96d56Sopenharmony_ci # Overridable -- finish processing of start+end tag: <tag.../> 4187db96d56Sopenharmony_ci def handle_startendtag(self, tag, attrs): 4197db96d56Sopenharmony_ci self.handle_starttag(tag, attrs) 4207db96d56Sopenharmony_ci self.handle_endtag(tag) 4217db96d56Sopenharmony_ci 4227db96d56Sopenharmony_ci # Overridable -- handle start tag 4237db96d56Sopenharmony_ci def handle_starttag(self, tag, attrs): 4247db96d56Sopenharmony_ci pass 4257db96d56Sopenharmony_ci 4267db96d56Sopenharmony_ci # Overridable -- handle end tag 4277db96d56Sopenharmony_ci def handle_endtag(self, tag): 4287db96d56Sopenharmony_ci pass 4297db96d56Sopenharmony_ci 4307db96d56Sopenharmony_ci # Overridable -- handle character reference 4317db96d56Sopenharmony_ci def handle_charref(self, name): 4327db96d56Sopenharmony_ci pass 4337db96d56Sopenharmony_ci 4347db96d56Sopenharmony_ci # Overridable -- handle entity reference 4357db96d56Sopenharmony_ci def handle_entityref(self, name): 4367db96d56Sopenharmony_ci pass 4377db96d56Sopenharmony_ci 4387db96d56Sopenharmony_ci # Overridable -- handle data 4397db96d56Sopenharmony_ci def handle_data(self, data): 4407db96d56Sopenharmony_ci pass 4417db96d56Sopenharmony_ci 4427db96d56Sopenharmony_ci # Overridable -- handle comment 4437db96d56Sopenharmony_ci def handle_comment(self, data): 4447db96d56Sopenharmony_ci pass 4457db96d56Sopenharmony_ci 4467db96d56Sopenharmony_ci # Overridable -- handle declaration 4477db96d56Sopenharmony_ci def handle_decl(self, decl): 4487db96d56Sopenharmony_ci pass 4497db96d56Sopenharmony_ci 4507db96d56Sopenharmony_ci # Overridable -- handle processing instruction 4517db96d56Sopenharmony_ci def handle_pi(self, data): 4527db96d56Sopenharmony_ci pass 4537db96d56Sopenharmony_ci 4547db96d56Sopenharmony_ci def unknown_decl(self, data): 4557db96d56Sopenharmony_ci pass 456