o 0¦iÌRã@sÞdZddlZddlZddlmZdgZe d¡Ze d¡Ze d¡Z e d¡Z e d ¡Z e d ¡Z e d ¡Z e d ¡Ze d ¡Ze d¡Ze dej¡Ze dej¡Ze dej¡Ze d ¡Ze d¡ZGdd„dejƒZdS)zA parser for HTML and XHTML.éN)ÚunescapeÚ HTMLParserz[&<]z &[a-zA-Z#]z%&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]z)&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]z <[a-zA-Z]z z--!?>z-?>z0([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*a{ ( (?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name ) ([\t\n\r\f ]*=[\t\n\r\f ]* # value indicator ('[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\t\n\r\f ]* # bare value ) )? (?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space a [a-zA-Z][^\t\n\r\f />]* # tag name [\t\n\r\f /]* # optional whitespace before attribute name (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name (?:[\t\n\r\f ]*=[\t\n\r\f ]* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\t\n\r\f ]* # bare value ) )? [\t\n\r\f /]* # possibly followed by a space )* >? aF <[a-zA-Z][^\t\n\r\f />\x00]* # tag name (?:[\s/]* # optional whitespace before attribute name (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) \s* # possibly followed by a space )?(?:\s|/(?!>))* )* )? \s* # trailing whitespace z#c@seZdZdZdZdZdddœdd„Zd d „Zd d „Zd d„Z dZ dd„Z ddœdd„Z dd„Z d>dd„Zdd„Zdd„Zd>dd„Zd?d d!„Zd"d#„Zd$d%„Zd&d'„Zd(d)„Zd*d+„Zd,d-„Zd.d/„Zd0d1„Zd2d3„Zd4d5„Zd6d7„Zd8d9„Zd:d;„Zd'.)Ú_HTMLParser__starttag_textrr r r Úget_starttag_textszHTMLParser.get_starttag_text©Ú escapablecCst| ¡|_||_|jdkrt d¡|_dS|r*|js*t d|jtjtjB¡|_dSt d|jtjtjB¡|_dS)NÚ plaintextz\Zz&|])z])) ÚlowerrrÚreÚcompilerrÚ IGNORECASEÚASCII)r Úelemr!r r r Úset_cdata_mode¡s      ÿ   ÿzHTMLParser.set_cdata_modecCst|_d|_d|_dS)NT)rrrrrr r r Úclear_cdata_mode­s zHTMLParser.clear_cdata_modecCs ||_dS)aEnable or disable support of the CDATA sections. If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>". If disabled, "<[CDATA[" starts a bogus comments which ends with ">". This method is not called by default. Its purpose is to be called in custom handle_starttag() and handle_endtag() methods, with value that depends on the adjusted current node. See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state for details. N)r)r Úflagr r r Ú_set_support_cdata²s zHTMLParser._set_support_cdatac Cs|j}d}t|ƒ}||krY|jr;|js;| d|¡}|dkr:| dt||dƒ¡}|dkr8t d¡  ||¡s8n!|}n|j   ||¡}|rI|  ¡}n|jrNn |}||kro|jrf|j rf|  t|||…ƒ¡n |  |||…¡| ||¡}||kr{nÞ|j}|d|ƒrt ||¡r| |¡} n@|d|ƒr›| |¡} n5|d|ƒr¦| |¡} n*|d|ƒr±| |¡} n|d |ƒr¼| |¡} n|d |ksÄ|rÎ|  d¡|d } nn‰| dkr‰|sÙn€t ||¡ràn§|d|ƒr|d |krò|  d¡n•t ||¡rùnŽ| ||d d…¡n‚|d|ƒr0|}d D]} | | |d ¡r"|t| ƒ8}nq| ||d |…¡nW|d|ƒrF|jrF| ||dd…¡nA|||d… ¡dkr_| ||d d…¡n(|d |ƒrq| ||d d…¡n|d|ƒrƒ| ||d d…¡nt dƒ‚|} | || ¡}nÅ|d|ƒràt! ||¡}|rÃ| "¡d d…} | #| ¡| $¡} |d| d ƒs¼| d } | || ¡}q d||d…vrß|  |||d …¡| ||d ¡}ny|d|ƒrQt% ||¡}|r| "d ¡} | &| ¡| $¡} |d| d ƒs | d } | || ¡}q t' ||¡}|r;|r:| "¡||d…kr:| $¡} | |kr2|} | ||d ¡}n|d |krP|  d¡| ||d ¡}nnJdƒ‚||ks|r„||kr„|jru|j ru|  t|||…ƒ¡n |  |||…¡| ||¡}||d…|_dS)Nrú<ú&é"z [\t\n\r\f ;]úrr;r8r:rrzrOr#rPrMÚparse_bogus_comment)r rZrr\Zgtposr r r rKNs4 ÿ z!HTMLParser.parse_html_declarationcCsp|j}| d|¡s Jdƒ‚t ||d¡}|s#t ||d¡}|s#dS|r4| ¡}| ||d|…¡| ¡S)Nr1ú"unexpected call to parse_comment()r6r;) rrDÚ commentcloserAÚcommentabruptcloserFrBrMrV)r rZÚreportrrFr\r r r rIpszHTMLParser.parse_commentrcCs`|j}|||d…dvsJdƒ‚| d|d¡}|dkrdS|r,| ||d|…¡|dS)Nr4)r3r0rbrr;r)rr>rM)r rZrerÚposr r r raszHTMLParser.parse_bogus_commentcCsd|j}|||d…dksJdƒ‚t ||d¡}|sdS| ¡}| ||d|…¡| ¡}|S)Nr4r2zunexpected call to parse_pi()r;)rÚpicloserArBrQrV©r rZrrFr\r r r rJ‹szHTMLParser.parse_picCsd|_| |¡}|dkr|S|j}|||…|_g}t ||d¡}|s(Jdƒ‚| ¡}| d¡ ¡|_}||kr–t  ||¡}|sCnS| ddd¡\} } } | sRd} n-| dd…dkrd| dd…ksyn| dd…dkrw| dd…krnn| dd…} | r…t | ƒ} |  |  ¡| f¡| ¡}||ks:|||…  ¡} | d vrÓ|  ¡\} }d |jvrÁ| |j d ¡} t|jƒ|j d ¡}n|t|jƒ}| |||…¡|S|  d ¡rà| ||¡|S| ||¡||jvsö|jrò|d ksö|d krÿ|j|dd|S||jvr |j|dd|S)Nrrz#unexpected call to parse_starttag()r4r8ú'r;ú")rú/>Ú rkZnoscriptr"Fr T)rÚcheck_for_whole_start_tagrÚtagfind_tolerantrFrVrTr#rÚattrfind_tolerantrÚappendÚstripZgetposÚcountr=r?rCrNÚhandle_startendtagÚhandle_starttagÚCDATA_CONTENT_ELEMENTSrr)ÚRCDATA_CONTENT_ELEMENTS)r rZÚendposrÚattrsrFr]ÚtagÚmÚattrnameÚrestZ attrvaluerVÚlinenoÚoffsetr r r rG—sf   &( ó   ÿ   ù ÿ þzHTMLParser.parse_starttagcCs>|j}t ||d¡}|sJ‚| ¡}||ddkrdS|S)Nrrr;)rÚ locatetagendrFrVrhr r r rmÐsz$HTMLParser.check_for_whole_start_tagcCsà|j}|||d…dksJdƒ‚| d|d¡dkrdSt ||¡s8||d|d…dkr3|dS| |¡St ||d¡}|sDJ‚| ¡}||ddkrRdSt ||d¡}|s^J‚| d¡  ¡}|  |¡|  ¡|S) Nr4r0zunexpected call to parse_endtagrrr;r8r) rr>rLrFrarrVrnrTr#Ú handle_endtagr*)r rZrrFr\ryr r r rHÚs&   zHTMLParser.parse_endtagcCs| ||¡| |¡dS©N)rtr€©r ryrxr r r rsøs zHTMLParser.handle_startendtagcCódSrr r‚r r r rtýózHTMLParser.handle_starttagcCrƒrr )r ryr r r r€r„zHTMLParser.handle_endtagcCrƒrr ©r r_r r r rUr„zHTMLParser.handle_charrefcCrƒrr r…r r r rX r„zHTMLParser.handle_entityrefcCrƒrr rr r r rC r„zHTMLParser.handle_datacCrƒrr rr r r rMr„zHTMLParser.handle_commentcCrƒrr )r Zdeclr r r rPr„zHTMLParser.handle_declcCrƒrr rr r r rQr„zHTMLParser.handle_picCrƒrr rr r r rOr„zHTMLParser.unknown_decl)T)r)!Ú__name__Ú __module__Ú __qualname__Ú__doc__rurvr rrrrrr)r*r,rrKrIrarJrGrmrHrsrtr€rUrXrCrMrPrQrOr r r r rZs@    "   9  )r‰r$rZhtmlrÚ__all__r%rrYrWrSrErLrgrcrdrnÚVERBOSErorZlocatestarttagend_tolerantZ endendtagZ endtagfindrrr r r r Ús6             õ  óò