o
    Xh0h                     @   s   d dl mZ d dlmZ d dlmZ ddlmZm	Z	m
Z
mZmZmZmZmZmZ ddlmZ eeeZdd Zd	d
 ZG dd dZdS )    )bisect_left)deque)html5   )	EOFTokenascii_lettersascii_upper_to_lowerdigits	hexdigitsreplacement_charactersspace_characterstag_token_types)HTMLInputStreamc                 C   s4   | t v rdS tt|  }tt krdS t| | S )NTF)entitiesr   entity_keyslen
startswithprefixi r   I/var/www/html/rh/venv/lib/python3.10/site-packages/tinyhtml5/tokenizer.pyhas_keys_with_prefix   s
   r   c                 C   sP   | t v r| S tdt| d D ]}| d |  t v r#| d |    S qt| )Nr   )r   ranger   KeyErrorr   r   r   r   longest_prefix   s   r   c                   @   sl  e Zd ZdZdddZdd Zdd Zd	d
 Zdd ZdddZ	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6d7 Zd8d9 Zd:d; Zd<d= Z d>d? Z!d@dA Z"dBdC Z#dDdE Z$dFdG Z%dHdI Z&dJdK Z'dLdM Z(dNdO Z)dPdQ Z*dRdS Z+dTdU Z,dVdW Z-dXdY Z.dZd[ Z/d\d] Z0d^d_ Z1d`da Z2dbdc Z3ddde Z4dfdg Z5dhdi Z6djdk Z7dldm Z8dndo Z9dpdq Z:drds Z;dtdu Z<dvdw Z=dxdy Z>dzd{ Z?d|d} Z@d~d ZAdd ZBdd ZCdd ZDdd ZEdd ZFdd ZGdd ZHdd ZIdd ZJdd ZKdd ZLdd ZMdd ZNdS )HTMLTokenizerzHTML tokenizer.Nc                 K   s*   t |fi || _|| _| j| _d | _d S N)r   streamparser
data_statestatecurrent_token)selfr   r    kwargsr   r   r   __init__)   s   
zHTMLTokenizer.__init__c                 c   sd    t g | _|  r0| jjrtj| jjddV  | jjs| jr*| j V  | js!|  s
dS dS )zThis is where the magic happens.

        We do our usually processing through the states and when we have a token
        to return we yield the token which pauses processing until the next token
        is requested.

        r   typedataN)	r   token_queuer"   r   errorsr   PARSE_ERRORpoppopleftr$   r   r   r   __iter__1   s   
zHTMLTokenizer.__iter__c                 K   s(   t j|d}|r||d< | j| dS )z%Add a parse error to the token queue.r'   datavarsN)r   r,   r*   append)r$   _datar1   tokenr   r   r   parse_errorE   s   zHTMLTokenizer.parse_errorc                 C   s   | j tj|d dS )z+Add a characters string to the token queue.r'   N)r*   r2   r   
CHARACTERS)r$   r3   r   r   r   
charactersL   s   zHTMLTokenizer.charactersc                 C   sL  |rt nt}|r
dnd}g }| j }||v r%|| | j }||v std||}|tv r=t| }| jd|d nXd|  krGdksLn |dkrVd	}| jd|d n?d
|  kr`dksn d|  krkdksn d|  krvdksn d|  krdksn |t	g dv r| jd|d t
|}|dkr| d | j| |S )zReturn either U+FFFD or the character based on the representation.

        It also discards ";" if present. If not present self.parse_error is
        invoked.

           
    z$illegal-codepoint-for-numeric-entity)integeri   i      �r                  i  i  )#   i  i  i i i i i i i i i i i i i i i i i	 i	 i
 i
 i i i i i i i i i i i r<   ;z numeric-entity-without-semicolon)r   r
   r   	characterr2   intjoinr   r5   	frozensetchrunget)r$   is_hexallowedradixstackrE   r;   replacementr   r   r   consume_number_entityP   s8   



z#HTMLTokenizer.consume_number_entityFc                 C   st  d}| j  g}|d tddgtR v p|d uo||d k}|r)| j |d  n|d dkrwd}|| j   |d dv rId}|| j   |d |rPtntv ra| j |d  | |}n| 	d	 | j |
  dd
| }n|d turtd
|sn|| j   |d tus}ztd
|d d }W n ty   | 	d | j |
  dd
| }Y nTw |d dkr| 	d t|}|| tv p|| tv p|| dk}	|d dkr|r|	r| j |
  dd
| }n| j |
  t|  d
||d   }|r$| jd d d  |7  < d S |tv r+dnd}
| jt|
 |d d S )N&r   <#F)xXTzexpected-numeric-entityr:   zexpected-named-entityrD   znamed-entity-without-semicolon=r)   r   SPACE_CHARACTERSr6   r'   )r   rE   r   r   rJ   r2   r   r
   rP   r5   r-   rG   r   r   r   r   r   r   r#   r*   r   )r$   rL   from_attributeoutputrN   rJ   hexentity_nameentity_lengthallowed_characterr(   r   r   r   consume_entity   s`   
	



zHTMLTokenizer.consume_entityc                 C   s   | j |dd dS )z5Replace the need for entity_in_attribute_value_state.T)rL   rY   N)r_   )r$   rL   r   r   r   process_entity_in_attribute   s   z)HTMLTokenizer.process_entity_in_attributec                 C   s   | j }|d tv rP|d t|d< |d tjkr7|d }t|}t|t|kr3||ddd  ||d< |d tj	krP|d rG| 
d |d rP| 
d | j| | j| _dS )	zThis method is a generic handler for emitting the tags.

        It also sets the state to "data" because that's what's needed after a
        token has been emitted.

        r(   namer)   NrT   zattributes-in-end-tagselfClosingzself-closing-flag-on-end-tag)r#   r   	translater	   r   	START_TAGdictr   updateEND_TAGr5   r*   r2   r!   r"   )r$   r4   rawr)   r   r   r   emit_current_token   s    

z HTMLTokenizer.emit_current_tokenc                 C   s   | j  }|dkr| j| _dS |dkr| j| _dS |dkr)| d | d dS |tu r/dS |tv rF| j	
tj|| j td d dS | j d}| ||  dS )	NrQ   rR    invalid-codepointFTr'   rQ   rR   rj   )r   rE   entity_data_stater"   tag_open_stater5   r7   r   r   r*   r2   r   rX   chars_untilr$   r)   r7   r   r   r   r!      s,   



zHTMLTokenizer.data_statec                 C      |    | j| _dS NT)r_   r!   r"   r/   r   r   r   rm        zHTMLTokenizer.entity_data_statec                 C   s   | j  }|dkr| j| _dS |dkr| j| _dS |tu rdS |dkr/| d | d dS |tv rF| j	
tj|| j td d dS | j d	}| ||  dS )
NrQ   rR   Frj   rk   r=   Tr'   rl   )r   rE   $character_reference_in_rc_data_stater"   rcdata_less_than_sign_stater   r5   r7   r   r*   r2   r   rX   ro   )r$   r)   charsr   r   r   rcdata_state  s,   



zHTMLTokenizer.rcdata_statec                 C   rq   rr   )r_   rw   r"   r/   r   r   r   rt   (  rs   z2HTMLTokenizer.character_reference_in_rc_data_statec                 C   h   | j  }|dkr| j| _dS |dkr| d | d dS |tu r%dS | j d}| ||  dS NrR   rj   rk   r=   F)rR   rj   T)r   rE   rawtext_less_than_sign_stater"   r5   r7   r   ro   rp   r   r   r   rawtext_state-     
	

zHTMLTokenizer.rawtext_statec                 C   rx   ry   )r   rE    script_data_less_than_sign_stater"   r5   r7   r   ro   rp   r   r   r   script_data_state;  r|   zHTMLTokenizer.script_data_statec                 C   sP   | j  }|tu rdS |dkr| d | d dS | || j d  dS )NFrj   rk   r=   T)r   rE   r   r5   r7   ro   r$   r)   r   r   r   plaintext_stateI  s   


zHTMLTokenizer.plaintext_statec                 C   s   | j  }|dkr| j| _dS |dkr| j| _dS |tv r-tj|g ddd| _| j	| _dS |dkrA| 
d | d | j| _dS |dkrV| 
d	 | j | | j| _dS | 
d
 | d | j | | j| _dS )N!/F)r(   ra   r)   rb   selfClosingAcknowledged>z'expected-tag-name-but-got-right-bracketz<>?z'expected-tag-name-but-got-question-markzexpected-tag-namerR   T)r   rE   markup_declaration_open_stater"   close_tag_open_stater   r   rd   r#   tag_name_stater5   r7   r!   rJ   bogus_comment_stater   r   r   r   rn   T  s>   





zHTMLTokenizer.tag_open_statec                 C   s   | j  }|tv rtj|g dd| _| j| _d	S |dkr'| d | j	| _d	S |t
u r;| d | d | j	| _d	S | jd|d | j | | j| _d	S )
NFr(   ra   r)   rb   r   z*expected-closing-tag-but-got-right-bracketz expected-closing-tag-but-got-eof</z!expected-closing-tag-but-got-charr)   T)r   rE   r   r   rg   r#   r   r"   r5   r!   r   r7   rJ   r   r   r   r   r   r   w  s,   




z"HTMLTokenizer.close_tag_open_statec                 C   s   | j  }|tv r| j| _dS |dkr|   dS |tu r(| d | j| _dS |dkr2| j	| _dS |dkrF| d | j
d  d7  < dS | j
d  |7  < dS )	Nr   zeof-in-tag-namer   rj   rk   ra   r=   T)r   rE   r   before_attribute_name_stater"   ri   r   r5   r!   self_closing_start_tag_stater#   r   r   r   r   r     s(   



zHTMLTokenizer.tag_name_statec                 C   F   | j  }|dkrd| _| j| _dS | d | j | | j| _dS Nr   r:   rR   T)r   rE   temporary_bufferrcdata_end_tag_open_stater"   r7   rJ   rw   r   r   r   r   ru        

z)HTMLTokenizer.rcdata_less_than_sign_statec                 C   N   | j  }|tv r|  j|7  _| j| _dS | d | j | | j| _dS Nr   T)	r   rE   r   r   rcdata_end_tag_name_stater"   r7   rJ   rw   r   r   r   r   r        

z'HTMLTokenizer.rcdata_end_tag_open_statec                 C      | j o| j d  | j k}| j }|tv r*|r*tj| jg dd| _ | j| _	dS |dkr@|r@tj| jg dd| _ | j
| _	dS |dkrZ|rZtj| jg dd| _ |   | j| _	dS |tv rg|  j|7  _dS | d| j  | j| | j| _	dS Nra   Fr   r   r   r   T)r#   lowerr   r   rE   r   r   rg   r   r"   r   ri   r!   r   r7   rJ   rw   r$   appropriater)   r   r   r   r     H   
z'HTMLTokenizer.rcdata_end_tag_name_statec                 C   r   r   )r   rE   r   rawtext_end_tag_open_stater"   r7   rJ   r{   r   r   r   r   rz     r   z*HTMLTokenizer.rawtext_less_than_sign_statec                 C   r   r   )	r   rE   r   r   rawtext_end_tag_name_stater"   r7   rJ   r{   r   r   r   r   r     r   z(HTMLTokenizer.rawtext_end_tag_open_statec                 C   r   r   )r#   r   r   r   rE   r   r   rg   r   r"   r   ri   r!   r   r7   rJ   r{   r   r   r   r   r     r   z(HTMLTokenizer.rawtext_end_tag_name_statec                 C   sd   | j  }|dkrd| _| j| _dS |dkr!| d | j| _dS | d | j | | j| _dS )Nr   r:   r   z<!rR   T)	r   rE   r   script_data_end_tag_open_stater"   r7   script_data_escape_start_staterJ   r~   r   r   r   r   r}     s   


z.HTMLTokenizer.script_data_less_than_sign_statec                 C   r   r   )	r   rE   r   r   script_data_end_tag_name_stater"   r7   rJ   r~   r   r   r   r   r   )  r   z,HTMLTokenizer.script_data_end_tag_open_statec                 C   r   r   )r#   r   r   r   rE   r   r   rg   r   r"   r   ri   r!   r   r7   rJ   r~   r   r   r   r   r   4  r   z,HTMLTokenizer.script_data_end_tag_name_statec                 C   @   | j  }|dkr| d | j| _dS | j | | j| _dS N-T)r   rE   r7   #script_data_escape_start_dash_stater"   rJ   r~   r   r   r   r   r   Z     

z,HTMLTokenizer.script_data_escape_start_statec                 C   r   r   )r   rE   r7   #script_data_escaped_dash_dash_stater"   rJ   r~   r   r   r   r   r   d  r   z1HTMLTokenizer.script_data_escape_start_dash_statec                 C   s   | j  }|dkr| d | j| _dS |dkr| j| _dS |dkr.| d | d dS |tu r8| j| _dS | || j 	d  dS )Nr   rR   rj   rk   r=   )rR   r   rj   T)
r   rE   r7   script_data_escaped_dash_stater"   (script_data_escaped_less_than_sign_stater5   r   r!   ro   r   r   r   r   script_data_escaped_staten  s"   




z'HTMLTokenizer.script_data_escaped_statec                 C   s   | j  }|dkr| d | j| _dS |dkr| j| _dS |dkr2| d | d | j| _dS |tu r<| j	| _dS | | | j| _dS )Nr   rR   rj   rk   r=   T)
r   rE   r7   r   r"   r   r5   r   r   r!   r   r   r   r   r   ~  s&   





z,HTMLTokenizer.script_data_escaped_dash_statec                 C   s   | j  }|dkr| d dS |dkr| j| _dS |dkr)| d | j| _dS |dkr=| d | d | j| _dS |tu rG| j	| _dS | | | j| _dS )Nr   rR   r   rj   rk   r=   T)
r   rE   r7   r   r"   r~   r5   r   r   r!   r   r   r   r   r     s,   






z1HTMLTokenizer.script_data_escaped_dash_dash_statec                 C   sp   | j  }|dkrd| _| j| _dS |tv r'| d|  || _| j| _dS | d | j | | j	| _dS r   )
r   rE   r   &script_data_escaped_end_tag_open_stater"   r   r7   %script_data_double_escape_start_staterJ   r   r   r   r   r   r     s   
	
z6HTMLTokenizer.script_data_escaped_less_than_sign_statec                 C   sF   | j  }|tv r|| _| j| _dS | d | j | | j| _dS r   )	r   rE   r   r   &script_data_escaped_end_tag_name_stater"   r7   rJ   r   r   r   r   r   r     r   z4HTMLTokenizer.script_data_escaped_end_tag_open_statec                 C   r   r   )r#   r   r   r   rE   r   r   rg   r   r"   r   ri   r!   r   r7   rJ   r   r   r   r   r   r     r   z4HTMLTokenizer.script_data_escaped_end_tag_name_statec                 C      | j  }|ttdB v r%| | | j dkr| j| _dS | j	| _dS |t
v r7| | |  j|7  _dS | j | | j	| _dS N)r   r   scriptT)r   rE   r   rH   r7   r   r    script_data_double_escaped_stater"   r   r   rJ   r   r   r   r   r        

	
z3HTMLTokenizer.script_data_double_escape_start_statec                 C   s   | j  }|dkr| d | j| _dS |dkr#| d | j| _dS |dkr3| d | d dS |tu rB| d | j| _dS | | dS Nr   rR   rj   rk   r=   eof-in-script-in-scriptT)	r   rE   r7   %script_data_double_escaped_dash_stater"   /script_data_double_escaped_less_than_sign_stater5   r   r!   r   r   r   r   r     s&   


	



z.HTMLTokenizer.script_data_double_escaped_statec                 C   s   | j  }|dkr| d | j| _dS |dkr#| d | j| _dS |dkr7| d | d | j| _dS |tu rF| d | j	| _dS | | | j| _dS r   )
r   rE   r7   *script_data_double_escaped_dash_dash_stater"   r   r5   r   r   r!   r   r   r   r   r     s*   






z3HTMLTokenizer.script_data_double_escaped_dash_statec                 C   s   | j  }|dkr| d dS |dkr| d | j| _dS |dkr.| d | j| _dS |dkrB| d | d | j| _dS |tu rQ| d | j	| _dS | | | j| _dS )	Nr   rR   r   rj   rk   r=   r   T)
r   rE   r7   r   r"   r~   r5   r   r   r!   r   r   r   r   r     s0   







z8HTMLTokenizer.script_data_double_escaped_dash_dash_statec                 C   sF   | j  }|dkr| d d| _| j| _dS | j | | j| _dS )Nr   r:   T)r   rE   r7   r   #script_data_double_escape_end_stater"   rJ   r   r   r   r   r   r   0  s   

z=HTMLTokenizer.script_data_double_escaped_less_than_sign_statec                 C   r   r   )r   rE   r   rH   r7   r   r   r   r"   r   r   rJ   r   r   r   r   r   ;  r   z1HTMLTokenizer.script_data_double_escape_end_statec                 C   s  | j  }|tv r| j td dS |tv r&| jd |dg | j| _dS |dkr0| 	  dS |dkr:| j
| _dS |dv rS| d | jd |dg | j| _dS |dkrl| d	 | jd d
dg | j| _dS |tu r{| d | j| _dS | jd |dg | j| _dS )NTr)   r:   r   r   )'"rW   rR   #invalid-character-in-attribute-namerj   rk   r=   z#expected-attribute-name-but-got-eof)r   rE   r   ro   r   r#   r2   attribute_name_stater"   ri   r   r5   r   r!   r   r   r   r   r   K  s>   



z)HTMLTokenizer.before_attribute_name_statec                 C   s  | j  }d}d}|dkr| j| _n|tv r-| jd d d  || j td 7  < d}nh|dkr4d}na|tv r=| j| _nX|dkrF| j	| _nO|d	kr_| 
d
 | jd d d  d7  < d}n6|dv rx| 
d | jd d d  |7  < d}n|tu r| 
d | j| _n| jd d d  |7  < d}|r| jd d d t| jd d d< | jd d d D ]\}}| jd d d |kr| 
d  nq|r|   dS )NTFrW   r)   rT   r   r   r   rj   rk   r=   r   r   rR   r   zeof-in-attribute-namezduplicate-attribute)r   rE   before_attribute_value_stater"   r   r#   ro   r   after_attribute_name_stater   r5   r   r!   rc   r	   ri   )r$   r)   leaving_this_state
emit_tokenra   _r   r   r   r   f  sP   








z"HTMLTokenizer.attribute_name_statec                 C   s*  | j  }|tv r| j td dS |dkr| j| _dS |dkr&|   dS |tv r:| jd 	|dg | j
| _dS |dkrD| j| _dS |dkr]| d | jd 	d	dg | j
| _dS |d
v rv| d | jd 	|dg | j
| _dS |tu r| d | j| _dS | jd 	|dg | j
| _dS )NTrW   r   r)   r:   r   rj   rk   r=   r   z&invalid-character-after-attribute-namezexpected-end-of-tag-but-got-eof)r   rE   r   ro   r   r"   ri   r   r#   r2   r   r   r5   r   r!   r   r   r   r   r     sD   



z(HTMLTokenizer.after_attribute_name_statec                 C   s>  | j  }|tv r| j td dS |dkr| j| _dS |dkr,| j| _| j | dS |dkr6| j| _dS |dkrE| 	d | 
  dS |dkra| 	d | jd	 d
 d  d7  < | j| _dS |dv r}| 	d | jd	 d
 d  |7  < | j| _dS |tu r| 	d | j| _dS | jd	 d
 d  |7  < | j| _dS )NTr   rQ   r   r   z.expected-attribute-value-but-got-right-bracketrj   rk   r)   rT   r   r=   )rW   rR   `z"equals-in-unquoted-attribute-valuez$expected-attribute-value-but-got-eof)r   rE   r   ro   #attribute_value_double_quoted_stater"   attribute_value_unquoted_staterJ   #attribute_value_single_quoted_stater5   ri   r#   r   r!   r   r   r   r   r     sF   




z*HTMLTokenizer.before_attribute_value_statec                 C      | j  }|dkr| j| _dS |dkr| d dS |dkr2| d | jd d d  d7  < dS |tu rA| d	 | j| _dS | jd d d  || j 	d
 7  < dS )Nr   rQ   rj   rk   r)   rT   r   r=   z#eof-in-attribute-value-double-quote)r   rQ   rj   T
r   rE   after_attribute_value_stater"   r`   r5   r#   r   r!   ro   r   r   r   r   r     &   




z1HTMLTokenizer.attribute_value_double_quoted_statec                 C   r   )Nr   rQ   rj   rk   r)   rT   r   r=   z#eof-in-attribute-value-single-quote)r   rQ   rj   Tr   r   r   r   r   r     r   z1HTMLTokenizer.attribute_value_single_quoted_statec                 C   s   | j  }|tv r| j| _dS |dkr| d dS |dkr$|   dS |dv r<| d | jd d d  |7  < dS |dkrT| d	 | jd d d  d
7  < dS |t	u rc| d | j
| _dS | jd d d  || j tdtB  7  < dS )NrQ   r   )r   r   rW   rR   r   z0unexpected-character-in-unquoted-attribute-valuer)   rT   r   rj   rk   r=   z eof-in-attribute-value-no-quotes)rQ   r   r   r   rW   rR   r   rj   T)r   rE   r   r   r"   r`   ri   r5   r#   r   r!   ro   rH   r   r   r   r   r     s<   



	
z,HTMLTokenizer.attribute_value_unquoted_statec                 C   s   | j  }|tv r| j| _dS |dkr|   dS |dkr#| j| _dS |tu r8| d | j 	| | j
| _dS | d | j 	| | j| _dS )Nr   r   z$unexpected-eof-after-attribute-valuez*unexpected-character-after-attribute-valueT)r   rE   r   r   r"   ri   r   r   r5   rJ   r!   r   r   r   r   r     s&   
	

z)HTMLTokenizer.after_attribute_value_statec                 C   st   | j  }|dkrd| jd< |   dS |tu r)| d | j | | j| _dS | d | j | | j	| _dS )Nr   Trb   z#unexpected-eof-after-solidus-in-tagz)unexpected-character-after-solidus-in-tag)
r   rE   r#   ri   r   r5   rJ   r!   r"   r   r   r   r   r   r     s   

	

z*HTMLTokenizer.self_closing_start_tag_statec                 C   sB   | j d}|dd}| jtj|d | j   | j| _	dS )Nr   rj   r=   r'   T)
r   ro   replacer*   r2   r   COMMENTrE   r!   r"   r   r   r   r   r   -  s   
z!HTMLTokenizer.bogus_comment_statec                 C   sn  | j  g}|d dkr(|| j   |d dkr'tjdd| _| j| _dS nx|d rb|d dv rbd}dD ]}|| j   |d rJ|d |vrNd} nq6|ratjdd d dd	| _| j	| _dS n>|d d
kr| j
d ur| j
jjr| j
jjd j| j
jjkrd}dD ]}|| j   |d |krd} nq|r| j| _dS | d |r| j |  |s| j| _dS )NrT   r   r:   r'   TdD)oOcCtTyYpPeEF)r(   ra   publicIdsystemIdcorrect[zCDATA[zexpected-dashes-or-doctype)r   rE   r2   r   r   r#   comment_start_stater"   DOCTYPEdoctype_stater    treeopen_elements	namespacedefault_namespacecdata_section_stater5   rJ   r-   r   )r$   rN   matchedexpectedr   r   r   r   ;  sb   


z+HTMLTokenizer.markup_declaration_open_statec                 C   s   | j  }|dkr| j| _d	S |dkr#| d | jd  d7  < d	S |dkr9| d | j| j | j| _d	S |t	u rO| d | j| j | j| _d	S | jd  |7  < | j
| _d	S )
Nr   rj   rk   r)   r=   r   incorrect-commenteof-in-commentT)r   rE   comment_start_dash_stater"   r5   r#   r*   r2   r!   r   comment_stater   r   r   r   r   j  s*   



z!HTMLTokenizer.comment_start_statec                 C   s   | j  }|dkr| j| _d	S |dkr#| d | jd  d7  < d	S |dkr9| d | j| j | j| _d	S |t	u rO| d | j| j | j| _d	S | jd  d| 7  < | j
| _d	S )
Nr   rj   rk   r)      -�r   r   r   T)r   rE   comment_end_stater"   r5   r#   r*   r2   r!   r   r   r   r   r   r   r   ~  s*   



z&HTMLTokenizer.comment_start_dash_statec                 C   s   | j  }|dkr| j| _dS |dkr#| d | jd  d7  < dS |tu r9| d | j| j | j	| _dS | jd  || j 
d 7  < dS )	Nr   rj   rk   r)   r=   r   )r   rj   T)r   rE   comment_end_dash_stater"   r5   r#   r   r*   r2   r!   ro   r   r   r   r   r     s"   



zHTMLTokenizer.comment_statec                 C   s   | j  }|dkr| j| _dS |dkr'| d | jd  d7  < | j| _dS |tu r=| d | j	| j | j
| _dS | jd  d| 7  < | j| _dS )Nr   rj   rk   r)   r   zeof-in-comment-end-dashT)r   rE   r   r"   r5   r#   r   r   r*   r2   r!   r   r   r   r   r     s"   


z$HTMLTokenizer.comment_end_dash_statec                 C   s   | j  }|dkr| j| j | j| _dS |dkr.| d | jd  d7  < | j| _dS |dkr=| d | j	| _dS |dkrQ| d	 | jd  |7  < dS |t
u rg| d
 | j| j | j| _dS | d | jd  d| 7  < | j| _dS )Nr   rj   rk   r)   u   --�r   z,unexpected-bang-after-double-dash-in-commentr   z,unexpected-dash-after-double-dash-in-commentzeof-in-comment-double-dashzunexpected-char-in-commentz--T)r   rE   r*   r2   r#   r!   r"   r5   r   comment_end_bang_stater   r   r   r   r   r     s6   






zHTMLTokenizer.comment_end_statec                 C   s   | j  }|dkr| j| j | j| _d	S |dkr)| jd  d7  < | j| _d	S |dkrA| d | jd  d7  < | j	| _d	S |t
u rW| d | j| j | j| _d	S | jd  d| 7  < | j	| _d	S )
Nr   r   r)   z--!rj   rk   u   --!�zeof-in-comment-end-bang-stateT)r   rE   r*   r2   r#   r!   r"   r   r5   r   r   r   r   r   r   r     s,   


z$HTMLTokenizer.comment_end_bang_statec                 C   sv   | j  }|tv r| j| _dS |tu r*| d d| jd< | j	| j | j
| _dS | d | j | | j| _dS )N!expected-doctype-name-but-got-eofFr   zneed-space-after-doctypeT)r   rE   r   before_doctype_name_stater"   r   r5   r#   r*   r2   r!   rJ   r   r   r   r   r     s   




zHTMLTokenizer.doctype_statec                 C   s   | j  }|tv r	 d
S |dkr'| d d| jd< | j| j | j| _d
S |dkr;| d d| jd< | j	| _d
S |t
u rV| d	 d| jd< | j| j | j| _d
S || jd< | j	| _d
S )Nr   z+expected-doctype-name-but-got-right-bracketFr   rj   rk   r=   ra   r   T)r   rE   r   r5   r#   r*   r2   r!   r"   doctype_name_stater   r   r   r   r   r     s0   




	


z'HTMLTokenizer.before_doctype_name_statec                 C   s   | j  }|tv r| jd t| jd< | j| _d	S |dkr6| jd t| jd< | j	| j | j
| _d	S |dkrN| d | jd  d7  < | j| _d	S |tu rt| d d| jd< | jd t| jd< | j	| j | j
| _d	S | jd  |7  < d	S )
Nra   r   rj   rk   r=   zeof-in-doctype-nameFr   T)r   rE   r   r#   rc   r	   after_doctype_name_stater"   r*   r2   r!   r5   r   r   r   r   r   r   r     s6   




z HTMLTokenizer.doctype_name_statec                 C   s>  | j  }|tv r	 dS |dkr| j| j | j| _dS |tu r>d| jd< | j 	| | 
d | j| j | j| _dS |rc|dv rcd}dD ]}| j  }|rU||vrYd} nqH|rb| j| _dS n$|r|dv rd}d	D ]}| j  }|rz||vr~d} nqm|r| j| _dS | j 	| | j
d
|d d| jd< | j| _dS )Nr   Fr   eof-in-doctyper   T)uUbBlLiIr   sS)r   r   r   r   mMz*expected-space-or-right-bracket-in-doctyper   )r   rE   r   r*   r2   r#   r!   r"   r   rJ   r5   "after_doctype_public_keyword_state"after_doctype_system_keyword_statebogus_doctype_state)r$   r)   r   r   r   r   r   r   "  sV   
(%




z&HTMLTokenizer.after_doctype_name_statec                 C      | j  }|tv r| j| _dS |dv r$| d | j | | j| _dS |tu r?| d d| jd< | j	
| j | j| _dS | j | | j| _dS N)r   r   unexpected-char-in-doctyper   Fr   T)r   rE   r   &before_doctype_public_identifier_stater"   r5   rJ   r   r#   r*   r2   r!   r   r   r   r   r   O  $   

	

z0HTMLTokenizer.after_doctype_public_keyword_statec                 C   s   | j  }|tv r	 dS |dkrd| jd< | j| _dS |dkr*d| jd< | j| _dS |dkrE| d d| jd< | j	| j | j
| _dS |tu r`| d	 d| jd< | j	| j | j
| _dS | d
 d| jd< | j| _dS )Nr   r:   r   r   r   unexpected-end-of-doctypeFr   r   r   T)r   rE   r   r#   -doctype_public_identifier_double_quoted_stater"   -doctype_public_identifier_single_quoted_stater5   r*   r2   r!   r   r   r   r   r   r   r   a  8   









z4HTMLTokenizer.before_doctype_public_identifier_statec                 C      | j  }|dkr| j| _dS |dkr#| d | jd  d7  < dS |dkr>| d d| jd	< | j| j | j| _dS |t	u rY| d
 d| jd	< | j| j | j| _dS | jd  |7  < dS )Nr   rj   rk   r   r=   r   r   Fr   r   T
r   rE   %after_doctype_public_identifier_stater"   r5   r#   r*   r2   r!   r   r   r   r   r   r   {  ,   





z;HTMLTokenizer.doctype_public_identifier_double_quoted_statec                 C   r  )Nr   rj   rk   r   r=   r   r   Fr   r   Tr  r   r   r   r   r     r  z;HTMLTokenizer.doctype_public_identifier_single_quoted_statec                 C   s   | j  }|tv r| j| _d
S |dkr | j| j | j| _d
S |dkr4| 	d d| jd< | j
| _d
S |dkrH| 	d d| jd< | j| _d
S |tu rc| 	d d| jd	< | j| j | j| _d
S | 	d d| jd	< | j| _d
S )Nr   r   r   r:   r   r   r   Fr   T)r   rE   r   3between_doctype_public_and_system_identifiers_stater"   r*   r2   r#   r!   r5   -doctype_system_identifier_double_quoted_state-doctype_system_identifier_single_quoted_stater   r   r   r   r   r   r    s8   









z3HTMLTokenizer.after_doctype_public_identifier_statec                 C   s   | j  }|tv r	 d
S |dkr| j| j | j| _d
S |dkr,d| jd< | j| _d
S |dkr;d| jd< | j	| _d
S |t
u rV| d d| jd< | j| j | j| _d
S | d	 d| jd< | j| _d
S )Nr   r   r:   r   r   r   Fr   r   T)r   rE   r   r*   r2   r#   r!   r"   r  r  r   r5   r   r   r   r   r   r    s4   







zAHTMLTokenizer.between_doctype_public_and_system_identifiers_statec                 C   r   r   )r   rE   r   &before_doctype_system_identifier_stater"   r5   rJ   r   r#   r*   r2   r!   r   r   r   r   r     r   z0HTMLTokenizer.after_doctype_system_keyword_statec                 C   s   | j  }|tv r	 d
S |dkrd| jd< | j| _d
S |dkr*d| jd< | j| _d
S |dkrE| d d| jd< | j	| j | j
| _d
S |tu r`| d	 d| jd< | j	| j | j
| _d
S | d d| jd< | j| _d
S )Nr   r:   r   r   r   r   Fr   r   T)r   rE   r   r#   r  r"   r  r5   r*   r2   r!   r   r   r   r   r   r   r	    r  z4HTMLTokenizer.before_doctype_system_identifier_statec                 C   r  )Nr   rj   rk   r   r=   r   r   Fr   r   T
r   rE   %after_doctype_system_identifier_stater"   r5   r#   r*   r2   r!   r   r   r   r   r   r    r  z;HTMLTokenizer.doctype_system_identifier_double_quoted_statec                 C   r  )Nr   rj   rk   r   r=   r   r   Fr   r   Tr
  r   r   r   r   r    r  z;HTMLTokenizer.doctype_system_identifier_single_quoted_statec                 C   s   | j  }|tv r	 dS |dkr| j| j | j| _dS |tu r8| 	d d| jd< | j| j | j| _dS | 	d | j
| _dS )Nr   r   Fr   r   T)r   rE   r   r*   r2   r#   r!   r"   r   r5   r   r   r   r   r   r  -  s"   
	


z3HTMLTokenizer.after_doctype_system_identifier_statec                 C   s`   | j  }|dkr| j| j | j| _dS |tu r-| j | | j| j | j| _dS 	 dS )Nr   T)	r   rE   r*   r2   r#   r!   r"   r   rJ   r   r   r   r   r   >  s   
z!HTMLTokenizer.bogus_doctype_statec                 C   s   g }	 | | jd | | jd | j }|tu rn!|dks%J |d dd  dkr:|d d d |d< n| | qd|}|d }d	kr`t|D ]}| d
 qR|	dd}|rg| 
| | j| _dS )NT]r   rT   z]]r:   rj   r   rk   r=   )r2   r   ro   rE   r   rG   countr   r5   r   r7   r!   r"   )r$   r)   char
null_countr   r   r   r   r   L  s,   



z!HTMLTokenizer.cdata_section_stater   )NF)O__name__
__module____qualname____doc__r&   r0   r5   r7   rP   r_   r`   ri   r!   rm   rw   rt   r{   r~   r   rn   r   r   ru   r   r   rz   r   r   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r   r	  r  r  r  r   r   r   r   r   r   r   &   s    

5G#&&&

&1/-r   N)bisectr   collectionsr   html.entitiesr   r   	constantsr   r   r   r	   r
   r   r   r   r   inputstreamr   tuplesortedr   r   r   r   r   r   r   r   <module>   s    ,	