o
    Xh0hLb                     @   s  d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	 d dl
Z
ddlmZmZmZ edd eD Zed	d eD Zed
d e	D ZeeddgB ZedZh dZedZi Zdd ZG dd dZG dd deZG dd deZG dd dZG dd dZdd Z dS )    N)BytesIOStringIO)Path)ascii_lettersascii_uppercase   )EOFReparseErrorspace_charactersc                 c       | ]}|  V  qd S Nencode.0item r   K/var/www/html/rh/venv/lib/python3.10/site-packages/tinyhtml5/inputstream.py	<genexpr>       r   c                 c   r   r   r   r   r   r   r   r      r   c                 c   r   r   r   r   r   r   r   r      r      >   <u   [---﷐-﷯￾￿🿾🿿𯿾𯿿𿿾𿿿񏿾񏿿񟿾񟿿񯿾񯿿񿿾񿿿򏿾򏿿򟿾򟿿򯿾򯿿򿿾򿿿󏿾󏿿󟿾󟿿󯿾󯿿󿿾󿿿􏿾􏿿-]>            	 
               	 
       z[	- -/:-@\[-`{-~]c                 K   s   t | trt| dk rt|  rtt|  fi |S t | tr,t|  fi |S t t| dr7| dn| trCt| fi |S t	| fi |S )N   readr   )

isinstancestrlenr   is_fileHTMLUnicodeInputStream	read_texthasattrr9   HTMLBinaryInputStream)sourcekwargsr   r   r   HTMLInputStream&   s   "
rD   c                   @   sZ   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dddZdd ZdS )r>   zProvides a Unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    c                 C   s.   dg| _ tddf| _| || _|   dS )a  Initialise the HTMLInputStream.

        Create a normalized stream from source for use by tinyhtml5.

        source can be either a file-object, local filename or a string.

        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element).

        r   utf-8certainN)	new_lineslookup_encodingencodingopen_streamstreamresetselfrB   r   r   r   __init__9   s   zHTMLUnicodeInputStream.__init__c                 C   s.   d| _ d| _d| _g | _d| _d| _d | _d S )N r   )chunk
chunk_sizechunk_offseterrorsprevious_number_linesprevious_number_columns_buffered_characterrN   r   r   r   rL   N   s   
zHTMLUnicodeInputStream.resetc                 C   s   t |dr|S t|S )zuProduce a file object from source.

        source can be either a file object, local filename or a string.

        r9   )r@   r   rM   r   r   r   rJ   \   s   z"HTMLUnicodeInputStream.open_streamc                 C   sZ   | j }|dd|}| j| }|dd|}|dkr#| j| }||fS ||d  }||fS )N
r   r   )rQ   countrU   rfindrV   )rN   offsetrQ   number_linesposition_linelast_line_positionposition_columnr   r   r   	_positiond   s   

z HTMLUnicodeInputStream._positionc                 C   s   |  | j\}}|d |fS )z9Return (line, col) of the current position in the stream.r   )rb   rS   )rN   linecolumnr   r   r   positiono   s   zHTMLUnicodeInputStream.positionc                 C   s6   | j | jkr|  stS | j }| j| }|d | _ |S )zlRead one character from the stream or queue if available.

        Return EOF when EOF is reached.

        r   )rS   rR   
read_chunkr   rQ   )rN   rS   	characterr   r   r   rg   t   s   

z HTMLUnicodeInputStream.characterc                 C   s   |  | j\| _| _d| _d| _d| _| jd}| jr%| j| }d | _n|s)dS t	|dkrPt
|d }|dksEd|  krCd	krPn n|d | _|d d }tt	t|D ]}| jd
 qY|dd}|dd}|| _t	|| _dS )NrP   r   i (  Fr   rZ      i   i  zinvalid-codepointz
rY   T)rb   rR   rU   rV   rQ   rS   rK   r9   rW   r<   ordrangeinvalid_unicode_refindallrT   appendreplace)rN   datalast_r   r   r   rf      s0   
 

z!HTMLUnicodeInputStream.read_chunkFc                 C   s   zt ||f }W n+ ty3   ddd |D }|s d| }td| d}| }t ||f< Y nw g }	 || j| j}|du rK| j| jkrJn*n|	 }|| jkrc|
| j| j|  || _n|
| j| jd  |  ssnq7d|S )	a   Return a string of characters from the stream.

        String goes up to but does not include any character in 'characters' or
        EOF. 'characters' must be a container that supports the 'in' method and
        iteration over its characters.

        rP   c                 S   s   g | ]
}d t |dqS )z\x02x)rj   )r   rg   r   r   r   
<listcomp>   s    z6HTMLUnicodeInputStream.chars_until.<locals>.<listcomp>^[z]+TN)characters_until_regexKeyErrorjoinrecompilematchrQ   rS   rR   endrn   rf   )rN   
charactersoppositeregexresultr|   r}   r   r   r   chars_until   s4   



z"HTMLUnicodeInputStream.chars_untilc                 C   sZ   |t ur)| jdkr|| j | _|  jd7  _d S |  jd8  _| j| j |ks+J d S d S )Nr   r   )r   rS   rQ   rR   )rN   charr   r   r   unget   s   

zHTMLUnicodeInputStream.ungetN)F)__name__
__module____qualname____doc__rO   rL   rJ   rb   re   rg   rf   r   r   r   r   r   r   r>   1   s    
%.r>   c                       sX   e Zd ZdZ			dddZ fddZdd	 Zd
d Zdd Zdd Z	dd Z
  ZS )rA   zProvide a binary stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    Nwindows-1252c                 C   sX   |  || _d| _|| _|| _|| _|| _|| _|  | _	| j	d d us&J | 
  d S )Ni   r   )rJ   
raw_streamnumber_bytes_metaoverride_encodingtransport_encodingsame_origin_parent_encodinglikely_encodingdefault_encodingdetermine_encodingrI   rL   )rN   rB   r   r   r   r   r   r   r   r   rO      s   
zHTMLBinaryInputStream.__init__c                    s*   | j d jj}|| jd| _t   d S )Nr   ro   )rI   
codec_infostreamreaderr   rK   superrL   )rN   r   	__class__r   r   rL   	  s   zHTMLBinaryInputStream.resetc                 C   s0   t |drt |dr| r|S | }t|S )Nr9   seekable)r@   r   r9   r   rM   r   r   r   rJ     s
   
z!HTMLBinaryInputStream.open_streamc                 C   s   |   df}|d d ur|S t| jdf}|d d ur|S t| jdf}|d d ur,|S |  df}|d d ur:|S t| jdf}|d d urQ|d jdsQ|S t| jdf}|d d ur`|S t| j	df}|d d uro|S tddfS )NrF   r   	tentativezutf-16r   )

detect_bomrH   r   r   detect_encoding_metar   name
startswithr   r   )rN   rI   r   r   r   r     s,   z(HTMLBinaryInputStream.determine_encodingc                 C   s   | j d dks	J t| }d u rd S |jdv r$td}|d us"J d S || j d kr5| j d df| _ d S | jd |df| _ |   td| j d  d| )Nr   rF   utf-16beutf-16lerE   r   zEncoding changed from z to )rI   rH   r   r   seekrL   r	   )rN   new_encodingr   r   r   change_encoding=  s   

z%HTMLBinaryInputStream.change_encodingc              
   C   s   t jdt jdt jdt jdt jdi}| jd}t|t	sJ dD ]}|
|d|  }r9| j| t|  S q | jd	 dS )
zAttempt to detect at BOM at the start of the stream.

        If an encoding can be determined from the BOM return the name of the
        encoding otherwise return None.

        rE   r   r   zutf-32lezutf-32be   )   r      Nr   )codecsBOM_UTF8BOM_UTF16_LEBOM_UTF16_BEBOM_UTF32_LEBOM_UTF32_BEr   r9   r:   bytesgetr   rH   )rN   bomsstringr   rI   r   r   r   r   M  s   	z HTMLBinaryInputStream.detect_bomc                 C   sV   | j | j}t|tsJ t|}| j d | }|dur)|jdv r)t	d}|S )z1Report the encoding declared by the meta element.r   Nr   rE   )
r   r9   r   r:   r   EncodingParserr   get_encodingr   rH   )rN   bufferparserrI   r   r   r   r   j  s   z*HTMLBinaryInputStream.detect_encoding_meta)NNNNr   )r   r   r   r   rO   rL   rJ   r   r   r   r   __classcell__r   r   r   r   rA      s    
(rA   c                   @   sz   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	e
e	eZe
dd ZefddZdd Zdd Zdd ZdS )EncodingByteszBytes-like object with an associated position and various extra methods.

    If the position is ever greater than the string length then an exception is
    raised.

    c                 C   s   t |tsJ t| | S r   )r:   r   __new__lower)clsvaluer   r   r   r     s   zEncodingBytes.__new__c                 C   s
   d| _ d S )NrZ   rb   )rN   r   r   r   r   rO     s   
zEncodingBytes.__init__c                 C   s0   | j d  }| _ |t| krt| ||d  S Nr   rb   r<   StopIterationrN   re   r   r   r   __next__  s   zEncodingBytes.__next__c                 C   s    | j d  | _ }| ||d  S r   r   r   r   r   r   previous  s   zEncodingBytes.previousc                 C   s"   | j t| kr	ttd|| _ d S Nr   )rb   r<   r   maxr   r   r   r   set_position  s   zEncodingBytes.set_positionc                 C   s&   | j t| kr	t| j dkr| j S d S r   r   rX   r   r   r   get_position  s
   
zEncodingBytes.get_positionc                 C   s   | | j | j d  S r   )re   rX   r   r   r   current_byte  s   zEncodingBytes.current_bytec                 C   sR   | j }|t| k r$| ||d  }||vr|| _|S |d7 }|t| k s	|| _dS )zSkip past a list of characters.r   Nre   r<   rb   rN   r~   re   rg   r   r   r   skip  s   zEncodingBytes.skipc                 C   sR   | j }|t| k r$| ||d  }||v r|| _|S |d7 }|t| k s	|| _d S r   r   r   r   r   r   
skip_until  s   zEncodingBytes.skip_untilc                 C   s(   |  || j }r|  jt|7  _|S )zLook for a sequence of bytes at the start of a string.

        If the bytes are found return True and advance the position to the byte
        after the match. Otherwise return False and leave the position alone.

        )r   re   r<   )rN   r   r   r   r   r   match_bytes  s   zEncodingBytes.match_bytesc                 C   s6   z|  || jt| d | _W dS  ty   tw )zLook for the next sequence of bytes matching a given sequence.

        If a match is found advance the position to the last byte of the match.

        r   T)indexre   r<   rb   
ValueErrorr   )rN   r   r   r   r   jump_to  s   zEncodingBytes.jump_toN)r   r   r   r   r   rO   r   r   r   r   propertyre   r   space_characters_bytesr   r   r   r   r   r   r   r   r   x  s    

r   c                   @   sX   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd ZdS )r   z@Mini parser for detecting character encoding from meta elements.c                 C   s   t || _d | _d S r   )r   rp   rI   rN   rp   r   r   r   rO     s   

zEncodingParser.__init__c              
   C   s   d| j vrd S | j| j| j| j| j| jd}| j D ]A}d}z| j d W n ty1   Y  | j
S w | D ]\}}| j 	|rSz| }W  n tyR   d}Y  nw q6|sZ | j
S q| j
S )N   <meta)s   <!--r   s   </s   <!s   <?r   Tr   F)rp   handle_commenthandle_metahandle_possible_end_taghandle_otherhandle_possible_start_tagr   r   itemsr   rI   )rN   method_dispatchrr   keep_parsingkeymethodr   r   r   r     s@   

zEncodingParser.get_encodingc                 C      | j dS )zSkip over comments.s   -->rp   r   rX   r   r   r   r     s   zEncodingParser.handle_commentc                 C   s   | j jtvrdS d}d }	 |   }d u rdS |d dkr/|d dk}|r.|d ur.|| _dS n?|d dkrG|d }t|}|d urF|| _dS n'|d dkrntt|d }|  }d urnt|}|d urn|rl|| _dS |}q)	NTFr   s
   http-equivr   s   content-type   charsets   content)	rp   r   r   get_attributerI   rH   ContentAttributeParserr   parse)rN   
has_pragmapending_encoding	attributetentative_encodingcodeccontent_parserr   r   r   r     s<   zEncodingParser.handle_metac                 C   s   | j ddS )NFend_tag)handle_possible_tagrX   r   r   r   r        z(EncodingParser.handle_possible_start_tagc                 C   s   t | j | jddS )NTr   )nextrp   r   rX   r   r   r   r     s   
z&EncodingParser.handle_possible_end_tagc                 C   s\   | j }|jtvr|r|  |   dS |t}|dkr#|  dS 	 |  d u r-	 dS q$)NTr   )rp   r   ascii_letters_bytesr   r   r   spaces_angle_bracketsr   )rN   r   rp   rg   r   r   r   r   #  s   

z"EncodingParser.handle_possible_tagc                 C   r   )Nr   r   rX   r   r   r   r   :  r   zEncodingParser.handle_otherc                 C   s  | j }|ttdgB }|du st|dksJ |dv rdS g }g }	 |dkr+|r+n0|tv r4| }n'|dv r?d|dfS |tv rK||  n|du rQdS || t	|}q$|dkrj|
  d|dfS t	| | }| }d	v r	 t	|}||krt	| d|d|fS |tv r||  n|| qy|d
krd|dfS |tv r||  n|du rdS || 	 t	|}|tv rd|d|fS |tv r||  n|du rdS || q)z{Return a (name, value) pair for the next attribute in the stream.

        If no attribute is found, return None.

           /Nr   )r   NT   =)r   r       )   '   "r   )rp   r   r   	frozensetr<   ry   ascii_uppercase_bytesrn   r   r   r   r   )rN   rp   rg   attribute_nameattribute_valuequoter   r   r   r   =  sl   



zEncodingParser.get_attributeN)r   r   r   r   rO   r   r   r   r   r   r   r   r   r   r   r   r   r     s    !r   c                   @   s   e Zd Zdd Zdd ZdS )r   c                 C   s   t |tsJ || _d S r   )r:   r   rp   r   r   r   r   rO     s   
zContentAttributeParser.__init__c                 C   s  zy| j d | j  jd7  _| j   | j jdksW d S | j  jd7  _| j   | j jdv rS| j j}| j  jd7  _| j j}| j |rP| j || j j W S W d S | j j}z| j t | j || j j W W S  tyy   | j |d   Y W S w  ty   Y d S w )Nr   r   r   )r   r   )rp   r   re   r   r   r   r   r   )rN   r   old_positionr   r   r   r     s2   

zContentAttributeParser.parseN)r   r   r   rO   r   r   r   r   r   r     s    r   c                 C   s\   t | trz| d} W n
 ty   Y dS w | dur,zt| W S  ty+   Y dS w dS )zReturn the Python codec name corresponding to an encoding.

    Return None if the string doesn't correspond to a valid encoding.

    asciiN)r:   r   decodeUnicodeDecodeErrorwebencodingslookupAttributeError)rI   r   r   r   rH     s   
rH   )!r   rz   ior   r   pathlibr   r   r   r   r   	constantsr   r	   r
   r   r   r   r   r   r{   rl   non_bmp_invalid_codepointsascii_punctuation_rerw   rD   r>   rA   r   r   r   r   rH   r   r   r   r   <module>   s:     8 Y <&