3
(h~              )   @   s  d dl mZmZmZ d dlmZmZ d dlmZm	Z	 d dl
Z
d dlZd dlZddlmZmZmZmZ ddlmZ ddlmZ d d	lmZ yd d
lmZ W n ek
r   eZY nX edd eD Zedd eD Zedd eD ZeeddgB ZdZejrFedE dkr"ej ddks&t!ej"eddF e#d d Z$n
ej"eZ$e%ddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3g Z&ej"d4Z'i Z(G d5d6 d6e)Z*d7d8 Z+G d9d: d:e)Z,G d;d< d<e,Z-G d=d> d>e.Z/G d?d@ d@e)Z0G dAdB dBe)Z1dCdD Z2dS )G    )absolute_importdivisionunicode_literals)	text_typebinary_type)http_clienturllibN   )EOFspaceCharactersasciiLettersasciiUppercase)ReparseException)_utils)StringIO)BytesIOc             C   s   g | ]}|j d qS )ascii)encode).0item r   ;/tmp/pip-install-q3hcpn_q/html5lib/html5lib/_inputstream.py
<listcomp>   s    r   c             C   s   g | ]}|j d qS )r   )r   )r   r   r   r   r   r      s    c             C   s   g | ]}|j d qS )r   )r   )r   r   r   r   r   r      s       >   <u   [---﷐-﷯￾￿🿾🿿𯿾𯿿𿿾𿿿񏿾񏿿񟿾񟿿񯿾񯿿񿿾񿿿򏿾򏿿򟿾򟿿򯿾򯿿򿿾򿿿󏿾󏿿󟿾󟿿󯿾󯿿󿿾󿿿􏿾􏿿]]z"\uD800-\uDFFF"i i i i i i i i i i i i i i i i i	 i	 i
 i
 i i i i i i i i i i i i z[	- -/:-@[-`{-~]c               @   sH   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dS )BufferedStreamzBuffering for streams that do not have buffering of their own

    The buffer is implemented as a list of chunks on the assumption that
    joining many strings will be slow since it is O(n**2)
    c             C   s   || _ g | _ddg| _d S )Nr	   r   )streambufferposition)selfr   r   r   r   __init__@   s    zBufferedStream.__init__c             C   s@   d}x(| j d | jd  D ]}|t|7 }qW || jd 7 }|S )Nr   r	   )r   r    len)r!   poschunkr   r   r   tellE   s
    zBufferedStream.tellc             C   sX   || j  kst|}d}x0t| j| |k rH|t| j| 8 }|d7 }qW ||g| _d S )Nr   r	   )_bufferedBytesAssertionErrorr#   r   r    )r!   r$   offsetir   r   r   seekL   s    zBufferedStream.seekc             C   sT   | j s| j|S | jd t| j krF| jd t| j d krF| j|S | j|S d S )Nr   r	   r   )r   _readStreamr    r#   _readFromBuffer)r!   bytesr   r   r   readU   s    

zBufferedStream.readc             C   s   t dd | jD S )Nc             S   s   g | ]}t |qS r   )r#   )r   r   r   r   r   r   _   s    z1BufferedStream._bufferedBytes.<locals>.<listcomp>)sumr   )r!   r   r   r   r'   ^   s    zBufferedStream._bufferedBytesc             C   s<   | j j|}| jj| | jd  d7  < t|| jd< |S )Nr   r	   )r   r/   r   appendr    r#   )r!   r.   datar   r   r   r,   a   s
    zBufferedStream._readStreamc             C   s   |}g }| j d }| j d }x|t| jk r|dkr|dks@t| j| }|t|| krn|}||| g| _ n"t|| }|t|g| _ |d7 }|j||||   ||8 }d}qW |r|j| j| dj|S )Nr   r	       )r    r#   r   r(   r1   r,   join)r!   r.   remainingBytesrvbufferIndexbufferOffsetbufferedDatabytesToReadr   r   r   r-   h   s&    


zBufferedStream._readFromBufferN)__name__
__module____qualname____doc__r"   r&   r+   r/   r'   r,   r-   r   r   r   r   r   9   s   		r   c             K   s   t | tjs(t | tjjr.t | jtjr.d}n&t| drJt | jdt	}n
t | t	}|rdd |D }|rvt
d| t| f|S t| f|S d S )NFr/   r   c             S   s   g | ]}|j d r|qS )	_encoding)endswith)r   xr   r   r   r      s    z#HTMLInputStream.<locals>.<listcomp>z3Cannot set an encoding with a unicode input, set %r)
isinstancer   HTTPResponser   responseaddbasefphasattrr/   r   	TypeErrorHTMLUnicodeInputStreamHTMLBinaryInputStream)sourcekwargs	isUnicode	encodingsr   r   r   HTMLInputStream   s    

rO   c               @   sp   e Zd ZdZdZdd Zdd Zdd Zd	d
 Zdd Z	dd Z
dddZdd Zdd ZdddZdd ZdS )rI   zProvides a unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    i (  c             C   sZ   t jsd| _ntddkr$| j| _n| j| _dg| _tddf| _| j	|| _
| j  dS )a  Initialises the HTMLInputStream.

        HTMLInputStream(source, [encoding]) -> Normalized stream from source
        for use by html5lib.

        source can be either a file-object, local filename or a string.

        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)

        Nu   􏿿r	   r   zutf-8certain)r   supports_lone_surrogatesreportCharacterErrorsr#   characterErrorsUCS4characterErrorsUCS2newLineslookupEncodingcharEncoding
openStream
dataStreamreset)r!   rK   r   r   r   r"      s    
zHTMLUnicodeInputStream.__init__c             C   s.   d| _ d| _d| _g | _d| _d| _d | _d S )N r   )r%   	chunkSizechunkOffseterrorsprevNumLinesprevNumCols_bufferedCharacter)r!   r   r   r   rZ      s    zHTMLUnicodeInputStream.resetc             C   s   t |dr|}nt|}|S )zvProduces a file object from source.

        source can be either a file object, local filename or a string.

        r/   )rG   r   )r!   rK   r   r   r   r   rX      s    
z!HTMLUnicodeInputStream.openStreamc             C   sT   | j }|jdd|}| j| }|jdd|}|dkr@| j| }n||d  }||fS )N
r   r	   r   )r%   countr_   rfindr`   )r!   r)   r%   nLinespositionLinelastLinePospositionColumnr   r   r   	_position   s    
z HTMLUnicodeInputStream._positionc             C   s   | j | j\}}|d |fS )z:Returns (line, col) of the current position in the stream.r	   )ri   r]   )r!   linecolr   r   r   r       s    zHTMLUnicodeInputStream.positionc             C   s6   | j | jkr| j stS | j }| j| }|d | _ |S )zo Read one character from the stream or queue if available. Return
            EOF when EOF is reached.
        r	   )r]   r\   	readChunkr
   r%   )r!   r]   charr   r   r   rm      s    

zHTMLUnicodeInputStream.charNc             C   s   |d kr| j }| j| j\| _| _d| _d| _d| _| jj|}| j	rX| j	| }d | _	n|s`dS t
|dkrt|d }|dksd|  kodkn  r|d | _	|d d }| jr| j| |jdd	}|jd
d	}|| _t
|| _dS )Nr[   r   Fr	      i   i  z
rb   Tr   r   r   )_defaultChunkSizeri   r\   r_   r`   r%   r]   rY   r/   ra   r#   ordrR   replace)r!   r\   r2   lastvr   r   r   rl      s0    
 


z HTMLUnicodeInputStream.readChunkc             C   s,   x&t ttj|D ]}| jjd qW d S )Nzinvalid-codepoint)ranger#   invalid_unicode_refindallr^   r1   )r!   r2   _r   r   r   rS   %  s    z*HTMLUnicodeInputStream.characterErrorsUCS4c             C   s   d}xt j|D ]}|rqt|j }|j }tj|||d  rttj|||d  }|tkrn| j	j
d d}q|dkr|dkr|t|d kr| j	j
d qd}| j	j
d qW d S )NF   zinvalid-codepointTi   i  r	   )ru   finditerrq   groupstartr   isSurrogatePairsurrogatePairToCodepointnon_bmp_invalid_codepointsr^   r1   r#   )r!   r2   skipmatch	codepointr$   char_valr   r   r   rT   )  s     z*HTMLUnicodeInputStream.characterErrorsUCS2Fc       
      C   s  yt ||f }W nl tk
r|   x|D ]}t|dk s&tq&W djdd |D }|s^d| }tjd|  }t ||f< Y nX g }x||j| j| j	}|dkr| j	| j
krP n0|j }|| j
kr|j| j| j	|  || _	P |j| j| j	d  | j sP qW dj|}	|	S )z Returns a string of characters from the stream up to but not
        including any character in 'characters' or EOF. 'characters' must be
        a container that supports the 'in' method and iteration over its
        characters.
           r[   c             S   s   g | ]}d t | qS )z\x%02x)rq   )r   cr   r   r   r   N  s    z5HTMLUnicodeInputStream.charsUntil.<locals>.<listcomp>z^%sz[%s]+N)charsUntilRegExKeyErrorrq   r(   r4   recompiler   r%   r]   r\   endr1   rl   )
r!   
charactersoppositecharsr   regexr6   mr   rr   r   r   
charsUntil@  s2    
 

z!HTMLUnicodeInputStream.charsUntilc             C   sT   |d k	rP| j dkr.|| j | _|  jd7  _n"|  j d8  _ | j| j  |ksPtd S )Nr   r	   )r]   r%   r\   r(   )r!   rm   r   r   r   ungeto  s    
zHTMLUnicodeInputStream.unget)N)F)r;   r<   r=   r>   rp   r"   rZ   rX   ri   r    rm   rl   rS   rT   r   r   r   r   r   r   rI      s    
&
/rI   c               @   sL   e Zd ZdZdddZdd Zd	d
 ZdddZdd Zdd Z	dd Z
dS )rJ   zProvides a unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    Nwindows-1252Tc             C   sn   | j || _tj| | j d| _d| _|| _|| _|| _|| _	|| _
| j|| _| jd dk	sbt| j  dS )a  Initialises the HTMLInputStream.

        HTMLInputStream(source, [encoding]) -> Normalized stream from source
        for use by html5lib.

        source can be either a file-object, local filename or a string.

        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)

        i   d   r   N)rX   	rawStreamrI   r"   numBytesMetanumBytesChardetoverride_encodingtransport_encodingsame_origin_parent_encodinglikely_encodingdefault_encodingdetermineEncodingrW   r(   rZ   )r!   rK   r   r   r   r   r   
useChardetr   r   r   r"     s    zHTMLBinaryInputStream.__init__c             C   s&   | j d jj| jd| _tj|  d S )Nr   rr   )rW   
codec_infostreamreaderr   rY   rI   rZ   )r!   r   r   r   rZ     s    zHTMLBinaryInputStream.resetc          	   C   sD   t |dr|}nt|}y|j|j  W n   t|}Y nX |S )zvProduces a file object from source.

        source can be either a file object, local filename or a string.

        r/   )rG   r   r+   r&   r   )r!   rK   r   r   r   r   rX     s    
z HTMLBinaryInputStream.openStreamc             C   s  | j  df}|d d k	r|S t| jdf}|d d k	r:|S t| jdf}|d d k	rX|S | j df}|d d k	rt|S t| jdf}|d d k	r|d jjd r|S t| jdf}|d d k	r|S |rtyddl	m
} W n tk
r   Y nX g }| }xF|js>| jj| j}t|ts t|s(P |j| |j| qW |j  t|jd }| jjd |d k	rt|dfS t| jdf}|d d k	r|S tddfS )NrP   r   	tentativezutf-16)UniversalDetectorencodingzwindows-1252)	detectBOMrV   r   r   detectEncodingMetar   name
startswithr   Zchardet.universaldetectorr   ImportErrordoner   r/   r   rB   r.   r(   r1   feedcloseresultr+   r   )r!   chardetrW   r   buffersdetectorr   r   r   r   r   r     sR    


z'HTMLBinaryInputStream.determineEncodingc             C   s   | j d dkstt|}|d kr&d S |jdkrFtd}|d k	stnT|| j d krf| j d df| _ n4| jjd |df| _ | j  td| j d |f d S )	Nr	   rP   utf-16beutf-16lezutf-8r   zEncoding changed from %s to %s)r   r   )rW   r(   rV   r   r   r+   rZ   r   )r!   newEncodingr   r   r   changeEncoding  s    

z$HTMLBinaryInputStream.changeEncodingc          
   C   s   t jdt jdt jdt jdt jdi}| jjd}t|t	s<t
|j|dd }d}|s~|j|}d}|s~|j|dd	 }d	}|r| jj| t|S | jjd
 dS dS )zAttempts to detect at BOM at the start of the stream. If
        an encoding can be determined from the BOM return the name of the
        encoding otherwise return Nonezutf-8zutf-16lezutf-16bezutf-32lezutf-32be   N   rx   r   )codecsBOM_UTF8BOM_UTF16_LEBOM_UTF16_BEBOM_UTF32_LEBOM_UTF32_BEr   r/   rB   r.   r(   getr+   rV   )r!   bomDictstringr   r+   r   r   r   r     s$    
zHTMLBinaryInputStream.detectBOMc             C   sV   | j j| j}t|tstt|}| j jd |j }|dk	rR|j	dkrRt
d}|S )z9Report the encoding declared by the meta element
        r   Nutf-16beutf-16lezutf-8)r   r   )r   r/   r   rB   r.   r(   EncodingParserr+   getEncodingr   rV   )r!   r   parserr   r   r   r   r   9  s    z(HTMLBinaryInputStream.detectEncodingMeta)NNNNr   T)T)r;   r<   r=   r>   r"   rZ   rX   r   r   r   r   r   r   r   r   rJ     s     
(
>"rJ   c               @   s   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zeee
Zdd ZeeZefddZdd Zdd Zdd ZdS )EncodingByteszString-like object with an associated position and various extra methods
    If the position is ever greater than the string length then an exception is
    raisedc             C   s   t |tsttj| |j S )N)rB   r.   r(   __new__lower)r!   valuer   r   r   r   L  s    zEncodingBytes.__new__c             C   s
   d| _ d S )Nr	   r   )ri   )r!   r   r   r   r   r"   P  s    zEncodingBytes.__init__c             C   s   | S )Nr   )r!   r   r   r   __iter__T  s    zEncodingBytes.__iter__c             C   s>   | j d  }| _ |t| kr"tn|dk r.t| ||d  S )Nr	   r   )ri   r#   StopIterationrH   )r!   pr   r   r   __next__W  s    zEncodingBytes.__next__c             C   s   | j  S )N)r   )r!   r   r   r   next_  s    zEncodingBytes.nextc             C   sB   | j }|t| krtn|dk r$t|d  | _ }| ||d  S )Nr   r	   )ri   r#   r   rH   )r!   r   r   r   r   previousc  s    zEncodingBytes.previousc             C   s   | j t| krt|| _ d S )N)ri   r#   r   )r!   r    r   r   r   setPositionl  s    zEncodingBytes.setPositionc             C   s*   | j t| krt| j dkr"| j S d S d S )Nr   )ri   r#   r   )r!   r   r   r   getPositionq  s
    
zEncodingBytes.getPositionc             C   s   | | j | j d  S )Nr	   )r    )r!   r   r   r   getCurrentByte{  s    zEncodingBytes.getCurrentBytec             C   sL   | j }x:|t| k r@| ||d  }||kr6|| _|S |d7 }qW || _dS )zSkip past a list of charactersr	   N)r    r#   ri   )r!   r   r   r   r   r   r   r     s    zEncodingBytes.skipc             C   sL   | j }x:|t| k r@| ||d  }||kr6|| _|S |d7 }qW || _d S )Nr	   )r    r#   ri   )r!   r   r   r   r   r   r   	skipUntil  s    zEncodingBytes.skipUntilc             C   s>   | j }| ||t|  }|j|}|r:|  j t|7  _ |S )zLook for a sequence of bytes at the start of a string. If the bytes
        are found return True and advance the position to the byte after the
        match. Otherwise return False and leave the position alone)r    r#   r   )r!   r.   r   r2   r6   r   r   r   
matchBytes  s    
zEncodingBytes.matchBytesc             C   sR   | | j d j|}|dkrJ| jdkr,d| _|  j|t| d 7  _dS tdS )zLook for the next sequence of bytes matching a given sequence. If
        a match is found advance the position to the last byte of the matchNr	   r   Tr   r   )r    findri   r#   r   )r!   r.   newPositionr   r   r   jumpTo  s    
zEncodingBytes.jumpToN)r;   r<   r=   r>   r   r"   r   r   r   r   r   r   propertyr    r   currentBytespaceCharactersBytesr   r   r   r   r   r   r   r   r   H  s    	
r   c               @   sX   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd ZdS )r   z?Mini parser for detecting character encoding from meta elementsc             C   s   t || _d| _dS )z3string - the data to work on for encoding detectionN)r   r2   r   )r!   r2   r   r   r   r"     s    
zEncodingParser.__init__c             C   s   d| j fd| jfd| jfd| jfd| jfd| jff}x^| jD ]T}d}xD|D ]<\}}| jj|rJy| }P W qJ tk
r   d}P Y qJX qJW |s<P q<W | jS )	Ns   <!--s   <metas   </s   <!s   <?r   TF)	handleComment
handleMetahandlePossibleEndTaghandleOtherhandlePossibleStartTagr2   r   r   r   )r!   methodDispatchrw   keepParsingkeymethodr   r   r   r     s&    zEncodingParser.getEncodingc             C   s   | j jdS )zSkip over commentss   -->)r2   r   )r!   r   r   r   r     s    zEncodingParser.handleCommentc             C   s   | j jtkrdS d}d }x| j }|d kr.dS |d dkr^|d dk}|r|d k	r|| _dS q|d dkr|d }t|}|d k	r|| _dS q|d dkrtt|d }|j }|d k	rt|}|d k	r|r|| _dS |}qW d S )	NTFr   s
   http-equivr	   s   content-types   charsets   content)	r2   r   r   getAttributer   rV   ContentAttrParserr   parse)r!   	hasPragmapendingEncodingattrtentativeEncodingcodeccontentParserr   r   r   r     s:    zEncodingParser.handleMetac             C   s
   | j dS )NF)handlePossibleTag)r!   r   r   r   r     s    z%EncodingParser.handlePossibleStartTagc             C   s   t | j | jdS )NT)r   r2   r   )r!   r   r   r   r     s    
z#EncodingParser.handlePossibleEndTagc             C   sf   | j }|jtkr(|r$|j  | j  dS |jt}|dkrD|j  n| j }x|d k	r`| j }qNW dS )NTr   )r2   r   asciiLettersBytesr   r   r   spacesAngleBracketsr   )r!   endTagr2   r   r   r   r   r   r     s    



z EncodingParser.handlePossibleTagc             C   s   | j jdS )Nr   )r2   r   )r!   r   r   r   r     s    zEncodingParser.handleOtherc             C   s  | j }|jttdgB }|dks2t|dks2t|d	kr>dS g }g }xt|dkrX|rXP nX|tkrl|j }P nD|d
krdj|dfS |tkr|j|j	  n|dkrdS |j| t
|}qHW |dkr|j  dj|dfS t
| |j }|dkrT|}xt
|}||kr(t
| dj|dj|fS |tkrB|j|j	  n
|j|  qW nJ|dkrldj|dfS |tkr|j|j	  n|dkrdS |j| x^t
|}|tkrdj|dj|fS |tkr|j|j	  n|dkrdS |j| qW dS )z_Return a name,value pair for the next attribute in the stream,
        if one is found, or None   /Nr	   r      =r3      '   ")r   N)r   r   )r   r   )r2   r   r   	frozensetr#   r(   r4   asciiUppercaseBytesr1   r   r   r   r   )r!   r2   r   attrName	attrValue	quoteCharr   r   r   r     sh    










zEncodingParser.getAttributeN)r;   r<   r=   r>   r"   r   r   r   r   r   r   r   r   r   r   r   r   r     s   $r   c               @   s   e Zd Zdd Zdd ZdS )r   c             C   s   t |tst|| _d S )N)rB   r.   r(   r2   )r!   r2   r   r   r   r"   f  s    zContentAttrParser.__init__c             C   s  y| j jd | j  jd7  _| j j  | j jdks8d S | j  jd7  _| j j  | j jdkr| j j}| j  jd7  _| j j}| j j|r| j || j j S d S nF| j j}y| j jt | j || j j S  tk
r   | j |d  S X W n tk
r    d S X d S )Ns   charsetr	   r   r   r   )r   r   )r2   r   r    r   r   r   r   r   )r!   	quoteMarkoldPositionr   r   r   r   j  s.    

zContentAttrParser.parseN)r;   r<   r=   r"   r   r   r   r   r   r   e  s   r   c             C   s`   t | tr.y| jd} W n tk
r,   dS X | dk	rXy
tj| S  tk
rT   dS X ndS dS )z{Return the python codec name corresponding to an encoding or None if the
    string doesn't correspond to a valid encoding.r   N)rB   r   decodeUnicodeDecodeErrorwebencodingslookupAttributeError)r   r   r   r   rV     s    

rV   r   r   )3
__future__r   r   r   sixr   r   Z	six.movesr   r   r   r   r   	constantsr
   r   r   r   r   r[   r   ior   r   r   r   r   r   r   r   invalid_unicode_no_surrogaterQ   rc   r(   r   evalru   setr~   ascii_punctuation_rer   objectr   rO   rI   rJ   r.   r   r   r   rV   r   r   r   r   <module>   sX   
"








J g Ih 6'