3
(h!              $   @   sp  d Z ddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZ yeZW n ek
rp   eefZY nX yddlmZ W n  ek
r   ddlmZ Y nX yddlmZ W n  ek
r   ddlmZ Y nX G d	d
 d
eZyddlmZ W n ek
r   Y nX G dd deZe Zdd ZdddZdddZdddZd ddZ d!ddZ!dd Z"e Z#dS )"z?
An interface to html5lib that mimics the lxml.html interface.
    N)
HTMLParser)TreeBuilder)etree)ElementXHTML_NAMESPACE_contains_block_level_tag)urlopen)urlparsec               @   s   e Zd ZdZdddZdS )r   z*An html5lib HTML parser with lxml as tree.Fc             K   s   t j| f|td| d S )N)stricttree)_HTMLParser__init__r   )selfr
   kwargs r   7/tmp/pip-install-q3hcpn_q/lxml/lxml/html/html5parser.pyr      s    zHTMLParser.__init__N)F)__name__
__module____qualname____doc__r   r   r   r   r   r      s   r   )XHTMLParserc               @   s   e Zd ZdZdddZdS )r   z+An html5lib XHTML Parser with lxml as tree.Fc             K   s   t j| f|td| d S )N)r
   r   )_XHTMLParserr   r   )r   r
   r   r   r   r   r   *   s    zXHTMLParser.__init__N)F)r   r   r   r   r   r   r   r   r   r   '   s   r   c             C   s(   | j |}|d k	r|S | j dt|f S )Nz{%s}%s)findr   )r   tagelemr   r   r   	_find_tag0   s    
r   c             C   sZ   t | tstd|dkrt}i }|dkr8t | tr8d}|dk	rH||d< |j| f|j S )z
    Parse a whole document into a string.

    If `guess_charset` is true, or if the input is not Unicode but a
    byte string, the `chardet` library will perform charset guessing
    on the string.
    zstring requiredNT
useChardet)
isinstance_strings	TypeErrorhtml_parserbytesparsegetroot)htmlguess_charsetparseroptionsr   r   r   document_fromstring7   s    
r(   Fc             C   s   t | tstd|dkrt}i }|dkr8t | tr8d}|dk	rH||d< |j| df|}|rt |d tr|r|d j rtjd|d  |d= |S )a`  Parses several HTML elements, returning a list of elements.

    The first item in the list may be a string.  If no_leading_text is true,
    then it will be an error if there is leading text, and it will always be
    a list of only elements.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    zstring requiredNFr   divr   zThere is leading text: %r)	r   r   r   r    r!   parseFragmentstripr   ParserError)r$   no_leading_textr%   r&   r'   childrenr   r   r   fragments_fromstringO   s"    
r/   c             C   s   t | tstdt|}t| ||| d}|rvt |ts>d}t|}|rrt |d trh|d |_|d= |j| |S |stj	dt
|dkrtj	d|d }|jr|jj rtj	d|j d	|_|S )
a  Parses a single HTML element; it is an error if there is more than
    one element, or if anything but whitespace precedes or follows the
    element.

    If 'create_parent' is true (or is a tag name) then a parent node
    will be created to encapsulate the HTML in a single element.  In
    this case, leading or trailing text is allowed.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    zstring required)r%   r&   r-   r)   r   zNo elements found   zMultiple elements foundzElement followed by text: %rN)r   r   r   boolr/   r   textextendr   r,   lentailr+   )r$   Zcreate_parentr%   r&   Zaccept_leading_textelementsnew_rootresultr   r   r   fragment_fromstringq   s2    






r9   c             C   s   t | tstdt| ||d}| dd }t |trB|jdd}|j j }|jdsb|jdrf|S t	|d	}t
|r||S t	|d
}t
|dkr|j s|jj  r|d j s|d jj  r|d S t|rd|_nd|_|S )a  Parse the html, returning a single element/document.

    This tries to minimally parse the chunk of text, without knowing if it
    is a fragment or a document.

    'base_url' will set the document's base_url attribute (and the tree's
    docinfo.URL)

    If `guess_charset` is true, or if the input is not Unicode but a
    byte string, the `chardet` library will perform charset guessing
    on the string.
    zstring required)r&   r%   N2   asciireplacez<htmlz	<!doctypeheadbodyr0   r   r)   spanr@   )r   r   r   r(   r!   decodelstriplower
startswithr   r4   r2   r+   r5   r   r   )r$   r%   r&   docstartr=   r>   r   r   r   
fromstring   s*    



 rG   c             C   sz   |dkrt }t| ts(| }|dkr\d}n4t| rFt| }|dkr\d}nt| d}|dkr\d}i }|rl||d< |j|f|S )a*  Parse a filename, URL, or file-like object into an HTML document
    tree.  Note: this returns a tree, not an element.  Use
    ``parse(...).getroot()`` to get the document root.

    If ``guess_charset`` is true, the ``useChardet`` option is passed into
    html5lib to enable character detection.  This option is on by default
    when parsing from URLs, off by default when parsing from file(-like)
    objects (which tend to return Unicode more often than not), and on by
    default when parsing from a file path (which is read in binary mode).
    NFTrbr   )r    r   r   _looks_like_urlr   openr"   )Zfilename_url_or_filer%   r&   fpr'   r   r   r   r"      s"    

r"   c             C   s@   t | d }|sdS tjdkr8|tjkr8t|dkr8dS dS d S )Nr   Fwin32r0   T)r	   sysplatformstringascii_lettersr4   )strschemer   r   r   rI      s    

rI   )NN)FNN)FNN)NN)NN)$r   rM   rO   html5libr   r   Z html5lib.treebuilders.etree_lxmlr   lxmlr   Z	lxml.htmlr   r   r   
basestringr   	NameErrorr!   rQ   urllib2r   ImportErrorurllib.requestr	   urllib.parser   r   Zxhtml_parserr   r(   r/   r9   rG   r"   rI   r    r   r   r   r   <module>   sF   
 
! 
+
6
$