385 lines
13 KiB
Python
385 lines
13 KiB
Python
|
|
from functools import cached_property
|
|||
|
|
from warnings import warn
|
|||
|
|
|
|||
|
|
from webencodings import ascii_lower
|
|||
|
|
|
|||
|
|
from .compiler import compile_selector_list, split_whitespace
|
|||
|
|
|
|||
|
|
|
|||
|
|
class ElementWrapper:
|
|||
|
|
"""Wrapper of :class:`xml.etree.ElementTree.Element` for Selector matching.
|
|||
|
|
|
|||
|
|
This class should not be instanciated directly. :meth:`from_xml_root` or
|
|||
|
|
:meth:`from_html_root` should be used for the root element of a document,
|
|||
|
|
and other elements should be accessed (and wrappers generated) using
|
|||
|
|
methods such as :meth:`iter_children` and :meth:`iter_subtree`.
|
|||
|
|
|
|||
|
|
:class:`ElementWrapper` objects compare equal if their underlying
|
|||
|
|
:class:`xml.etree.ElementTree.Element` do.
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
@classmethod
|
|||
|
|
def from_xml_root(cls, root, content_language=None):
|
|||
|
|
"""Wrap for selector matching the root of an XML or XHTML document.
|
|||
|
|
|
|||
|
|
:param root:
|
|||
|
|
An ElementTree :class:`xml.etree.ElementTree.Element`
|
|||
|
|
for the root element of a document.
|
|||
|
|
If the given element is not the root,
|
|||
|
|
selector matching will behave is if it were.
|
|||
|
|
In other words, selectors will be not be `scoped`_
|
|||
|
|
to the subtree rooted at that element.
|
|||
|
|
:returns:
|
|||
|
|
A new :class:`ElementWrapper`
|
|||
|
|
|
|||
|
|
.. _scoped: https://drafts.csswg.org/selectors-4/#scoping
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
return cls._from_root(root, content_language, in_html_document=False)
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def from_html_root(cls, root, content_language=None):
|
|||
|
|
"""Same as :meth:`from_xml_root` with case-insensitive attribute names.
|
|||
|
|
|
|||
|
|
Useful for documents parsed with an HTML parser like html5lib, which
|
|||
|
|
should be the case of documents with the ``text/html`` MIME type.
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
return cls._from_root(root, content_language, in_html_document=True)
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def _from_root(cls, root, content_language, in_html_document=True):
|
|||
|
|
if hasattr(root, 'getroot'):
|
|||
|
|
root = root.getroot()
|
|||
|
|
return cls(
|
|||
|
|
root, parent=None, index=0, previous=None,
|
|||
|
|
in_html_document=in_html_document, content_language=content_language)
|
|||
|
|
|
|||
|
|
def __init__(self, etree_element, parent, index, previous,
|
|||
|
|
in_html_document, content_language=None):
|
|||
|
|
#: The underlying ElementTree :class:`xml.etree.ElementTree.Element`
|
|||
|
|
self.etree_element = etree_element
|
|||
|
|
#: The parent :class:`ElementWrapper`,
|
|||
|
|
#: or :obj:`None` for the root element.
|
|||
|
|
self.parent = parent
|
|||
|
|
#: The previous sibling :class:`ElementWrapper`,
|
|||
|
|
#: or :obj:`None` for the root element.
|
|||
|
|
self.previous = previous
|
|||
|
|
if parent is not None:
|
|||
|
|
#: The :attr:`parent`’s children
|
|||
|
|
#: as a list of
|
|||
|
|
#: ElementTree :class:`xml.etree.ElementTree.Element`\ s.
|
|||
|
|
#: For the root (which has no parent)
|
|||
|
|
self.etree_siblings = parent.etree_children
|
|||
|
|
else:
|
|||
|
|
self.etree_siblings = [etree_element]
|
|||
|
|
#: The position within the :attr:`parent`’s children, counting from 0.
|
|||
|
|
#: ``e.etree_siblings[e.index]`` is always ``e.etree_element``.
|
|||
|
|
self.index = index
|
|||
|
|
self.in_html_document = in_html_document
|
|||
|
|
self.transport_content_language = content_language
|
|||
|
|
|
|||
|
|
# Cache
|
|||
|
|
self._ancestors = None
|
|||
|
|
self._previous_siblings = None
|
|||
|
|
|
|||
|
|
def __eq__(self, other):
|
|||
|
|
return (
|
|||
|
|
type(self) is type(other) and
|
|||
|
|
self.etree_element == other.etree_element)
|
|||
|
|
|
|||
|
|
def __ne__(self, other):
|
|||
|
|
return not (self == other)
|
|||
|
|
|
|||
|
|
def __hash__(self):
|
|||
|
|
return hash((type(self), self.etree_element))
|
|||
|
|
|
|||
|
|
def __iter__(self):
|
|||
|
|
yield from self.iter_children()
|
|||
|
|
|
|||
|
|
@property
|
|||
|
|
def ancestors(self):
|
|||
|
|
"""Tuple of existing ancestors.
|
|||
|
|
|
|||
|
|
Tuple of existing :class:`ElementWrapper` objects for this element’s
|
|||
|
|
ancestors, in reversed tree order, from :attr:`parent` to the root.
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
if self._ancestors is None:
|
|||
|
|
self._ancestors = (
|
|||
|
|
() if self.parent is None else (*self.parent.ancestors, self.parent))
|
|||
|
|
return self._ancestors
|
|||
|
|
|
|||
|
|
@property
|
|||
|
|
def previous_siblings(self):
|
|||
|
|
"""Tuple of previous siblings.
|
|||
|
|
|
|||
|
|
Tuple of existing :class:`ElementWrapper` objects for this element’s
|
|||
|
|
previous siblings, in reversed tree order.
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
if self._previous_siblings is None:
|
|||
|
|
self._previous_siblings = (
|
|||
|
|
() if self.previous is None else
|
|||
|
|
(*self.previous.previous_siblings, self.previous))
|
|||
|
|
return self._previous_siblings
|
|||
|
|
|
|||
|
|
def iter_ancestors(self):
|
|||
|
|
"""Iterate over ancestors.
|
|||
|
|
|
|||
|
|
Return an iterator of existing :class:`ElementWrapper` objects for this
|
|||
|
|
element’s ancestors, in reversed tree order (from :attr:`parent` to the
|
|||
|
|
root).
|
|||
|
|
|
|||
|
|
The element itself is not included, this is an empty sequence for the
|
|||
|
|
root element.
|
|||
|
|
|
|||
|
|
This method is deprecated and will be removed in version 0.7.0. Use
|
|||
|
|
:attr:`ancestors` instead.
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
warn(
|
|||
|
|
'This method is deprecated and will be removed in version 0.7.0. '
|
|||
|
|
'Use the "ancestors" attribute instead.',
|
|||
|
|
DeprecationWarning)
|
|||
|
|
yield from self.ancestors
|
|||
|
|
|
|||
|
|
def iter_previous_siblings(self):
|
|||
|
|
"""Iterate over previous siblings.
|
|||
|
|
|
|||
|
|
Return an iterator of existing :class:`ElementWrapper` objects for this
|
|||
|
|
element’s previous siblings, in reversed tree order.
|
|||
|
|
|
|||
|
|
The element itself is not included, this is an empty sequence for a
|
|||
|
|
first child or the root element.
|
|||
|
|
|
|||
|
|
This method is deprecated and will be removed in version 0.7.0. Use
|
|||
|
|
:attr:`previous_siblings` instead.
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
warn(
|
|||
|
|
'This method is deprecated and will be removed in version 0.7.0. '
|
|||
|
|
'Use the "previous_siblings" attribute instead.',
|
|||
|
|
DeprecationWarning)
|
|||
|
|
yield from self.previous_siblings
|
|||
|
|
|
|||
|
|
def iter_siblings(self):
|
|||
|
|
"""Iterate over siblings.
|
|||
|
|
|
|||
|
|
Return an iterator of newly-created :class:`ElementWrapper` objects for
|
|||
|
|
this element’s siblings, in tree order.
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
if self.parent is None:
|
|||
|
|
yield self
|
|||
|
|
else:
|
|||
|
|
yield from self.parent.iter_children()
|
|||
|
|
|
|||
|
|
def iter_next_siblings(self):
|
|||
|
|
"""Iterate over next siblings.
|
|||
|
|
|
|||
|
|
Return an iterator of newly-created :class:`ElementWrapper` objects for
|
|||
|
|
this element’s next siblings, in tree order.
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
found = False
|
|||
|
|
for sibling in self.iter_siblings():
|
|||
|
|
if found:
|
|||
|
|
yield sibling
|
|||
|
|
if sibling == self:
|
|||
|
|
found = True
|
|||
|
|
|
|||
|
|
def iter_children(self):
|
|||
|
|
"""Iterate over children.
|
|||
|
|
|
|||
|
|
Return an iterator of newly-created :class:`ElementWrapper` objects for
|
|||
|
|
this element’s child elements, in tree order.
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
child = None
|
|||
|
|
for i, etree_child in enumerate(self.etree_children):
|
|||
|
|
child = type(self)(
|
|||
|
|
etree_child, parent=self, index=i, previous=child,
|
|||
|
|
in_html_document=self.in_html_document)
|
|||
|
|
yield child
|
|||
|
|
|
|||
|
|
def iter_subtree(self):
|
|||
|
|
"""Iterate over subtree.
|
|||
|
|
|
|||
|
|
Return an iterator of newly-created :class:`ElementWrapper` objects for
|
|||
|
|
the entire subtree rooted at this element, in tree order.
|
|||
|
|
|
|||
|
|
Unlike in other methods, the element itself *is* included.
|
|||
|
|
|
|||
|
|
This loops over an entire document:
|
|||
|
|
|
|||
|
|
.. code-block:: python
|
|||
|
|
|
|||
|
|
for element in ElementWrapper.from_root(root_etree).iter_subtree():
|
|||
|
|
...
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
stack = [iter([self])]
|
|||
|
|
while stack:
|
|||
|
|
element = next(stack[-1], None)
|
|||
|
|
if element is None:
|
|||
|
|
stack.pop()
|
|||
|
|
else:
|
|||
|
|
yield element
|
|||
|
|
stack.append(element.iter_children())
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def _compile(selectors):
|
|||
|
|
return [
|
|||
|
|
compiled_selector.test
|
|||
|
|
for selector in selectors
|
|||
|
|
for compiled_selector in (
|
|||
|
|
[selector] if hasattr(selector, 'test')
|
|||
|
|
else compile_selector_list(selector))
|
|||
|
|
if compiled_selector.pseudo_element is None and
|
|||
|
|
not compiled_selector.never_matches]
|
|||
|
|
|
|||
|
|
def matches(self, *selectors):
|
|||
|
|
"""Return wether this elememt matches any of the given selectors.
|
|||
|
|
|
|||
|
|
:param selectors:
|
|||
|
|
Each given selector is either a :class:`compiler.CompiledSelector`,
|
|||
|
|
or an argument to :func:`compile_selector_list`.
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
return any(test(self) for test in self._compile(selectors))
|
|||
|
|
|
|||
|
|
def query_all(self, *selectors):
|
|||
|
|
"""Return elements, in tree order, that match any of given selectors.
|
|||
|
|
|
|||
|
|
Selectors are `scoped`_ to the subtree rooted at this element.
|
|||
|
|
|
|||
|
|
.. _scoped: https://drafts.csswg.org/selectors-4/#scoping
|
|||
|
|
|
|||
|
|
:param selectors:
|
|||
|
|
Each given selector is either a :class:`compiler.CompiledSelector`,
|
|||
|
|
or an argument to :func:`compile_selector_list`.
|
|||
|
|
:returns:
|
|||
|
|
An iterator of newly-created :class:`ElementWrapper` objects.
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
tests = self._compile(selectors)
|
|||
|
|
if len(tests) == 1:
|
|||
|
|
return filter(tests[0], self.iter_subtree())
|
|||
|
|
elif selectors:
|
|||
|
|
return (
|
|||
|
|
element for element in self.iter_subtree()
|
|||
|
|
if any(test(element) for test in tests))
|
|||
|
|
else:
|
|||
|
|
return iter(())
|
|||
|
|
|
|||
|
|
def query(self, *selectors):
|
|||
|
|
"""Return first element that matches any of given selectors.
|
|||
|
|
|
|||
|
|
:param selectors:
|
|||
|
|
Each given selector is either a :class:`compiler.CompiledSelector`,
|
|||
|
|
or an argument to :func:`compile_selector_list`.
|
|||
|
|
:returns:
|
|||
|
|
A newly-created :class:`ElementWrapper` object,
|
|||
|
|
or :obj:`None` if there is no match.
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
return next(self.query_all(*selectors), None)
|
|||
|
|
|
|||
|
|
@cached_property
|
|||
|
|
def etree_children(self):
|
|||
|
|
"""Children as a list of :class:`xml.etree.ElementTree.Element`.
|
|||
|
|
|
|||
|
|
Other ElementTree nodes such as
|
|||
|
|
:func:`comments <xml.etree.ElementTree.Comment>` and
|
|||
|
|
:func:`processing instructions
|
|||
|
|
<xml.etree.ElementTree.ProcessingInstruction>`
|
|||
|
|
are not included.
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
return [
|
|||
|
|
element for element in self.etree_element
|
|||
|
|
if isinstance(element.tag, str)]
|
|||
|
|
|
|||
|
|
@cached_property
|
|||
|
|
def local_name(self):
|
|||
|
|
"""The local name of this element, as a string."""
|
|||
|
|
namespace_url, local_name = _split_etree_tag(self.etree_element.tag)
|
|||
|
|
self.__dict__['namespace_url'] = namespace_url
|
|||
|
|
return local_name
|
|||
|
|
|
|||
|
|
@cached_property
|
|||
|
|
def namespace_url(self):
|
|||
|
|
"""The namespace URL of this element, as a string."""
|
|||
|
|
namespace_url, local_name = _split_etree_tag(self.etree_element.tag)
|
|||
|
|
self.__dict__['local_name'] = local_name
|
|||
|
|
return namespace_url
|
|||
|
|
|
|||
|
|
@cached_property
|
|||
|
|
def id(self):
|
|||
|
|
"""The ID of this element, as a string."""
|
|||
|
|
return self.etree_element.get('id')
|
|||
|
|
|
|||
|
|
@cached_property
|
|||
|
|
def classes(self):
|
|||
|
|
"""The classes of this element, as a :class:`set` of strings."""
|
|||
|
|
return set(split_whitespace(self.etree_element.get('class', '')))
|
|||
|
|
|
|||
|
|
@cached_property
|
|||
|
|
def lang(self):
|
|||
|
|
"""The language of this element, as a string."""
|
|||
|
|
# http://whatwg.org/C#language
|
|||
|
|
xml_lang = self.etree_element.get('{http://www.w3.org/XML/1998/namespace}lang')
|
|||
|
|
if xml_lang is not None:
|
|||
|
|
return ascii_lower(xml_lang)
|
|||
|
|
is_html = (
|
|||
|
|
self.in_html_document or
|
|||
|
|
self.namespace_url == 'http://www.w3.org/1999/xhtml')
|
|||
|
|
if is_html:
|
|||
|
|
lang = self.etree_element.get('lang')
|
|||
|
|
if lang is not None:
|
|||
|
|
return ascii_lower(lang)
|
|||
|
|
if self.parent is not None:
|
|||
|
|
return self.parent.lang
|
|||
|
|
# Root elememnt
|
|||
|
|
if is_html:
|
|||
|
|
content_language = None
|
|||
|
|
iterator = self.etree_element.iter('{http://www.w3.org/1999/xhtml}meta')
|
|||
|
|
for meta in iterator:
|
|||
|
|
http_equiv = meta.get('http-equiv', '')
|
|||
|
|
if ascii_lower(http_equiv) == 'content-language':
|
|||
|
|
content_language = _parse_content_language(meta.get('content'))
|
|||
|
|
if content_language is not None:
|
|||
|
|
return ascii_lower(content_language)
|
|||
|
|
# Empty string means unknown
|
|||
|
|
return _parse_content_language(self.transport_content_language) or ''
|
|||
|
|
|
|||
|
|
@cached_property
|
|||
|
|
def in_disabled_fieldset(self):
|
|||
|
|
if self.parent is None:
|
|||
|
|
return False
|
|||
|
|
fieldset = '{http://www.w3.org/1999/xhtml}fieldset'
|
|||
|
|
legend = '{http://www.w3.org/1999/xhtml}legend'
|
|||
|
|
disabled_fieldset = (
|
|||
|
|
self.parent.etree_element.tag == fieldset and
|
|||
|
|
self.parent.etree_element.get('disabled') is not None and (
|
|||
|
|
self.etree_element.tag != legend or any(
|
|||
|
|
sibling.etree_element.tag == legend
|
|||
|
|
for sibling in self.iter_previous_siblings())))
|
|||
|
|
return disabled_fieldset or self.parent.in_disabled_fieldset
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _split_etree_tag(tag):
|
|||
|
|
position = tag.rfind('}')
|
|||
|
|
if position == -1 or tag[0] != '{':
|
|||
|
|
return '', tag
|
|||
|
|
else:
|
|||
|
|
return tag[1:position], tag[position+1:]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _parse_content_language(value):
|
|||
|
|
if value is not None and ',' not in value:
|
|||
|
|
parts = split_whitespace(value)
|
|||
|
|
if len(parts) == 1:
|
|||
|
|
return parts[0]
|