Merge pull request from GHSA-22p3-qrh9-cx32
* Make _iterate_over_text easier to read by using simple data structures * Prefer a set of tags to ignore In my tests, it's 4x faster to check for containment in a set of this size * Add a stack size limit to _iterate_over_text * Continue accepting the case where there is no body element * Use an early return instead for None Co-authored-by: Richard van der Hoff <richard@matrix.org>
This commit is contained in:
parent
21e6c0ed64
commit
fa13080618
|
@ -12,10 +12,9 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import codecs
|
||||
import itertools
|
||||
import logging
|
||||
import re
|
||||
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
|
||||
from typing import TYPE_CHECKING, Dict, Generator, Iterable, List, Optional, Set, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from lxml import etree
|
||||
|
@ -276,7 +275,7 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
|
|||
|
||||
from lxml import etree
|
||||
|
||||
TAGS_TO_REMOVE = (
|
||||
TAGS_TO_REMOVE = {
|
||||
"header",
|
||||
"nav",
|
||||
"aside",
|
||||
|
@ -291,31 +290,42 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
|
|||
"img",
|
||||
"picture",
|
||||
etree.Comment,
|
||||
)
|
||||
}
|
||||
|
||||
# Split all the text nodes into paragraphs (by splitting on new
|
||||
# lines)
|
||||
text_nodes = (
|
||||
re.sub(r"\s+", "\n", el).strip()
|
||||
for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
|
||||
for el in _iterate_over_text(tree.find("body"), TAGS_TO_REMOVE)
|
||||
)
|
||||
return summarize_paragraphs(text_nodes)
|
||||
|
||||
|
||||
def _iterate_over_text(
|
||||
tree: "etree.Element", *tags_to_ignore: Union[str, "etree.Comment"]
|
||||
tree: Optional["etree.Element"],
|
||||
tags_to_ignore: Set[Union[str, "etree.Comment"]],
|
||||
stack_limit: int = 1024,
|
||||
) -> Generator[str, None, None]:
|
||||
"""Iterate over the tree returning text nodes in a depth first fashion,
|
||||
skipping text nodes inside certain tags.
|
||||
|
||||
Args:
|
||||
tree: The parent element to iterate. Can be None if there isn't one.
|
||||
tags_to_ignore: Set of tags to ignore
|
||||
stack_limit: Maximum stack size limit for depth-first traversal.
|
||||
Nodes will be dropped if this limit is hit, which may truncate the
|
||||
textual result.
|
||||
Intended to limit the maximum working memory when generating a preview.
|
||||
"""
|
||||
# This is basically a stack that we extend using itertools.chain.
|
||||
# This will either consist of an element to iterate over *or* a string
|
||||
|
||||
if tree is None:
|
||||
return
|
||||
|
||||
# This is a stack whose items are elements to iterate over *or* strings
|
||||
# to be returned.
|
||||
elements = iter([tree])
|
||||
while True:
|
||||
el = next(elements, None)
|
||||
if el is None:
|
||||
return
|
||||
elements: List[Union[str, "etree.Element"]] = [tree]
|
||||
while elements:
|
||||
el = elements.pop()
|
||||
|
||||
if isinstance(el, str):
|
||||
yield el
|
||||
|
@ -329,17 +339,22 @@ def _iterate_over_text(
|
|||
if el.text:
|
||||
yield el.text
|
||||
|
||||
# We add to the stack all the elements children, interspersed with
|
||||
# each child's tail text (if it exists). The tail text of a node
|
||||
# is text that comes *after* the node, so we always include it even
|
||||
# if we ignore the child node.
|
||||
elements = itertools.chain(
|
||||
itertools.chain.from_iterable( # Basically a flatmap
|
||||
[child, child.tail] if child.tail else [child]
|
||||
for child in el.iterchildren()
|
||||
),
|
||||
elements,
|
||||
)
|
||||
# We add to the stack all the element's children, interspersed with
|
||||
# each child's tail text (if it exists).
|
||||
#
|
||||
# We iterate in reverse order so that earlier pieces of text appear
|
||||
# closer to the top of the stack.
|
||||
for child in el.iterchildren(reversed=True):
|
||||
if len(elements) > stack_limit:
|
||||
# We've hit our limit for working memory
|
||||
break
|
||||
|
||||
if child.tail:
|
||||
# The tail text of a node is text that comes *after* the node,
|
||||
# so we always include it even if we ignore the child node.
|
||||
elements.append(child.tail)
|
||||
|
||||
elements.append(child)
|
||||
|
||||
|
||||
def summarize_paragraphs(
|
||||
|
|
|
@ -370,6 +370,23 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
|
|||
og = parse_html_to_open_graph(tree)
|
||||
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
|
||||
|
||||
def test_nested_nodes(self) -> None:
|
||||
"""A body with some nested nodes. Tests that we iterate over children
|
||||
in the right order (and don't reverse the order of the text)."""
|
||||
html = b"""
|
||||
<a href="somewhere">Welcome <b>the bold <u>and underlined text <svg>
|
||||
with a cheeky SVG</svg></u> and <strong>some</strong> tail text</b></a>
|
||||
"""
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree)
|
||||
self.assertEqual(
|
||||
og,
|
||||
{
|
||||
"og:title": None,
|
||||
"og:description": "Welcome\n\nthe bold\n\nand underlined text\n\nand\n\nsome\n\ntail text",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
class MediaEncodingTestCase(unittest.TestCase):
|
||||
def test_meta_charset(self) -> None:
|
||||
|
|
Loading…
Reference in New Issue