Improve URL previews for some pages (#12951)
* Skip `og` and `meta` tags where the value is empty. * Fallback to the favicon if there are no other images. * Ignore tags meant for navigation.
This commit is contained in:
parent
888a29f412
commit
01df5bacac
|
@ -0,0 +1 @@
|
|||
Improve URL previews for pages with empty elements.
|
|
@ -30,6 +30,9 @@ _xml_encoding_match = re.compile(
|
|||
)
|
||||
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
|
||||
|
||||
# Certain elements aren't meant for display.
|
||||
ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"}
|
||||
|
||||
|
||||
def _normalise_encoding(encoding: str) -> Optional[str]:
|
||||
"""Use the Python codec's name as the normalised entry."""
|
||||
|
@ -174,13 +177,15 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
|
|||
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
|
||||
|
||||
og: Dict[str, Optional[str]] = {}
|
||||
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
|
||||
if "content" in tag.attrib:
|
||||
# if we've got more than 50 tags, someone is taking the piss
|
||||
if len(og) >= 50:
|
||||
logger.warning("Skipping OG for page with too many 'og:' tags")
|
||||
return {}
|
||||
og[tag.attrib["property"]] = tag.attrib["content"]
|
||||
for tag in tree.xpath(
|
||||
"//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
|
||||
):
|
||||
# if we've got more than 50 tags, someone is taking the piss
|
||||
if len(og) >= 50:
|
||||
logger.warning("Skipping OG for page with too many 'og:' tags")
|
||||
return {}
|
||||
|
||||
og[tag.attrib["property"]] = tag.attrib["content"]
|
||||
|
||||
# TODO: grab article: meta tags too, e.g.:
|
||||
|
||||
|
@ -192,21 +197,23 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
|
|||
# "article:modified_time" content="2016-04-01T18:31:53+00:00" />
|
||||
|
||||
if "og:title" not in og:
|
||||
# do some basic spidering of the HTML
|
||||
title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
|
||||
if title and title[0].text is not None:
|
||||
og["og:title"] = title[0].text.strip()
|
||||
# Attempt to find a title from the title tag, or the biggest header on the page.
|
||||
title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
|
||||
if title:
|
||||
og["og:title"] = title[0].strip()
|
||||
else:
|
||||
og["og:title"] = None
|
||||
|
||||
if "og:image" not in og:
|
||||
# TODO: extract a favicon failing all else
|
||||
meta_image = tree.xpath(
|
||||
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
|
||||
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
|
||||
)
|
||||
# If a meta image is found, use it.
|
||||
if meta_image:
|
||||
og["og:image"] = meta_image[0]
|
||||
else:
|
||||
# Try to find images which are larger than 10px by 10px.
|
||||
#
|
||||
# TODO: consider inlined CSS styles as well as width & height attribs
|
||||
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
|
||||
images = sorted(
|
||||
|
@ -215,17 +222,24 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
|
|||
-1 * float(i.attrib["width"]) * float(i.attrib["height"])
|
||||
),
|
||||
)
|
||||
# If no images were found, try to find *any* images.
|
||||
if not images:
|
||||
images = tree.xpath("//img[@src]")
|
||||
images = tree.xpath("//img[@src][1]")
|
||||
if images:
|
||||
og["og:image"] = images[0].attrib["src"]
|
||||
|
||||
# Finally, fallback to the favicon if nothing else.
|
||||
else:
|
||||
favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
|
||||
if favicons:
|
||||
og["og:image"] = favicons[0]
|
||||
|
||||
if "og:description" not in og:
|
||||
# Check the first meta description tag for content.
|
||||
meta_description = tree.xpath(
|
||||
"//*/meta"
|
||||
"[translate(@name, 'DESCRIPTION', 'description')='description']"
|
||||
"/@content"
|
||||
"//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
|
||||
)
|
||||
# If a meta description is found with content, use it.
|
||||
if meta_description:
|
||||
og["og:description"] = meta_description[0]
|
||||
else:
|
||||
|
@ -306,6 +320,10 @@ def _iterate_over_text(
|
|||
if isinstance(el, str):
|
||||
yield el
|
||||
elif el.tag not in tags_to_ignore:
|
||||
# If the element isn't meant for display, ignore it.
|
||||
if el.get("role") in ARIA_ROLES_TO_IGNORE:
|
||||
continue
|
||||
|
||||
# el.text is the text before the first child, so we can immediately
|
||||
# return it if the text exists.
|
||||
if el.text:
|
||||
|
|
|
@ -145,7 +145,7 @@ class SummarizeTestCase(unittest.TestCase):
|
|||
)
|
||||
|
||||
|
||||
class CalcOgTestCase(unittest.TestCase):
|
||||
class OpenGraphFromHtmlTestCase(unittest.TestCase):
|
||||
if not lxml:
|
||||
skip = "url preview feature requires lxml"
|
||||
|
||||
|
@ -235,6 +235,21 @@ class CalcOgTestCase(unittest.TestCase):
|
|||
|
||||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
||||
|
||||
# Another variant is a title with no content.
|
||||
html = b"""
|
||||
<html>
|
||||
<head><title></title></head>
|
||||
<body>
|
||||
<h1>Title</h1>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree)
|
||||
|
||||
self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})
|
||||
|
||||
def test_h1_as_title(self) -> None:
|
||||
html = b"""
|
||||
<html>
|
||||
|
@ -250,6 +265,26 @@ class CalcOgTestCase(unittest.TestCase):
|
|||
|
||||
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
|
||||
|
||||
def test_empty_description(self) -> None:
|
||||
"""Description tags with empty content should be ignored."""
|
||||
html = b"""
|
||||
<html>
|
||||
<meta property="og:description" content=""/>
|
||||
<meta property="og:description"/>
|
||||
<meta name="description" content=""/>
|
||||
<meta name="description"/>
|
||||
<meta name="description" content="Finally!"/>
|
||||
<body>
|
||||
<h1>Title</h1>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree)
|
||||
|
||||
self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})
|
||||
|
||||
def test_missing_title_and_broken_h1(self) -> None:
|
||||
html = b"""
|
||||
<html>
|
||||
|
|
Loading…
Reference in New Issue