Improve URL previews for some pages (#12951)
* Skip `og` and `meta` tags where the value is empty. * Fallback to the favicon if there are no other images. * Ignore tags meant for navigation.
This commit is contained in:
parent
888a29f412
commit
01df5bacac
|
@ -0,0 +1 @@
|
||||||
|
Improve URL previews for pages with empty elements.
|
|
@ -30,6 +30,9 @@ _xml_encoding_match = re.compile(
|
||||||
)
|
)
|
||||||
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
|
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
|
||||||
|
|
||||||
|
# Certain elements aren't meant for display.
|
||||||
|
ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"}
|
||||||
|
|
||||||
|
|
||||||
def _normalise_encoding(encoding: str) -> Optional[str]:
|
def _normalise_encoding(encoding: str) -> Optional[str]:
|
||||||
"""Use the Python codec's name as the normalised entry."""
|
"""Use the Python codec's name as the normalised entry."""
|
||||||
|
@ -174,13 +177,15 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
|
||||||
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
|
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
|
||||||
|
|
||||||
og: Dict[str, Optional[str]] = {}
|
og: Dict[str, Optional[str]] = {}
|
||||||
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
|
for tag in tree.xpath(
|
||||||
if "content" in tag.attrib:
|
"//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
|
||||||
# if we've got more than 50 tags, someone is taking the piss
|
):
|
||||||
if len(og) >= 50:
|
# if we've got more than 50 tags, someone is taking the piss
|
||||||
logger.warning("Skipping OG for page with too many 'og:' tags")
|
if len(og) >= 50:
|
||||||
return {}
|
logger.warning("Skipping OG for page with too many 'og:' tags")
|
||||||
og[tag.attrib["property"]] = tag.attrib["content"]
|
return {}
|
||||||
|
|
||||||
|
og[tag.attrib["property"]] = tag.attrib["content"]
|
||||||
|
|
||||||
# TODO: grab article: meta tags too, e.g.:
|
# TODO: grab article: meta tags too, e.g.:
|
||||||
|
|
||||||
|
@ -192,21 +197,23 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
|
||||||
# "article:modified_time" content="2016-04-01T18:31:53+00:00" />
|
# "article:modified_time" content="2016-04-01T18:31:53+00:00" />
|
||||||
|
|
||||||
if "og:title" not in og:
|
if "og:title" not in og:
|
||||||
# do some basic spidering of the HTML
|
# Attempt to find a title from the title tag, or the biggest header on the page.
|
||||||
title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
|
title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
|
||||||
if title and title[0].text is not None:
|
if title:
|
||||||
og["og:title"] = title[0].text.strip()
|
og["og:title"] = title[0].strip()
|
||||||
else:
|
else:
|
||||||
og["og:title"] = None
|
og["og:title"] = None
|
||||||
|
|
||||||
if "og:image" not in og:
|
if "og:image" not in og:
|
||||||
# TODO: extract a favicon failing all else
|
|
||||||
meta_image = tree.xpath(
|
meta_image = tree.xpath(
|
||||||
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
|
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
|
||||||
)
|
)
|
||||||
|
# If a meta image is found, use it.
|
||||||
if meta_image:
|
if meta_image:
|
||||||
og["og:image"] = meta_image[0]
|
og["og:image"] = meta_image[0]
|
||||||
else:
|
else:
|
||||||
|
# Try to find images which are larger than 10px by 10px.
|
||||||
|
#
|
||||||
# TODO: consider inlined CSS styles as well as width & height attribs
|
# TODO: consider inlined CSS styles as well as width & height attribs
|
||||||
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
|
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
|
||||||
images = sorted(
|
images = sorted(
|
||||||
|
@ -215,17 +222,24 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
|
||||||
-1 * float(i.attrib["width"]) * float(i.attrib["height"])
|
-1 * float(i.attrib["width"]) * float(i.attrib["height"])
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
# If no images were found, try to find *any* images.
|
||||||
if not images:
|
if not images:
|
||||||
images = tree.xpath("//img[@src]")
|
images = tree.xpath("//img[@src][1]")
|
||||||
if images:
|
if images:
|
||||||
og["og:image"] = images[0].attrib["src"]
|
og["og:image"] = images[0].attrib["src"]
|
||||||
|
|
||||||
|
# Finally, fallback to the favicon if nothing else.
|
||||||
|
else:
|
||||||
|
favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
|
||||||
|
if favicons:
|
||||||
|
og["og:image"] = favicons[0]
|
||||||
|
|
||||||
if "og:description" not in og:
|
if "og:description" not in og:
|
||||||
|
# Check the first meta description tag for content.
|
||||||
meta_description = tree.xpath(
|
meta_description = tree.xpath(
|
||||||
"//*/meta"
|
"//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
|
||||||
"[translate(@name, 'DESCRIPTION', 'description')='description']"
|
|
||||||
"/@content"
|
|
||||||
)
|
)
|
||||||
|
# If a meta description is found with content, use it.
|
||||||
if meta_description:
|
if meta_description:
|
||||||
og["og:description"] = meta_description[0]
|
og["og:description"] = meta_description[0]
|
||||||
else:
|
else:
|
||||||
|
@ -306,6 +320,10 @@ def _iterate_over_text(
|
||||||
if isinstance(el, str):
|
if isinstance(el, str):
|
||||||
yield el
|
yield el
|
||||||
elif el.tag not in tags_to_ignore:
|
elif el.tag not in tags_to_ignore:
|
||||||
|
# If the element isn't meant for display, ignore it.
|
||||||
|
if el.get("role") in ARIA_ROLES_TO_IGNORE:
|
||||||
|
continue
|
||||||
|
|
||||||
# el.text is the text before the first child, so we can immediately
|
# el.text is the text before the first child, so we can immediately
|
||||||
# return it if the text exists.
|
# return it if the text exists.
|
||||||
if el.text:
|
if el.text:
|
||||||
|
|
|
@ -145,7 +145,7 @@ class SummarizeTestCase(unittest.TestCase):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class CalcOgTestCase(unittest.TestCase):
|
class OpenGraphFromHtmlTestCase(unittest.TestCase):
|
||||||
if not lxml:
|
if not lxml:
|
||||||
skip = "url preview feature requires lxml"
|
skip = "url preview feature requires lxml"
|
||||||
|
|
||||||
|
@ -235,6 +235,21 @@ class CalcOgTestCase(unittest.TestCase):
|
||||||
|
|
||||||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
||||||
|
|
||||||
|
# Another variant is a title with no content.
|
||||||
|
html = b"""
|
||||||
|
<html>
|
||||||
|
<head><title></title></head>
|
||||||
|
<body>
|
||||||
|
<h1>Title</h1>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
tree = decode_body(html, "http://example.com/test.html")
|
||||||
|
og = parse_html_to_open_graph(tree)
|
||||||
|
|
||||||
|
self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})
|
||||||
|
|
||||||
def test_h1_as_title(self) -> None:
|
def test_h1_as_title(self) -> None:
|
||||||
html = b"""
|
html = b"""
|
||||||
<html>
|
<html>
|
||||||
|
@ -250,6 +265,26 @@ class CalcOgTestCase(unittest.TestCase):
|
||||||
|
|
||||||
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
|
||||||
|
|
||||||
|
def test_empty_description(self) -> None:
|
||||||
|
"""Description tags with empty content should be ignored."""
|
||||||
|
html = b"""
|
||||||
|
<html>
|
||||||
|
<meta property="og:description" content=""/>
|
||||||
|
<meta property="og:description"/>
|
||||||
|
<meta name="description" content=""/>
|
||||||
|
<meta name="description"/>
|
||||||
|
<meta name="description" content="Finally!"/>
|
||||||
|
<body>
|
||||||
|
<h1>Title</h1>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
tree = decode_body(html, "http://example.com/test.html")
|
||||||
|
og = parse_html_to_open_graph(tree)
|
||||||
|
|
||||||
|
self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})
|
||||||
|
|
||||||
def test_missing_title_and_broken_h1(self) -> None:
|
def test_missing_title_and_broken_h1(self) -> None:
|
||||||
html = b"""
|
html = b"""
|
||||||
<html>
|
<html>
|
||||||
|
|
Loading…
Reference in New Issue