From 6dec726b1faddea054ee14312ac5c234ee338608 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 10 Dec 2025 10:24:09 -0500 Subject: [PATCH 1/6] Use BeautifulSoup instead of LXML directly. --- poetry.lock | 87 +++++--- pyproject.toml | 3 +- synapse/media/oembed.py | 95 ++++----- synapse/media/preview_html.py | 301 ++++++++------------------- synapse/media/url_previewer.py | 10 +- tests/media/test_html_preview.py | 269 +++++++++--------------- tests/rest/client/test_media.py | 2 +- tests/rest/media/test_url_preview.py | 2 +- 8 files changed, 295 insertions(+), 474 deletions(-) diff --git a/poetry.lock b/poetry.lock index 8c9256c8929..62080647c09 100644 --- a/poetry.lock +++ b/poetry.lock @@ -31,7 +31,7 @@ description = "The ultimate Python library in building OAuth and OpenID Connect optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"jwt\" or extra == \"oidc\"" +markers = "extra == \"oidc\" or extra == \"jwt\" or extra == \"all\"" files = [ {file = "authlib-1.6.5-py2.py3-none-any.whl", hash = "sha256:3e0e0507807f842b02175507bdee8957a1d5707fd4afb17c32fb43fee90b6e3a"}, {file = "authlib-1.6.5.tar.gz", hash = "sha256:6aaf9c79b7cc96c900f0b284061691c5d4e61221640a948fe690b556a6d6d10b"}, @@ -132,6 +132,30 @@ files = [ tests = ["pytest (>=3.2.1,!=3.3.0)"] typecheck = ["mypy"] +[[package]] +name = "beautifulsoup4" +version = "4.14.3" +description = "Screen-scraping library" +optional = true +python-versions = ">=3.7.0" +groups = ["main"] +markers = "extra == \"url-preview\" or extra == \"all\"" +files = [ + {file = "beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb"}, + {file = "beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86"}, +] + +[package.dependencies] +soupsieve = ">=1.6.1" +typing-extensions = ">=4.0.0" + +[package.extras] +cchardet = ["cchardet"] +chardet = ["chardet"] +charset-normalizer = ["charset-normalizer"] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "bleach" version = "6.3.0" @@ -481,7 +505,7 @@ description = "XML bomb protection for Python stdlib modules" optional = true python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"}, {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, @@ -506,7 +530,7 @@ description = "XPath 1.0/2.0/3.0/3.1 parsers and selectors for ElementTree and l optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "elementpath-4.1.5-py3-none-any.whl", hash = "sha256:2ac1a2fb31eb22bbbf817f8cf6752f844513216263f0e3892c8e79782fe4bb55"}, {file = "elementpath-4.1.5.tar.gz", hash = "sha256:c2d6dc524b29ef751ecfc416b0627668119d8812441c555d7471da41d4bacb8d"}, @@ -556,7 +580,7 @@ description = "Python wrapper for hiredis" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"all\" or extra == \"redis\"" +markers = "extra == \"redis\" or extra == \"all\"" files = [ {file = "hiredis-3.3.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:9937d9b69321b393fbace69f55423480f098120bc55a3316e1ca3508c4dbbd6f"}, {file = "hiredis-3.3.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:50351b77f89ba6a22aff430b993653847f36b71d444509036baa0f2d79d1ebf4"}, @@ -879,7 +903,7 @@ description = "Jaeger Python OpenTracing Tracer implementation" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "jaeger-client-4.8.0.tar.gz", hash = "sha256:3157836edab8e2c209bd2d6ae61113db36f7ee399e66b1dcbb715d87ab49bfe0"}, ] @@ -1017,7 +1041,7 @@ description = "A strictly RFC 4510 conforming LDAP V3 pure Python client library optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"matrix-synapse-ldap3\"" +markers = "extra == \"matrix-synapse-ldap3\" or extra == \"all\"" files = [ {file = "ldap3-2.9.1-py2.py3-none-any.whl", hash = "sha256:5869596fc4948797020d3f03b7939da938778a0f9e2009f7a072ccf92b8e8d70"}, {file = "ldap3-2.9.1.tar.gz", hash = "sha256:f3e7fc4718e3f09dda568b57100095e0ce58633bcabbed8667ce3f8fbaa4229f"}, @@ -1119,7 +1143,7 @@ description = "Powerful and Pythonic XML processing library combining libxml2/li optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"all\" or extra == \"url-preview\"" +markers = "extra == \"url-preview\" or extra == \"all\"" files = [ {file = "lxml-6.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e77dd455b9a16bbd2a5036a63ddbd479c19572af81b624e79ef422f929eef388"}, {file = "lxml-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d444858b9f07cefff6455b983aea9a67f7462ba1f6cbe4a21e8bf6791bf2153"}, @@ -1405,7 +1429,7 @@ description = "An LDAP3 auth provider for Synapse" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"matrix-synapse-ldap3\"" +markers = "extra == \"matrix-synapse-ldap3\" or extra == \"all\"" files = [ {file = "matrix-synapse-ldap3-0.3.0.tar.gz", hash = "sha256:8bb6517173164d4b9cc44f49de411d8cebdb2e705d5dd1ea1f38733c4a009e1d"}, {file = "matrix_synapse_ldap3-0.3.0-py3-none-any.whl", hash = "sha256:8b4d701f8702551e98cc1d8c20dbed532de5613584c08d0df22de376ba99159d"}, @@ -1648,7 +1672,7 @@ description = "OpenTracing API for Python. See documentation at http://opentraci optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "opentracing-2.4.0.tar.gz", hash = "sha256:a173117e6ef580d55874734d1fa7ecb6f3655160b8b8974a2a1e98e5ec9c840d"}, ] @@ -1838,7 +1862,7 @@ description = "psycopg2 - Python-PostgreSQL Database Adapter" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"postgres\"" +markers = "extra == \"postgres\" or extra == \"all\"" files = [ {file = "psycopg2-2.9.11-cp310-cp310-win_amd64.whl", hash = "sha256:103e857f46bb76908768ead4e2d0ba1d1a130e7b8ed77d3ae91e8b33481813e8"}, {file = "psycopg2-2.9.11-cp311-cp311-win_amd64.whl", hash = "sha256:210daed32e18f35e3140a1ebe059ac29209dd96468f2f7559aa59f75ee82a5cb"}, @@ -1856,7 +1880,7 @@ description = ".. image:: https://travis-ci.org/chtd/psycopg2cffi.svg?branch=mas optional = true python-versions = "*" groups = ["main"] -markers = "platform_python_implementation == \"PyPy\" and (extra == \"all\" or extra == \"postgres\")" +markers = "platform_python_implementation == \"PyPy\" and (extra == \"postgres\" or extra == \"all\")" files = [ {file = "psycopg2cffi-2.9.0.tar.gz", hash = "sha256:7e272edcd837de3a1d12b62185eb85c45a19feda9e62fa1b120c54f9e8d35c52"}, ] @@ -1872,7 +1896,7 @@ description = "A Simple library to enable psycopg2 compatability" optional = true python-versions = "*" groups = ["main"] -markers = "platform_python_implementation == \"PyPy\" and (extra == \"all\" or extra == \"postgres\")" +markers = "platform_python_implementation == \"PyPy\" and (extra == \"postgres\" or extra == \"all\")" files = [ {file = "psycopg2cffi-compat-1.1.tar.gz", hash = "sha256:d25e921748475522b33d13420aad5c2831c743227dc1f1f2585e0fdb5c914e05"}, ] @@ -2154,7 +2178,7 @@ description = "A development tool to measure, monitor and analyze the memory beh optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"all\" or extra == \"cache-memory\"" +markers = "extra == \"cache-memory\" or extra == \"all\"" files = [ {file = "Pympler-1.0.1-py3-none-any.whl", hash = "sha256:d260dda9ae781e1eab6ea15bacb84015849833ba5555f141d2d9b7b7473b307d"}, {file = "Pympler-1.0.1.tar.gz", hash = "sha256:993f1a3599ca3f4fcd7160c7545ad06310c9e12f70174ae7ae8d4e25f6c5d3fa"}, @@ -2214,7 +2238,7 @@ description = "Python implementation of SAML Version 2 Standard" optional = true python-versions = ">=3.9,<4.0" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "pysaml2-7.5.0-py3-none-any.whl", hash = "sha256:bc6627cc344476a83c757f440a73fda1369f13b6fda1b4e16bca63ffbabb5318"}, {file = "pysaml2-7.5.0.tar.gz", hash = "sha256:f36871d4e5ee857c6b85532e942550d2cf90ea4ee943d75eb681044bbc4f54f7"}, @@ -2239,7 +2263,7 @@ description = "Extensions to the standard Python datetime module" optional = true python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, @@ -2267,7 +2291,7 @@ description = "World timezone definitions, modern and historical" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00"}, {file = "pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3"}, @@ -2671,7 +2695,7 @@ description = "Python client for Sentry (https://sentry.io)" optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"all\" or extra == \"sentry\"" +markers = "extra == \"sentry\" or extra == \"all\"" files = [ {file = "sentry_sdk-2.46.0-py2.py3-none-any.whl", hash = "sha256:4eeeb60198074dff8d066ea153fa6f241fef1668c10900ea53a4200abc8da9b1"}, {file = "sentry_sdk-2.46.0.tar.gz", hash = "sha256:91821a23460725734b7741523021601593f35731808afc0bb2ba46c27b8acd91"}, @@ -2846,6 +2870,19 @@ files = [ {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, ] +[[package]] +name = "soupsieve" +version = "2.8" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"url-preview\" or extra == \"all\"" +files = [ + {file = "soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c"}, + {file = "soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f"}, +] + [[package]] name = "sqlglot" version = "28.0.0" @@ -2881,7 +2918,7 @@ description = "Tornado IOLoop Backed Concurrent Futures" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "threadloop-1.0.2-py2-none-any.whl", hash = "sha256:5c90dbefab6ffbdba26afb4829d2a9df8275d13ac7dc58dccb0e279992679599"}, {file = "threadloop-1.0.2.tar.gz", hash = "sha256:8b180aac31013de13c2ad5c834819771992d350267bddb854613ae77ef571944"}, @@ -2897,7 +2934,7 @@ description = "Python bindings for the Apache Thrift RPC system" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "thrift-0.16.0.tar.gz", hash = "sha256:2b5b6488fcded21f9d312aa23c9ff6a0195d0f6ae26ddbd5ad9e3e25dfc14408"}, ] @@ -2970,7 +3007,7 @@ description = "Tornado is a Python web framework and asynchronous networking lib optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "tornado-6.5-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:f81067dad2e4443b015368b24e802d0083fecada4f0a4572fdb72fc06e54a9a6"}, {file = "tornado-6.5-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9ac1cbe1db860b3cbb251e795c701c41d343f06a96049d6274e7c77559117e41"}, @@ -3104,7 +3141,7 @@ description = "non-blocking redis client for python" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"redis\"" +markers = "extra == \"redis\" or extra == \"all\"" files = [ {file = "txredisapi-1.4.11-py3-none-any.whl", hash = "sha256:ac64d7a9342b58edca13ef267d4fa7637c1aa63f8595e066801c1e8b56b22d0b"}, {file = "txredisapi-1.4.11.tar.gz", hash = "sha256:3eb1af99aefdefb59eb877b1dd08861efad60915e30ad5bf3d5bf6c5cedcdbc6"}, @@ -3350,7 +3387,7 @@ description = "An XML Schema validator and decoder" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "xmlschema-2.4.0-py3-none-any.whl", hash = "sha256:dc87be0caaa61f42649899189aab2fd8e0d567f2cf548433ba7b79278d231a4a"}, {file = "xmlschema-2.4.0.tar.gz", hash = "sha256:d74cd0c10866ac609e1ef94a5a69b018ad16e39077bc6393408b40c6babee793"}, @@ -3468,7 +3505,7 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"] test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"] [extras] -all = ["authlib", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pympler", "pysaml2", "sentry-sdk", "txredisapi"] +all = ["authlib", "beautifulsoup4", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pympler", "pysaml2", "sentry-sdk", "txredisapi"] cache-memory = ["pympler"] jwt = ["authlib"] matrix-synapse-ldap3 = ["matrix-synapse-ldap3"] @@ -3480,9 +3517,9 @@ saml2 = ["pysaml2"] sentry = ["sentry-sdk"] systemd = ["systemd-python"] test = ["idna", "parameterized"] -url-preview = ["lxml"] +url-preview = ["beautifulsoup4", "lxml"] [metadata] lock-version = "2.1" python-versions = ">=3.10.0,<4.0.0" -content-hash = "960ddae65fde8574f0f36b6988622fc4baf7646823c36699c5cd4773cad8b0ed" +content-hash = "cded33baf3b0eb42bba93b2e96439c10a1520f8471d1136771636e63f26523df" diff --git a/pyproject.toml b/pyproject.toml index 70d5e3d5730..3882b233780 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,7 +110,7 @@ oidc = ["authlib>=0.15.1"] # `systemd.journal.JournalHandler`, as is documented in # `contrib/systemd/log_config.yaml`. systemd = ["systemd-python>=231"] -url-preview = ["lxml>=4.6.3"] +url-preview = ["lxml>=4.6.3", "beautifulsoup4>=4.13.0"] sentry = ["sentry-sdk>=0.7.2"] opentracing = ["jaeger-client>=4.2.0", "opentracing>=2.2.0"] jwt = ["authlib"] @@ -144,6 +144,7 @@ all = [ "authlib>=0.15.1", # url-preview "lxml>=4.6.3", + "beautifulsoup4>=4.13.0", # sentry "sentry-sdk>=0.7.2", # opentracing diff --git a/synapse/media/oembed.py b/synapse/media/oembed.py index 7e440721302..1d48d648309 100644 --- a/synapse/media/oembed.py +++ b/synapse/media/oembed.py @@ -25,12 +25,13 @@ import attr -from synapse.media.preview_html import parse_html_description +from synapse.media.preview_html import NON_BLANK, decode_body, parse_html_description from synapse.types import JsonDict from synapse.util.json import json_decoder if TYPE_CHECKING: - from lxml import etree + from bs4 import BeautifulSoup + from bs4.element import Tag from synapse.server import HomeServer @@ -105,35 +106,25 @@ def get_oembed_url(self, url: str) -> str | None: # No match. return None - def autodiscover_from_html(self, tree: "etree._Element") -> str | None: + def autodiscover_from_html(self, soup: "BeautifulSoup") -> str | None: """ Search an HTML document for oEmbed autodiscovery information. Args: - tree: The parsed HTML body. + soup: The parsed HTML body. Returns: The URL to use for oEmbed information, or None if no URL was found. """ # Search for link elements with the proper rel and type attributes. - # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. - for tag in cast( - list["etree._Element"], - tree.xpath("//link[@rel='alternate'][@type='application/json+oembed']"), - ): - if "href" in tag.attrib: - return cast(str, tag.attrib["href"]) - # Some providers (e.g. Flickr) use alternative instead of alternate. - # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. - for tag in cast( - list["etree._Element"], - tree.xpath("//link[@rel='alternative'][@type='application/json+oembed']"), - ): - if "href" in tag.attrib: - return cast(str, tag.attrib["href"]) - - return None + tag = soup.find( + "link", + rel=("alternate", "alternative"), + type="application/json+oembed", + href=NON_BLANK, + ) + return cast(str, cast("Tag", tag)["href"]) if tag else None def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: """ @@ -196,7 +187,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: if oembed_type == "rich": html_str = oembed.get("html") if isinstance(html_str, str): - calc_description_and_urls(open_graph_response, html_str) + calc_description_and_urls(open_graph_response, html_str, url) elif oembed_type == "photo": # If this is a photo, use the full image, not the thumbnail. @@ -208,7 +199,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: open_graph_response["og:type"] = "video.other" html_str = oembed.get("html") if html_str and isinstance(html_str, str): - calc_description_and_urls(open_graph_response, oembed["html"]) + calc_description_and_urls(open_graph_response, oembed["html"], url) for size in ("width", "height"): val = oembed.get(size) if type(val) is int: # noqa: E721 @@ -223,55 +214,45 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: return OEmbedResult(open_graph_response, author_name, cache_age) -def _fetch_urls(tree: "etree._Element", tag_name: str) -> list[str]: - results = [] - # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. - for tag in cast(list["etree._Element"], tree.xpath("//*/" + tag_name)): - if "src" in tag.attrib: - results.append(cast(str, tag.attrib["src"])) - return results +def _fetch_url(soup: "BeautifulSoup", tag_name: str) -> str | None: + tag = soup.find(tag_name, src=NON_BLANK) + return cast(str, cast("Tag", tag)["src"]) if tag else None -def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None: +def calc_description_and_urls( + open_graph_response: JsonDict, html_body: str, url: str +) -> None: """ Calculate description for an HTML document. - This uses lxml to convert the HTML document into plaintext. If errors + This uses BeautifulSoup to convert the HTML document into plaintext. If errors occur during processing of the document, an empty response is returned. Args: open_graph_response: The current Open Graph summary. This is updated with additional fields. html_body: The HTML document, as bytes. - - Returns: - The summary + url: The URL which is being previewed (not the one which was requested). """ - # If there's no body, nothing useful is going to be found. - if not html_body: - return + soup = decode_body(html_body, url) - from lxml import etree - - # Create an HTML parser. If this fails, log and return no metadata. - parser = etree.HTMLParser(recover=True, encoding="utf-8") - - # Attempt to parse the body. If this fails, log and return no metadata. - tree = etree.fromstring(html_body, parser) - - # The data was successfully parsed, but no tree was found. - if tree is None: + # If there's no body, nothing useful is going to be found. + if not soup: return # Attempt to find interesting URLs (images, videos, embeds). if "og:image" not in open_graph_response: - image_urls = _fetch_urls(tree, "img") - if image_urls: - open_graph_response["og:image"] = image_urls[0] - - video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed") - if video_urls: - open_graph_response["og:video"] = video_urls[0] - - description = parse_html_description(tree) + image_url = _fetch_url(soup, "img") + if image_url: + open_graph_response["og:image"] = image_url + + video_url = _fetch_url(soup, "video") + if video_url: + open_graph_response["og:video"] = video_url + else: + embed_url = _fetch_url(soup, "embed") + if embed_url: + open_graph_response["og:video"] = embed_url + + description = parse_html_description(soup) if description: open_graph_response["og:description"] = description diff --git a/synapse/media/preview_html.py b/synapse/media/preview_html.py index 22ad581f829..8f1bc2276c8 100644 --- a/synapse/media/preview_html.py +++ b/synapse/media/preview_html.py @@ -18,7 +18,7 @@ # [This file includes modifications made by New Vector Limited] # # -import codecs +import itertools import logging import re from typing import ( @@ -26,100 +26,28 @@ Callable, Generator, Iterable, + Iterator, Optional, cast, ) if TYPE_CHECKING: - from lxml import etree + from bs4 import BeautifulSoup + from bs4.element import PageElement, Tag logger = logging.getLogger(__name__) -_charset_match = re.compile( - rb'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I -) -_xml_encoding_match = re.compile( - rb'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I -) _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) # Certain elements aren't meant for display. ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"} - -def _normalise_encoding(encoding: str) -> str | None: - """Use the Python codec's name as the normalised entry.""" - try: - return codecs.lookup(encoding).name - except LookupError: - return None +NON_BLANK = re.compile(".+") -def _get_html_media_encodings(body: bytes, content_type: str | None) -> Iterable[str]: +def decode_body(body: bytes | str, uri: str) -> Optional["BeautifulSoup"]: """ - Get potential encoding of the body based on the (presumably) HTML body or the content-type header. - - The precedence used for finding a character encoding is: - - 1. tag with a charset declared. - 2. The XML document's character encoding attribute. - 3. The Content-Type header. - 4. Fallback to utf-8. - 5. Fallback to windows-1252. - - This roughly follows the algorithm used by BeautifulSoup's bs4.dammit.EncodingDetector. - - Args: - body: The HTML document, as bytes. - content_type: The Content-Type header. - - Returns: - The character encoding of the body, as a string. - """ - # There's no point in returning an encoding more than once. - attempted_encodings: set[str] = set() - - # Limit searches to the first 1kb, since it ought to be at the top. - body_start = body[:1024] - - # Check if it has an encoding set in a meta tag. - match = _charset_match.search(body_start) - if match: - encoding = _normalise_encoding(match.group(1).decode("ascii")) - if encoding: - attempted_encodings.add(encoding) - yield encoding - - # TODO Support - - # Check if it has an XML document with an encoding. - match = _xml_encoding_match.match(body_start) - if match: - encoding = _normalise_encoding(match.group(1).decode("ascii")) - if encoding and encoding not in attempted_encodings: - attempted_encodings.add(encoding) - yield encoding - - # Check the HTTP Content-Type header for a character set. - if content_type: - content_match = _content_type_match.match(content_type) - if content_match: - encoding = _normalise_encoding(content_match.group(1)) - if encoding and encoding not in attempted_encodings: - attempted_encodings.add(encoding) - yield encoding - - # Finally, fallback to UTF-8, then windows-1252. - for fallback in ("utf-8", "cp1252"): - if fallback not in attempted_encodings: - yield fallback - - -def decode_body( - body: bytes, uri: str, content_type: str | None = None -) -> Optional["etree._Element"]: - """ - This uses lxml to parse the HTML document. + This uses BeautifulSoup to parse the HTML document. Args: body: The HTML document, as bytes. @@ -133,54 +61,22 @@ def decode_body( if not body: return None - # The idea here is that multiple encodings are tried until one works. - # Unfortunately the result is never used and then LXML will decode the string - # again with the found encoding. - for encoding in _get_html_media_encodings(body, content_type): - try: - body.decode(encoding) - except Exception: - pass - else: - break - else: + from bs4 import BeautifulSoup + from bs4.builder import ParserRejectedMarkup + + try: + soup = BeautifulSoup(body, "lxml") + # If an empty document is returned, convert to None. + if not len(soup): + return None + return soup + except ParserRejectedMarkup: logger.warning("Unable to decode HTML body for %s", uri) return None - from lxml import etree - - # Create an HTML parser. - parser = etree.HTMLParser(recover=True, encoding=encoding) - - # Attempt to parse the body. With `lxml` 6.0.0+, this will be an empty HTML - # tree if the body was successfully parsed, but no tree was found. In - # previous `lxml` versions, `etree.fromstring` would return `None` in that - # case. - html_tree = etree.fromstring(body, parser) - - # Account for the above referenced case where `html_tree` is an HTML tree - # with an empty body. If so, return None. - if html_tree is not None and html_tree.tag == "html": - # If the tree has only a single element and it's empty, then - # return None. - body_el = html_tree.find("body") - if body_el is not None and len(html_tree) == 1: - # Extract the content of the body tag as text. - body_text = "".join(cast(Iterable[str], body_el.itertext())) - - # Strip any undecodable Unicode characters and whitespace. - body_text = body_text.strip("\ufffd").strip() - - # If there's no text left, and there were no child tags, - # then we consider the tag empty. - if not body_text and len(body_el) == 0: - return None - - return html_tree - def _get_meta_tags( - tree: "etree._Element", + soup: "BeautifulSoup", property: str, prefix: str, property_mapper: Callable[[str], str | None] | None = None, @@ -189,7 +85,7 @@ def _get_meta_tags( Search for meta tags prefixed with a particular string. Args: - tree: The parsed HTML document. + soup: The parsed HTML document. property: The name of the property which contains the tag name, e.g. "property" for Open Graph. prefix: The prefix on the property to search for, e.g. "og" for Open Graph. @@ -199,15 +95,10 @@ def _get_meta_tags( Returns: A map of tag name to value. """ - # This actually returns dict[str, str], but the caller sets this as a variable - # which is dict[str, str | None]. results: dict[str, str | None] = {} # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. - for tag in cast( - list["etree._Element"], - tree.xpath( - f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]" - ), + for tag in soup.find_all( + "meta", attrs={property: re.compile(rf"^{prefix}:")}, content=NON_BLANK ): # if we've got more than 50 tags, someone is taking the piss if len(results) >= 50: @@ -217,7 +108,7 @@ def _get_meta_tags( ) return {} - key = cast(str, tag.attrib[property]) + key = tag[property] if property_mapper: new_key = property_mapper(key) # None is a special value used to ignore a value. @@ -225,7 +116,7 @@ def _get_meta_tags( continue key = new_key - results[key] = cast(str, tag.attrib["content"]) + results[key] = tag["content"] return results @@ -250,15 +141,14 @@ def _map_twitter_to_open_graph(key: str) -> str | None: return "og" + key[7:] -def parse_html_to_open_graph(tree: "etree._Element") -> dict[str, str | None]: +def parse_html_to_open_graph(soup: "BeautifulSoup") -> dict[str, str | None]: """ - Parse the HTML document into an Open Graph response. + Calculate metadata for an HTML document. - This uses lxml to search the HTML document for Open Graph data (or - synthesizes it from the document). + This uses BeautifulSoup to search the HTML document for Open Graph data. Args: - tree: The parsed HTML document. + soup: The parsed HTML document. Returns: The Open Graph response as a dictionary. @@ -278,7 +168,8 @@ def parse_html_to_open_graph(tree: "etree._Element") -> dict[str, str | None]: # "og:video:height" : "720", # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3", - og = _get_meta_tags(tree, "property", "og") + # TODO: grab article: meta tags too, e.g.: + og = _get_meta_tags(soup, "property", "og") # TODO: Search for properties specific to the different Open Graph types, # such as article: meta tags, e.g.: @@ -298,7 +189,7 @@ def parse_html_to_open_graph(tree: "etree._Element") -> dict[str, str | None]: # Twitter cards tags also duplicate Open Graph tags. # # See https://developer.twitter.com/en/docs/twitter-for-websites/cards/guides/getting-started - twitter = _get_meta_tags(tree, "name", "twitter", _map_twitter_to_open_graph) + twitter = _get_meta_tags(soup, "name", "twitter", _map_twitter_to_open_graph) # Merge the Twitter values with the Open Graph values, but do not overwrite # information from Open Graph tags. for key, value in twitter.items(): @@ -307,73 +198,69 @@ def parse_html_to_open_graph(tree: "etree._Element") -> dict[str, str | None]: if "og:title" not in og: # Attempt to find a title from the title tag, or the biggest header on the page. - # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. title = cast( - list["etree._ElementUnicodeResult"], - tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()"), + Optional["Tag"], soup.find(("title", "h1", "h2", "h3"), string=True) ) - if title: - og["og:title"] = title[0].strip() + if title and title.string: + og["og:title"] = title.string.strip() else: og["og:title"] = None if "og:image" not in og: - # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. + # Check microdata for an image. meta_image = cast( - list["etree._ElementUnicodeResult"], - tree.xpath( - "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]" - ), + Optional["Tag"], + soup.find("meta", itemprop=re.compile("image", re.I), content=NON_BLANK), ) # If a meta image is found, use it. if meta_image: - og["og:image"] = meta_image[0] + og["og:image"] = cast(str, meta_image["content"]) else: # Try to find images which are larger than 10px by 10px. - # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. # # TODO: consider inlined CSS styles as well as width & height attribs images = cast( - list["etree._Element"], - tree.xpath("//img[@src][number(@width)>10][number(@height)>10]"), + list["Tag"], + soup.find_all("img", src=NON_BLANK, width=NON_BLANK, height=NON_BLANK), ) images = sorted( - images, + filter( + lambda tag: int(cast(str, tag["width"])) > 10 + and int(cast(str, tag["height"])) > 10, + images, + ), key=lambda i: ( - -1 * float(i.attrib["width"]) * float(i.attrib["height"]) + -1 * float(cast(str, i["width"])) * float(cast(str, i["height"])) ), ) # If no images were found, try to find *any* images. if not images: - # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. - images = cast(list["etree._Element"], tree.xpath("//img[@src][1]")) + images = soup.find_all("img", src=NON_BLANK, limit=1) if images: - og["og:image"] = cast(str, images[0].attrib["src"]) + og["og:image"] = cast(str, images[0]["src"]) # Finally, fallback to the favicon if nothing else. else: - # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. - favicons = cast( - list["etree._ElementUnicodeResult"], - tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]"), - ) - if favicons: - og["og:image"] = favicons[0] + favicon = cast("Tag", soup.find("link", href=NON_BLANK, rel="icon")) + if favicon: + og["og:image"] = cast(str, favicon["href"]) if "og:description" not in og: # Check the first meta description tag for content. - # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this. meta_description = cast( - list["etree._ElementUnicodeResult"], - tree.xpath( - "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]" + Optional["Tag"], + soup.find( + "meta", + attrs={"name": re.compile("description", re.I)}, + content=NON_BLANK, ), ) + # If a meta description is found with content, use it. if meta_description: - og["og:description"] = meta_description[0] + og["og:description"] = cast(str, meta_description["content"]) else: - og["og:description"] = parse_html_description(tree) + og["og:description"] = parse_html_description(soup) elif og["og:description"]: # This must be a non-empty string at this point. assert isinstance(og["og:description"], str) @@ -384,7 +271,7 @@ def parse_html_to_open_graph(tree: "etree._Element") -> dict[str, str | None]: return og -def parse_html_description(tree: "etree._Element") -> str | None: +def parse_html_description(soup: "BeautifulSoup") -> str | None: """ Calculate a text description based on an HTML document. @@ -397,14 +284,11 @@ def parse_html_description(tree: "etree._Element") -> str | None: This is a very very very coarse approximation to a plain text render of the page. Args: - tree: The parsed HTML document. + soup: The parsed HTML document. Returns: The plain text description, or None if one cannot be generated. """ - # We don't just use XPATH here as that is slow on some machines. - - from lxml import etree TAGS_TO_REMOVE = { "header", @@ -423,24 +307,27 @@ def parse_html_description(tree: "etree._Element") -> str | None: # etree.Comment is a function which creates an etree._Comment element. # The "tag" attribute of an etree._Comment instance is confusingly the # etree.Comment function instead of a string. - etree.Comment, + # etree.Comment, + # XXX } # Split all the text nodes into paragraphs (by splitting on new # lines) text_nodes = ( re.sub(r"\s+", "\n", el).strip() - for el in _iterate_over_text(tree.find("body"), TAGS_TO_REMOVE) + for el in _iterate_over_text( + cast(Optional["Tag"], soup.find("body")), TAGS_TO_REMOVE + ) ) return summarize_paragraphs(text_nodes) def _iterate_over_text( - tree: Optional["etree._Element"], - tags_to_ignore: set[object], + soup: Optional["Tag"], + tags_to_ignore: Iterable[str], stack_limit: int = 1024, ) -> Generator[str, None, None]: - """Iterate over the tree returning text nodes in a depth first fashion, + """Iterate over the document returning text nodes in a depth first fashion, skipping text nodes inside certain tags. Args: @@ -452,43 +339,27 @@ def _iterate_over_text( Intended to limit the maximum working memory when generating a preview. """ - if tree is None: + if not soup: return - # This is a stack whose items are elements to iterate over *or* strings - # to be returned. - elements: list[str | "etree._Element"] = [tree] - while elements: - el = elements.pop() - - if isinstance(el, str): - yield el - elif el.tag not in tags_to_ignore: - # If the element isn't meant for display, ignore it. - if el.get("role") in ARIA_ROLES_TO_IGNORE: - continue - - # el.text is the text before the first child, so we can immediately - # return it if the text exists. - if el.text: - yield el.text + from bs4.element import NavigableString, Tag - # We add to the stack all the element's children, interspersed with - # each child's tail text (if it exists). - # - # We iterate in reverse order so that earlier pieces of text appear - # closer to the top of the stack. - for child in el.iterchildren(reversed=True): - if len(elements) > stack_limit: - # We've hit our limit for working memory - break - - if child.tail: - # The tail text of a node is text that comes *after* the node, - # so we always include it even if we ignore the child node. - elements.append(child.tail) - - elements.append(child) + # This is basically a stack that we extend using itertools.chain. + # This will either consist of an element to iterate over *or* a string + # to be returned. + elements: Iterator["PageElement"] = iter([soup]) + while True: + el = next(elements, None) + if el is None: + return + + # Do not consider sub-classes of NavigableString since those represent + # comments, etc. + if type(el) == NavigableString: # noqa: E721 + yield str(el) + elif isinstance(el, Tag) and el.name not in tags_to_ignore: + # We add to the stack all the element's children. + elements = itertools.chain(el.contents, elements) def summarize_paragraphs( diff --git a/synapse/media/url_previewer.py b/synapse/media/url_previewer.py index 2c5e518918b..a1f941ea5b3 100644 --- a/synapse/media/url_previewer.py +++ b/synapse/media/url_previewer.py @@ -294,16 +294,16 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: # define our OG response for this media elif _is_html(media_info.media_type): - # TODO: somehow stop a big HTML tree from exploding synapse's RAM + # TODO: somehow stop a big HTML document from exploding synapse's RAM with open(media_info.filename, "rb") as file: body = file.read() - tree = decode_body(body, media_info.uri, media_info.media_type) - if tree is not None: + soup = decode_body(body, media_info.uri) + if soup is not None: # Check if this HTML document points to oEmbed information and # defer to that. - oembed_url = self._oembed.autodiscover_from_html(tree) + oembed_url = self._oembed.autodiscover_from_html(soup) og_from_oembed: JsonDict = {} # Only download to the oEmbed URL if it is allowed. if oembed_url: @@ -329,7 +329,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: # Parse Open Graph information from the HTML in case the oEmbed # response failed or is incomplete. - og_from_html = parse_html_to_open_graph(tree) + og_from_html = parse_html_to_open_graph(soup) # Compile the Open Graph response by using the scraped # information from the HTML and overlaying any information diff --git a/tests/media/test_html_preview.py b/tests/media/test_html_preview.py index d3f1e8833a7..e33b80739ea 100644 --- a/tests/media/test_html_preview.py +++ b/tests/media/test_html_preview.py @@ -20,7 +20,6 @@ # from synapse.media.preview_html import ( - _get_html_media_encodings, decode_body, parse_html_to_open_graph, summarize_paragraphs, @@ -166,9 +165,9 @@ def test_simple(self) -> None: """ - tree = decode_body(html, "http://example.com/test.html") - assert tree is not None - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + assert soup is not None + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -183,9 +182,9 @@ def test_comment(self) -> None: """ - tree = decode_body(html, "http://example.com/test.html") - assert tree is not None - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + assert soup is not None + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -203,9 +202,9 @@ def test_comment2(self) -> None: """ - tree = decode_body(html, "http://example.com/test.html") - assert tree is not None - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + assert soup is not None + og = parse_html_to_open_graph(soup) self.assertEqual( og, @@ -226,9 +225,9 @@ def test_script(self) -> None: """ - tree = decode_body(html, "http://example.com/test.html") - assert tree is not None - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + assert soup is not None + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -241,9 +240,9 @@ def test_missing_title(self) -> None: """ - tree = decode_body(html, "http://example.com/test.html") - assert tree is not None - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + assert soup is not None + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) @@ -273,9 +272,9 @@ def test_h1_as_title(self) -> None: """ - tree = decode_body(html, "http://example.com/test.html") - assert tree is not None - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + assert soup is not None + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."}) @@ -310,23 +309,17 @@ def test_missing_title_and_broken_h1(self) -> None: """ - tree = decode_body(html, "http://example.com/test.html") - assert tree is not None - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + assert soup is not None + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) def test_empty(self) -> None: """Test a body with no data in it.""" html = b"" - tree = decode_body(html, "http://example.com/test.html") - self.assertIsNone(tree) - - def test_no_tree(self) -> None: - """A valid body with no tree in it.""" - html = b"\x00" - tree = decode_body(html, "http://example.com/test.html") - self.assertIsNone(tree) + soup = decode_body(html, "http://example.com/test.html") + self.assertIsNone(soup) def test_xml(self) -> None: """Test decoding XML and ensure it works properly.""" @@ -339,24 +332,9 @@ def test_xml(self) -> None: FooSome text. """.strip() - tree = decode_body(html, "http://example.com/test.html") - assert tree is not None - og = parse_html_to_open_graph(tree) - self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) - - def test_invalid_encoding(self) -> None: - """An invalid character encoding should be ignored and treated as UTF-8, if possible.""" - html = b""" - - Foo - - Some text. - - - """ - tree = decode_body(html, "http://example.com/test.html", "invalid-encoding") - assert tree is not None - og = parse_html_to_open_graph(tree) + soup = decode_body(html, "http://example.com/test.html") + assert soup is not None + og = parse_html_to_open_graph(soup) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) def test_invalid_encoding2(self) -> None: @@ -370,10 +348,10 @@ def test_invalid_encoding2(self) -> None: """ - tree = decode_body(html, "http://example.com/test.html") - assert tree is not None - og = parse_html_to_open_graph(tree) - self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."}) + soup = decode_body(html, "http://example.com/test.html") + assert soup is not None + og = parse_html_to_open_graph(soup) + self.assertEqual(og, {"og:title": "˙˙ Foo", "og:description": "Some text."}) def test_windows_1252(self) -> None: """A body which uses cp1252, but doesn't declare that.""" @@ -385,10 +363,71 @@ def test_windows_1252(self) -> None: """ - tree = decode_body(html, "http://example.com/test.html") - assert tree is not None - og = parse_html_to_open_graph(tree) - self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."}) + soup = decode_body(html, "http://example.com/test.html") + assert soup is not None + og = parse_html_to_open_graph(soup) + self.assertEqual(og, {"og:title": "�", "og:description": "Some text."}) + + def test_image(self) -> None: + """Test the spots an image can be pulled from .""" + # Ordered listed of tags, we'll pop off the top and keep testing. + tags = [ + ( + b"""""", + "meta-prop", + ), + ( + b"""""", + "meta-IMAGE", + ), + ( + b"""""", + "meta-image", + ), + (b"""""", "img"), + ( + b"""""", + "img", + ), + ( + b"""""", + "img", + ), + ( + b"""""", + "img", + ), + ( + b"""""", + "img", + ), + # Put this image again since if it is the *only* image it will be used. + ( + b"""""", + "img-no-width-no-height", + ), + ( + b"""""", + "favicon", + ), + ] + + while tags: + html = b"" + b"".join(t[0] for t in tags) + b"" + tree = decode_body(html, "http://example.com/test.html") + assert tree is not None + og = parse_html_to_open_graph(tree) + self.assertEqual( + og, + { + "og:title": None, + "og:description": None, + "og:image": f"https://example.com/{tags[0][1]}.png", + }, + ) + + # Remove the highest remaining priority item. + tags.pop(0) def test_twitter_tag(self) -> None: """Twitter card tags should be used if nothing else is available.""" @@ -397,6 +436,7 @@ def test_twitter_tag(self) -> None: + """ tree = decode_body(html, "http://example.com/test.html") @@ -408,6 +448,7 @@ def test_twitter_tag(self) -> None: "og:title": None, "og:description": "Description", "og:site_name": "@matrixdotorg", + "og:image": "https://example.com/test.png", }, ) @@ -419,6 +460,8 @@ def test_twitter_tag(self) -> None: + + """ tree = decode_body(html, "http://example.com/test.html") @@ -430,6 +473,7 @@ def test_twitter_tag(self) -> None: "og:title": None, "og:description": "Real Description", "og:site_name": "matrix.org", + "og:image": "https://example.com/good.png", }, ) @@ -450,116 +494,3 @@ def test_nested_nodes(self) -> None: "og:description": "Welcome\n\nthe bold\n\nand underlined text\n\nand\n\nsome\n\ntail text", }, ) - - -class MediaEncodingTestCase(unittest.TestCase): - def test_meta_charset(self) -> None: - """A character encoding is found via the meta tag.""" - encodings = _get_html_media_encodings( - b""" - - - - - """, - "text/html", - ) - self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"]) - - # A less well-formed version. - encodings = _get_html_media_encodings( - b""" - - < meta charset = ascii> - - - """, - "text/html", - ) - self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"]) - - def test_meta_charset_underscores(self) -> None: - """A character encoding contains underscore.""" - encodings = _get_html_media_encodings( - b""" - - - - - """, - "text/html", - ) - self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"]) - - def test_xml_encoding(self) -> None: - """A character encoding is found via the meta tag.""" - encodings = _get_html_media_encodings( - b""" - - - - """, - "text/html", - ) - self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"]) - - def test_meta_xml_encoding(self) -> None: - """Meta tags take precedence over XML encoding.""" - encodings = _get_html_media_encodings( - b""" - - - - - - """, - "text/html", - ) - self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"]) - - def test_content_type(self) -> None: - """A character encoding is found via the Content-Type header.""" - # Test a few variations of the header. - headers = ( - 'text/html; charset="ascii";', - "text/html;charset=ascii;", - 'text/html; charset="ascii"', - "text/html; charset=ascii", - 'text/html; charset="ascii;', - 'text/html; charset=ascii";', - ) - for header in headers: - encodings = _get_html_media_encodings(b"", header) - self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"]) - - def test_fallback(self) -> None: - """A character encoding cannot be found in the body or header.""" - encodings = _get_html_media_encodings(b"", "text/html") - self.assertEqual(list(encodings), ["utf-8", "cp1252"]) - - def test_duplicates(self) -> None: - """Ensure each encoding is only attempted once.""" - encodings = _get_html_media_encodings( - b""" - - - - - - """, - 'text/html; charset="UTF_8"', - ) - self.assertEqual(list(encodings), ["utf-8", "cp1252"]) - - def test_unknown_invalid(self) -> None: - """A character encoding should be ignored if it is unknown or invalid.""" - encodings = _get_html_media_encodings( - b""" - - - - - """, - 'text/html; charset="invalid"', - ) - self.assertEqual(list(encodings), ["utf-8", "cp1252"]) diff --git a/tests/rest/client/test_media.py b/tests/rest/client/test_media.py index ec81b1413c2..ce006eaa75c 100644 --- a/tests/rest/client/test_media.py +++ b/tests/rest/client/test_media.py @@ -495,7 +495,7 @@ def test_non_ascii_preview_content_type(self) -> None: self.pump() self.assertEqual(channel.code, 200) - self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430") + self.assertEqual(channel.json_body["og:title"], "‰Íý") def test_overlong_title(self) -> None: self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")] diff --git a/tests/rest/media/test_url_preview.py b/tests/rest/media/test_url_preview.py index 32e78fc12a6..683d646942b 100644 --- a/tests/rest/media/test_url_preview.py +++ b/tests/rest/media/test_url_preview.py @@ -367,7 +367,7 @@ def test_non_ascii_preview_content_type(self) -> None: self.pump() self.assertEqual(channel.code, 200) - self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430") + self.assertEqual(channel.json_body["og:title"], "���") def test_overlong_title(self) -> None: self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")] From 8e9e3339bd831455db39da8a4a89e2e60437aff0 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 10 Dec 2025 10:56:44 -0500 Subject: [PATCH 2/6] Dont use lxml --- poetry.lock | 178 +-------------------------- pyproject.toml | 4 +- synapse/media/preview_html.py | 19 +-- tests/media/test_html_preview.py | 2 +- tests/rest/media/test_url_preview.py | 2 +- 5 files changed, 11 insertions(+), 194 deletions(-) diff --git a/poetry.lock b/poetry.lock index 62080647c09..faf6ed91d20 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1136,178 +1136,6 @@ files = [ {file = "librt-0.6.3.tar.gz", hash = "sha256:c724a884e642aa2bbad52bb0203ea40406ad742368a5f90da1b220e970384aae"}, ] -[[package]] -name = "lxml" -version = "6.0.2" -description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." -optional = true -python-versions = ">=3.8" -groups = ["main"] -markers = "extra == \"url-preview\" or extra == \"all\"" -files = [ - {file = "lxml-6.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e77dd455b9a16bbd2a5036a63ddbd479c19572af81b624e79ef422f929eef388"}, - {file = "lxml-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d444858b9f07cefff6455b983aea9a67f7462ba1f6cbe4a21e8bf6791bf2153"}, - {file = "lxml-6.0.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f952dacaa552f3bb8834908dddd500ba7d508e6ea6eb8c52eb2d28f48ca06a31"}, - {file = "lxml-6.0.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:71695772df6acea9f3c0e59e44ba8ac50c4f125217e84aab21074a1a55e7e5c9"}, - {file = "lxml-6.0.2-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:17f68764f35fd78d7c4cc4ef209a184c38b65440378013d24b8aecd327c3e0c8"}, - {file = "lxml-6.0.2-cp310-cp310-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:058027e261afed589eddcfe530fcc6f3402d7fd7e89bfd0532df82ebc1563dba"}, - {file = "lxml-6.0.2-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8ffaeec5dfea5881d4c9d8913a32d10cfe3923495386106e4a24d45300ef79c"}, - {file = "lxml-6.0.2-cp310-cp310-manylinux_2_31_armv7l.whl", hash = "sha256:f2e3b1a6bb38de0bc713edd4d612969dd250ca8b724be8d460001a387507021c"}, - {file = "lxml-6.0.2-cp310-cp310-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d6690ec5ec1cce0385cb20896b16be35247ac8c2046e493d03232f1c2414d321"}, - {file = "lxml-6.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2a50c3c1d11cad0ebebbac357a97b26aa79d2bcaf46f256551152aa85d3a4d1"}, - {file = "lxml-6.0.2-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:3efe1b21c7801ffa29a1112fab3b0f643628c30472d507f39544fd48e9549e34"}, - {file = "lxml-6.0.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:59c45e125140b2c4b33920d21d83681940ca29f0b83f8629ea1a2196dc8cfe6a"}, - {file = "lxml-6.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:452b899faa64f1805943ec1c0c9ebeaece01a1af83e130b69cdefeda180bb42c"}, - {file = "lxml-6.0.2-cp310-cp310-win32.whl", hash = "sha256:1e786a464c191ca43b133906c6903a7e4d56bef376b75d97ccbb8ec5cf1f0a4b"}, - {file = "lxml-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:dacf3c64ef3f7440e3167aa4b49aa9e0fb99e0aa4f9ff03795640bf94531bcb0"}, - {file = "lxml-6.0.2-cp310-cp310-win_arm64.whl", hash = "sha256:45f93e6f75123f88d7f0cfd90f2d05f441b808562bf0bc01070a00f53f5028b5"}, - {file = "lxml-6.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:13e35cbc684aadf05d8711a5d1b5857c92e5e580efa9a0d2be197199c8def607"}, - {file = "lxml-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b1675e096e17c6fe9c0e8c81434f5736c0739ff9ac6123c87c2d452f48fc938"}, - {file = "lxml-6.0.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8ac6e5811ae2870953390452e3476694196f98d447573234592d30488147404d"}, - {file = "lxml-6.0.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5aa0fc67ae19d7a64c3fe725dc9a1bb11f80e01f78289d05c6f62545affec438"}, - {file = "lxml-6.0.2-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de496365750cc472b4e7902a485d3f152ecf57bd3ba03ddd5578ed8ceb4c5964"}, - {file = "lxml-6.0.2-cp311-cp311-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:200069a593c5e40b8f6fc0d84d86d970ba43138c3e68619ffa234bc9bb806a4d"}, - {file = "lxml-6.0.2-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7d2de809c2ee3b888b59f995625385f74629707c9355e0ff856445cdcae682b7"}, - {file = "lxml-6.0.2-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:b2c3da8d93cf5db60e8858c17684c47d01fee6405e554fb55018dd85fc23b178"}, - {file = "lxml-6.0.2-cp311-cp311-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:442de7530296ef5e188373a1ea5789a46ce90c4847e597856570439621d9c553"}, - {file = "lxml-6.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2593c77efde7bfea7f6389f1ab249b15ed4aa5bc5cb5131faa3b843c429fbedb"}, - {file = "lxml-6.0.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:3e3cb08855967a20f553ff32d147e14329b3ae70ced6edc2f282b94afbc74b2a"}, - {file = "lxml-6.0.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:2ed6c667fcbb8c19c6791bbf40b7268ef8ddf5a96940ba9404b9f9a304832f6c"}, - {file = "lxml-6.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b8f18914faec94132e5b91e69d76a5c1d7b0c73e2489ea8929c4aaa10b76bbf7"}, - {file = "lxml-6.0.2-cp311-cp311-win32.whl", hash = "sha256:6605c604e6daa9e0d7f0a2137bdc47a2e93b59c60a65466353e37f8272f47c46"}, - {file = "lxml-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e5867f2651016a3afd8dd2c8238baa66f1e2802f44bc17e236f547ace6647078"}, - {file = "lxml-6.0.2-cp311-cp311-win_arm64.whl", hash = "sha256:4197fb2534ee05fd3e7afaab5d8bfd6c2e186f65ea7f9cd6a82809c887bd1285"}, - {file = "lxml-6.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a59f5448ba2ceccd06995c95ea59a7674a10de0810f2ce90c9006f3cbc044456"}, - {file = "lxml-6.0.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e8113639f3296706fbac34a30813929e29247718e88173ad849f57ca59754924"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a8bef9b9825fa8bc816a6e641bb67219489229ebc648be422af695f6e7a4fa7f"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:65ea18d710fd14e0186c2f973dc60bb52039a275f82d3c44a0e42b43440ea534"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c371aa98126a0d4c739ca93ceffa0fd7a5d732e3ac66a46e74339acd4d334564"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:700efd30c0fa1a3581d80a748157397559396090a51d306ea59a70020223d16f"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c33e66d44fe60e72397b487ee92e01da0d09ba2d66df8eae42d77b6d06e5eba0"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:90a345bbeaf9d0587a3aaffb7006aa39ccb6ff0e96a57286c0cb2fd1520ea192"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:064fdadaf7a21af3ed1dcaa106b854077fbeada827c18f72aec9346847cd65d0"}, - {file = "lxml-6.0.2-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fbc74f42c3525ac4ffa4b89cbdd00057b6196bcefe8bce794abd42d33a018092"}, - {file = "lxml-6.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6ddff43f702905a4e32bc24f3f2e2edfe0f8fde3277d481bffb709a4cced7a1f"}, - {file = "lxml-6.0.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:6da5185951d72e6f5352166e3da7b0dc27aa70bd1090b0eb3f7f7212b53f1bb8"}, - {file = "lxml-6.0.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:57a86e1ebb4020a38d295c04fc79603c7899e0df71588043eb218722dabc087f"}, - {file = "lxml-6.0.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:2047d8234fe735ab77802ce5f2297e410ff40f5238aec569ad7c8e163d7b19a6"}, - {file = "lxml-6.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6f91fd2b2ea15a6800c8e24418c0775a1694eefc011392da73bc6cef2623b322"}, - {file = "lxml-6.0.2-cp312-cp312-win32.whl", hash = "sha256:3ae2ce7d6fedfb3414a2b6c5e20b249c4c607f72cb8d2bb7cc9c6ec7c6f4e849"}, - {file = "lxml-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:72c87e5ee4e58a8354fb9c7c84cbf95a1c8236c127a5d1b7683f04bed8361e1f"}, - {file = "lxml-6.0.2-cp312-cp312-win_arm64.whl", hash = "sha256:61cb10eeb95570153e0c0e554f58df92ecf5109f75eacad4a95baa709e26c3d6"}, - {file = "lxml-6.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9b33d21594afab46f37ae58dfadd06636f154923c4e8a4d754b0127554eb2e77"}, - {file = "lxml-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:6c8963287d7a4c5c9a432ff487c52e9c5618667179c18a204bdedb27310f022f"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1941354d92699fb5ffe6ed7b32f9649e43c2feb4b97205f75866f7d21aa91452"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bb2f6ca0ae2d983ded09357b84af659c954722bbf04dea98030064996d156048"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb2a12d704f180a902d7fa778c6d71f36ceb7b0d317f34cdc76a5d05aa1dd1df"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:6ec0e3f745021bfed19c456647f0298d60a24c9ff86d9d051f52b509663feeb1"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:846ae9a12d54e368933b9759052d6206a9e8b250291109c48e350c1f1f49d916"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef9266d2aa545d7374938fb5c484531ef5a2ec7f2d573e62f8ce722c735685fd"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:4077b7c79f31755df33b795dc12119cb557a0106bfdab0d2c2d97bd3cf3dffa6"}, - {file = "lxml-6.0.2-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a7c5d5e5f1081955358533be077166ee97ed2571d6a66bdba6ec2f609a715d1a"}, - {file = "lxml-6.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8f8d0cbd0674ee89863a523e6994ac25fd5be9c8486acfc3e5ccea679bad2679"}, - {file = "lxml-6.0.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:2cbcbf6d6e924c28f04a43f3b6f6e272312a090f269eff68a2982e13e5d57659"}, - {file = "lxml-6.0.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:dfb874cfa53340009af6bdd7e54ebc0d21012a60a4e65d927c2e477112e63484"}, - {file = "lxml-6.0.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:fb8dae0b6b8b7f9e96c26fdd8121522ce5de9bb5538010870bd538683d30e9a2"}, - {file = "lxml-6.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:358d9adae670b63e95bc59747c72f4dc97c9ec58881d4627fe0120da0f90d314"}, - {file = "lxml-6.0.2-cp313-cp313-win32.whl", hash = "sha256:e8cd2415f372e7e5a789d743d133ae474290a90b9023197fd78f32e2dc6873e2"}, - {file = "lxml-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:b30d46379644fbfc3ab81f8f82ae4de55179414651f110a1514f0b1f8f6cb2d7"}, - {file = "lxml-6.0.2-cp313-cp313-win_arm64.whl", hash = "sha256:13dcecc9946dca97b11b7c40d29fba63b55ab4170d3c0cf8c0c164343b9bfdcf"}, - {file = "lxml-6.0.2-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:b0c732aa23de8f8aec23f4b580d1e52905ef468afb4abeafd3fec77042abb6fe"}, - {file = "lxml-6.0.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:4468e3b83e10e0317a89a33d28f7aeba1caa4d1a6fd457d115dd4ffe90c5931d"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:abd44571493973bad4598a3be7e1d807ed45aa2adaf7ab92ab7c62609569b17d"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:370cd78d5855cfbffd57c422851f7d3864e6ae72d0da615fca4dad8c45d375a5"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:901e3b4219fa04ef766885fb40fa516a71662a4c61b80c94d25336b4934b71c0"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:a4bf42d2e4cf52c28cc1812d62426b9503cdb0c87a6de81442626aa7d69707ba"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2c7fdaa4d7c3d886a42534adec7cfac73860b89b4e5298752f60aa5984641a0"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98a5e1660dc7de2200b00d53fa00bcd3c35a3608c305d45a7bbcaf29fa16e83d"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:dc051506c30b609238d79eda75ee9cab3e520570ec8219844a72a46020901e37"}, - {file = "lxml-6.0.2-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8799481bbdd212470d17513a54d568f44416db01250f49449647b5ab5b5dccb9"}, - {file = "lxml-6.0.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9261bb77c2dab42f3ecd9103951aeca2c40277701eb7e912c545c1b16e0e4917"}, - {file = "lxml-6.0.2-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:65ac4a01aba353cfa6d5725b95d7aed6356ddc0a3cd734de00124d285b04b64f"}, - {file = "lxml-6.0.2-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:b22a07cbb82fea98f8a2fd814f3d1811ff9ed76d0fc6abc84eb21527596e7cc8"}, - {file = "lxml-6.0.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:d759cdd7f3e055d6bc8d9bec3ad905227b2e4c785dc16c372eb5b5e83123f48a"}, - {file = "lxml-6.0.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:945da35a48d193d27c188037a05fec5492937f66fb1958c24fc761fb9d40d43c"}, - {file = "lxml-6.0.2-cp314-cp314-win32.whl", hash = "sha256:be3aaa60da67e6153eb15715cc2e19091af5dc75faef8b8a585aea372507384b"}, - {file = "lxml-6.0.2-cp314-cp314-win_amd64.whl", hash = "sha256:fa25afbadead523f7001caf0c2382afd272c315a033a7b06336da2637d92d6ed"}, - {file = "lxml-6.0.2-cp314-cp314-win_arm64.whl", hash = "sha256:063eccf89df5b24e361b123e257e437f9e9878f425ee9aae3144c77faf6da6d8"}, - {file = "lxml-6.0.2-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:6162a86d86893d63084faaf4ff937b3daea233e3682fb4474db07395794fa80d"}, - {file = "lxml-6.0.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:414aaa94e974e23a3e92e7ca5b97d10c0cf37b6481f50911032c69eeb3991bba"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:48461bd21625458dd01e14e2c38dd0aea69addc3c4f960c30d9f59d7f93be601"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:25fcc59afc57d527cfc78a58f40ab4c9b8fd096a9a3f964d2781ffb6eb33f4ed"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5179c60288204e6ddde3f774a93350177e08876eaf3ab78aa3a3649d43eb7d37"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:967aab75434de148ec80597b75062d8123cadf2943fb4281f385141e18b21338"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d100fcc8930d697c6561156c6810ab4a508fb264c8b6779e6e61e2ed5e7558f9"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ca59e7e13e5981175b8b3e4ab84d7da57993eeff53c07764dcebda0d0e64ecd"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:957448ac63a42e2e49531b9d6c0fa449a1970dbc32467aaad46f11545be9af1d"}, - {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b7fc49c37f1786284b12af63152fe1d0990722497e2d5817acfe7a877522f9a9"}, - {file = "lxml-6.0.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e19e0643cc936a22e837f79d01a550678da8377d7d801a14487c10c34ee49c7e"}, - {file = "lxml-6.0.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:1db01e5cf14345628e0cbe71067204db658e2fb8e51e7f33631f5f4735fefd8d"}, - {file = "lxml-6.0.2-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:875c6b5ab39ad5291588aed6925fac99d0097af0dd62f33c7b43736043d4a2ec"}, - {file = "lxml-6.0.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:cdcbed9ad19da81c480dfd6dd161886db6096083c9938ead313d94b30aadf272"}, - {file = "lxml-6.0.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:80dadc234ebc532e09be1975ff538d154a7fa61ea5031c03d25178855544728f"}, - {file = "lxml-6.0.2-cp314-cp314t-win32.whl", hash = "sha256:da08e7bb297b04e893d91087df19638dc7a6bb858a954b0cc2b9f5053c922312"}, - {file = "lxml-6.0.2-cp314-cp314t-win_amd64.whl", hash = "sha256:252a22982dca42f6155125ac76d3432e548a7625d56f5a273ee78a5057216eca"}, - {file = "lxml-6.0.2-cp314-cp314t-win_arm64.whl", hash = "sha256:bb4c1847b303835d89d785a18801a883436cdfd5dc3d62947f9c49e24f0f5a2c"}, - {file = "lxml-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a656ca105115f6b766bba324f23a67914d9c728dafec57638e2b92a9dcd76c62"}, - {file = "lxml-6.0.2-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c54d83a2188a10ebdba573f16bd97135d06c9ef60c3dc495315c7a28c80a263f"}, - {file = "lxml-6.0.2-cp38-cp38-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:1ea99340b3c729beea786f78c38f60f4795622f36e305d9c9be402201efdc3b7"}, - {file = "lxml-6.0.2-cp38-cp38-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:af85529ae8d2a453feee4c780d9406a5e3b17cee0dd75c18bd31adcd584debc3"}, - {file = "lxml-6.0.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:fe659f6b5d10fb5a17f00a50eb903eb277a71ee35df4615db573c069bcf967ac"}, - {file = "lxml-6.0.2-cp38-cp38-win32.whl", hash = "sha256:5921d924aa5468c939d95c9814fa9f9b5935a6ff4e679e26aaf2951f74043512"}, - {file = "lxml-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:0aa7070978f893954008ab73bb9e3c24a7c56c054e00566a21b553dc18105fca"}, - {file = "lxml-6.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:2c8458c2cdd29589a8367c09c8f030f1d202be673f0ca224ec18590b3b9fb694"}, - {file = "lxml-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3fee0851639d06276e6b387f1c190eb9d7f06f7f53514e966b26bae46481ec90"}, - {file = "lxml-6.0.2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b2142a376b40b6736dfc214fd2902409e9e3857eff554fed2d3c60f097e62a62"}, - {file = "lxml-6.0.2-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a6b5b39cc7e2998f968f05309e666103b53e2edd01df8dc51b90d734c0825444"}, - {file = "lxml-6.0.2-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4aec24d6b72ee457ec665344a29acb2d35937d5192faebe429ea02633151aad"}, - {file = "lxml-6.0.2-cp39-cp39-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:b42f4d86b451c2f9d06ffb4f8bbc776e04df3ba070b9fe2657804b1b40277c48"}, - {file = "lxml-6.0.2-cp39-cp39-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6cdaefac66e8b8f30e37a9b4768a391e1f8a16a7526d5bc77a7928408ef68e93"}, - {file = "lxml-6.0.2-cp39-cp39-manylinux_2_31_armv7l.whl", hash = "sha256:b738f7e648735714bbb82bdfd030203360cfeab7f6e8a34772b3c8c8b820568c"}, - {file = "lxml-6.0.2-cp39-cp39-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:daf42de090d59db025af61ce6bdb2521f0f102ea0e6ea310f13c17610a97da4c"}, - {file = "lxml-6.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:66328dabea70b5ba7e53d94aa774b733cf66686535f3bc9250a7aab53a91caaf"}, - {file = "lxml-6.0.2-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:e237b807d68a61fc3b1e845407e27e5eb8ef69bc93fe8505337c1acb4ee300b6"}, - {file = "lxml-6.0.2-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:ac02dc29fd397608f8eb15ac1610ae2f2f0154b03f631e6d724d9e2ad4ee2c84"}, - {file = "lxml-6.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:817ef43a0c0b4a77bd166dc9a09a555394105ff3374777ad41f453526e37f9cb"}, - {file = "lxml-6.0.2-cp39-cp39-win32.whl", hash = "sha256:bc532422ff26b304cfb62b328826bd995c96154ffd2bac4544f37dbb95ecaa8f"}, - {file = "lxml-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:995e783eb0374c120f528f807443ad5a83a656a8624c467ea73781fc5f8a8304"}, - {file = "lxml-6.0.2-cp39-cp39-win_arm64.whl", hash = "sha256:08b9d5e803c2e4725ae9e8559ee880e5328ed61aa0935244e0515d7d9dbec0aa"}, - {file = "lxml-6.0.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:e748d4cf8fef2526bb2a589a417eba0c8674e29ffcb570ce2ceca44f1e567bf6"}, - {file = "lxml-6.0.2-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4ddb1049fa0579d0cbd00503ad8c58b9ab34d1254c77bc6a5576d96ec7853dba"}, - {file = "lxml-6.0.2-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cb233f9c95f83707dae461b12b720c1af9c28c2d19208e1be03387222151daf5"}, - {file = "lxml-6.0.2-pp310-pypy310_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bc456d04db0515ce3320d714a1eac7a97774ff0849e7718b492d957da4631dd4"}, - {file = "lxml-6.0.2-pp310-pypy310_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2613e67de13d619fd283d58bda40bff0ee07739f624ffee8b13b631abf33083d"}, - {file = "lxml-6.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:24a8e756c982c001ca8d59e87c80c4d9dcd4d9b44a4cbeb8d9be4482c514d41d"}, - {file = "lxml-6.0.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1c06035eafa8404b5cf475bb37a9f6088b0aca288d4ccc9d69389750d5543700"}, - {file = "lxml-6.0.2-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c7d13103045de1bdd6fe5d61802565f1a3537d70cd3abf596aa0af62761921ee"}, - {file = "lxml-6.0.2-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a3c150a95fbe5ac91de323aa756219ef9cf7fde5a3f00e2281e30f33fa5fa4f"}, - {file = "lxml-6.0.2-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:60fa43be34f78bebb27812ed90f1925ec99560b0fa1decdb7d12b84d857d31e9"}, - {file = "lxml-6.0.2-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21c73b476d3cfe836be731225ec3421fa2f048d84f6df6a8e70433dff1376d5a"}, - {file = "lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e"}, - {file = "lxml-6.0.2.tar.gz", hash = "sha256:cd79f3367bd74b317dda655dc8fcfa304d9eb6e4fb06b7168c5cf27f96e0cd62"}, -] - -[package.extras] -cssselect = ["cssselect (>=0.7)"] -html-clean = ["lxml_html_clean"] -html5 = ["html5lib"] -htmlsoup = ["BeautifulSoup4"] - -[[package]] -name = "lxml-stubs" -version = "0.5.1" -description = "Type annotations for the lxml package" -optional = false -python-versions = "*" -groups = ["dev"] -files = [ - {file = "lxml-stubs-0.5.1.tar.gz", hash = "sha256:e0ec2aa1ce92d91278b719091ce4515c12adc1d564359dfaf81efa7d4feab79d"}, - {file = "lxml_stubs-0.5.1-py3-none-any.whl", hash = "sha256:1f689e5dbc4b9247cb09ae820c7d34daeb1fdbd1db06123814b856dae7787272"}, -] - -[package.extras] -test = ["coverage[toml] (>=7.2.5)", "mypy (>=1.2.0)", "pytest (>=7.3.0)", "pytest-mypy-plugins (>=1.10.1)"] - [[package]] name = "markdown-it-py" version = "4.0.0" @@ -3505,7 +3333,7 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"] test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"] [extras] -all = ["authlib", "beautifulsoup4", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pympler", "pysaml2", "sentry-sdk", "txredisapi"] +all = ["authlib", "beautifulsoup4", "hiredis", "jaeger-client", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pympler", "pysaml2", "sentry-sdk", "txredisapi"] cache-memory = ["pympler"] jwt = ["authlib"] matrix-synapse-ldap3 = ["matrix-synapse-ldap3"] @@ -3517,9 +3345,9 @@ saml2 = ["pysaml2"] sentry = ["sentry-sdk"] systemd = ["systemd-python"] test = ["idna", "parameterized"] -url-preview = ["beautifulsoup4", "lxml"] +url-preview = ["beautifulsoup4"] [metadata] lock-version = "2.1" python-versions = ">=3.10.0,<4.0.0" -content-hash = "cded33baf3b0eb42bba93b2e96439c10a1520f8471d1136771636e63f26523df" +content-hash = "c4bd7887db4ab253e3bd0c5acb0f6e9d1d02007ad6d0b13e0de50a8e8e840277" diff --git a/pyproject.toml b/pyproject.toml index 3882b233780..fe53f0443af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,7 +110,7 @@ oidc = ["authlib>=0.15.1"] # `systemd.journal.JournalHandler`, as is documented in # `contrib/systemd/log_config.yaml`. systemd = ["systemd-python>=231"] -url-preview = ["lxml>=4.6.3", "beautifulsoup4>=4.13.0"] +url-preview = ["beautifulsoup4>=4.13.0"] sentry = ["sentry-sdk>=0.7.2"] opentracing = ["jaeger-client>=4.2.0", "opentracing>=2.2.0"] jwt = ["authlib"] @@ -143,7 +143,6 @@ all = [ # oidc and jwt "authlib>=0.15.1", # url-preview - "lxml>=4.6.3", "beautifulsoup4>=4.13.0", # sentry "sentry-sdk>=0.7.2", @@ -334,7 +333,6 @@ generate-setup-file = true ruff = "0.14.6" # Typechecking -lxml-stubs = ">=0.4.0" mypy = "*" mypy-zope = "*" types-bleach = ">=4.1.0" diff --git a/synapse/media/preview_html.py b/synapse/media/preview_html.py index 8f1bc2276c8..2d79165ca6f 100644 --- a/synapse/media/preview_html.py +++ b/synapse/media/preview_html.py @@ -65,7 +65,7 @@ def decode_body(body: bytes | str, uri: str) -> Optional["BeautifulSoup"]: from bs4.builder import ParserRejectedMarkup try: - soup = BeautifulSoup(body, "lxml") + soup = BeautifulSoup(body, "html.parser") # If an empty document is returned, convert to None. if not len(soup): return None @@ -291,6 +291,7 @@ def parse_html_description(soup: "BeautifulSoup") -> str | None: """ TAGS_TO_REMOVE = { + "head", "header", "nav", "aside", @@ -304,26 +305,19 @@ def parse_html_description(soup: "BeautifulSoup") -> str | None: "canvas", "img", "picture", - # etree.Comment is a function which creates an etree._Comment element. - # The "tag" attribute of an etree._Comment instance is confusingly the - # etree.Comment function instead of a string. - # etree.Comment, - # XXX } # Split all the text nodes into paragraphs (by splitting on new # lines) text_nodes = ( re.sub(r"\s+", "\n", el).strip() - for el in _iterate_over_text( - cast(Optional["Tag"], soup.find("body")), TAGS_TO_REMOVE - ) + for el in _iterate_over_text(soup, TAGS_TO_REMOVE) ) return summarize_paragraphs(text_nodes) def _iterate_over_text( - soup: Optional["Tag"], + soup: "BeautifulSoup", tags_to_ignore: Iterable[str], stack_limit: int = 1024, ) -> Generator[str, None, None]: @@ -331,7 +325,7 @@ def _iterate_over_text( skipping text nodes inside certain tags. Args: - tree: The parent element to iterate. Can be None if there isn't one. + soup: The parent element to iterate. Can be None if there isn't one. tags_to_ignore: Set of tags to ignore stack_limit: Maximum stack size limit for depth-first traversal. Nodes will be dropped if this limit is hit, which may truncate the @@ -339,9 +333,6 @@ def _iterate_over_text( Intended to limit the maximum working memory when generating a preview. """ - if not soup: - return - from bs4.element import NavigableString, Tag # This is basically a stack that we extend using itertools.chain. diff --git a/tests/media/test_html_preview.py b/tests/media/test_html_preview.py index e33b80739ea..7fa437fdb88 100644 --- a/tests/media/test_html_preview.py +++ b/tests/media/test_html_preview.py @@ -366,7 +366,7 @@ def test_windows_1252(self) -> None: soup = decode_body(html, "http://example.com/test.html") assert soup is not None og = parse_html_to_open_graph(soup) - self.assertEqual(og, {"og:title": "�", "og:description": "Some text."}) + self.assertEqual(og, {"og:title": "¾", "og:description": "Some text."}) def test_image(self) -> None: """Test the spots an image can be pulled from .""" diff --git a/tests/rest/media/test_url_preview.py b/tests/rest/media/test_url_preview.py index 683d646942b..53338f70b31 100644 --- a/tests/rest/media/test_url_preview.py +++ b/tests/rest/media/test_url_preview.py @@ -367,7 +367,7 @@ def test_non_ascii_preview_content_type(self) -> None: self.pump() self.assertEqual(channel.code, 200) - self.assertEqual(channel.json_body["og:title"], "���") + self.assertEqual(channel.json_body["og:title"], "‰Íý") def test_overlong_title(self) -> None: self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")] From a24d2514137c1d2132400525bf2167cc60d13da3 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 10 Dec 2025 11:51:18 -0500 Subject: [PATCH 3/6] Update docs --- docs/setup/installation.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docs/setup/installation.md b/docs/setup/installation.md index a48662362af..097ca6c44c0 100644 --- a/docs/setup/installation.md +++ b/docs/setup/installation.md @@ -633,10 +633,6 @@ This is critical from a security perspective to stop arbitrary Matrix users spidering 'internal' URLs on your network. At the very least we recommend that your loopback and RFC1918 IP addresses are blacklisted. -This also requires the optional `lxml` python dependency to be installed. This -in turn requires the `libxml2` library to be available - on Debian/Ubuntu this -means `apt-get install libxml2-dev`, or equivalent for your OS. - ### Backups Don't forget to take [backups](../usage/administration/backups.md) of your new server! From d5332b079e1e6476eb85717b17ac336e86948358 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Thu, 11 Dec 2025 08:44:49 -0500 Subject: [PATCH 4/6] Create 19301.misc --- changelog.d/19301.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/19301.misc diff --git a/changelog.d/19301.misc b/changelog.d/19301.misc new file mode 100644 index 00000000000..a29625692ee --- /dev/null +++ b/changelog.d/19301.misc @@ -0,0 +1 @@ +Switch to beautofulsoup4 from lxml for URL previews. Controbuted by @clokep. \ No newline at end of file From 38135374778a78f5741a6d2500130806864f3054 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 12 Dec 2025 09:28:47 -0500 Subject: [PATCH 5/6] Lint fixes --- synapse/media/oembed.py | 5 ++--- synapse/media/preview_html.py | 27 ++++++++++++--------------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/synapse/media/oembed.py b/synapse/media/oembed.py index 1d48d648309..ef17a2191eb 100644 --- a/synapse/media/oembed.py +++ b/synapse/media/oembed.py @@ -31,7 +31,6 @@ if TYPE_CHECKING: from bs4 import BeautifulSoup - from bs4.element import Tag from synapse.server import HomeServer @@ -124,7 +123,7 @@ def autodiscover_from_html(self, soup: "BeautifulSoup") -> str | None: type="application/json+oembed", href=NON_BLANK, ) - return cast(str, cast("Tag", tag)["href"]) if tag else None + return cast(str, tag["href"]) if tag else None def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: """ @@ -216,7 +215,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: def _fetch_url(soup: "BeautifulSoup", tag_name: str) -> str | None: tag = soup.find(tag_name, src=NON_BLANK) - return cast(str, cast("Tag", tag)["src"]) if tag else None + return cast(str, tag["src"]) if tag else None def calc_description_and_urls( diff --git a/synapse/media/preview_html.py b/synapse/media/preview_html.py index 2d79165ca6f..613c1a4af03 100644 --- a/synapse/media/preview_html.py +++ b/synapse/media/preview_html.py @@ -108,7 +108,7 @@ def _get_meta_tags( ) return {} - key = tag[property] + key = cast(str, tag[property]) if property_mapper: new_key = property_mapper(key) # None is a special value used to ignore a value. @@ -116,7 +116,7 @@ def _get_meta_tags( continue key = new_key - results[key] = tag["content"] + results[key] = cast(str, tag["content"]) return results @@ -198,9 +198,10 @@ def parse_html_to_open_graph(soup: "BeautifulSoup") -> dict[str, str | None]: if "og:title" not in og: # Attempt to find a title from the title tag, or the biggest header on the page. - title = cast( - Optional["Tag"], soup.find(("title", "h1", "h2", "h3"), string=True) - ) + # + # mypy doesn't like passing both name and string, but it is used to ignore + # empty elements. + title = soup.find(("title", "h1", "h2", "h3"), string=True) # type: ignore[call-overload] if title and title.string: og["og:title"] = title.string.strip() else: @@ -208,9 +209,8 @@ def parse_html_to_open_graph(soup: "BeautifulSoup") -> dict[str, str | None]: if "og:image" not in og: # Check microdata for an image. - meta_image = cast( - Optional["Tag"], - soup.find("meta", itemprop=re.compile("image", re.I), content=NON_BLANK), + meta_image = soup.find( + "meta", itemprop=re.compile("image", re.I), content=NON_BLANK ) # If a meta image is found, use it. if meta_image: @@ -247,13 +247,10 @@ def parse_html_to_open_graph(soup: "BeautifulSoup") -> dict[str, str | None]: if "og:description" not in og: # Check the first meta description tag for content. - meta_description = cast( - Optional["Tag"], - soup.find( - "meta", - attrs={"name": re.compile("description", re.I)}, - content=NON_BLANK, - ), + meta_description = soup.find( + "meta", + attrs={"name": re.compile("description", re.I)}, + content=NON_BLANK, ) # If a meta description is found with content, use it. From 5940217ffd16045193c5b732929e5d010327c47c Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 12 Dec 2025 09:53:48 -0500 Subject: [PATCH 6/6] Fix-up references --- tests/media/test_html_preview.py | 12 ++++++------ tests/media/test_oembed.py | 8 ++++---- tests/media/test_url_previewer.py | 8 ++++---- tests/rest/client/test_media.py | 8 ++++---- tests/rest/media/test_url_preview.py | 8 ++++---- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/tests/media/test_html_preview.py b/tests/media/test_html_preview.py index 7fa437fdb88..387969522c3 100644 --- a/tests/media/test_html_preview.py +++ b/tests/media/test_html_preview.py @@ -28,14 +28,14 @@ from tests import unittest try: - import lxml + import beauitfulsoup4 except ImportError: - lxml = None # type: ignore[assignment] + beauitfulsoup4 = None # type: ignore[assignment] class SummarizeTestCase(unittest.TestCase): - if not lxml: - skip = "url preview feature requires lxml" + if not beauitfulsoup4: + skip = "url preview feature requires beauitfulsoup4" def test_long_summarize(self) -> None: example_paras = [ @@ -152,8 +152,8 @@ def test_small_then_large_summarize(self) -> None: class OpenGraphFromHtmlTestCase(unittest.TestCase): - if not lxml: - skip = "url preview feature requires lxml" + if not beauitfulsoup4: + skip = "url preview feature requires beauitfulsoup4" def test_simple(self) -> None: html = b""" diff --git a/tests/media/test_oembed.py b/tests/media/test_oembed.py index dc13c03df33..8181f86c576 100644 --- a/tests/media/test_oembed.py +++ b/tests/media/test_oembed.py @@ -34,14 +34,14 @@ from tests.unittest import HomeserverTestCase try: - import lxml + import beauitfulsoup4 except ImportError: - lxml = None # type: ignore[assignment] + beauitfulsoup4 = None # type: ignore[assignment] class OEmbedTests(HomeserverTestCase): - if not lxml: - skip = "url preview feature requires lxml" + if not beauitfulsoup4: + skip = "url preview feature requires beauitfulsoup4" def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.oembed = OEmbedProvider(hs) diff --git a/tests/media/test_url_previewer.py b/tests/media/test_url_previewer.py index 3d706c7e90d..a89a581fc5a 100644 --- a/tests/media/test_url_previewer.py +++ b/tests/media/test_url_previewer.py @@ -29,14 +29,14 @@ from tests.unittest import override_config try: - import lxml + import beauitfulsoup4 except ImportError: - lxml = None # type: ignore[assignment] + beauitfulsoup4 = None # type: ignore[assignment] class URLPreviewTests(unittest.HomeserverTestCase): - if not lxml: - skip = "url preview feature requires lxml" + if not beauitfulsoup4: + skip = "url preview feature requires beauitfulsoup4" def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer: config = self.default_config() diff --git a/tests/rest/client/test_media.py b/tests/rest/client/test_media.py index ce006eaa75c..43b1acb5f84 100644 --- a/tests/rest/client/test_media.py +++ b/tests/rest/client/test_media.py @@ -77,9 +77,9 @@ from tests.unittest import override_config try: - import lxml + import beauitfulsoup4 except ImportError: - lxml = None # type: ignore[assignment] + beauitfulsoup4 = None # type: ignore[assignment] class MediaDomainBlockingTests(unittest.HomeserverTestCase): @@ -188,8 +188,8 @@ def test_remote_media_thumbnail_normally_unblocked(self) -> None: class URLPreviewTests(unittest.HomeserverTestCase): - if not lxml: - skip = "url preview feature requires lxml" + if not beauitfulsoup4: + skip = "url preview feature requires beauitfulsoup4" servlets = [media.register_servlets] hijack_auth = True diff --git a/tests/rest/media/test_url_preview.py b/tests/rest/media/test_url_preview.py index 53338f70b31..bcd6c7b75d4 100644 --- a/tests/rest/media/test_url_preview.py +++ b/tests/rest/media/test_url_preview.py @@ -45,14 +45,14 @@ from tests.unittest import override_config try: - import lxml + import beautifulsoup4 except ImportError: - lxml = None # type: ignore[assignment] + beautifulsoup4 = None # type: ignore[assignment] class URLPreviewTests(unittest.HomeserverTestCase): - if not lxml: - skip = "url preview feature requires lxml" + if not beautifulsoup4: + skip = "url preview feature requires beautifulsoup4" hijack_auth = True user_id = "@test:user"