From 6dec726b1faddea054ee14312ac5c234ee338608 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@patrick.cloke.us>
Date: Wed, 10 Dec 2025 10:24:09 -0500
Subject: [PATCH 1/6] Use BeautifulSoup instead of LXML directly.

---
 poetry.lock                          |  87 +++++---
 pyproject.toml                       |   3 +-
 synapse/media/oembed.py              |  95 ++++-----
 synapse/media/preview_html.py        | 301 ++++++++-------------------
 synapse/media/url_previewer.py       |  10 +-
 tests/media/test_html_preview.py     | 269 +++++++++---------------
 tests/rest/client/test_media.py      |   2 +-
 tests/rest/media/test_url_preview.py |   2 +-
 8 files changed, 295 insertions(+), 474 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 8c9256c8929..62080647c09 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -31,7 +31,7 @@ description = "The ultimate Python library in building OAuth and OpenID Connect
 optional = true
 python-versions = ">=3.9"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"jwt\" or extra == \"oidc\""
+markers = "extra == \"oidc\" or extra == \"jwt\" or extra == \"all\""
 files = [
     {file = "authlib-1.6.5-py2.py3-none-any.whl", hash = "sha256:3e0e0507807f842b02175507bdee8957a1d5707fd4afb17c32fb43fee90b6e3a"},
     {file = "authlib-1.6.5.tar.gz", hash = "sha256:6aaf9c79b7cc96c900f0b284061691c5d4e61221640a948fe690b556a6d6d10b"},
@@ -132,6 +132,30 @@ files = [
 tests = ["pytest (>=3.2.1,!=3.3.0)"]
 typecheck = ["mypy"]
 
+[[package]]
+name = "beautifulsoup4"
+version = "4.14.3"
+description = "Screen-scraping library"
+optional = true
+python-versions = ">=3.7.0"
+groups = ["main"]
+markers = "extra == \"url-preview\" or extra == \"all\""
+files = [
+    {file = "beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb"},
+    {file = "beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86"},
+]
+
+[package.dependencies]
+soupsieve = ">=1.6.1"
+typing-extensions = ">=4.0.0"
+
+[package.extras]
+cchardet = ["cchardet"]
+chardet = ["chardet"]
+charset-normalizer = ["charset-normalizer"]
+html5lib = ["html5lib"]
+lxml = ["lxml"]
+
 [[package]]
 name = "bleach"
 version = "6.3.0"
@@ -481,7 +505,7 @@ description = "XML bomb protection for Python stdlib modules"
 optional = true
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"saml2\""
+markers = "extra == \"saml2\" or extra == \"all\""
 files = [
     {file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"},
     {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
@@ -506,7 +530,7 @@ description = "XPath 1.0/2.0/3.0/3.1 parsers and selectors for ElementTree and l
 optional = true
 python-versions = ">=3.7"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"saml2\""
+markers = "extra == \"saml2\" or extra == \"all\""
 files = [
     {file = "elementpath-4.1.5-py3-none-any.whl", hash = "sha256:2ac1a2fb31eb22bbbf817f8cf6752f844513216263f0e3892c8e79782fe4bb55"},
     {file = "elementpath-4.1.5.tar.gz", hash = "sha256:c2d6dc524b29ef751ecfc416b0627668119d8812441c555d7471da41d4bacb8d"},
@@ -556,7 +580,7 @@ description = "Python wrapper for hiredis"
 optional = true
 python-versions = ">=3.8"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"redis\""
+markers = "extra == \"redis\" or extra == \"all\""
 files = [
     {file = "hiredis-3.3.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:9937d9b69321b393fbace69f55423480f098120bc55a3316e1ca3508c4dbbd6f"},
     {file = "hiredis-3.3.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:50351b77f89ba6a22aff430b993653847f36b71d444509036baa0f2d79d1ebf4"},
@@ -879,7 +903,7 @@ description = "Jaeger Python OpenTracing Tracer implementation"
 optional = true
 python-versions = ">=3.7"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"opentracing\""
+markers = "extra == \"opentracing\" or extra == \"all\""
 files = [
     {file = "jaeger-client-4.8.0.tar.gz", hash = "sha256:3157836edab8e2c209bd2d6ae61113db36f7ee399e66b1dcbb715d87ab49bfe0"},
 ]
@@ -1017,7 +1041,7 @@ description = "A strictly RFC 4510 conforming LDAP V3 pure Python client library
 optional = true
 python-versions = "*"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"matrix-synapse-ldap3\""
+markers = "extra == \"matrix-synapse-ldap3\" or extra == \"all\""
 files = [
     {file = "ldap3-2.9.1-py2.py3-none-any.whl", hash = "sha256:5869596fc4948797020d3f03b7939da938778a0f9e2009f7a072ccf92b8e8d70"},
     {file = "ldap3-2.9.1.tar.gz", hash = "sha256:f3e7fc4718e3f09dda568b57100095e0ce58633bcabbed8667ce3f8fbaa4229f"},
@@ -1119,7 +1143,7 @@ description = "Powerful and Pythonic XML processing library combining libxml2/li
 optional = true
 python-versions = ">=3.8"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"url-preview\""
+markers = "extra == \"url-preview\" or extra == \"all\""
 files = [
     {file = "lxml-6.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e77dd455b9a16bbd2a5036a63ddbd479c19572af81b624e79ef422f929eef388"},
     {file = "lxml-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d444858b9f07cefff6455b983aea9a67f7462ba1f6cbe4a21e8bf6791bf2153"},
@@ -1405,7 +1429,7 @@ description = "An LDAP3 auth provider for Synapse"
 optional = true
 python-versions = ">=3.7"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"matrix-synapse-ldap3\""
+markers = "extra == \"matrix-synapse-ldap3\" or extra == \"all\""
 files = [
     {file = "matrix-synapse-ldap3-0.3.0.tar.gz", hash = "sha256:8bb6517173164d4b9cc44f49de411d8cebdb2e705d5dd1ea1f38733c4a009e1d"},
     {file = "matrix_synapse_ldap3-0.3.0-py3-none-any.whl", hash = "sha256:8b4d701f8702551e98cc1d8c20dbed532de5613584c08d0df22de376ba99159d"},
@@ -1648,7 +1672,7 @@ description = "OpenTracing API for Python. See documentation at http://opentraci
 optional = true
 python-versions = "*"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"opentracing\""
+markers = "extra == \"opentracing\" or extra == \"all\""
 files = [
     {file = "opentracing-2.4.0.tar.gz", hash = "sha256:a173117e6ef580d55874734d1fa7ecb6f3655160b8b8974a2a1e98e5ec9c840d"},
 ]
@@ -1838,7 +1862,7 @@ description = "psycopg2 - Python-PostgreSQL Database Adapter"
 optional = true
 python-versions = ">=3.9"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"postgres\""
+markers = "extra == \"postgres\" or extra == \"all\""
 files = [
     {file = "psycopg2-2.9.11-cp310-cp310-win_amd64.whl", hash = "sha256:103e857f46bb76908768ead4e2d0ba1d1a130e7b8ed77d3ae91e8b33481813e8"},
     {file = "psycopg2-2.9.11-cp311-cp311-win_amd64.whl", hash = "sha256:210daed32e18f35e3140a1ebe059ac29209dd96468f2f7559aa59f75ee82a5cb"},
@@ -1856,7 +1880,7 @@ description = ".. image:: https://travis-ci.org/chtd/psycopg2cffi.svg?branch=mas
 optional = true
 python-versions = "*"
 groups = ["main"]
-markers = "platform_python_implementation == \"PyPy\" and (extra == \"all\" or extra == \"postgres\")"
+markers = "platform_python_implementation == \"PyPy\" and (extra == \"postgres\" or extra == \"all\")"
 files = [
     {file = "psycopg2cffi-2.9.0.tar.gz", hash = "sha256:7e272edcd837de3a1d12b62185eb85c45a19feda9e62fa1b120c54f9e8d35c52"},
 ]
@@ -1872,7 +1896,7 @@ description = "A Simple library to enable psycopg2 compatability"
 optional = true
 python-versions = "*"
 groups = ["main"]
-markers = "platform_python_implementation == \"PyPy\" and (extra == \"all\" or extra == \"postgres\")"
+markers = "platform_python_implementation == \"PyPy\" and (extra == \"postgres\" or extra == \"all\")"
 files = [
     {file = "psycopg2cffi-compat-1.1.tar.gz", hash = "sha256:d25e921748475522b33d13420aad5c2831c743227dc1f1f2585e0fdb5c914e05"},
 ]
@@ -2154,7 +2178,7 @@ description = "A development tool to measure, monitor and analyze the memory beh
 optional = true
 python-versions = ">=3.6"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"cache-memory\""
+markers = "extra == \"cache-memory\" or extra == \"all\""
 files = [
     {file = "Pympler-1.0.1-py3-none-any.whl", hash = "sha256:d260dda9ae781e1eab6ea15bacb84015849833ba5555f141d2d9b7b7473b307d"},
     {file = "Pympler-1.0.1.tar.gz", hash = "sha256:993f1a3599ca3f4fcd7160c7545ad06310c9e12f70174ae7ae8d4e25f6c5d3fa"},
@@ -2214,7 +2238,7 @@ description = "Python implementation of SAML Version 2 Standard"
 optional = true
 python-versions = ">=3.9,<4.0"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"saml2\""
+markers = "extra == \"saml2\" or extra == \"all\""
 files = [
     {file = "pysaml2-7.5.0-py3-none-any.whl", hash = "sha256:bc6627cc344476a83c757f440a73fda1369f13b6fda1b4e16bca63ffbabb5318"},
     {file = "pysaml2-7.5.0.tar.gz", hash = "sha256:f36871d4e5ee857c6b85532e942550d2cf90ea4ee943d75eb681044bbc4f54f7"},
@@ -2239,7 +2263,7 @@ description = "Extensions to the standard Python datetime module"
 optional = true
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"saml2\""
+markers = "extra == \"saml2\" or extra == \"all\""
 files = [
     {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
     {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
@@ -2267,7 +2291,7 @@ description = "World timezone definitions, modern and historical"
 optional = true
 python-versions = "*"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"saml2\""
+markers = "extra == \"saml2\" or extra == \"all\""
 files = [
     {file = "pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00"},
     {file = "pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3"},
@@ -2671,7 +2695,7 @@ description = "Python client for Sentry (https://sentry.io)"
 optional = true
 python-versions = ">=3.6"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"sentry\""
+markers = "extra == \"sentry\" or extra == \"all\""
 files = [
     {file = "sentry_sdk-2.46.0-py2.py3-none-any.whl", hash = "sha256:4eeeb60198074dff8d066ea153fa6f241fef1668c10900ea53a4200abc8da9b1"},
     {file = "sentry_sdk-2.46.0.tar.gz", hash = "sha256:91821a23460725734b7741523021601593f35731808afc0bb2ba46c27b8acd91"},
@@ -2846,6 +2870,19 @@ files = [
     {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
 ]
 
+[[package]]
+name = "soupsieve"
+version = "2.8"
+description = "A modern CSS selector implementation for Beautiful Soup."
+optional = true
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "extra == \"url-preview\" or extra == \"all\""
+files = [
+    {file = "soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c"},
+    {file = "soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f"},
+]
+
 [[package]]
 name = "sqlglot"
 version = "28.0.0"
@@ -2881,7 +2918,7 @@ description = "Tornado IOLoop Backed Concurrent Futures"
 optional = true
 python-versions = "*"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"opentracing\""
+markers = "extra == \"opentracing\" or extra == \"all\""
 files = [
     {file = "threadloop-1.0.2-py2-none-any.whl", hash = "sha256:5c90dbefab6ffbdba26afb4829d2a9df8275d13ac7dc58dccb0e279992679599"},
     {file = "threadloop-1.0.2.tar.gz", hash = "sha256:8b180aac31013de13c2ad5c834819771992d350267bddb854613ae77ef571944"},
@@ -2897,7 +2934,7 @@ description = "Python bindings for the Apache Thrift RPC system"
 optional = true
 python-versions = "*"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"opentracing\""
+markers = "extra == \"opentracing\" or extra == \"all\""
 files = [
     {file = "thrift-0.16.0.tar.gz", hash = "sha256:2b5b6488fcded21f9d312aa23c9ff6a0195d0f6ae26ddbd5ad9e3e25dfc14408"},
 ]
@@ -2970,7 +3007,7 @@ description = "Tornado is a Python web framework and asynchronous networking lib
 optional = true
 python-versions = ">=3.9"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"opentracing\""
+markers = "extra == \"opentracing\" or extra == \"all\""
 files = [
     {file = "tornado-6.5-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:f81067dad2e4443b015368b24e802d0083fecada4f0a4572fdb72fc06e54a9a6"},
     {file = "tornado-6.5-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9ac1cbe1db860b3cbb251e795c701c41d343f06a96049d6274e7c77559117e41"},
@@ -3104,7 +3141,7 @@ description = "non-blocking redis client for python"
 optional = true
 python-versions = "*"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"redis\""
+markers = "extra == \"redis\" or extra == \"all\""
 files = [
     {file = "txredisapi-1.4.11-py3-none-any.whl", hash = "sha256:ac64d7a9342b58edca13ef267d4fa7637c1aa63f8595e066801c1e8b56b22d0b"},
     {file = "txredisapi-1.4.11.tar.gz", hash = "sha256:3eb1af99aefdefb59eb877b1dd08861efad60915e30ad5bf3d5bf6c5cedcdbc6"},
@@ -3350,7 +3387,7 @@ description = "An XML Schema validator and decoder"
 optional = true
 python-versions = ">=3.7"
 groups = ["main"]
-markers = "extra == \"all\" or extra == \"saml2\""
+markers = "extra == \"saml2\" or extra == \"all\""
 files = [
     {file = "xmlschema-2.4.0-py3-none-any.whl", hash = "sha256:dc87be0caaa61f42649899189aab2fd8e0d567f2cf548433ba7b79278d231a4a"},
     {file = "xmlschema-2.4.0.tar.gz", hash = "sha256:d74cd0c10866ac609e1ef94a5a69b018ad16e39077bc6393408b40c6babee793"},
@@ -3468,7 +3505,7 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"]
 test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"]
 
 [extras]
-all = ["authlib", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pympler", "pysaml2", "sentry-sdk", "txredisapi"]
+all = ["authlib", "beautifulsoup4", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pympler", "pysaml2", "sentry-sdk", "txredisapi"]
 cache-memory = ["pympler"]
 jwt = ["authlib"]
 matrix-synapse-ldap3 = ["matrix-synapse-ldap3"]
@@ -3480,9 +3517,9 @@ saml2 = ["pysaml2"]
 sentry = ["sentry-sdk"]
 systemd = ["systemd-python"]
 test = ["idna", "parameterized"]
-url-preview = ["lxml"]
+url-preview = ["beautifulsoup4", "lxml"]
 
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10.0,<4.0.0"
-content-hash = "960ddae65fde8574f0f36b6988622fc4baf7646823c36699c5cd4773cad8b0ed"
+content-hash = "cded33baf3b0eb42bba93b2e96439c10a1520f8471d1136771636e63f26523df"
diff --git a/pyproject.toml b/pyproject.toml
index 70d5e3d5730..3882b233780 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -110,7 +110,7 @@ oidc = ["authlib>=0.15.1"]
 # `systemd.journal.JournalHandler`, as is documented in
 # `contrib/systemd/log_config.yaml`.
 systemd = ["systemd-python>=231"]
-url-preview = ["lxml>=4.6.3"]
+url-preview = ["lxml>=4.6.3", "beautifulsoup4>=4.13.0"]
 sentry = ["sentry-sdk>=0.7.2"]
 opentracing = ["jaeger-client>=4.2.0", "opentracing>=2.2.0"]
 jwt = ["authlib"]
@@ -144,6 +144,7 @@ all = [
     "authlib>=0.15.1",
     # url-preview
     "lxml>=4.6.3",
+    "beautifulsoup4>=4.13.0",
     # sentry
     "sentry-sdk>=0.7.2",
     # opentracing
diff --git a/synapse/media/oembed.py b/synapse/media/oembed.py
index 7e440721302..1d48d648309 100644
--- a/synapse/media/oembed.py
+++ b/synapse/media/oembed.py
@@ -25,12 +25,13 @@
 
 import attr
 
-from synapse.media.preview_html import parse_html_description
+from synapse.media.preview_html import NON_BLANK, decode_body, parse_html_description
 from synapse.types import JsonDict
 from synapse.util.json import json_decoder
 
 if TYPE_CHECKING:
-    from lxml import etree
+    from bs4 import BeautifulSoup
+    from bs4.element import Tag
 
     from synapse.server import HomeServer
 
@@ -105,35 +106,25 @@ def get_oembed_url(self, url: str) -> str | None:
         # No match.
         return None
 
-    def autodiscover_from_html(self, tree: "etree._Element") -> str | None:
+    def autodiscover_from_html(self, soup: "BeautifulSoup") -> str | None:
         """
         Search an HTML document for oEmbed autodiscovery information.
 
         Args:
-            tree: The parsed HTML body.
+            soup: The parsed HTML body.
 
         Returns:
             The URL to use for oEmbed information, or None if no URL was found.
         """
         # Search for link elements with the proper rel and type attributes.
-        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
-        for tag in cast(
-            list["etree._Element"],
-            tree.xpath("//link[@rel='alternate'][@type='application/json+oembed']"),
-        ):
-            if "href" in tag.attrib:
-                return cast(str, tag.attrib["href"])
-
         # Some providers (e.g. Flickr) use alternative instead of alternate.
-        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
-        for tag in cast(
-            list["etree._Element"],
-            tree.xpath("//link[@rel='alternative'][@type='application/json+oembed']"),
-        ):
-            if "href" in tag.attrib:
-                return cast(str, tag.attrib["href"])
-
-        return None
+        tag = soup.find(
+            "link",
+            rel=("alternate", "alternative"),
+            type="application/json+oembed",
+            href=NON_BLANK,
+        )
+        return cast(str, cast("Tag", tag)["href"]) if tag else None
 
     def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
         """
@@ -196,7 +187,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
         if oembed_type == "rich":
             html_str = oembed.get("html")
             if isinstance(html_str, str):
-                calc_description_and_urls(open_graph_response, html_str)
+                calc_description_and_urls(open_graph_response, html_str, url)
 
         elif oembed_type == "photo":
             # If this is a photo, use the full image, not the thumbnail.
@@ -208,7 +199,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
             open_graph_response["og:type"] = "video.other"
             html_str = oembed.get("html")
             if html_str and isinstance(html_str, str):
-                calc_description_and_urls(open_graph_response, oembed["html"])
+                calc_description_and_urls(open_graph_response, oembed["html"], url)
             for size in ("width", "height"):
                 val = oembed.get(size)
                 if type(val) is int:  # noqa: E721
@@ -223,55 +214,45 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
         return OEmbedResult(open_graph_response, author_name, cache_age)
 
 
-def _fetch_urls(tree: "etree._Element", tag_name: str) -> list[str]:
-    results = []
-    # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
-    for tag in cast(list["etree._Element"], tree.xpath("//*/" + tag_name)):
-        if "src" in tag.attrib:
-            results.append(cast(str, tag.attrib["src"]))
-    return results
+def _fetch_url(soup: "BeautifulSoup", tag_name: str) -> str | None:
+    tag = soup.find(tag_name, src=NON_BLANK)
+    return cast(str, cast("Tag", tag)["src"]) if tag else None
 
 
-def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
+def calc_description_and_urls(
+    open_graph_response: JsonDict, html_body: str, url: str
+) -> None:
     """
     Calculate description for an HTML document.
 
-    This uses lxml to convert the HTML document into plaintext. If errors
+    This uses BeautifulSoup to convert the HTML document into plaintext. If errors
     occur during processing of the document, an empty response is returned.
 
     Args:
         open_graph_response: The current Open Graph summary. This is updated with additional fields.
         html_body: The HTML document, as bytes.
-
-    Returns:
-        The summary
+        url: The URL which is being previewed (not the one which was requested).
     """
-    # If there's no body, nothing useful is going to be found.
-    if not html_body:
-        return
+    soup = decode_body(html_body, url)
 
-    from lxml import etree
-
-    # Create an HTML parser. If this fails, log and return no metadata.
-    parser = etree.HTMLParser(recover=True, encoding="utf-8")
-
-    # Attempt to parse the body. If this fails, log and return no metadata.
-    tree = etree.fromstring(html_body, parser)
-
-    # The data was successfully parsed, but no tree was found.
-    if tree is None:
+    # If there's no body, nothing useful is going to be found.
+    if not soup:
         return
 
     # Attempt to find interesting URLs (images, videos, embeds).
     if "og:image" not in open_graph_response:
-        image_urls = _fetch_urls(tree, "img")
-        if image_urls:
-            open_graph_response["og:image"] = image_urls[0]
-
-    video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
-    if video_urls:
-        open_graph_response["og:video"] = video_urls[0]
-
-    description = parse_html_description(tree)
+        image_url = _fetch_url(soup, "img")
+        if image_url:
+            open_graph_response["og:image"] = image_url
+
+    video_url = _fetch_url(soup, "video")
+    if video_url:
+        open_graph_response["og:video"] = video_url
+    else:
+        embed_url = _fetch_url(soup, "embed")
+        if embed_url:
+            open_graph_response["og:video"] = embed_url
+
+    description = parse_html_description(soup)
     if description:
         open_graph_response["og:description"] = description
diff --git a/synapse/media/preview_html.py b/synapse/media/preview_html.py
index 22ad581f829..8f1bc2276c8 100644
--- a/synapse/media/preview_html.py
+++ b/synapse/media/preview_html.py
@@ -18,7 +18,7 @@
 # [This file includes modifications made by New Vector Limited]
 #
 #
-import codecs
+import itertools
 import logging
 import re
 from typing import (
@@ -26,100 +26,28 @@
     Callable,
     Generator,
     Iterable,
+    Iterator,
     Optional,
     cast,
 )
 
 if TYPE_CHECKING:
-    from lxml import etree
+    from bs4 import BeautifulSoup
+    from bs4.element import PageElement, Tag
 
 logger = logging.getLogger(__name__)
 
-_charset_match = re.compile(
-    rb'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I
-)
-_xml_encoding_match = re.compile(
-    rb'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I
-)
 _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
 
 # Certain elements aren't meant for display.
 ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"}
 
-
-def _normalise_encoding(encoding: str) -> str | None:
-    """Use the Python codec's name as the normalised entry."""
-    try:
-        return codecs.lookup(encoding).name
-    except LookupError:
-        return None
+NON_BLANK = re.compile(".+")
 
 
-def _get_html_media_encodings(body: bytes, content_type: str | None) -> Iterable[str]:
+def decode_body(body: bytes | str, uri: str) -> Optional["BeautifulSoup"]:
     """
-    Get potential encoding of the body based on the (presumably) HTML body or the content-type header.
-
-    The precedence used for finding a character encoding is:
-
-    1. <meta> tag with a charset declared.
-    2. The XML document's character encoding attribute.
-    3. The Content-Type header.
-    4. Fallback to utf-8.
-    5. Fallback to windows-1252.
-
-    This roughly follows the algorithm used by BeautifulSoup's bs4.dammit.EncodingDetector.
-
-    Args:
-        body: The HTML document, as bytes.
-        content_type: The Content-Type header.
-
-    Returns:
-        The character encoding of the body, as a string.
-    """
-    # There's no point in returning an encoding more than once.
-    attempted_encodings: set[str] = set()
-
-    # Limit searches to the first 1kb, since it ought to be at the top.
-    body_start = body[:1024]
-
-    # Check if it has an encoding set in a meta tag.
-    match = _charset_match.search(body_start)
-    if match:
-        encoding = _normalise_encoding(match.group(1).decode("ascii"))
-        if encoding:
-            attempted_encodings.add(encoding)
-            yield encoding
-
-    # TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
-
-    # Check if it has an XML document with an encoding.
-    match = _xml_encoding_match.match(body_start)
-    if match:
-        encoding = _normalise_encoding(match.group(1).decode("ascii"))
-        if encoding and encoding not in attempted_encodings:
-            attempted_encodings.add(encoding)
-            yield encoding
-
-    # Check the HTTP Content-Type header for a character set.
-    if content_type:
-        content_match = _content_type_match.match(content_type)
-        if content_match:
-            encoding = _normalise_encoding(content_match.group(1))
-            if encoding and encoding not in attempted_encodings:
-                attempted_encodings.add(encoding)
-                yield encoding
-
-    # Finally, fallback to UTF-8, then windows-1252.
-    for fallback in ("utf-8", "cp1252"):
-        if fallback not in attempted_encodings:
-            yield fallback
-
-
-def decode_body(
-    body: bytes, uri: str, content_type: str | None = None
-) -> Optional["etree._Element"]:
-    """
-    This uses lxml to parse the HTML document.
+    This uses BeautifulSoup to parse the HTML document.
 
     Args:
         body: The HTML document, as bytes.
@@ -133,54 +61,22 @@ def decode_body(
     if not body:
         return None
 
-    # The idea here is that multiple encodings are tried until one works.
-    # Unfortunately the result is never used and then LXML will decode the string
-    # again with the found encoding.
-    for encoding in _get_html_media_encodings(body, content_type):
-        try:
-            body.decode(encoding)
-        except Exception:
-            pass
-        else:
-            break
-    else:
+    from bs4 import BeautifulSoup
+    from bs4.builder import ParserRejectedMarkup
+
+    try:
+        soup = BeautifulSoup(body, "lxml")
+        # If an empty document is returned, convert to None.
+        if not len(soup):
+            return None
+        return soup
+    except ParserRejectedMarkup:
         logger.warning("Unable to decode HTML body for %s", uri)
         return None
 
-    from lxml import etree
-
-    # Create an HTML parser.
-    parser = etree.HTMLParser(recover=True, encoding=encoding)
-
-    # Attempt to parse the body. With `lxml` 6.0.0+, this will be an empty HTML
-    # tree if the body was successfully parsed, but no tree was found. In
-    # previous `lxml` versions, `etree.fromstring` would return `None` in that
-    # case.
-    html_tree = etree.fromstring(body, parser)
-
-    # Account for the above referenced case where `html_tree` is an HTML tree
-    # with an empty body. If so, return None.
-    if html_tree is not None and html_tree.tag == "html":
-        # If the tree has only a single <body> element and it's empty, then
-        # return None.
-        body_el = html_tree.find("body")
-        if body_el is not None and len(html_tree) == 1:
-            # Extract the content of the body tag as text.
-            body_text = "".join(cast(Iterable[str], body_el.itertext()))
-
-            # Strip any undecodable Unicode characters and whitespace.
-            body_text = body_text.strip("\ufffd").strip()
-
-            # If there's no text left, and there were no child tags,
-            # then we consider the <body> tag empty.
-            if not body_text and len(body_el) == 0:
-                return None
-
-    return html_tree
-
 
 def _get_meta_tags(
-    tree: "etree._Element",
+    soup: "BeautifulSoup",
     property: str,
     prefix: str,
     property_mapper: Callable[[str], str | None] | None = None,
@@ -189,7 +85,7 @@ def _get_meta_tags(
     Search for meta tags prefixed with a particular string.
 
     Args:
-        tree: The parsed HTML document.
+        soup: The parsed HTML document.
         property: The name of the property which contains the tag name, e.g.
             "property" for Open Graph.
         prefix: The prefix on the property to search for, e.g. "og" for Open Graph.
@@ -199,15 +95,10 @@ def _get_meta_tags(
     Returns:
         A map of tag name to value.
     """
-    # This actually returns dict[str, str], but the caller sets this as a variable
-    # which is dict[str, str | None].
     results: dict[str, str | None] = {}
     # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
-    for tag in cast(
-        list["etree._Element"],
-        tree.xpath(
-            f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
-        ),
+    for tag in soup.find_all(
+        "meta", attrs={property: re.compile(rf"^{prefix}:")}, content=NON_BLANK
     ):
         # if we've got more than 50 tags, someone is taking the piss
         if len(results) >= 50:
@@ -217,7 +108,7 @@ def _get_meta_tags(
             )
             return {}
 
-        key = cast(str, tag.attrib[property])
+        key = tag[property]
         if property_mapper:
             new_key = property_mapper(key)
             # None is a special value used to ignore a value.
@@ -225,7 +116,7 @@ def _get_meta_tags(
                 continue
             key = new_key
 
-        results[key] = cast(str, tag.attrib["content"])
+        results[key] = tag["content"]
 
     return results
 
@@ -250,15 +141,14 @@ def _map_twitter_to_open_graph(key: str) -> str | None:
     return "og" + key[7:]
 
 
-def parse_html_to_open_graph(tree: "etree._Element") -> dict[str, str | None]:
+def parse_html_to_open_graph(soup: "BeautifulSoup") -> dict[str, str | None]:
     """
-    Parse the HTML document into an Open Graph response.
+    Calculate metadata for an HTML document.
 
-    This uses lxml to search the HTML document for Open Graph data (or
-    synthesizes it from the document).
+    This uses BeautifulSoup to search the HTML document for Open Graph data.
 
     Args:
-        tree: The parsed HTML document.
+        soup: The parsed HTML document.
 
     Returns:
         The Open Graph response as a dictionary.
@@ -278,7 +168,8 @@ def parse_html_to_open_graph(tree: "etree._Element") -> dict[str, str | None]:
     # "og:video:height" : "720",
     # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
 
-    og = _get_meta_tags(tree, "property", "og")
+    # TODO: grab article: meta tags too, e.g.:
+    og = _get_meta_tags(soup, "property", "og")
 
     # TODO: Search for properties specific to the different Open Graph types,
     # such as article: meta tags, e.g.:
@@ -298,7 +189,7 @@ def parse_html_to_open_graph(tree: "etree._Element") -> dict[str, str | None]:
     # Twitter cards tags also duplicate Open Graph tags.
     #
     # See https://developer.twitter.com/en/docs/twitter-for-websites/cards/guides/getting-started
-    twitter = _get_meta_tags(tree, "name", "twitter", _map_twitter_to_open_graph)
+    twitter = _get_meta_tags(soup, "name", "twitter", _map_twitter_to_open_graph)
     # Merge the Twitter values with the Open Graph values, but do not overwrite
     # information from Open Graph tags.
     for key, value in twitter.items():
@@ -307,73 +198,69 @@ def parse_html_to_open_graph(tree: "etree._Element") -> dict[str, str | None]:
 
     if "og:title" not in og:
         # Attempt to find a title from the title tag, or the biggest header on the page.
-        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
         title = cast(
-            list["etree._ElementUnicodeResult"],
-            tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()"),
+            Optional["Tag"], soup.find(("title", "h1", "h2", "h3"), string=True)
         )
-        if title:
-            og["og:title"] = title[0].strip()
+        if title and title.string:
+            og["og:title"] = title.string.strip()
         else:
             og["og:title"] = None
 
     if "og:image" not in og:
-        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
+        # Check microdata for an image.
         meta_image = cast(
-            list["etree._ElementUnicodeResult"],
-            tree.xpath(
-                "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
-            ),
+            Optional["Tag"],
+            soup.find("meta", itemprop=re.compile("image", re.I), content=NON_BLANK),
         )
         # If a meta image is found, use it.
         if meta_image:
-            og["og:image"] = meta_image[0]
+            og["og:image"] = cast(str, meta_image["content"])
         else:
             # Try to find images which are larger than 10px by 10px.
-            # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
             #
             # TODO: consider inlined CSS styles as well as width & height attribs
             images = cast(
-                list["etree._Element"],
-                tree.xpath("//img[@src][number(@width)>10][number(@height)>10]"),
+                list["Tag"],
+                soup.find_all("img", src=NON_BLANK, width=NON_BLANK, height=NON_BLANK),
             )
             images = sorted(
-                images,
+                filter(
+                    lambda tag: int(cast(str, tag["width"])) > 10
+                    and int(cast(str, tag["height"])) > 10,
+                    images,
+                ),
                 key=lambda i: (
-                    -1 * float(i.attrib["width"]) * float(i.attrib["height"])
+                    -1 * float(cast(str, i["width"])) * float(cast(str, i["height"]))
                 ),
             )
             # If no images were found, try to find *any* images.
             if not images:
-                # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
-                images = cast(list["etree._Element"], tree.xpath("//img[@src][1]"))
+                images = soup.find_all("img", src=NON_BLANK, limit=1)
             if images:
-                og["og:image"] = cast(str, images[0].attrib["src"])
+                og["og:image"] = cast(str, images[0]["src"])
 
             # Finally, fallback to the favicon if nothing else.
             else:
-                # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
-                favicons = cast(
-                    list["etree._ElementUnicodeResult"],
-                    tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]"),
-                )
-                if favicons:
-                    og["og:image"] = favicons[0]
+                favicon = cast("Tag", soup.find("link", href=NON_BLANK, rel="icon"))
+                if favicon:
+                    og["og:image"] = cast(str, favicon["href"])
 
     if "og:description" not in og:
         # Check the first meta description tag for content.
-        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
         meta_description = cast(
-            list["etree._ElementUnicodeResult"],
-            tree.xpath(
-                "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
+            Optional["Tag"],
+            soup.find(
+                "meta",
+                attrs={"name": re.compile("description", re.I)},
+                content=NON_BLANK,
             ),
         )
+
         # If a meta description is found with content, use it.
         if meta_description:
-            og["og:description"] = meta_description[0]
+            og["og:description"] = cast(str, meta_description["content"])
         else:
-            og["og:description"] = parse_html_description(tree)
+            og["og:description"] = parse_html_description(soup)
     elif og["og:description"]:
         # This must be a non-empty string at this point.
         assert isinstance(og["og:description"], str)
@@ -384,7 +271,7 @@ def parse_html_to_open_graph(tree: "etree._Element") -> dict[str, str | None]:
     return og
 
 
-def parse_html_description(tree: "etree._Element") -> str | None:
+def parse_html_description(soup: "BeautifulSoup") -> str | None:
     """
     Calculate a text description based on an HTML document.
 
@@ -397,14 +284,11 @@ def parse_html_description(tree: "etree._Element") -> str | None:
     This is a very very very coarse approximation to a plain text render of the page.
 
     Args:
-        tree: The parsed HTML document.
+        soup: The parsed HTML document.
 
     Returns:
         The plain text description, or None if one cannot be generated.
     """
-    # We don't just use XPATH here as that is slow on some machines.
-
-    from lxml import etree
 
     TAGS_TO_REMOVE = {
         "header",
@@ -423,24 +307,27 @@ def parse_html_description(tree: "etree._Element") -> str | None:
         # etree.Comment is a function which creates an etree._Comment element.
         # The "tag" attribute of an etree._Comment instance is confusingly the
         # etree.Comment function instead of a string.
-        etree.Comment,
+        # etree.Comment,
+        # XXX
     }
 
     # Split all the text nodes into paragraphs (by splitting on new
     # lines)
     text_nodes = (
         re.sub(r"\s+", "\n", el).strip()
-        for el in _iterate_over_text(tree.find("body"), TAGS_TO_REMOVE)
+        for el in _iterate_over_text(
+            cast(Optional["Tag"], soup.find("body")), TAGS_TO_REMOVE
+        )
     )
     return summarize_paragraphs(text_nodes)
 
 
 def _iterate_over_text(
-    tree: Optional["etree._Element"],
-    tags_to_ignore: set[object],
+    soup: Optional["Tag"],
+    tags_to_ignore: Iterable[str],
     stack_limit: int = 1024,
 ) -> Generator[str, None, None]:
-    """Iterate over the tree returning text nodes in a depth first fashion,
+    """Iterate over the document returning text nodes in a depth first fashion,
     skipping text nodes inside certain tags.
 
     Args:
@@ -452,43 +339,27 @@ def _iterate_over_text(
             Intended to limit the maximum working memory when generating a preview.
     """
 
-    if tree is None:
+    if not soup:
         return
 
-    # This is a stack whose items are elements to iterate over *or* strings
-    # to be returned.
-    elements: list[str | "etree._Element"] = [tree]
-    while elements:
-        el = elements.pop()
-
-        if isinstance(el, str):
-            yield el
-        elif el.tag not in tags_to_ignore:
-            # If the element isn't meant for display, ignore it.
-            if el.get("role") in ARIA_ROLES_TO_IGNORE:
-                continue
-
-            # el.text is the text before the first child, so we can immediately
-            # return it if the text exists.
-            if el.text:
-                yield el.text
+    from bs4.element import NavigableString, Tag
 
-            # We add to the stack all the element's children, interspersed with
-            # each child's tail text (if it exists).
-            #
-            # We iterate in reverse order so that earlier pieces of text appear
-            # closer to the top of the stack.
-            for child in el.iterchildren(reversed=True):
-                if len(elements) > stack_limit:
-                    # We've hit our limit for working memory
-                    break
-
-                if child.tail:
-                    # The tail text of a node is text that comes *after* the node,
-                    # so we always include it even if we ignore the child node.
-                    elements.append(child.tail)
-
-                elements.append(child)
+    # This is basically a stack that we extend using itertools.chain.
+    # This will either consist of an element to iterate over *or* a string
+    # to be returned.
+    elements: Iterator["PageElement"] = iter([soup])
+    while True:
+        el = next(elements, None)
+        if el is None:
+            return
+
+        # Do not consider sub-classes of NavigableString since those represent
+        # comments, etc.
+        if type(el) == NavigableString:  # noqa: E721
+            yield str(el)
+        elif isinstance(el, Tag) and el.name not in tags_to_ignore:
+            # We add to the stack all the element's children.
+            elements = itertools.chain(el.contents, elements)
 
 
 def summarize_paragraphs(
diff --git a/synapse/media/url_previewer.py b/synapse/media/url_previewer.py
index 2c5e518918b..a1f941ea5b3 100644
--- a/synapse/media/url_previewer.py
+++ b/synapse/media/url_previewer.py
@@ -294,16 +294,16 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
 
             # define our OG response for this media
         elif _is_html(media_info.media_type):
-            # TODO: somehow stop a big HTML tree from exploding synapse's RAM
+            # TODO: somehow stop a big HTML document from exploding synapse's RAM
 
             with open(media_info.filename, "rb") as file:
                 body = file.read()
 
-            tree = decode_body(body, media_info.uri, media_info.media_type)
-            if tree is not None:
+            soup = decode_body(body, media_info.uri)
+            if soup is not None:
                 # Check if this HTML document points to oEmbed information and
                 # defer to that.
-                oembed_url = self._oembed.autodiscover_from_html(tree)
+                oembed_url = self._oembed.autodiscover_from_html(soup)
                 og_from_oembed: JsonDict = {}
                 # Only download to the oEmbed URL if it is allowed.
                 if oembed_url:
@@ -329,7 +329,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
 
                 # Parse Open Graph information from the HTML in case the oEmbed
                 # response failed or is incomplete.
-                og_from_html = parse_html_to_open_graph(tree)
+                og_from_html = parse_html_to_open_graph(soup)
 
                 # Compile the Open Graph response by using the scraped
                 # information from the HTML and overlaying any information
diff --git a/tests/media/test_html_preview.py b/tests/media/test_html_preview.py
index d3f1e8833a7..e33b80739ea 100644
--- a/tests/media/test_html_preview.py
+++ b/tests/media/test_html_preview.py
@@ -20,7 +20,6 @@
 #
 
 from synapse.media.preview_html import (
-    _get_html_media_encodings,
     decode_body,
     parse_html_to_open_graph,
     summarize_paragraphs,
@@ -166,9 +165,9 @@ def test_simple(self) -> None:
         </html>
         """
 
-        tree = decode_body(html, "http://example.com/test.html")
-        assert tree is not None
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        assert soup is not None
+        og = parse_html_to_open_graph(soup)
 
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
@@ -183,9 +182,9 @@ def test_comment(self) -> None:
         </html>
         """
 
-        tree = decode_body(html, "http://example.com/test.html")
-        assert tree is not None
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        assert soup is not None
+        og = parse_html_to_open_graph(soup)
 
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
@@ -203,9 +202,9 @@ def test_comment2(self) -> None:
         </html>
         """
 
-        tree = decode_body(html, "http://example.com/test.html")
-        assert tree is not None
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        assert soup is not None
+        og = parse_html_to_open_graph(soup)
 
         self.assertEqual(
             og,
@@ -226,9 +225,9 @@ def test_script(self) -> None:
         </html>
         """
 
-        tree = decode_body(html, "http://example.com/test.html")
-        assert tree is not None
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        assert soup is not None
+        og = parse_html_to_open_graph(soup)
 
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
@@ -241,9 +240,9 @@ def test_missing_title(self) -> None:
         </html>
         """
 
-        tree = decode_body(html, "http://example.com/test.html")
-        assert tree is not None
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        assert soup is not None
+        og = parse_html_to_open_graph(soup)
 
         self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
 
@@ -273,9 +272,9 @@ def test_h1_as_title(self) -> None:
         </html>
         """
 
-        tree = decode_body(html, "http://example.com/test.html")
-        assert tree is not None
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        assert soup is not None
+        og = parse_html_to_open_graph(soup)
 
         self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
 
@@ -310,23 +309,17 @@ def test_missing_title_and_broken_h1(self) -> None:
         </html>
         """
 
-        tree = decode_body(html, "http://example.com/test.html")
-        assert tree is not None
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        assert soup is not None
+        og = parse_html_to_open_graph(soup)
 
         self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
 
     def test_empty(self) -> None:
         """Test a body with no data in it."""
         html = b""
-        tree = decode_body(html, "http://example.com/test.html")
-        self.assertIsNone(tree)
-
-    def test_no_tree(self) -> None:
-        """A valid body with no tree in it."""
-        html = b"\x00"
-        tree = decode_body(html, "http://example.com/test.html")
-        self.assertIsNone(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        self.assertIsNone(soup)
 
     def test_xml(self) -> None:
         """Test decoding XML and ensure it works properly."""
@@ -339,24 +332,9 @@ def test_xml(self) -> None:
         <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
         <head><title>Foo</title></head><body>Some text.</body></html>
         """.strip()
-        tree = decode_body(html, "http://example.com/test.html")
-        assert tree is not None
-        og = parse_html_to_open_graph(tree)
-        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
-
-    def test_invalid_encoding(self) -> None:
-        """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
-        html = b"""
-        <html>
-        <head><title>Foo</title></head>
-        <body>
-        Some text.
-        </body>
-        </html>
-        """
-        tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
-        assert tree is not None
-        og = parse_html_to_open_graph(tree)
+        soup = decode_body(html, "http://example.com/test.html")
+        assert soup is not None
+        og = parse_html_to_open_graph(soup)
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
     def test_invalid_encoding2(self) -> None:
@@ -370,10 +348,10 @@ def test_invalid_encoding2(self) -> None:
         </body>
         </html>
         """
-        tree = decode_body(html, "http://example.com/test.html")
-        assert tree is not None
-        og = parse_html_to_open_graph(tree)
-        self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
+        soup = decode_body(html, "http://example.com/test.html")
+        assert soup is not None
+        og = parse_html_to_open_graph(soup)
+        self.assertEqual(og, {"og:title": "˙˙ Foo", "og:description": "Some text."})
 
     def test_windows_1252(self) -> None:
         """A body which uses cp1252, but doesn't declare that."""
@@ -385,10 +363,71 @@ def test_windows_1252(self) -> None:
         </body>
         </html>
         """
-        tree = decode_body(html, "http://example.com/test.html")
-        assert tree is not None
-        og = parse_html_to_open_graph(tree)
-        self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
+        soup = decode_body(html, "http://example.com/test.html")
+        assert soup is not None
+        og = parse_html_to_open_graph(soup)
+        self.assertEqual(og, {"og:title": "�", "og:description": "Some text."})
+
+    def test_image(self) -> None:
+        """Test the spots an image can be pulled from ."""
+        # Ordered listed of tags, we'll pop off the top and keep testing.
+        tags = [
+            (
+                b"""<meta property="og:image" content="https://example.com/meta-prop.png">""",
+                "meta-prop",
+            ),
+            (
+                b"""<meta itemprop="IMAGE" content="https://example.com/meta-IMAGE.png">""",
+                "meta-IMAGE",
+            ),
+            (
+                b"""<meta itemprop="image" content="https://example.com/meta-image.png">""",
+                "meta-image",
+            ),
+            (b"""<img src="https://example.com/img-no-width-no-height.png">""", "img"),
+            (
+                b"""<img src="https://example.com/img-no-height.png" width="100">""",
+                "img",
+            ),
+            (
+                b"""<img src="https://example.com/img-no-width.png" height="100">""",
+                "img",
+            ),
+            (
+                b"""<img src="https://example.com/img-small.png" width="100" height="100">""",
+                "img",
+            ),
+            (
+                b"""<img src="https://example.com/img.png" width="200" height="100">""",
+                "img",
+            ),
+            # Put this image again since if it is the *only* image it will be used.
+            (
+                b"""<img src="https://example.com/img-no-width-no-height.png">""",
+                "img-no-width-no-height",
+            ),
+            (
+                b"""<link rel="icon" href="https://example.com/favicon.png">""",
+                "favicon",
+            ),
+        ]
+
+        while tags:
+            html = b"<html>" + b"".join(t[0] for t in tags) + b"</html>"
+            tree = decode_body(html, "http://example.com/test.html")
+            assert tree is not None
+            og = parse_html_to_open_graph(tree)
+            self.assertEqual(
+                og,
+                {
+                    "og:title": None,
+                    "og:description": None,
+                    "og:image": f"https://example.com/{tags[0][1]}.png",
+                },
+            )
+
+            # Remove the highest remaining priority item.
+            tags.pop(0)
 
     def test_twitter_tag(self) -> None:
         """Twitter card tags should be used if nothing else is available."""
@@ -397,6 +436,7 @@ def test_twitter_tag(self) -> None:
         <meta name="twitter:card" content="summary">
         <meta name="twitter:description" content="Description">
         <meta name="twitter:site" content="@matrixdotorg">
+        <meta name="twitter:image" content="https://example.com/test.png">
         </html>
         """
         tree = decode_body(html, "http://example.com/test.html")
@@ -408,6 +448,7 @@ def test_twitter_tag(self) -> None:
                 "og:title": None,
                 "og:description": "Description",
                 "og:site_name": "@matrixdotorg",
+                "og:image": "https://example.com/test.png",
             },
         )
 
@@ -419,6 +460,8 @@ def test_twitter_tag(self) -> None:
         <meta property="og:description" content="Real Description">
         <meta name="twitter:site" content="@matrixdotorg">
         <meta property="og:site_name" content="matrix.org">
+        <meta name="twitter:image" content="https://example.com/bad.png">
+        <meta property="og:image" content="https://example.com/good.png">
         </html>
         """
         tree = decode_body(html, "http://example.com/test.html")
@@ -430,6 +473,7 @@ def test_twitter_tag(self) -> None:
                 "og:title": None,
                 "og:description": "Real Description",
                 "og:site_name": "matrix.org",
+                "og:image": "https://example.com/good.png",
             },
         )
 
@@ -450,116 +494,3 @@ def test_nested_nodes(self) -> None:
                 "og:description": "Welcome\n\nthe bold\n\nand underlined text\n\nand\n\nsome\n\ntail text",
             },
         )
-
-
-class MediaEncodingTestCase(unittest.TestCase):
-    def test_meta_charset(self) -> None:
-        """A character encoding is found via the meta tag."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head><meta charset="ascii">
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-        # A less well-formed version.
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head>< meta charset = ascii>
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-    def test_meta_charset_underscores(self) -> None:
-        """A character encoding contains underscore."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head><meta charset="Shift_JIS">
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])
-
-    def test_xml_encoding(self) -> None:
-        """A character encoding is found via the meta tag."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <?xml version="1.0" encoding="ascii"?>
-        <html>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-    def test_meta_xml_encoding(self) -> None:
-        """Meta tags take precedence over XML encoding."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <?xml version="1.0" encoding="ascii"?>
-        <html>
-        <head><meta charset="UTF-16">
-        </head>
-        </html>
-        """,
-            "text/html",
-        )
-        self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])
-
-    def test_content_type(self) -> None:
-        """A character encoding is found via the Content-Type header."""
-        # Test a few variations of the header.
-        headers = (
-            'text/html; charset="ascii";',
-            "text/html;charset=ascii;",
-            'text/html;  charset="ascii"',
-            "text/html; charset=ascii",
-            'text/html; charset="ascii;',
-            'text/html; charset=ascii";',
-        )
-        for header in headers:
-            encodings = _get_html_media_encodings(b"", header)
-            self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
-
-    def test_fallback(self) -> None:
-        """A character encoding cannot be found in the body or header."""
-        encodings = _get_html_media_encodings(b"", "text/html")
-        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
-
-    def test_duplicates(self) -> None:
-        """Ensure each encoding is only attempted once."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <?xml version="1.0" encoding="utf8"?>
-        <html>
-        <head><meta charset="UTF-8">
-        </head>
-        </html>
-        """,
-            'text/html; charset="UTF_8"',
-        )
-        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
-
-    def test_unknown_invalid(self) -> None:
-        """A character encoding should be ignored if it is unknown or invalid."""
-        encodings = _get_html_media_encodings(
-            b"""
-        <html>
-        <head><meta charset="invalid">
-        </head>
-        </html>
-        """,
-            'text/html; charset="invalid"',
-        )
-        self.assertEqual(list(encodings), ["utf-8", "cp1252"])
diff --git a/tests/rest/client/test_media.py b/tests/rest/client/test_media.py
index ec81b1413c2..ce006eaa75c 100644
--- a/tests/rest/client/test_media.py
+++ b/tests/rest/client/test_media.py
@@ -495,7 +495,7 @@ def test_non_ascii_preview_content_type(self) -> None:
 
         self.pump()
         self.assertEqual(channel.code, 200)
-        self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
+        self.assertEqual(channel.json_body["og:title"], "‰Íý")
 
     def test_overlong_title(self) -> None:
         self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
diff --git a/tests/rest/media/test_url_preview.py b/tests/rest/media/test_url_preview.py
index 32e78fc12a6..683d646942b 100644
--- a/tests/rest/media/test_url_preview.py
+++ b/tests/rest/media/test_url_preview.py
@@ -367,7 +367,7 @@ def test_non_ascii_preview_content_type(self) -> None:
 
         self.pump()
         self.assertEqual(channel.code, 200)
-        self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
+        self.assertEqual(channel.json_body["og:title"], "���")
 
     def test_overlong_title(self) -> None:
         self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]

From 8e9e3339bd831455db39da8a4a89e2e60437aff0 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@patrick.cloke.us>
Date: Wed, 10 Dec 2025 10:56:44 -0500
Subject: [PATCH 2/6] Dont use lxml

---
 poetry.lock                          | 178 +--------------------------
 pyproject.toml                       |   4 +-
 synapse/media/preview_html.py        |  19 +--
 tests/media/test_html_preview.py     |   2 +-
 tests/rest/media/test_url_preview.py |   2 +-
 5 files changed, 11 insertions(+), 194 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 62080647c09..faf6ed91d20 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1136,178 +1136,6 @@ files = [
     {file = "librt-0.6.3.tar.gz", hash = "sha256:c724a884e642aa2bbad52bb0203ea40406ad742368a5f90da1b220e970384aae"},
 ]
 
-[[package]]
-name = "lxml"
-version = "6.0.2"
-description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
-optional = true
-python-versions = ">=3.8"
-groups = ["main"]
-markers = "extra == \"url-preview\" or extra == \"all\""
-files = [
-    {file = "lxml-6.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e77dd455b9a16bbd2a5036a63ddbd479c19572af81b624e79ef422f929eef388"},
-    {file = "lxml-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d444858b9f07cefff6455b983aea9a67f7462ba1f6cbe4a21e8bf6791bf2153"},
-    {file = "lxml-6.0.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f952dacaa552f3bb8834908dddd500ba7d508e6ea6eb8c52eb2d28f48ca06a31"},
-    {file = "lxml-6.0.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:71695772df6acea9f3c0e59e44ba8ac50c4f125217e84aab21074a1a55e7e5c9"},
-    {file = "lxml-6.0.2-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:17f68764f35fd78d7c4cc4ef209a184c38b65440378013d24b8aecd327c3e0c8"},
-    {file = "lxml-6.0.2-cp310-cp310-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:058027e261afed589eddcfe530fcc6f3402d7fd7e89bfd0532df82ebc1563dba"},
-    {file = "lxml-6.0.2-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8ffaeec5dfea5881d4c9d8913a32d10cfe3923495386106e4a24d45300ef79c"},
-    {file = "lxml-6.0.2-cp310-cp310-manylinux_2_31_armv7l.whl", hash = "sha256:f2e3b1a6bb38de0bc713edd4d612969dd250ca8b724be8d460001a387507021c"},
-    {file = "lxml-6.0.2-cp310-cp310-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d6690ec5ec1cce0385cb20896b16be35247ac8c2046e493d03232f1c2414d321"},
-    {file = "lxml-6.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2a50c3c1d11cad0ebebbac357a97b26aa79d2bcaf46f256551152aa85d3a4d1"},
-    {file = "lxml-6.0.2-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:3efe1b21c7801ffa29a1112fab3b0f643628c30472d507f39544fd48e9549e34"},
-    {file = "lxml-6.0.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:59c45e125140b2c4b33920d21d83681940ca29f0b83f8629ea1a2196dc8cfe6a"},
-    {file = "lxml-6.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:452b899faa64f1805943ec1c0c9ebeaece01a1af83e130b69cdefeda180bb42c"},
-    {file = "lxml-6.0.2-cp310-cp310-win32.whl", hash = "sha256:1e786a464c191ca43b133906c6903a7e4d56bef376b75d97ccbb8ec5cf1f0a4b"},
-    {file = "lxml-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:dacf3c64ef3f7440e3167aa4b49aa9e0fb99e0aa4f9ff03795640bf94531bcb0"},
-    {file = "lxml-6.0.2-cp310-cp310-win_arm64.whl", hash = "sha256:45f93e6f75123f88d7f0cfd90f2d05f441b808562bf0bc01070a00f53f5028b5"},
-    {file = "lxml-6.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:13e35cbc684aadf05d8711a5d1b5857c92e5e580efa9a0d2be197199c8def607"},
-    {file = "lxml-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b1675e096e17c6fe9c0e8c81434f5736c0739ff9ac6123c87c2d452f48fc938"},
-    {file = "lxml-6.0.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8ac6e5811ae2870953390452e3476694196f98d447573234592d30488147404d"},
-    {file = "lxml-6.0.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5aa0fc67ae19d7a64c3fe725dc9a1bb11f80e01f78289d05c6f62545affec438"},
-    {file = "lxml-6.0.2-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de496365750cc472b4e7902a485d3f152ecf57bd3ba03ddd5578ed8ceb4c5964"},
-    {file = "lxml-6.0.2-cp311-cp311-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:200069a593c5e40b8f6fc0d84d86d970ba43138c3e68619ffa234bc9bb806a4d"},
-    {file = "lxml-6.0.2-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7d2de809c2ee3b888b59f995625385f74629707c9355e0ff856445cdcae682b7"},
-    {file = "lxml-6.0.2-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:b2c3da8d93cf5db60e8858c17684c47d01fee6405e554fb55018dd85fc23b178"},
-    {file = "lxml-6.0.2-cp311-cp311-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:442de7530296ef5e188373a1ea5789a46ce90c4847e597856570439621d9c553"},
-    {file = "lxml-6.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2593c77efde7bfea7f6389f1ab249b15ed4aa5bc5cb5131faa3b843c429fbedb"},
-    {file = "lxml-6.0.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:3e3cb08855967a20f553ff32d147e14329b3ae70ced6edc2f282b94afbc74b2a"},
-    {file = "lxml-6.0.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:2ed6c667fcbb8c19c6791bbf40b7268ef8ddf5a96940ba9404b9f9a304832f6c"},
-    {file = "lxml-6.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b8f18914faec94132e5b91e69d76a5c1d7b0c73e2489ea8929c4aaa10b76bbf7"},
-    {file = "lxml-6.0.2-cp311-cp311-win32.whl", hash = "sha256:6605c604e6daa9e0d7f0a2137bdc47a2e93b59c60a65466353e37f8272f47c46"},
-    {file = "lxml-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e5867f2651016a3afd8dd2c8238baa66f1e2802f44bc17e236f547ace6647078"},
-    {file = "lxml-6.0.2-cp311-cp311-win_arm64.whl", hash = "sha256:4197fb2534ee05fd3e7afaab5d8bfd6c2e186f65ea7f9cd6a82809c887bd1285"},
-    {file = "lxml-6.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a59f5448ba2ceccd06995c95ea59a7674a10de0810f2ce90c9006f3cbc044456"},
-    {file = "lxml-6.0.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e8113639f3296706fbac34a30813929e29247718e88173ad849f57ca59754924"},
-    {file = "lxml-6.0.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a8bef9b9825fa8bc816a6e641bb67219489229ebc648be422af695f6e7a4fa7f"},
-    {file = "lxml-6.0.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:65ea18d710fd14e0186c2f973dc60bb52039a275f82d3c44a0e42b43440ea534"},
-    {file = "lxml-6.0.2-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c371aa98126a0d4c739ca93ceffa0fd7a5d732e3ac66a46e74339acd4d334564"},
-    {file = "lxml-6.0.2-cp312-cp312-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:700efd30c0fa1a3581d80a748157397559396090a51d306ea59a70020223d16f"},
-    {file = "lxml-6.0.2-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c33e66d44fe60e72397b487ee92e01da0d09ba2d66df8eae42d77b6d06e5eba0"},
-    {file = "lxml-6.0.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:90a345bbeaf9d0587a3aaffb7006aa39ccb6ff0e96a57286c0cb2fd1520ea192"},
-    {file = "lxml-6.0.2-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:064fdadaf7a21af3ed1dcaa106b854077fbeada827c18f72aec9346847cd65d0"},
-    {file = "lxml-6.0.2-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fbc74f42c3525ac4ffa4b89cbdd00057b6196bcefe8bce794abd42d33a018092"},
-    {file = "lxml-6.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6ddff43f702905a4e32bc24f3f2e2edfe0f8fde3277d481bffb709a4cced7a1f"},
-    {file = "lxml-6.0.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:6da5185951d72e6f5352166e3da7b0dc27aa70bd1090b0eb3f7f7212b53f1bb8"},
-    {file = "lxml-6.0.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:57a86e1ebb4020a38d295c04fc79603c7899e0df71588043eb218722dabc087f"},
-    {file = "lxml-6.0.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:2047d8234fe735ab77802ce5f2297e410ff40f5238aec569ad7c8e163d7b19a6"},
-    {file = "lxml-6.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6f91fd2b2ea15a6800c8e24418c0775a1694eefc011392da73bc6cef2623b322"},
-    {file = "lxml-6.0.2-cp312-cp312-win32.whl", hash = "sha256:3ae2ce7d6fedfb3414a2b6c5e20b249c4c607f72cb8d2bb7cc9c6ec7c6f4e849"},
-    {file = "lxml-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:72c87e5ee4e58a8354fb9c7c84cbf95a1c8236c127a5d1b7683f04bed8361e1f"},
-    {file = "lxml-6.0.2-cp312-cp312-win_arm64.whl", hash = "sha256:61cb10eeb95570153e0c0e554f58df92ecf5109f75eacad4a95baa709e26c3d6"},
-    {file = "lxml-6.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9b33d21594afab46f37ae58dfadd06636f154923c4e8a4d754b0127554eb2e77"},
-    {file = "lxml-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:6c8963287d7a4c5c9a432ff487c52e9c5618667179c18a204bdedb27310f022f"},
-    {file = "lxml-6.0.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1941354d92699fb5ffe6ed7b32f9649e43c2feb4b97205f75866f7d21aa91452"},
-    {file = "lxml-6.0.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bb2f6ca0ae2d983ded09357b84af659c954722bbf04dea98030064996d156048"},
-    {file = "lxml-6.0.2-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb2a12d704f180a902d7fa778c6d71f36ceb7b0d317f34cdc76a5d05aa1dd1df"},
-    {file = "lxml-6.0.2-cp313-cp313-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:6ec0e3f745021bfed19c456647f0298d60a24c9ff86d9d051f52b509663feeb1"},
-    {file = "lxml-6.0.2-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:846ae9a12d54e368933b9759052d6206a9e8b250291109c48e350c1f1f49d916"},
-    {file = "lxml-6.0.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef9266d2aa545d7374938fb5c484531ef5a2ec7f2d573e62f8ce722c735685fd"},
-    {file = "lxml-6.0.2-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:4077b7c79f31755df33b795dc12119cb557a0106bfdab0d2c2d97bd3cf3dffa6"},
-    {file = "lxml-6.0.2-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a7c5d5e5f1081955358533be077166ee97ed2571d6a66bdba6ec2f609a715d1a"},
-    {file = "lxml-6.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8f8d0cbd0674ee89863a523e6994ac25fd5be9c8486acfc3e5ccea679bad2679"},
-    {file = "lxml-6.0.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:2cbcbf6d6e924c28f04a43f3b6f6e272312a090f269eff68a2982e13e5d57659"},
-    {file = "lxml-6.0.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:dfb874cfa53340009af6bdd7e54ebc0d21012a60a4e65d927c2e477112e63484"},
-    {file = "lxml-6.0.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:fb8dae0b6b8b7f9e96c26fdd8121522ce5de9bb5538010870bd538683d30e9a2"},
-    {file = "lxml-6.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:358d9adae670b63e95bc59747c72f4dc97c9ec58881d4627fe0120da0f90d314"},
-    {file = "lxml-6.0.2-cp313-cp313-win32.whl", hash = "sha256:e8cd2415f372e7e5a789d743d133ae474290a90b9023197fd78f32e2dc6873e2"},
-    {file = "lxml-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:b30d46379644fbfc3ab81f8f82ae4de55179414651f110a1514f0b1f8f6cb2d7"},
-    {file = "lxml-6.0.2-cp313-cp313-win_arm64.whl", hash = "sha256:13dcecc9946dca97b11b7c40d29fba63b55ab4170d3c0cf8c0c164343b9bfdcf"},
-    {file = "lxml-6.0.2-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:b0c732aa23de8f8aec23f4b580d1e52905ef468afb4abeafd3fec77042abb6fe"},
-    {file = "lxml-6.0.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:4468e3b83e10e0317a89a33d28f7aeba1caa4d1a6fd457d115dd4ffe90c5931d"},
-    {file = "lxml-6.0.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:abd44571493973bad4598a3be7e1d807ed45aa2adaf7ab92ab7c62609569b17d"},
-    {file = "lxml-6.0.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:370cd78d5855cfbffd57c422851f7d3864e6ae72d0da615fca4dad8c45d375a5"},
-    {file = "lxml-6.0.2-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:901e3b4219fa04ef766885fb40fa516a71662a4c61b80c94d25336b4934b71c0"},
-    {file = "lxml-6.0.2-cp314-cp314-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:a4bf42d2e4cf52c28cc1812d62426b9503cdb0c87a6de81442626aa7d69707ba"},
-    {file = "lxml-6.0.2-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2c7fdaa4d7c3d886a42534adec7cfac73860b89b4e5298752f60aa5984641a0"},
-    {file = "lxml-6.0.2-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98a5e1660dc7de2200b00d53fa00bcd3c35a3608c305d45a7bbcaf29fa16e83d"},
-    {file = "lxml-6.0.2-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:dc051506c30b609238d79eda75ee9cab3e520570ec8219844a72a46020901e37"},
-    {file = "lxml-6.0.2-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8799481bbdd212470d17513a54d568f44416db01250f49449647b5ab5b5dccb9"},
-    {file = "lxml-6.0.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9261bb77c2dab42f3ecd9103951aeca2c40277701eb7e912c545c1b16e0e4917"},
-    {file = "lxml-6.0.2-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:65ac4a01aba353cfa6d5725b95d7aed6356ddc0a3cd734de00124d285b04b64f"},
-    {file = "lxml-6.0.2-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:b22a07cbb82fea98f8a2fd814f3d1811ff9ed76d0fc6abc84eb21527596e7cc8"},
-    {file = "lxml-6.0.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:d759cdd7f3e055d6bc8d9bec3ad905227b2e4c785dc16c372eb5b5e83123f48a"},
-    {file = "lxml-6.0.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:945da35a48d193d27c188037a05fec5492937f66fb1958c24fc761fb9d40d43c"},
-    {file = "lxml-6.0.2-cp314-cp314-win32.whl", hash = "sha256:be3aaa60da67e6153eb15715cc2e19091af5dc75faef8b8a585aea372507384b"},
-    {file = "lxml-6.0.2-cp314-cp314-win_amd64.whl", hash = "sha256:fa25afbadead523f7001caf0c2382afd272c315a033a7b06336da2637d92d6ed"},
-    {file = "lxml-6.0.2-cp314-cp314-win_arm64.whl", hash = "sha256:063eccf89df5b24e361b123e257e437f9e9878f425ee9aae3144c77faf6da6d8"},
-    {file = "lxml-6.0.2-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:6162a86d86893d63084faaf4ff937b3daea233e3682fb4474db07395794fa80d"},
-    {file = "lxml-6.0.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:414aaa94e974e23a3e92e7ca5b97d10c0cf37b6481f50911032c69eeb3991bba"},
-    {file = "lxml-6.0.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:48461bd21625458dd01e14e2c38dd0aea69addc3c4f960c30d9f59d7f93be601"},
-    {file = "lxml-6.0.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:25fcc59afc57d527cfc78a58f40ab4c9b8fd096a9a3f964d2781ffb6eb33f4ed"},
-    {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5179c60288204e6ddde3f774a93350177e08876eaf3ab78aa3a3649d43eb7d37"},
-    {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:967aab75434de148ec80597b75062d8123cadf2943fb4281f385141e18b21338"},
-    {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d100fcc8930d697c6561156c6810ab4a508fb264c8b6779e6e61e2ed5e7558f9"},
-    {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ca59e7e13e5981175b8b3e4ab84d7da57993eeff53c07764dcebda0d0e64ecd"},
-    {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:957448ac63a42e2e49531b9d6c0fa449a1970dbc32467aaad46f11545be9af1d"},
-    {file = "lxml-6.0.2-cp314-cp314t-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b7fc49c37f1786284b12af63152fe1d0990722497e2d5817acfe7a877522f9a9"},
-    {file = "lxml-6.0.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e19e0643cc936a22e837f79d01a550678da8377d7d801a14487c10c34ee49c7e"},
-    {file = "lxml-6.0.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:1db01e5cf14345628e0cbe71067204db658e2fb8e51e7f33631f5f4735fefd8d"},
-    {file = "lxml-6.0.2-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:875c6b5ab39ad5291588aed6925fac99d0097af0dd62f33c7b43736043d4a2ec"},
-    {file = "lxml-6.0.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:cdcbed9ad19da81c480dfd6dd161886db6096083c9938ead313d94b30aadf272"},
-    {file = "lxml-6.0.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:80dadc234ebc532e09be1975ff538d154a7fa61ea5031c03d25178855544728f"},
-    {file = "lxml-6.0.2-cp314-cp314t-win32.whl", hash = "sha256:da08e7bb297b04e893d91087df19638dc7a6bb858a954b0cc2b9f5053c922312"},
-    {file = "lxml-6.0.2-cp314-cp314t-win_amd64.whl", hash = "sha256:252a22982dca42f6155125ac76d3432e548a7625d56f5a273ee78a5057216eca"},
-    {file = "lxml-6.0.2-cp314-cp314t-win_arm64.whl", hash = "sha256:bb4c1847b303835d89d785a18801a883436cdfd5dc3d62947f9c49e24f0f5a2c"},
-    {file = "lxml-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a656ca105115f6b766bba324f23a67914d9c728dafec57638e2b92a9dcd76c62"},
-    {file = "lxml-6.0.2-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c54d83a2188a10ebdba573f16bd97135d06c9ef60c3dc495315c7a28c80a263f"},
-    {file = "lxml-6.0.2-cp38-cp38-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:1ea99340b3c729beea786f78c38f60f4795622f36e305d9c9be402201efdc3b7"},
-    {file = "lxml-6.0.2-cp38-cp38-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:af85529ae8d2a453feee4c780d9406a5e3b17cee0dd75c18bd31adcd584debc3"},
-    {file = "lxml-6.0.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:fe659f6b5d10fb5a17f00a50eb903eb277a71ee35df4615db573c069bcf967ac"},
-    {file = "lxml-6.0.2-cp38-cp38-win32.whl", hash = "sha256:5921d924aa5468c939d95c9814fa9f9b5935a6ff4e679e26aaf2951f74043512"},
-    {file = "lxml-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:0aa7070978f893954008ab73bb9e3c24a7c56c054e00566a21b553dc18105fca"},
-    {file = "lxml-6.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:2c8458c2cdd29589a8367c09c8f030f1d202be673f0ca224ec18590b3b9fb694"},
-    {file = "lxml-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3fee0851639d06276e6b387f1c190eb9d7f06f7f53514e966b26bae46481ec90"},
-    {file = "lxml-6.0.2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b2142a376b40b6736dfc214fd2902409e9e3857eff554fed2d3c60f097e62a62"},
-    {file = "lxml-6.0.2-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a6b5b39cc7e2998f968f05309e666103b53e2edd01df8dc51b90d734c0825444"},
-    {file = "lxml-6.0.2-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4aec24d6b72ee457ec665344a29acb2d35937d5192faebe429ea02633151aad"},
-    {file = "lxml-6.0.2-cp39-cp39-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:b42f4d86b451c2f9d06ffb4f8bbc776e04df3ba070b9fe2657804b1b40277c48"},
-    {file = "lxml-6.0.2-cp39-cp39-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6cdaefac66e8b8f30e37a9b4768a391e1f8a16a7526d5bc77a7928408ef68e93"},
-    {file = "lxml-6.0.2-cp39-cp39-manylinux_2_31_armv7l.whl", hash = "sha256:b738f7e648735714bbb82bdfd030203360cfeab7f6e8a34772b3c8c8b820568c"},
-    {file = "lxml-6.0.2-cp39-cp39-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:daf42de090d59db025af61ce6bdb2521f0f102ea0e6ea310f13c17610a97da4c"},
-    {file = "lxml-6.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:66328dabea70b5ba7e53d94aa774b733cf66686535f3bc9250a7aab53a91caaf"},
-    {file = "lxml-6.0.2-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:e237b807d68a61fc3b1e845407e27e5eb8ef69bc93fe8505337c1acb4ee300b6"},
-    {file = "lxml-6.0.2-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:ac02dc29fd397608f8eb15ac1610ae2f2f0154b03f631e6d724d9e2ad4ee2c84"},
-    {file = "lxml-6.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:817ef43a0c0b4a77bd166dc9a09a555394105ff3374777ad41f453526e37f9cb"},
-    {file = "lxml-6.0.2-cp39-cp39-win32.whl", hash = "sha256:bc532422ff26b304cfb62b328826bd995c96154ffd2bac4544f37dbb95ecaa8f"},
-    {file = "lxml-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:995e783eb0374c120f528f807443ad5a83a656a8624c467ea73781fc5f8a8304"},
-    {file = "lxml-6.0.2-cp39-cp39-win_arm64.whl", hash = "sha256:08b9d5e803c2e4725ae9e8559ee880e5328ed61aa0935244e0515d7d9dbec0aa"},
-    {file = "lxml-6.0.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:e748d4cf8fef2526bb2a589a417eba0c8674e29ffcb570ce2ceca44f1e567bf6"},
-    {file = "lxml-6.0.2-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4ddb1049fa0579d0cbd00503ad8c58b9ab34d1254c77bc6a5576d96ec7853dba"},
-    {file = "lxml-6.0.2-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cb233f9c95f83707dae461b12b720c1af9c28c2d19208e1be03387222151daf5"},
-    {file = "lxml-6.0.2-pp310-pypy310_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bc456d04db0515ce3320d714a1eac7a97774ff0849e7718b492d957da4631dd4"},
-    {file = "lxml-6.0.2-pp310-pypy310_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2613e67de13d619fd283d58bda40bff0ee07739f624ffee8b13b631abf33083d"},
-    {file = "lxml-6.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:24a8e756c982c001ca8d59e87c80c4d9dcd4d9b44a4cbeb8d9be4482c514d41d"},
-    {file = "lxml-6.0.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1c06035eafa8404b5cf475bb37a9f6088b0aca288d4ccc9d69389750d5543700"},
-    {file = "lxml-6.0.2-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c7d13103045de1bdd6fe5d61802565f1a3537d70cd3abf596aa0af62761921ee"},
-    {file = "lxml-6.0.2-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a3c150a95fbe5ac91de323aa756219ef9cf7fde5a3f00e2281e30f33fa5fa4f"},
-    {file = "lxml-6.0.2-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:60fa43be34f78bebb27812ed90f1925ec99560b0fa1decdb7d12b84d857d31e9"},
-    {file = "lxml-6.0.2-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21c73b476d3cfe836be731225ec3421fa2f048d84f6df6a8e70433dff1376d5a"},
-    {file = "lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e"},
-    {file = "lxml-6.0.2.tar.gz", hash = "sha256:cd79f3367bd74b317dda655dc8fcfa304d9eb6e4fb06b7168c5cf27f96e0cd62"},
-]
-
-[package.extras]
-cssselect = ["cssselect (>=0.7)"]
-html-clean = ["lxml_html_clean"]
-html5 = ["html5lib"]
-htmlsoup = ["BeautifulSoup4"]
-
-[[package]]
-name = "lxml-stubs"
-version = "0.5.1"
-description = "Type annotations for the lxml package"
-optional = false
-python-versions = "*"
-groups = ["dev"]
-files = [
-    {file = "lxml-stubs-0.5.1.tar.gz", hash = "sha256:e0ec2aa1ce92d91278b719091ce4515c12adc1d564359dfaf81efa7d4feab79d"},
-    {file = "lxml_stubs-0.5.1-py3-none-any.whl", hash = "sha256:1f689e5dbc4b9247cb09ae820c7d34daeb1fdbd1db06123814b856dae7787272"},
-]
-
-[package.extras]
-test = ["coverage[toml] (>=7.2.5)", "mypy (>=1.2.0)", "pytest (>=7.3.0)", "pytest-mypy-plugins (>=1.10.1)"]
-
 [[package]]
 name = "markdown-it-py"
 version = "4.0.0"
@@ -3505,7 +3333,7 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"]
 test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"]
 
 [extras]
-all = ["authlib", "beautifulsoup4", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pympler", "pysaml2", "sentry-sdk", "txredisapi"]
+all = ["authlib", "beautifulsoup4", "hiredis", "jaeger-client", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pympler", "pysaml2", "sentry-sdk", "txredisapi"]
 cache-memory = ["pympler"]
 jwt = ["authlib"]
 matrix-synapse-ldap3 = ["matrix-synapse-ldap3"]
@@ -3517,9 +3345,9 @@ saml2 = ["pysaml2"]
 sentry = ["sentry-sdk"]
 systemd = ["systemd-python"]
 test = ["idna", "parameterized"]
-url-preview = ["beautifulsoup4", "lxml"]
+url-preview = ["beautifulsoup4"]
 
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10.0,<4.0.0"
-content-hash = "cded33baf3b0eb42bba93b2e96439c10a1520f8471d1136771636e63f26523df"
+content-hash = "c4bd7887db4ab253e3bd0c5acb0f6e9d1d02007ad6d0b13e0de50a8e8e840277"
diff --git a/pyproject.toml b/pyproject.toml
index 3882b233780..fe53f0443af 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -110,7 +110,7 @@ oidc = ["authlib>=0.15.1"]
 # `systemd.journal.JournalHandler`, as is documented in
 # `contrib/systemd/log_config.yaml`.
 systemd = ["systemd-python>=231"]
-url-preview = ["lxml>=4.6.3", "beautifulsoup4>=4.13.0"]
+url-preview = ["beautifulsoup4>=4.13.0"]
 sentry = ["sentry-sdk>=0.7.2"]
 opentracing = ["jaeger-client>=4.2.0", "opentracing>=2.2.0"]
 jwt = ["authlib"]
@@ -143,7 +143,6 @@ all = [
     # oidc and jwt
     "authlib>=0.15.1",
     # url-preview
-    "lxml>=4.6.3",
     "beautifulsoup4>=4.13.0",
     # sentry
     "sentry-sdk>=0.7.2",
@@ -334,7 +333,6 @@ generate-setup-file = true
 ruff = "0.14.6"
 
 # Typechecking
-lxml-stubs = ">=0.4.0"
 mypy = "*"
 mypy-zope = "*"
 types-bleach = ">=4.1.0"
diff --git a/synapse/media/preview_html.py b/synapse/media/preview_html.py
index 8f1bc2276c8..2d79165ca6f 100644
--- a/synapse/media/preview_html.py
+++ b/synapse/media/preview_html.py
@@ -65,7 +65,7 @@ def decode_body(body: bytes | str, uri: str) -> Optional["BeautifulSoup"]:
     from bs4.builder import ParserRejectedMarkup
 
     try:
-        soup = BeautifulSoup(body, "lxml")
+        soup = BeautifulSoup(body, "html.parser")
         # If an empty document is returned, convert to None.
         if not len(soup):
             return None
@@ -291,6 +291,7 @@ def parse_html_description(soup: "BeautifulSoup") -> str | None:
     """
 
     TAGS_TO_REMOVE = {
+        "head",
         "header",
         "nav",
         "aside",
@@ -304,26 +305,19 @@ def parse_html_description(soup: "BeautifulSoup") -> str | None:
         "canvas",
         "img",
         "picture",
-        # etree.Comment is a function which creates an etree._Comment element.
-        # The "tag" attribute of an etree._Comment instance is confusingly the
-        # etree.Comment function instead of a string.
-        # etree.Comment,
-        # XXX
     }
 
     # Split all the text nodes into paragraphs (by splitting on new
     # lines)
     text_nodes = (
         re.sub(r"\s+", "\n", el).strip()
-        for el in _iterate_over_text(
-            cast(Optional["Tag"], soup.find("body")), TAGS_TO_REMOVE
-        )
+        for el in _iterate_over_text(soup, TAGS_TO_REMOVE)
     )
     return summarize_paragraphs(text_nodes)
 
 
 def _iterate_over_text(
-    soup: Optional["Tag"],
+    soup: "BeautifulSoup",
     tags_to_ignore: Iterable[str],
     stack_limit: int = 1024,
 ) -> Generator[str, None, None]:
@@ -331,7 +325,7 @@ def _iterate_over_text(
     skipping text nodes inside certain tags.
 
     Args:
-        tree: The parent element to iterate. Can be None if there isn't one.
+        soup: The parent element to iterate. Can be None if there isn't one.
         tags_to_ignore: Set of tags to ignore
         stack_limit: Maximum stack size limit for depth-first traversal.
             Nodes will be dropped if this limit is hit, which may truncate the
@@ -339,9 +333,6 @@ def _iterate_over_text(
             Intended to limit the maximum working memory when generating a preview.
     """
 
-    if not soup:
-        return
-
     from bs4.element import NavigableString, Tag
 
     # This is basically a stack that we extend using itertools.chain.
diff --git a/tests/media/test_html_preview.py b/tests/media/test_html_preview.py
index e33b80739ea..7fa437fdb88 100644
--- a/tests/media/test_html_preview.py
+++ b/tests/media/test_html_preview.py
@@ -366,7 +366,7 @@ def test_windows_1252(self) -> None:
         soup = decode_body(html, "http://example.com/test.html")
         assert soup is not None
         og = parse_html_to_open_graph(soup)
-        self.assertEqual(og, {"og:title": "�", "og:description": "Some text."})
+        self.assertEqual(og, {"og:title": "¾", "og:description": "Some text."})
 
     def test_image(self) -> None:
         """Test the spots an image can be pulled from ."""
diff --git a/tests/rest/media/test_url_preview.py b/tests/rest/media/test_url_preview.py
index 683d646942b..53338f70b31 100644
--- a/tests/rest/media/test_url_preview.py
+++ b/tests/rest/media/test_url_preview.py
@@ -367,7 +367,7 @@ def test_non_ascii_preview_content_type(self) -> None:
 
         self.pump()
         self.assertEqual(channel.code, 200)
-        self.assertEqual(channel.json_body["og:title"], "���")
+        self.assertEqual(channel.json_body["og:title"], "‰Íý")
 
     def test_overlong_title(self) -> None:
         self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]

From a24d2514137c1d2132400525bf2167cc60d13da3 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@patrick.cloke.us>
Date: Wed, 10 Dec 2025 11:51:18 -0500
Subject: [PATCH 3/6] Update docs

---
 docs/setup/installation.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/setup/installation.md b/docs/setup/installation.md
index a48662362af..097ca6c44c0 100644
--- a/docs/setup/installation.md
+++ b/docs/setup/installation.md
@@ -633,10 +633,6 @@ This is critical from a security perspective to stop arbitrary Matrix users
 spidering 'internal' URLs on your network. At the very least we recommend that
 your loopback and RFC1918 IP addresses are blacklisted.
 
-This also requires the optional `lxml` python dependency to be  installed. This
-in turn requires the `libxml2` library to be available - on  Debian/Ubuntu this
-means `apt-get install libxml2-dev`, or equivalent for your OS.
-
 ### Backups
 
 Don't forget to take [backups](../usage/administration/backups.md) of your new server!

From d5332b079e1e6476eb85717b17ac336e86948358 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Thu, 11 Dec 2025 08:44:49 -0500
Subject: [PATCH 4/6] Create 19301.misc

---
 changelog.d/19301.misc | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelog.d/19301.misc

diff --git a/changelog.d/19301.misc b/changelog.d/19301.misc
new file mode 100644
index 00000000000..a29625692ee
--- /dev/null
+++ b/changelog.d/19301.misc
@@ -0,0 +1 @@
+Switch to beautofulsoup4 from lxml for URL previews. Controbuted by @clokep.
\ No newline at end of file

From 38135374778a78f5741a6d2500130806864f3054 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@patrick.cloke.us>
Date: Fri, 12 Dec 2025 09:28:47 -0500
Subject: [PATCH 5/6] Lint fixes

---
 synapse/media/oembed.py       |  5 ++---
 synapse/media/preview_html.py | 27 ++++++++++++---------------
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/synapse/media/oembed.py b/synapse/media/oembed.py
index 1d48d648309..ef17a2191eb 100644
--- a/synapse/media/oembed.py
+++ b/synapse/media/oembed.py
@@ -31,7 +31,6 @@
 
 if TYPE_CHECKING:
     from bs4 import BeautifulSoup
-    from bs4.element import Tag
 
     from synapse.server import HomeServer
 
@@ -124,7 +123,7 @@ def autodiscover_from_html(self, soup: "BeautifulSoup") -> str | None:
             type="application/json+oembed",
             href=NON_BLANK,
         )
-        return cast(str, cast("Tag", tag)["href"]) if tag else None
+        return cast(str, tag["href"]) if tag else None
 
     def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
         """
@@ -216,7 +215,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
 
 def _fetch_url(soup: "BeautifulSoup", tag_name: str) -> str | None:
     tag = soup.find(tag_name, src=NON_BLANK)
-    return cast(str, cast("Tag", tag)["src"]) if tag else None
+    return cast(str, tag["src"]) if tag else None
 
 
 def calc_description_and_urls(
diff --git a/synapse/media/preview_html.py b/synapse/media/preview_html.py
index 2d79165ca6f..613c1a4af03 100644
--- a/synapse/media/preview_html.py
+++ b/synapse/media/preview_html.py
@@ -108,7 +108,7 @@ def _get_meta_tags(
             )
             return {}
 
-        key = tag[property]
+        key = cast(str, tag[property])
         if property_mapper:
             new_key = property_mapper(key)
             # None is a special value used to ignore a value.
@@ -116,7 +116,7 @@ def _get_meta_tags(
                 continue
             key = new_key
 
-        results[key] = tag["content"]
+        results[key] = cast(str, tag["content"])
 
     return results
 
@@ -198,9 +198,10 @@ def parse_html_to_open_graph(soup: "BeautifulSoup") -> dict[str, str | None]:
 
     if "og:title" not in og:
         # Attempt to find a title from the title tag, or the biggest header on the page.
-        title = cast(
-            Optional["Tag"], soup.find(("title", "h1", "h2", "h3"), string=True)
-        )
+        #
+        # mypy doesn't like passing both name and string, but it is used to ignore
+        # empty elements.
+        title = soup.find(("title", "h1", "h2", "h3"), string=True)  # type: ignore[call-overload]
         if title and title.string:
             og["og:title"] = title.string.strip()
         else:
@@ -208,9 +209,8 @@ def parse_html_to_open_graph(soup: "BeautifulSoup") -> dict[str, str | None]:
 
     if "og:image" not in og:
         # Check microdata for an image.
-        meta_image = cast(
-            Optional["Tag"],
-            soup.find("meta", itemprop=re.compile("image", re.I), content=NON_BLANK),
+        meta_image = soup.find(
+            "meta", itemprop=re.compile("image", re.I), content=NON_BLANK
         )
         # If a meta image is found, use it.
         if meta_image:
@@ -247,13 +247,10 @@ def parse_html_to_open_graph(soup: "BeautifulSoup") -> dict[str, str | None]:
 
     if "og:description" not in og:
         # Check the first meta description tag for content.
-        meta_description = cast(
-            Optional["Tag"],
-            soup.find(
-                "meta",
-                attrs={"name": re.compile("description", re.I)},
-                content=NON_BLANK,
-            ),
+        meta_description = soup.find(
+            "meta",
+            attrs={"name": re.compile("description", re.I)},
+            content=NON_BLANK,
         )
 
         # If a meta description is found with content, use it.

From 5940217ffd16045193c5b732929e5d010327c47c Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@patrick.cloke.us>
Date: Fri, 12 Dec 2025 09:53:48 -0500
Subject: [PATCH 6/6] Fix-up references

---
 tests/media/test_html_preview.py     | 12 ++++++------
 tests/media/test_oembed.py           |  8 ++++----
 tests/media/test_url_previewer.py    |  8 ++++----
 tests/rest/client/test_media.py      |  8 ++++----
 tests/rest/media/test_url_preview.py |  8 ++++----
 5 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/tests/media/test_html_preview.py b/tests/media/test_html_preview.py
index 7fa437fdb88..387969522c3 100644
--- a/tests/media/test_html_preview.py
+++ b/tests/media/test_html_preview.py
@@ -28,14 +28,14 @@
 from tests import unittest
 
 try:
-    import lxml
+    import beauitfulsoup4
 except ImportError:
-    lxml = None  # type: ignore[assignment]
+    beauitfulsoup4 = None  # type: ignore[assignment]
 
 
 class SummarizeTestCase(unittest.TestCase):
-    if not lxml:
-        skip = "url preview feature requires lxml"
+    if not beauitfulsoup4:
+        skip = "url preview feature requires beauitfulsoup4"
 
     def test_long_summarize(self) -> None:
         example_paras = [
@@ -152,8 +152,8 @@ def test_small_then_large_summarize(self) -> None:
 
 
 class OpenGraphFromHtmlTestCase(unittest.TestCase):
-    if not lxml:
-        skip = "url preview feature requires lxml"
+    if not beauitfulsoup4:
+        skip = "url preview feature requires beauitfulsoup4"
 
     def test_simple(self) -> None:
         html = b"""
diff --git a/tests/media/test_oembed.py b/tests/media/test_oembed.py
index dc13c03df33..8181f86c576 100644
--- a/tests/media/test_oembed.py
+++ b/tests/media/test_oembed.py
@@ -34,14 +34,14 @@
 from tests.unittest import HomeserverTestCase
 
 try:
-    import lxml
+    import beauitfulsoup4
 except ImportError:
-    lxml = None  # type: ignore[assignment]
+    beauitfulsoup4 = None  # type: ignore[assignment]
 
 
 class OEmbedTests(HomeserverTestCase):
-    if not lxml:
-        skip = "url preview feature requires lxml"
+    if not beauitfulsoup4:
+        skip = "url preview feature requires beauitfulsoup4"
 
     def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
         self.oembed = OEmbedProvider(hs)
diff --git a/tests/media/test_url_previewer.py b/tests/media/test_url_previewer.py
index 3d706c7e90d..a89a581fc5a 100644
--- a/tests/media/test_url_previewer.py
+++ b/tests/media/test_url_previewer.py
@@ -29,14 +29,14 @@
 from tests.unittest import override_config
 
 try:
-    import lxml
+    import beauitfulsoup4
 except ImportError:
-    lxml = None  # type: ignore[assignment]
+    beauitfulsoup4 = None  # type: ignore[assignment]
 
 
 class URLPreviewTests(unittest.HomeserverTestCase):
-    if not lxml:
-        skip = "url preview feature requires lxml"
+    if not beauitfulsoup4:
+        skip = "url preview feature requires beauitfulsoup4"
 
     def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
         config = self.default_config()
diff --git a/tests/rest/client/test_media.py b/tests/rest/client/test_media.py
index ce006eaa75c..43b1acb5f84 100644
--- a/tests/rest/client/test_media.py
+++ b/tests/rest/client/test_media.py
@@ -77,9 +77,9 @@
 from tests.unittest import override_config
 
 try:
-    import lxml
+    import beauitfulsoup4
 except ImportError:
-    lxml = None  # type: ignore[assignment]
+    beauitfulsoup4 = None  # type: ignore[assignment]
 
 
 class MediaDomainBlockingTests(unittest.HomeserverTestCase):
@@ -188,8 +188,8 @@ def test_remote_media_thumbnail_normally_unblocked(self) -> None:
 
 
 class URLPreviewTests(unittest.HomeserverTestCase):
-    if not lxml:
-        skip = "url preview feature requires lxml"
+    if not beauitfulsoup4:
+        skip = "url preview feature requires beauitfulsoup4"
 
     servlets = [media.register_servlets]
     hijack_auth = True
diff --git a/tests/rest/media/test_url_preview.py b/tests/rest/media/test_url_preview.py
index 53338f70b31..bcd6c7b75d4 100644
--- a/tests/rest/media/test_url_preview.py
+++ b/tests/rest/media/test_url_preview.py
@@ -45,14 +45,14 @@
 from tests.unittest import override_config
 
 try:
-    import lxml
+    import beautifulsoup4
 except ImportError:
-    lxml = None  # type: ignore[assignment]
+    beautifulsoup4 = None  # type: ignore[assignment]
 
 
 class URLPreviewTests(unittest.HomeserverTestCase):
-    if not lxml:
-        skip = "url preview feature requires lxml"
+    if not beautifulsoup4:
+        skip = "url preview feature requires beautifulsoup4"
 
     hijack_auth = True
     user_id = "@test:user"