element-hq · clokep · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025 · Dec 11, 2025
@@ -0,0 +1 @@
+Switch to beautofulsoup4 from lxml for URL previews. Controbuted by @clokep.
-Switch to beautofulsoup4 from lxml for URL previews. Controbuted by @clokep.
+Switch to `beautifulsoup4` from `lxml` for URL previews. Contributed by @clokep.
-Switch to beautofulsoup4 from lxml for URL previews. Controbuted by @clokep.
+Switch to `beautifulsoup4` from `lxml` for URL previews. Contributed by @clokep.
@@ -633,10 +633,6 @@ This is critical from a security perspective to stop arbitrary Matrix users
 spidering 'internal' URLs on your network. At the very least we recommend that
 your loopback and RFC1918 IP addresses are blacklisted.
 
-This also requires the optional `lxml` python dependency to be  installed. This
-in turn requires the `libxml2` library to be available - on  Debian/Ubuntu this
-means `apt-get install libxml2-dev`, or equivalent for your OS.
 libxml2 
       # There aren't wheels for some of the older deps, so we need to install 
       # their build dependencies 
       - run: | 
           sudo apt-get -qq update 
           sudo apt-get -qq install build-essential libffi-dev python3-dev \ 
             libxml2-dev libxslt-dev xmlsec1 zlib1g-dev libjpeg-dev libwebp-dev 
 - run: sudo apt-get -qq install xmlsec1 libxml2-dev libxslt-dev 
 ```sh 
 sudo dnf install libtiff-devel libjpeg-devel libzip-devel freetype-devel \ 
                  libwebp-devel libxml2-devel libxslt-devel libpq-devel \ 
                  python3-virtualenv libffi-devel openssl-devel python3-devel 
 libxml2 
       # There aren't wheels for some of the older deps, so we need to install 
       # their build dependencies 
       - run: | 
           sudo apt-get -qq update 
           sudo apt-get -qq install build-essential libffi-dev python3-dev \ 
             libxml2-dev libxslt-dev xmlsec1 zlib1g-dev libjpeg-dev libwebp-dev 
 - run: sudo apt-get -qq install xmlsec1 libxml2-dev libxslt-dev 
 ```sh 
 sudo dnf install libtiff-devel libjpeg-devel libzip-devel freetype-devel \ 
                  libwebp-devel libxml2-devel libxslt-devel libpq-devel \ 
                  python3-virtualenv libffi-devel openssl-devel python3-devel 
-
 ### Backups
 
 Don't forget to take [backups](../usage/administration/backups.md) of your new server!

@@ -134,7 +134,7 @@ oidc = ["authlib>=0.15.1"]
 # `systemd.journal.JournalHandler`, as is documented in
 # `contrib/systemd/log_config.yaml`.
 systemd = ["systemd-python>=231"]
-url-preview = ["lxml>=4.6.3"]
+url-preview = ["beautifulsoup4>=4.13.0"]
 sentry = ["sentry-sdk>=0.7.2"]
 opentracing = [
     "jaeger-client>=4.2.0",
@@ -177,7 +177,7 @@ all = [
     # oidc and jwt
     "authlib>=0.15.1",
     # url-preview
-    "lxml>=4.6.3",
+    "beautifulsoup4>=4.13.0",
     # sentry
     "sentry-sdk>=0.7.2",
     # opentracing
@@ -261,7 +261,6 @@ generate-setup-file = true
 ruff = "0.14.6"
 
 # Typechecking
-lxml-stubs = ">=0.4.0"
 mypy = "*"
 mypy-zope = "*"
 types-bleach = ">=4.1.0"
@@ -436,6 +435,7 @@ sdist-include = [
     "rust/build.rs",
     "rust/src/**",
 ]
+
 sdist-exclude = ["synapse/*.so"]
 
 [build-system]

@@ -25,12 +25,12 @@
 
 import attr
 
-from synapse.media.preview_html import parse_html_description
+from synapse.media.preview_html import NON_BLANK, decode_body, parse_html_description
 from synapse.types import JsonDict
 from synapse.util.json import json_decoder
 
 if TYPE_CHECKING:
-    from lxml import etree
+    from bs4 import BeautifulSoup
 
     from synapse.server import HomeServer
 
@@ -105,35 +105,25 @@ def get_oembed_url(self, url: str) -> str | None:
         # No match.
         return None
 
-    def autodiscover_from_html(self, tree: "etree._Element") -> str | None:
+    def autodiscover_from_html(self, soup: "BeautifulSoup") -> str | None:
         """
         Search an HTML document for oEmbed autodiscovery information.
 
         Args:
-            tree: The parsed HTML body.
+            soup: The parsed HTML body.
 
         Returns:
             The URL to use for oEmbed information, or None if no URL was found.
         """
         # Search for link elements with the proper rel and type attributes.
-        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
-        for tag in cast(
-            list["etree._Element"],
-            tree.xpath("//link[@rel='alternate'][@type='application/json+oembed']"),
-        ):
-            if "href" in tag.attrib:
-                return cast(str, tag.attrib["href"])
-
         # Some providers (e.g. Flickr) use alternative instead of alternate.
-        # Some providers (e.g. Flickr) use alternative instead of alternate.
+        # Some providers (e.g. Flickr) use `alternative` instead of `alternate`.
-        # Some providers (e.g. Flickr) use alternative instead of alternate.
+        # Some providers (e.g. Flickr) use `alternative` instead of `alternate`.
-        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
-        for tag in cast(
-            list["etree._Element"],
-            tree.xpath("//link[@rel='alternative'][@type='application/json+oembed']"),
-        ):
-            if "href" in tag.attrib:
-                return cast(str, tag.attrib["href"])
-
-        return None
+        tag = soup.find(
+            "link",
+            rel=("alternate", "alternative"),
+            type="application/json+oembed",
+            href=NON_BLANK,
+        )
+        return cast(str, tag["href"]) if tag else None
-        return cast(str, tag["href"]) if tag else None
+        tag = soup.find(
+            "link",
+            rel=("alternate", "alternative"),
+            type="application/json+oembed",
+        )
+        href = tag.get("href")
+        return cast(str, href) if href else None
-        return cast(str, tag["href"]) if tag else None
+        tags = soup.find_all(
+            "link",
+            rel=("alternate", "alternative"),
+            type="application/json+oembed",
+        )
+        if len(tags) == 0:
+            return None
+        elif len(tags) == 1:
+            tag = tags[0]
+            href = tag.get("href")
+            return cast(str, href) if href else None
+        else:
+            # If there are multiple tags, return an error. We don't want to even
+            # try to pick the right one if there are multiple as we could run into
+            # problems similar to request smuggling vulnerabilities which rely on the
+            # mismatch of how different systems interpret information.
+            raise ValueError(
+                'Expected one `<link type="application/json+oembed">` but found multiple.'
+            )
-        return cast(str, tag["href"]) if tag else None
+        tag = soup.find(
+            "link",
+            rel=("alternate", "alternative"),
+            type="application/json+oembed",
+        )
+        href = tag.get("href")
+        return cast(str, href) if href else None
-        return cast(str, tag["href"]) if tag else None
+        tags = soup.find_all(
+            "link",
+            rel=("alternate", "alternative"),
+            type="application/json+oembed",
+        )
+        if len(tags) == 0:
+            return None
+        elif len(tags) == 1:
+            tag = tags[0]
+            href = tag.get("href")
+            return cast(str, href) if href else None
+        else:
+            # If there are multiple tags, return an error. We don't want to even
+            # try to pick the right one if there are multiple as we could run into
+            # problems similar to request smuggling vulnerabilities which rely on the
+            # mismatch of how different systems interpret information.
+            raise ValueError(
+                'Expected one `<link type="application/json+oembed">` but found multiple.'
+            )
 
     def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
         """
@@ -196,7 +186,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
         if oembed_type == "rich":
             html_str = oembed.get("html")
             if isinstance(html_str, str):
-                calc_description_and_urls(open_graph_response, html_str)
+                calc_description_and_urls(open_graph_response, html_str, url)
 
         elif oembed_type == "photo":
             # If this is a photo, use the full image, not the thumbnail.
@@ -208,7 +198,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
             open_graph_response["og:type"] = "video.other"
             html_str = oembed.get("html")
             if html_str and isinstance(html_str, str):
-                calc_description_and_urls(open_graph_response, oembed["html"])
+                calc_description_and_urls(open_graph_response, oembed["html"], url)
             for size in ("width", "height"):
                 val = oembed.get(size)
                 if type(val) is int:  # noqa: E721
@@ -223,55 +213,45 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
         return OEmbedResult(open_graph_response, author_name, cache_age)
 
 
-def _fetch_urls(tree: "etree._Element", tag_name: str) -> list[str]:
-    results = []
-    # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
-    for tag in cast(list["etree._Element"], tree.xpath("//*/" + tag_name)):
-        if "src" in tag.attrib:
-            results.append(cast(str, tag.attrib["src"]))
-    return results
+def _fetch_url(soup: "BeautifulSoup", tag_name: str) -> str | None:
+    tag = soup.find(tag_name, src=NON_BLANK)
+    return cast(str, tag["src"]) if tag else None
 
 
-def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
+def calc_description_and_urls(
+    open_graph_response: JsonDict, html_body: str, url: str
+) -> None:
     """
     Calculate description for an HTML document.
 
-    This uses lxml to convert the HTML document into plaintext. If errors
+    This uses BeautifulSoup to convert the HTML document into plaintext. If errors
     occur during processing of the document, an empty response is returned.
 
     Args:
         open_graph_response: The current Open Graph summary. This is updated with additional fields.
         html_body: The HTML document, as bytes.
-
-    Returns:
-        The summary
+        url: The URL which is being previewed (not the one which was requested).
     """
-    # If there's no body, nothing useful is going to be found.
-    if not html_body:
-        return
+    soup = decode_body(html_body, url)
 
-    from lxml import etree
-
-    # Create an HTML parser. If this fails, log and return no metadata.
-    parser = etree.HTMLParser(recover=True, encoding="utf-8")
-
-    # Attempt to parse the body. If this fails, log and return no metadata.
-    tree = etree.fromstring(html_body, parser)
-
-    # The data was successfully parsed, but no tree was found.
-    if tree is None:
+    # If there's no body, nothing useful is going to be found.
+    if not soup:
         return
 
     # Attempt to find interesting URLs (images, videos, embeds).
     if "og:image" not in open_graph_response:
-        image_urls = _fetch_urls(tree, "img")
-        if image_urls:
-            open_graph_response["og:image"] = image_urls[0]
-
-    video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
-    if video_urls:
-        open_graph_response["og:video"] = video_urls[0]
-
-    description = parse_html_description(tree)
+        image_url = _fetch_url(soup, "img")
+        if image_url:
+            open_graph_response["og:image"] = image_url
+
+    video_url = _fetch_url(soup, "video")
+    if video_url:
+        open_graph_response["og:video"] = video_url
+    else:
+        embed_url = _fetch_url(soup, "embed")
+        if embed_url:
+            open_graph_response["og:video"] = embed_url
+
+    description = parse_html_description(soup)
     if description:
         open_graph_response["og:description"] = description