From 04266f38096b3cbcddbc1747b5025dd87fcac1ad Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Tue, 29 Apr 2025 13:36:42 +0100
Subject: [PATCH 01/39] Add tests __init__.py

This will cause the tests directory to be treated as a package and allow
two test files to have the same name in different directories.
---
 cardinal_pythonlib/tests/__init__.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 cardinal_pythonlib/tests/__init__.py

diff --git a/cardinal_pythonlib/tests/__init__.py b/cardinal_pythonlib/tests/__init__.py
new file mode 100644
index 0000000..a5311f0
--- /dev/null
+++ b/cardinal_pythonlib/tests/__init__.py
@@ -0,0 +1,27 @@
+# cardinal_pythonlib/tests/__init__.py
+
+"""
+===============================================================================
+
+    Original code copyright (C) 2009-2022 Rudolf Cardinal (rudolf@pobox.com).
+
+    This file is part of cardinal_pythonlib.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        https://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+===============================================================================
+
+The mere existence of this file makes Python treat the directory as a
+package.
+
+"""

From 6e9f343dd82206024377a574c56a5b8eaf1ffa19 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Tue, 29 Apr 2025 14:17:38 +0100
Subject: [PATCH 02/39] Test document_to_text exceptions

---
 .../tests/extract_text_tests.py               | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 cardinal_pythonlib/tests/extract_text_tests.py

diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
new file mode 100644
index 0000000..4576e0f
--- /dev/null
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -0,0 +1,60 @@
+# cardinal_pythonlib/tests/datetimefunc_tests.py
+
+"""
+===============================================================================
+
+    Original code copyright (C) 2009-2022 Rudolf Cardinal (rudolf@pobox.com).
+
+    This file is part of cardinal_pythonlib.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        https://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+===============================================================================
+
+**Text extraction tests.**
+
+"""
+
+import os
+from tempfile import TemporaryDirectory
+from unittest import TestCase
+
+from cardinal_pythonlib.extract_text import document_to_text
+
+
+class DocumentToTextTests(TestCase):
+    def test_raises_when_no_filename_or_blob(self) -> None:
+        with self.assertRaises(ValueError) as cm:
+            document_to_text()
+
+        self.assertIn("no filename and no blob", str(cm.exception))
+
+    def test_raises_when_filename_and_blob(self) -> None:
+        with self.assertRaises(ValueError) as cm:
+            document_to_text(filename="foo", blob="bar")
+
+        self.assertIn("specify either filename or blob", str(cm.exception))
+
+    def test_raises_when_blob_but_no_extension(self) -> None:
+        with self.assertRaises(ValueError) as cm:
+            document_to_text(blob="bar")
+
+        self.assertIn("need extension hint for blob", str(cm.exception))
+
+    def test_raises_when_not_a_file(self) -> None:
+        with self.assertRaises(ValueError) as cm:
+            with TemporaryDirectory() as temp_dir_name:
+                filename = os.path.join(temp_dir_name, "foo")
+                document_to_text(filename=filename)
+
+        self.assertIn("no such file", str(cm.exception))

From 9d78d2ccce3cdc9e100bbf3981a0faa7f635f24f Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Tue, 29 Apr 2025 14:51:47 +0100
Subject: [PATCH 03/39] Test document_to_text CSV extraction

---
 cardinal_pythonlib/tests/extract_text_tests.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index 4576e0f..5a2acf8 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -26,7 +26,7 @@
 """
 
 import os
-from tempfile import TemporaryDirectory
+from tempfile import TemporaryDirectory, NamedTemporaryFile
 from unittest import TestCase
 
 from cardinal_pythonlib.extract_text import document_to_text
@@ -58,3 +58,13 @@ def test_raises_when_not_a_file(self) -> None:
                 document_to_text(filename=filename)
 
         self.assertIn("no such file", str(cm.exception))
+
+    def test_csv_converted(self) -> None:
+        content = "one,two,three"
+
+        with NamedTemporaryFile(suffix=".csv", delete=False) as temp_file:
+            temp_file.write(content.encode("utf-8"))
+            temp_file.close()
+            text = document_to_text(temp_file.name)
+
+        self.assertEqual(text, content)

From 5a8d542439ce6fccafcb92bfeb0e50f481dacb30 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Tue, 29 Apr 2025 15:35:40 +0100
Subject: [PATCH 04/39] Test doc extraction

---
 .../tests/extract_text_tests.py               | 47 ++++++++++++++++++-
 1 file changed, 45 insertions(+), 2 deletions(-)

diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index 5a2acf8..090842c 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -26,13 +26,34 @@
 """
 
 import os
+import subprocess
 from tempfile import TemporaryDirectory, NamedTemporaryFile
-from unittest import TestCase
+from unittest import mock, TestCase
 
-from cardinal_pythonlib.extract_text import document_to_text
+from cardinal_pythonlib.extract_text import (
+    document_to_text,
+    TextProcessingConfig,
+    update_external_tools,
+)
 
 
 class DocumentToTextTests(TestCase):
+    def setUp(self) -> None:
+        update_external_tools(
+            {
+                "antiword": "/path/to/antiword",
+            }
+        )
+
+        self.config = TextProcessingConfig()
+
+        mock_decode = mock.Mock(return_value="")
+        mock_stdout = mock.Mock(decode=mock_decode)
+        mock_communicate = mock.Mock(return_value=(mock_stdout, None))
+        self.mock_popen = mock.Mock(
+            return_value=mock.Mock(communicate=mock_communicate)
+        )
+
     def test_raises_when_no_filename_or_blob(self) -> None:
         with self.assertRaises(ValueError) as cm:
             document_to_text()
@@ -68,3 +89,25 @@ def test_csv_converted(self) -> None:
             text = document_to_text(temp_file.name)
 
         self.assertEqual(text, content)
+
+    def test_doc_converted_with_antiword(self) -> None:
+        with mock.patch.multiple(
+            "cardinal_pythonlib.extract_text.subprocess",
+            Popen=self.mock_popen,
+        ):
+            with NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
+                temp_file.close()
+                document_to_text(temp_file.name)
+
+        expected_calls = [
+            mock.call(
+                (
+                    "/path/to/antiword",
+                    "-w",
+                    str(self.config.width),
+                    temp_file.name,
+                ),
+                stdout=subprocess.PIPE,
+            ),
+        ]
+        self.mock_popen.assert_has_calls(expected_calls)

From a8f8cb54d421e80ea1f05f30a9bb978c128d51d7 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Tue, 29 Apr 2025 15:48:26 +0100
Subject: [PATCH 05/39] Test dot file extraction

---
 .../tests/extract_text_tests.py               | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index 090842c..437c111 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -38,6 +38,8 @@
 
 
 class DocumentToTextTests(TestCase):
+    # For external tools we assume the tools are running correctly
+    # and we just check that they are invoked with the correct arguments.
     def setUp(self) -> None:
         update_external_tools(
             {
@@ -47,6 +49,7 @@ def setUp(self) -> None:
 
         self.config = TextProcessingConfig()
 
+        # Some mock empty output that we don't check
         mock_decode = mock.Mock(return_value="")
         mock_stdout = mock.Mock(decode=mock_decode)
         mock_communicate = mock.Mock(return_value=(mock_stdout, None))
@@ -111,3 +114,25 @@ def test_doc_converted_with_antiword(self) -> None:
             ),
         ]
         self.mock_popen.assert_has_calls(expected_calls)
+
+    def test_dot_converted_with_antiword(self) -> None:
+        with mock.patch.multiple(
+            "cardinal_pythonlib.extract_text.subprocess",
+            Popen=self.mock_popen,
+        ):
+            with NamedTemporaryFile(suffix=".dot", delete=False) as temp_file:
+                temp_file.close()
+                document_to_text(temp_file.name)
+
+        expected_calls = [
+            mock.call(
+                (
+                    "/path/to/antiword",
+                    "-w",
+                    str(self.config.width),
+                    temp_file.name,
+                ),
+                stdout=subprocess.PIPE,
+            ),
+        ]
+        self.mock_popen.assert_has_calls(expected_calls)

From 2cb286625a90324aba9f7edc1f9051416c3fb82e Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Tue, 29 Apr 2025 16:37:21 +0100
Subject: [PATCH 06/39] Update docs

---
 docs/source/autodoc/_index.rst                |  1 +
 .../autodoc/tests/extract_text_tests.py.rst   | 25 +++++++++++++++++++
 2 files changed, 26 insertions(+)
 create mode 100644 docs/source/autodoc/tests/extract_text_tests.py.rst

diff --git a/docs/source/autodoc/_index.rst b/docs/source/autodoc/_index.rst
index 4adf8f6..3a7fb7e 100644
--- a/docs/source/autodoc/_index.rst
+++ b/docs/source/autodoc/_index.rst
@@ -178,6 +178,7 @@ Automatic documentation of source code
     tee.py.rst
     tests/datetimefunc_tests.py.rst
     tests/dogpile_cache_tests.py.rst
+    tests/extract_text_tests.py.rst
     tests/interval_tests.py.rst
     tests/lists_tests.py.rst
     tests/pdf_tests.py.rst
diff --git a/docs/source/autodoc/tests/extract_text_tests.py.rst b/docs/source/autodoc/tests/extract_text_tests.py.rst
new file mode 100644
index 0000000..b1a6abb
--- /dev/null
+++ b/docs/source/autodoc/tests/extract_text_tests.py.rst
@@ -0,0 +1,25 @@
+.. docs/source/autodoc/tests/extract_text_tests.py.rst
+
+.. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
+
+
+..  Copyright (C) 2009-2020 Rudolf Cardinal (rudolf@pobox.com).
+    .
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+    .
+        https://www.apache.org/licenses/LICENSE-2.0
+    .
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+cardinal_pythonlib.tests.extract_text_tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: cardinal_pythonlib.tests.extract_text_tests
+    :members:

From 699645d199823b7c81ed1b15eb6df88cde6dfaa2 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Wed, 30 Apr 2025 08:02:35 +0100
Subject: [PATCH 07/39] Test DOCX conversion

---
 .../tests/extract_text_tests.py               | 23 ++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index 437c111..11a2c9a 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -30,6 +30,9 @@
 from tempfile import TemporaryDirectory, NamedTemporaryFile
 from unittest import mock, TestCase
 
+from faker import Faker
+from faker_file.providers.docx_file import DocxFileProvider
+
 from cardinal_pythonlib.extract_text import (
     document_to_text,
     TextProcessingConfig,
@@ -57,6 +60,9 @@ def setUp(self) -> None:
             return_value=mock.Mock(communicate=mock_communicate)
         )
 
+        self.fake = Faker()
+        self.fake.add_provider(DocxFileProvider)
+
     def test_raises_when_no_filename_or_blob(self) -> None:
         with self.assertRaises(ValueError) as cm:
             document_to_text()
@@ -89,7 +95,7 @@ def test_csv_converted(self) -> None:
         with NamedTemporaryFile(suffix=".csv", delete=False) as temp_file:
             temp_file.write(content.encode("utf-8"))
             temp_file.close()
-            text = document_to_text(temp_file.name)
+            text = document_to_text(filename=temp_file.name)
 
         self.assertEqual(text, content)
 
@@ -100,7 +106,7 @@ def test_doc_converted_with_antiword(self) -> None:
         ):
             with NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
                 temp_file.close()
-                document_to_text(temp_file.name)
+                document_to_text(filename=temp_file.name, config=self.config)
 
         expected_calls = [
             mock.call(
@@ -122,7 +128,7 @@ def test_dot_converted_with_antiword(self) -> None:
         ):
             with NamedTemporaryFile(suffix=".dot", delete=False) as temp_file:
                 temp_file.close()
-                document_to_text(temp_file.name)
+                document_to_text(filename=temp_file.name)
 
         expected_calls = [
             mock.call(
@@ -136,3 +142,14 @@ def test_dot_converted_with_antiword(self) -> None:
             ),
         ]
         self.mock_popen.assert_has_calls(expected_calls)
+
+    def test_docx_converted(self) -> None:
+        content = self.fake.paragraph(nb_sentences=10)
+
+        docx = self.fake.docx_file(content=content)
+        self.config.width = 0
+        text = document_to_text(
+            docx.data["filename"], extension="docx", config=self.config
+        )
+
+        self.assertEqual(text.strip(), content)

From 78873eb9933c2cadb45b2e1709b8f991e60d7ca8 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Wed, 30 Apr 2025 08:13:19 +0100
Subject: [PATCH 08/39] Test HTML conversion

---
 .../tests/extract_text_tests.py               | 21 +++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index 11a2c9a..6dfcb55 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -148,8 +148,25 @@ def test_docx_converted(self) -> None:
 
         docx = self.fake.docx_file(content=content)
         self.config.width = 0
+        text = document_to_text(docx.data["filename"], config=self.config)
+
+        self.assertEqual(text.strip(), content)
+
+    def test_htm_converted(self) -> None:
+        content = self.fake.paragraph(nb_sentences=10)
+
+        html = f"""
+<!DOCTYPE html>
+<html>
+<head>
+</head>
+<body>
+{content}
+</body>
+</html>
+"""
+
         text = document_to_text(
-            docx.data["filename"], extension="docx", config=self.config
+            blob=html.encode("utf-8"), extension="htm", config=self.config
         )
-
         self.assertEqual(text.strip(), content)

From 9219cab1bd7ba72fea6c10e46b5c5c5d67e6631d Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Wed, 30 Apr 2025 08:26:31 +0100
Subject: [PATCH 09/39] Test log file conversion

---
 cardinal_pythonlib/tests/extract_text_tests.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index 6dfcb55..5b51964 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -170,3 +170,15 @@ def test_htm_converted(self) -> None:
             blob=html.encode("utf-8"), extension="htm", config=self.config
         )
         self.assertEqual(text.strip(), content)
+
+    def test_log_converted(self) -> None:
+        content = """
+2025-04-02 06:05:43,772 INFO Starting unattended upgrades script
+2025-04-02 06:05:43,772 INFO Allowed origins are: o=Ubuntu,a=focal, o=Ubuntu,a=focal-security, o=UbuntuESMApps,a=focal-apps-security, o=UbuntuESM,a=focal-infra-security
+"""  # noqa: E501
+
+        text = document_to_text(
+            blob=content.encode("utf-8"), extension="log", config=self.config
+        )
+
+        self.assertEqual(text.strip(), content.strip())

From 04b0c3739fe5f4539bece63d701de81c2af95f3a Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Wed, 30 Apr 2025 08:32:19 +0100
Subject: [PATCH 10/39] Test ODT file conversion

---
 cardinal_pythonlib/tests/extract_text_tests.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index 5b51964..c106d19 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -32,6 +32,7 @@
 
 from faker import Faker
 from faker_file.providers.docx_file import DocxFileProvider
+from faker_file.providers.odt_file import OdtFileProvider
 
 from cardinal_pythonlib.extract_text import (
     document_to_text,
@@ -62,6 +63,7 @@ def setUp(self) -> None:
 
         self.fake = Faker()
         self.fake.add_provider(DocxFileProvider)
+        self.fake.add_provider(OdtFileProvider)
 
     def test_raises_when_no_filename_or_blob(self) -> None:
         with self.assertRaises(ValueError) as cm:
@@ -182,3 +184,12 @@ def test_log_converted(self) -> None:
         )
 
         self.assertEqual(text.strip(), content.strip())
+
+    def test_odt_converted(self) -> None:
+        content = self.fake.paragraph(nb_sentences=10)
+
+        odt = self.fake.odt_file(content=content)
+        self.config.width = 0
+        text = document_to_text(odt.data["filename"], config=self.config)
+
+        self.assertEqual(text.strip(), content)

From 21e2b81c03aa8927d996af0f092cab5085cab6cf Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Wed, 30 Apr 2025 08:54:15 +0100
Subject: [PATCH 11/39] Test PDF file conversion

---
 .../tests/extract_text_tests.py               | 65 +++++++++++++++----
 1 file changed, 52 insertions(+), 13 deletions(-)

diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index c106d19..5dfaf8d 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -27,12 +27,13 @@
 
 import os
 import subprocess
-from tempfile import TemporaryDirectory, NamedTemporaryFile
+from tempfile import mkdtemp, NamedTemporaryFile
 from unittest import mock, TestCase
 
 from faker import Faker
 from faker_file.providers.docx_file import DocxFileProvider
 from faker_file.providers.odt_file import OdtFileProvider
+from faker_file.providers.pdf_file import PdfFileProvider
 
 from cardinal_pythonlib.extract_text import (
     document_to_text,
@@ -42,17 +43,15 @@
 
 
 class DocumentToTextTests(TestCase):
-    # For external tools we assume the tools are running correctly
-    # and we just check that they are invoked with the correct arguments.
     def setUp(self) -> None:
-        update_external_tools(
-            {
-                "antiword": "/path/to/antiword",
-            }
-        )
+        self.empty_dir = mkdtemp()
 
+        self._replace_external_tools_with_fakes()
         self.config = TextProcessingConfig()
+        self._create_mock_objects()
+        self._register_faker_providers()
 
+    def _create_mock_objects(self) -> None:
         # Some mock empty output that we don't check
         mock_decode = mock.Mock(return_value="")
         mock_stdout = mock.Mock(decode=mock_decode)
@@ -61,9 +60,29 @@ def setUp(self) -> None:
             return_value=mock.Mock(communicate=mock_communicate)
         )
 
+    def _register_faker_providers(self) -> None:
         self.fake = Faker()
         self.fake.add_provider(DocxFileProvider)
         self.fake.add_provider(OdtFileProvider)
+        self.fake.add_provider(PdfFileProvider)
+
+    def _replace_external_tools_with_fakes(self) -> None:
+        # For external tools we assume the tools are running correctly
+        # and we just check that they are invoked with the correct arguments.
+
+        tool_names = [
+            "antiword",
+            "pdftotext",
+            "strings",
+            "strings2",
+            "unrtf",
+        ]
+
+        tools_dir = {t: os.path.join(self.empty_dir, t) for t in tool_names}
+        update_external_tools(tools_dir)
+
+    def tearDown(self) -> None:
+        os.rmdir(self.empty_dir)
 
     def test_raises_when_no_filename_or_blob(self) -> None:
         with self.assertRaises(ValueError) as cm:
@@ -85,9 +104,8 @@ def test_raises_when_blob_but_no_extension(self) -> None:
 
     def test_raises_when_not_a_file(self) -> None:
         with self.assertRaises(ValueError) as cm:
-            with TemporaryDirectory() as temp_dir_name:
-                filename = os.path.join(temp_dir_name, "foo")
-                document_to_text(filename=filename)
+            filename = os.path.join(self.empty_dir, "foo")
+            document_to_text(filename=filename)
 
         self.assertIn("no such file", str(cm.exception))
 
@@ -113,7 +131,7 @@ def test_doc_converted_with_antiword(self) -> None:
         expected_calls = [
             mock.call(
                 (
-                    "/path/to/antiword",
+                    f"{self.empty_dir}/antiword",
                     "-w",
                     str(self.config.width),
                     temp_file.name,
@@ -135,7 +153,7 @@ def test_dot_converted_with_antiword(self) -> None:
         expected_calls = [
             mock.call(
                 (
-                    "/path/to/antiword",
+                    f"{self.empty_dir}/antiword",
                     "-w",
                     str(self.config.width),
                     temp_file.name,
@@ -193,3 +211,24 @@ def test_odt_converted(self) -> None:
         text = document_to_text(odt.data["filename"], config=self.config)
 
         self.assertEqual(text.strip(), content)
+
+    def test_pdf_converted(self) -> None:
+        with mock.patch.multiple(
+            "cardinal_pythonlib.extract_text.subprocess",
+            Popen=self.mock_popen,
+        ):
+            with NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
+                temp_file.close()
+                document_to_text(filename=temp_file.name, config=self.config)
+
+        expected_calls = [
+            mock.call(
+                (
+                    f"{self.empty_dir}/pdftotext",
+                    temp_file.name,
+                    "-",
+                ),
+                stdout=subprocess.PIPE,
+            ),
+        ]
+        self.mock_popen.assert_has_calls(expected_calls)

From 82737b17df937cf89358eb71a0c580edf8a933c8 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Wed, 30 Apr 2025 09:41:57 +0100
Subject: [PATCH 12/39] Test RTF file conversion

---
 .../tests/extract_text_tests.py               | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index 5dfaf8d..a5dd724 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -232,3 +232,33 @@ def test_pdf_converted(self) -> None:
             ),
         ]
         self.mock_popen.assert_has_calls(expected_calls)
+
+    def test_rtf_converted(self) -> None:
+        with mock.patch(
+            "cardinal_pythonlib.extract_text.UNRTF_SUPPORTS_QUIET", True
+        ):
+            with mock.patch.multiple(
+                "cardinal_pythonlib.extract_text.subprocess",
+                Popen=self.mock_popen,
+            ):
+                with NamedTemporaryFile(
+                    suffix=".rtf", delete=False
+                ) as temp_file:
+                    temp_file.close()
+                    document_to_text(
+                        filename=temp_file.name, config=self.config
+                    )
+
+        expected_calls = [
+            mock.call(
+                (
+                    f"{self.empty_dir}/unrtf",
+                    "--text",
+                    "--nopict",
+                    "--quiet",
+                    temp_file.name,
+                ),
+                stdout=subprocess.PIPE,
+            ),
+        ]
+        self.mock_popen.assert_has_calls(expected_calls)

From 1427e823c8799cf47103b917c9dfb313e07e13ce Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Wed, 30 Apr 2025 14:45:59 +0100
Subject: [PATCH 13/39] Install Faker when building docs and running tests

---
 .github/scripts/install_test_python_packages.sh | 1 +
 docs/docs_requirements.txt                      | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/.github/scripts/install_test_python_packages.sh b/.github/scripts/install_test_python_packages.sh
index 12eb372..ef6fbf9 100755
--- a/.github/scripts/install_test_python_packages.sh
+++ b/.github/scripts/install_test_python_packages.sh
@@ -10,3 +10,4 @@ ${PYTHON} -m pip install xlrd
 ${PYTHON} -m pip install dogpile.cache==0.9.2  # Later versions incompatible
 ${PYTHON} -m pip install pytest
 ${PYTHON} -m pip install xhtml2pdf weasyprint pdfkit  # For PDF tests
+${PYTHON} -m pip install faker==13.3.1 faker-file'[common]'==0.17.13
diff --git a/docs/docs_requirements.txt b/docs/docs_requirements.txt
index 9ffc9b2..f489b7f 100644
--- a/docs/docs_requirements.txt
+++ b/docs/docs_requirements.txt
@@ -6,6 +6,8 @@ deform
 dogpile.cache==0.9.2
 # CRATE is on 4.2
 Django>=4.2,<5.0
+faker==13.3.1
+faker-file[common]==0.17.13
 libChEBIpy
 pdfkit
 pyramid==1.10.8

From d1f89770bc20414ebfcc966908ab42370c2234ae Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Wed, 30 Apr 2025 15:13:49 +0100
Subject: [PATCH 14/39] Test TXT file conversion

---
 cardinal_pythonlib/tests/extract_text_tests.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index a5dd724..5f467aa 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -34,6 +34,7 @@
 from faker_file.providers.docx_file import DocxFileProvider
 from faker_file.providers.odt_file import OdtFileProvider
 from faker_file.providers.pdf_file import PdfFileProvider
+from faker_file.providers.txt_file import TxtFileProvider
 
 from cardinal_pythonlib.extract_text import (
     document_to_text,
@@ -65,6 +66,7 @@ def _register_faker_providers(self) -> None:
         self.fake.add_provider(DocxFileProvider)
         self.fake.add_provider(OdtFileProvider)
         self.fake.add_provider(PdfFileProvider)
+        self.fake.add_provider(TxtFileProvider)
 
     def _replace_external_tools_with_fakes(self) -> None:
         # For external tools we assume the tools are running correctly
@@ -168,7 +170,9 @@ def test_docx_converted(self) -> None:
 
         docx = self.fake.docx_file(content=content)
         self.config.width = 0
-        text = document_to_text(docx.data["filename"], config=self.config)
+        text = document_to_text(
+            filename=docx.data["filename"], config=self.config
+        )
 
         self.assertEqual(text.strip(), content)
 
@@ -208,7 +212,9 @@ def test_odt_converted(self) -> None:
 
         odt = self.fake.odt_file(content=content)
         self.config.width = 0
-        text = document_to_text(odt.data["filename"], config=self.config)
+        text = document_to_text(
+            filename=odt.data["filename"], config=self.config
+        )
 
         self.assertEqual(text.strip(), content)
 
@@ -262,3 +268,10 @@ def test_rtf_converted(self) -> None:
             ),
         ]
         self.mock_popen.assert_has_calls(expected_calls)
+
+    def test_txt_converted(self) -> None:
+        content = self.fake.paragraph(nb_sentences=10)
+        txt_file = self.fake.txt_file(content=content)
+        text = document_to_text(filename=txt_file.data["filename"])
+
+        self.assertEqual(text.strip(), content)

From 10b1ac011f6c8d6cca7100026f1cd02e215d449e Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Wed, 30 Apr 2025 15:35:43 +0100
Subject: [PATCH 15/39] Test XML and anything else converted to text

---
 .../tests/extract_text_tests.py               | 39 ++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index 5f467aa..b715ba3 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -35,6 +35,7 @@
 from faker_file.providers.odt_file import OdtFileProvider
 from faker_file.providers.pdf_file import PdfFileProvider
 from faker_file.providers.txt_file import TxtFileProvider
+from faker_file.providers.xml_file import XmlFileProvider
 
 from cardinal_pythonlib.extract_text import (
     document_to_text,
@@ -62,11 +63,12 @@ def _create_mock_objects(self) -> None:
         )
 
     def _register_faker_providers(self) -> None:
-        self.fake = Faker()
+        self.fake = Faker("en-GB")
         self.fake.add_provider(DocxFileProvider)
         self.fake.add_provider(OdtFileProvider)
         self.fake.add_provider(PdfFileProvider)
         self.fake.add_provider(TxtFileProvider)
+        self.fake.add_provider(XmlFileProvider)
 
     def _replace_external_tools_with_fakes(self) -> None:
         # For external tools we assume the tools are running correctly
@@ -275,3 +277,38 @@ def test_txt_converted(self) -> None:
         text = document_to_text(filename=txt_file.data["filename"])
 
         self.assertEqual(text.strip(), content)
+
+    def test_xml_converted(self) -> None:
+        name = self.fake.name()
+        address = self.fake.address()
+
+        xml_file = self.fake.xml_file(
+            num_rows=1,
+            data_columns={
+                "name": name,
+                "address": address,
+            },
+        )
+        text = document_to_text(filename=xml_file.data["filename"])
+
+        self.assertEqual(text.strip(), f"{name}{address}")
+
+    def test_unsupported_converted(self) -> None:
+        with mock.patch.multiple(
+            "cardinal_pythonlib.extract_text.subprocess",
+            Popen=self.mock_popen,
+        ):
+            with NamedTemporaryFile(suffix=".exe", delete=False) as temp_file:
+                temp_file.close()
+                document_to_text(filename=temp_file.name, config=self.config)
+
+        expected_calls = [
+            mock.call(
+                (
+                    f"{self.empty_dir}/strings",
+                    temp_file.name,
+                ),
+                stdout=subprocess.PIPE,
+            ),
+        ]
+        self.mock_popen.assert_has_calls(expected_calls)

From 37d3257fb667c9d32066aeb741533972e202501c Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Tue, 6 May 2025 10:23:53 +0100
Subject: [PATCH 16/39] Fix name clashes with python built-ins

This is preventing us from doing this in extract_text.py:
from email.parser import Parser as EmailParser

Because email already exists as a module
---
 .pre-commit-config.yaml                                |  4 ++++
 cardinal_pythonlib/django/fields/jsonclassfield.py     |  2 +-
 cardinal_pythonlib/django/function_cache.py            |  2 +-
 cardinal_pythonlib/django/middleware.py                | 10 +++++-----
 cardinal_pythonlib/{email => email_utils}/__init__.py  |  0
 .../{email => email_utils}/mailboxpurge.py             |  0
 cardinal_pythonlib/{email => email_utils}/sendmail.py  |  0
 .../{email => email_utils}/tests/sendmail_tests.py     |  0
 cardinal_pythonlib/{json => json_utils}/__init__.py    |  2 +-
 cardinal_pythonlib/{json => json_utils}/serialize.py   |  2 +-
 .../{json => json_utils}/typing_helpers.py             |  2 +-
 cardinal_pythonlib/{profile.py => profiling.py}        |  0
 .../tools/explore_clang_format_config.py               |  6 +++++-
 docs/source/conf.py                                    |  2 +-
 setup.cfg                                              |  2 +-
 15 files changed, 21 insertions(+), 13 deletions(-)
 rename cardinal_pythonlib/{email => email_utils}/__init__.py (100%)
 rename cardinal_pythonlib/{email => email_utils}/mailboxpurge.py (100%)
 rename cardinal_pythonlib/{email => email_utils}/sendmail.py (100%)
 rename cardinal_pythonlib/{email => email_utils}/tests/sendmail_tests.py (100%)
 rename cardinal_pythonlib/{json => json_utils}/__init__.py (95%)
 rename cardinal_pythonlib/{json => json_utils}/serialize.py (99%)
 rename cardinal_pythonlib/{json => json_utils}/typing_helpers.py (96%)
 rename cardinal_pythonlib/{profile.py => profiling.py} (100%)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2c39bb5..1af96a3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,10 +18,14 @@ repos:
     rev: 5.0.4
     hooks:
     -   id: flake8
+        additional_dependencies:
+        - flake8-builtins==2.5.0
 -   repo: https://github.com/asottile/yesqa
     rev: v1.5.0
     hooks:
     -   id: yesqa
+        additional_dependencies:
+        - flake8-builtins==2.5.0
 -   repo: https://github.com/pre-commit/pygrep-hooks
     rev: v1.9.0
     hooks:
diff --git a/cardinal_pythonlib/django/fields/jsonclassfield.py b/cardinal_pythonlib/django/fields/jsonclassfield.py
index 43fbaf5..2240707 100644
--- a/cardinal_pythonlib/django/fields/jsonclassfield.py
+++ b/cardinal_pythonlib/django/fields/jsonclassfield.py
@@ -130,7 +130,7 @@ def my_decoder_hook(d: Dict) -> Any:
 # noinspection PyUnresolvedReferences
 from django.db.models import TextField
 
-from cardinal_pythonlib.json.serialize import json_decode, json_encode
+from cardinal_pythonlib.json_utils.serialize import json_decode, json_encode
 
 
 # =============================================================================
diff --git a/cardinal_pythonlib/django/function_cache.py b/cardinal_pythonlib/django/function_cache.py
index 11127b1..a2416ab 100644
--- a/cardinal_pythonlib/django/function_cache.py
+++ b/cardinal_pythonlib/django/function_cache.py
@@ -36,7 +36,7 @@
 from django.core.cache import cache  # default cache
 
 from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler
-from cardinal_pythonlib.json.serialize import json_encode
+from cardinal_pythonlib.json_utils.serialize import json_encode
 
 log = get_brace_style_log_with_null_handler(__name__)
 
diff --git a/cardinal_pythonlib/django/middleware.py b/cardinal_pythonlib/django/middleware.py
index b201a08..3d614a7 100644
--- a/cardinal_pythonlib/django/middleware.py
+++ b/cardinal_pythonlib/django/middleware.py
@@ -28,7 +28,7 @@
 
 import logging
 import os
-from re import compile
+import re
 import sys
 from typing import Optional
 
@@ -107,9 +107,9 @@ def process_exception(
 Modified according to: https://djangosnippets.org/snippets/2845/
 """
 
-# EXEMPT_URLS = [compile(settings.LOGIN_URL.lstrip('/'))]
+# EXEMPT_URLS = [re.compile(settings.LOGIN_URL.lstrip('/'))]
 # if hasattr(settings, 'LOGIN_EXEMPT_URLS'):
-#     EXEMPT_URLS += [compile(expr) for expr in settings.LOGIN_EXEMPT_URLS]
+#     EXEMPT_URLS += [re.compile(expr) for expr in settings.LOGIN_EXEMPT_URLS]
 #
 #
 # class LoginRequiredMiddleware:
@@ -166,10 +166,10 @@ def process_exception(
 # 3. RNC; composite of those patterns.
 # -----------------------------------------------------------------------------
 
-EXEMPT_URLS = [compile(settings.LOGIN_URL.lstrip("/"))]
+EXEMPT_URLS = [re.compile(settings.LOGIN_URL.lstrip("/"))]
 if hasattr(settings, "LOGIN_EXEMPT_URLS"):
     EXEMPT_URLS += [
-        compile(expr.lstrip("/")) for expr in settings.LOGIN_EXEMPT_URLS
+        re.compile(expr.lstrip("/")) for expr in settings.LOGIN_EXEMPT_URLS
     ]
 
 
diff --git a/cardinal_pythonlib/email/__init__.py b/cardinal_pythonlib/email_utils/__init__.py
similarity index 100%
rename from cardinal_pythonlib/email/__init__.py
rename to cardinal_pythonlib/email_utils/__init__.py
diff --git a/cardinal_pythonlib/email/mailboxpurge.py b/cardinal_pythonlib/email_utils/mailboxpurge.py
similarity index 100%
rename from cardinal_pythonlib/email/mailboxpurge.py
rename to cardinal_pythonlib/email_utils/mailboxpurge.py
diff --git a/cardinal_pythonlib/email/sendmail.py b/cardinal_pythonlib/email_utils/sendmail.py
similarity index 100%
rename from cardinal_pythonlib/email/sendmail.py
rename to cardinal_pythonlib/email_utils/sendmail.py
diff --git a/cardinal_pythonlib/email/tests/sendmail_tests.py b/cardinal_pythonlib/email_utils/tests/sendmail_tests.py
similarity index 100%
rename from cardinal_pythonlib/email/tests/sendmail_tests.py
rename to cardinal_pythonlib/email_utils/tests/sendmail_tests.py
diff --git a/cardinal_pythonlib/json/__init__.py b/cardinal_pythonlib/json_utils/__init__.py
similarity index 95%
rename from cardinal_pythonlib/json/__init__.py
rename to cardinal_pythonlib/json_utils/__init__.py
index 3256199..51cc3bd 100644
--- a/cardinal_pythonlib/json/__init__.py
+++ b/cardinal_pythonlib/json_utils/__init__.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/json/__init__.py
+# cardinal_pythonlib/json_utils/__init__.py
 
 """
 ===============================================================================
diff --git a/cardinal_pythonlib/json/serialize.py b/cardinal_pythonlib/json_utils/serialize.py
similarity index 99%
rename from cardinal_pythonlib/json/serialize.py
rename to cardinal_pythonlib/json_utils/serialize.py
index 3103a6e..eb3a434 100644
--- a/cardinal_pythonlib/json/serialize.py
+++ b/cardinal_pythonlib/json_utils/serialize.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/json/serialize.py
+# cardinal_pythonlib/json_utils/serialize.py
 
 """
 ===============================================================================
diff --git a/cardinal_pythonlib/json/typing_helpers.py b/cardinal_pythonlib/json_utils/typing_helpers.py
similarity index 96%
rename from cardinal_pythonlib/json/typing_helpers.py
rename to cardinal_pythonlib/json_utils/typing_helpers.py
index d5c6c18..47c7161 100644
--- a/cardinal_pythonlib/json/typing_helpers.py
+++ b/cardinal_pythonlib/json_utils/typing_helpers.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/json/typing_helpers.py
+# cardinal_pythonlib/json_utils/typing_helpers.py
 
 """
 ===============================================================================
diff --git a/cardinal_pythonlib/profile.py b/cardinal_pythonlib/profiling.py
similarity index 100%
rename from cardinal_pythonlib/profile.py
rename to cardinal_pythonlib/profiling.py
diff --git a/cardinal_pythonlib/tools/explore_clang_format_config.py b/cardinal_pythonlib/tools/explore_clang_format_config.py
index 94cb8b0..c6de7cc 100644
--- a/cardinal_pythonlib/tools/explore_clang_format_config.py
+++ b/cardinal_pythonlib/tools/explore_clang_format_config.py
@@ -60,7 +60,11 @@ def monitor_diff(filenames: List[str], meld_exe: str) -> subprocess.Popen:
 
 
 def clang_format(
-    config: str, src: str, dest: str, dir: str, clang_format_exe: str
+    config: str,
+    src: str,
+    dest: str,
+    dir: str,  # noqa: A002
+    clang_format_exe: str,
 ) -> None:
     """
     Rungs clang-format, formatting a source file to a destination file using a
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 9254629..f45f913 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -32,7 +32,7 @@
 
 project = "cardinal_pythonlib"
 # noinspection PyShadowingBuiltins
-copyright = "2009-2020, Rudolf Cardinal"
+copyright = "2009-2020, Rudolf Cardinal"  # noqa: A001
 author = "Rudolf Cardinal"
 
 # The short X.Y version
diff --git a/setup.cfg b/setup.cfg
index 7985ef3..0652958 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -13,7 +13,7 @@ linters=pycodestyle,pyflakes
 max-line-length=79
 # Not compatible with Black and not PEP8 apparently
 # E203: Whitespace before ':'
-extend-ignore = E203
+extend-ignore = A003,E203
 
 [mypy]
 # MyPy is a static type checker. It will not execute the code!

From 4ba8610400e8d53093d212f7caf766ae13f9fdd0 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Tue, 6 May 2025 10:45:35 +0100
Subject: [PATCH 17/39] Ignore shadowing of python built-ins

---
 cardinal_pythonlib/sizeformatter.py              | 2 +-
 cardinal_pythonlib/tools/convert_mdb_to_mysql.py | 4 ++--
 setup.py                                         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cardinal_pythonlib/sizeformatter.py b/cardinal_pythonlib/sizeformatter.py
index 10169fe..69345e2 100644
--- a/cardinal_pythonlib/sizeformatter.py
+++ b/cardinal_pythonlib/sizeformatter.py
@@ -69,7 +69,7 @@ def sizeof_fmt(num: float, suffix: str = "B") -> str:
 
 def bytes2human(
     n: Union[int, float],
-    format: str = "%(value).1f %(symbol)s",
+    format: str = "%(value).1f %(symbol)s",  # noqa: A002
     symbols: str = "customary",
 ) -> str:
     """
diff --git a/cardinal_pythonlib/tools/convert_mdb_to_mysql.py b/cardinal_pythonlib/tools/convert_mdb_to_mysql.py
index 8f2b397..da85905 100644
--- a/cardinal_pythonlib/tools/convert_mdb_to_mysql.py
+++ b/cardinal_pythonlib/tools/convert_mdb_to_mysql.py
@@ -156,9 +156,9 @@ def __init__(
         nargs: Union[int, str] = "?",  # 0 or 1
         default: Any = None,
         required: bool = False,
-        type: Callable[[str], Any] = None,
+        type: Callable[[str], Any] = None,  # noqa: A002
         metavar: str = None,
-        help: str = None,
+        help: str = None,  # noqa: A002
     ) -> None:
         super(PasswordPromptAction, self).__init__(
             option_strings=option_strings,
diff --git a/setup.py b/setup.py
index 26489c1..2a3e6d4 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@
 """
 
 from setuptools import setup, find_packages
-from codecs import open
+from codecs import open  # noqa: A004
 from os import path
 
 from cardinal_pythonlib.version_string import VERSION_STRING

From b0520cb813c651e82ee43c23a34054abdb9430ae Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Tue, 6 May 2025 10:47:53 +0100
Subject: [PATCH 18/39] Remove check for conflicting email import

---
 .../ensure_test_executed_correctly.py         | 40 -------------------
 cardinal_pythonlib/module_version.py          |  3 --
 2 files changed, 43 deletions(-)
 delete mode 100644 cardinal_pythonlib/ensure_test_executed_correctly.py

diff --git a/cardinal_pythonlib/ensure_test_executed_correctly.py b/cardinal_pythonlib/ensure_test_executed_correctly.py
deleted file mode 100644
index 6ae15f4..0000000
--- a/cardinal_pythonlib/ensure_test_executed_correctly.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env python
-# cardinal_pythonlib/module_version.py
-
-"""
-===============================================================================
-
-    Original code copyright (C) 2009-2022 Rudolf Cardinal (rudolf@pobox.com).
-
-    This file is part of cardinal_pythonlib.
-
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-
-        https://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-
-===============================================================================
-
-**Ensure that a library module is executed properly, and not via a way that
-breaks imports.**
-
-"""
-
-try:
-    # we want the stdlib email package!
-    from email import message_from_string  # noqa: F401
-except ImportError:
-    raise ImportError(
-        "A test of importing 'email' has found "
-        "cardinal_pythonlib/email/__init__.py, not the email package from "
-        "stdlib. You are probably running a cardinal_pythonlib file directly, "
-        "e.g. with 'python somefile.py' or '/path/somefile.py'. Instead, use "
-        "'python -m cardinal_pythonlib.somefile'."
-    )
diff --git a/cardinal_pythonlib/module_version.py b/cardinal_pythonlib/module_version.py
index f18f9a0..cb251fe 100644
--- a/cardinal_pythonlib/module_version.py
+++ b/cardinal_pythonlib/module_version.py
@@ -39,9 +39,6 @@
 
 from semantic_version import Version
 
-# noinspection PyUnresolvedReferences
-import cardinal_pythonlib.ensure_test_executed_correctly  # noqa: F401
-
 
 # =============================================================================
 # Report Python module versions

From d1b00b034e70289e360c467201da7a470233d576 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Tue, 6 May 2025 10:55:42 +0100
Subject: [PATCH 19/39] Update docs

---
 docs/source/autodoc/_index.rst                | 13 +++++-----
 .../mailboxpurge.py.rst                       |  8 +++---
 .../{email => email_utils}/sendmail.py.rst    |  8 +++---
 .../tests/sendmail_tests.py.rst               |  8 +++---
 .../ensure_test_executed_correctly.py.rst     | 25 -------------------
 .../{json => json_utils}/serialize.py.rst     |  8 +++---
 .../typing_helpers.py.rst                     |  8 +++---
 .../{profile.py.rst => profiling.py.rst}      |  8 +++---
 8 files changed, 30 insertions(+), 56 deletions(-)
 rename docs/source/autodoc/{email => email_utils}/mailboxpurge.py.rst (77%)
 rename docs/source/autodoc/{email => email_utils}/sendmail.py.rst (79%)
 rename docs/source/autodoc/{email => email_utils}/tests/sendmail_tests.py.rst (75%)
 delete mode 100644 docs/source/autodoc/ensure_test_executed_correctly.py.rst
 rename docs/source/autodoc/{json => json_utils}/serialize.py.rst (79%)
 rename docs/source/autodoc/{json => json_utils}/typing_helpers.py.rst (77%)
 rename docs/source/autodoc/{profile.py.rst => profiling.py.rst} (83%)

diff --git a/docs/source/autodoc/_index.rst b/docs/source/autodoc/_index.rst
index 3a7fb7e..d910bf4 100644
--- a/docs/source/autodoc/_index.rst
+++ b/docs/source/autodoc/_index.rst
@@ -66,10 +66,9 @@ Automatic documentation of source code
     docker.py.rst
     dogpile_cache.py.rst
     dsp.py.rst
-    email/mailboxpurge.py.rst
-    email/sendmail.py.rst
-    email/tests/sendmail_tests.py.rst
-    ensure_test_executed_correctly.py.rst
+    email_utils/mailboxpurge.py.rst
+    email_utils/sendmail.py.rst
+    email_utils/tests/sendmail_tests.py.rst
     enumlike.py.rst
     excel.py.rst
     exceptions.py.rst
@@ -82,8 +81,8 @@ Automatic documentation of source code
     httpconst.py.rst
     interval.py.rst
     iterhelp.py.rst
-    json/serialize.py.rst
-    json/typing_helpers.py.rst
+    json_utils/serialize.py.rst
+    json_utils/typing_helpers.py.rst
     lang.py.rst
     lists.py.rst
     logs.py.rst
@@ -104,7 +103,7 @@ Automatic documentation of source code
     plot.py.rst
     probability.py.rst
     process.py.rst
-    profile.py.rst
+    profiling.py.rst
     progress.py.rst
     psychiatry/drugs.py.rst
     psychiatry/mk_r_druglists.py.rst
diff --git a/docs/source/autodoc/email/mailboxpurge.py.rst b/docs/source/autodoc/email_utils/mailboxpurge.py.rst
similarity index 77%
rename from docs/source/autodoc/email/mailboxpurge.py.rst
rename to docs/source/autodoc/email_utils/mailboxpurge.py.rst
index c49e933..a4c86da 100644
--- a/docs/source/autodoc/email/mailboxpurge.py.rst
+++ b/docs/source/autodoc/email_utils/mailboxpurge.py.rst
@@ -1,4 +1,4 @@
-.. docs/source/autodoc/email/mailboxpurge.py.rst
+.. docs/source/autodoc/email_utils/mailboxpurge.py.rst
 
 .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
 
@@ -18,8 +18,8 @@
     limitations under the License.
 
 
-cardinal_pythonlib.email.mailboxpurge
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+cardinal_pythonlib.email_utils.mailboxpurge
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. automodule:: cardinal_pythonlib.email.mailboxpurge
+.. automodule:: cardinal_pythonlib.email_utils.mailboxpurge
     :members:
diff --git a/docs/source/autodoc/email/sendmail.py.rst b/docs/source/autodoc/email_utils/sendmail.py.rst
similarity index 79%
rename from docs/source/autodoc/email/sendmail.py.rst
rename to docs/source/autodoc/email_utils/sendmail.py.rst
index 82327dc..e090e97 100644
--- a/docs/source/autodoc/email/sendmail.py.rst
+++ b/docs/source/autodoc/email_utils/sendmail.py.rst
@@ -1,4 +1,4 @@
-.. docs/source/autodoc/email/sendmail.py.rst
+.. docs/source/autodoc/email_utils/sendmail.py.rst
 
 .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
 
@@ -18,8 +18,8 @@
     limitations under the License.
 
 
-cardinal_pythonlib.email.sendmail
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+cardinal_pythonlib.email_utils.sendmail
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. automodule:: cardinal_pythonlib.email.sendmail
+.. automodule:: cardinal_pythonlib.email_utils.sendmail
     :members:
diff --git a/docs/source/autodoc/email/tests/sendmail_tests.py.rst b/docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst
similarity index 75%
rename from docs/source/autodoc/email/tests/sendmail_tests.py.rst
rename to docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst
index f209e33..22fa863 100644
--- a/docs/source/autodoc/email/tests/sendmail_tests.py.rst
+++ b/docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst
@@ -1,4 +1,4 @@
-.. docs/source/autodoc/email/tests/sendmail_tests.py.rst
+.. docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst
 
 .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
 
@@ -18,8 +18,8 @@
     limitations under the License.
 
 
-cardinal_pythonlib.email.tests.sendmail_tests
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+cardinal_pythonlib.email_utils.tests.sendmail_tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. automodule:: cardinal_pythonlib.email.tests.sendmail_tests
+.. automodule:: cardinal_pythonlib.email_utils.tests.sendmail_tests
     :members:
diff --git a/docs/source/autodoc/ensure_test_executed_correctly.py.rst b/docs/source/autodoc/ensure_test_executed_correctly.py.rst
deleted file mode 100644
index efde8de..0000000
--- a/docs/source/autodoc/ensure_test_executed_correctly.py.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-.. docs/source/autodoc/ensure_test_executed_correctly.py.rst
-
-.. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
-
-
-..  Copyright (C) 2009-2020 Rudolf Cardinal (rudolf@pobox.com).
-    .
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-    .
-        https://www.apache.org/licenses/LICENSE-2.0
-    .
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-
-
-cardinal_pythonlib.ensure_test_executed_correctly
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. automodule:: cardinal_pythonlib.ensure_test_executed_correctly
-    :members:
diff --git a/docs/source/autodoc/json/serialize.py.rst b/docs/source/autodoc/json_utils/serialize.py.rst
similarity index 79%
rename from docs/source/autodoc/json/serialize.py.rst
rename to docs/source/autodoc/json_utils/serialize.py.rst
index 1868956..15e18c3 100644
--- a/docs/source/autodoc/json/serialize.py.rst
+++ b/docs/source/autodoc/json_utils/serialize.py.rst
@@ -1,4 +1,4 @@
-.. docs/source/autodoc/json/serialize.py.rst
+.. docs/source/autodoc/json_utils/serialize.py.rst
 
 .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
 
@@ -18,8 +18,8 @@
     limitations under the License.
 
 
-cardinal_pythonlib.json.serialize
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+cardinal_pythonlib.json_utils.serialize
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. automodule:: cardinal_pythonlib.json.serialize
+.. automodule:: cardinal_pythonlib.json_utils.serialize
     :members:
diff --git a/docs/source/autodoc/json/typing_helpers.py.rst b/docs/source/autodoc/json_utils/typing_helpers.py.rst
similarity index 77%
rename from docs/source/autodoc/json/typing_helpers.py.rst
rename to docs/source/autodoc/json_utils/typing_helpers.py.rst
index d6125b1..e53154e 100644
--- a/docs/source/autodoc/json/typing_helpers.py.rst
+++ b/docs/source/autodoc/json_utils/typing_helpers.py.rst
@@ -1,4 +1,4 @@
-.. docs/source/autodoc/json/typing_helpers.py.rst
+.. docs/source/autodoc/json_utils/typing_helpers.py.rst
 
 .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
 
@@ -18,8 +18,8 @@
     limitations under the License.
 
 
-cardinal_pythonlib.json.typing_helpers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+cardinal_pythonlib.json_utils.typing_helpers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. automodule:: cardinal_pythonlib.json.typing_helpers
+.. automodule:: cardinal_pythonlib.json_utils.typing_helpers
     :members:
diff --git a/docs/source/autodoc/profile.py.rst b/docs/source/autodoc/profiling.py.rst
similarity index 83%
rename from docs/source/autodoc/profile.py.rst
rename to docs/source/autodoc/profiling.py.rst
index 6149c87..1c40074 100644
--- a/docs/source/autodoc/profile.py.rst
+++ b/docs/source/autodoc/profiling.py.rst
@@ -1,4 +1,4 @@
-.. docs/source/autodoc/profile.py.rst
+.. docs/source/autodoc/profiling.py.rst
 
 .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
 
@@ -18,8 +18,8 @@
     limitations under the License.
 
 
-cardinal_pythonlib.profile
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+cardinal_pythonlib.profiling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. automodule:: cardinal_pythonlib.profile
+.. automodule:: cardinal_pythonlib.profiling
     :members:

From dc511c77cbc3e906465740c48ce8863ef6ef16c4 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Tue, 6 May 2025 11:03:07 +0100
Subject: [PATCH 20/39] Fixups following module renaming

---
 cardinal_pythonlib/bulk_email/main.py                  | 2 +-
 cardinal_pythonlib/bulk_email/models.py                | 2 +-
 cardinal_pythonlib/email_utils/__init__.py             | 2 +-
 cardinal_pythonlib/email_utils/mailboxpurge.py         | 2 +-
 cardinal_pythonlib/email_utils/sendmail.py             | 2 +-
 cardinal_pythonlib/email_utils/tests/sendmail_tests.py | 4 ++--
 cardinal_pythonlib/profiling.py                        | 2 +-
 setup.py                                               | 2 +-
 8 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cardinal_pythonlib/bulk_email/main.py b/cardinal_pythonlib/bulk_email/main.py
index 7757574..a70f19e 100644
--- a/cardinal_pythonlib/bulk_email/main.py
+++ b/cardinal_pythonlib/bulk_email/main.py
@@ -57,7 +57,7 @@
     Recipient,
     SendAttempt,
 )
-from cardinal_pythonlib.email.sendmail import (
+from cardinal_pythonlib.email_utils.sendmail import (
     CONTENT_TYPE_HTML,
     CONTENT_TYPE_TEXT,
     is_email_valid,
diff --git a/cardinal_pythonlib/bulk_email/models.py b/cardinal_pythonlib/bulk_email/models.py
index c6d210f..a01b76c 100644
--- a/cardinal_pythonlib/bulk_email/models.py
+++ b/cardinal_pythonlib/bulk_email/models.py
@@ -63,7 +63,7 @@
     USERNAME_MAX_LENGTH,
 )
 from cardinal_pythonlib.colander_utils import EMAIL_ADDRESS_MAX_LEN
-from cardinal_pythonlib.email.sendmail import (
+from cardinal_pythonlib.email_utils.sendmail import (
     ASCII,
     CONTENT_TYPE_TEXT,
     is_email_valid,
diff --git a/cardinal_pythonlib/email_utils/__init__.py b/cardinal_pythonlib/email_utils/__init__.py
index 61be984..c94078a 100644
--- a/cardinal_pythonlib/email_utils/__init__.py
+++ b/cardinal_pythonlib/email_utils/__init__.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/email/__init__.py
+# cardinal_pythonlib/email_utils/__init__.py
 
 """
 ===============================================================================
diff --git a/cardinal_pythonlib/email_utils/mailboxpurge.py b/cardinal_pythonlib/email_utils/mailboxpurge.py
index cae5c07..1f52b51 100755
--- a/cardinal_pythonlib/email_utils/mailboxpurge.py
+++ b/cardinal_pythonlib/email_utils/mailboxpurge.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/email/mailboxpurge.py
+# cardinal_pythonlib/email_utils/mailboxpurge.py
 
 """
 Remove all binary attachments from email messages
diff --git a/cardinal_pythonlib/email_utils/sendmail.py b/cardinal_pythonlib/email_utils/sendmail.py
index a286fb8..edebe34 100755
--- a/cardinal_pythonlib/email_utils/sendmail.py
+++ b/cardinal_pythonlib/email_utils/sendmail.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/email/sendmail.py
+# cardinal_pythonlib/email_utils/sendmail.py
 
 """
 ===============================================================================
diff --git a/cardinal_pythonlib/email_utils/tests/sendmail_tests.py b/cardinal_pythonlib/email_utils/tests/sendmail_tests.py
index 7e3107d..3c4eb37 100644
--- a/cardinal_pythonlib/email_utils/tests/sendmail_tests.py
+++ b/cardinal_pythonlib/email_utils/tests/sendmail_tests.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/email/tests/sendmail_tests.py
+# cardinal_pythonlib/email_utils/tests/sendmail_tests.py
 
 """
 ===============================================================================
@@ -28,7 +28,7 @@
 
 import unittest
 
-from cardinal_pythonlib.email.sendmail import is_email_valid
+from cardinal_pythonlib.email_utils.sendmail import is_email_valid
 
 
 class TestIsEmailValid(unittest.TestCase):
diff --git a/cardinal_pythonlib/profiling.py b/cardinal_pythonlib/profiling.py
index 558e13a..a06074e 100644
--- a/cardinal_pythonlib/profiling.py
+++ b/cardinal_pythonlib/profiling.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/profile.py
+# cardinal_pythonlib/profiling.py
 
 """
 ===============================================================================
diff --git a/setup.py b/setup.py
index 2a3e6d4..553ff11 100644
--- a/setup.py
+++ b/setup.py
@@ -192,7 +192,7 @@
             "cardinalpythonlib_chebi=cardinal_pythonlib.chebi:main",
             (
                 "cardinalpythonlib_email="
-                "cardinal_pythonlib.email.sendmail:main"
+                "cardinal_pythonlib.email_utils.sendmail:main"
             ),
             (
                 "cardinalpythonlib_extract_text="

From be15403e7d5ad8273d08935bab812d8e9e9b51c8 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Wed, 7 May 2025 06:51:47 +0100
Subject: [PATCH 21/39] extract_text.py type hints

---
 cardinal_pythonlib/extract_text.py | 17 +++++++++--------
 setup.cfg                          |  3 +++
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index 41dbd52..96df52d 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -87,6 +87,7 @@
 import sys
 import textwrap
 from typing import (
+    Any,
     BinaryIO,
     Dict,
     Generator,
@@ -205,9 +206,9 @@ def __init__(
         plain: bool = False,
         semiplain: bool = False,
         docx_in_order: bool = True,
-        horizontal_char="─",
-        vertical_char="│",
-        junction_char="┼",
+        horizontal_char: str = "─",
+        vertical_char: str = "│",
+        junction_char: str = "┼",
         plain_table_start: str = None,
         plain_table_end: str = None,
         plain_table_col_boundary: str = None,
@@ -445,7 +446,7 @@ def get_file_contents_text(
     )
 
 
-def get_cmd_output(*args, encoding: str = SYS_ENCODING) -> str:
+def get_cmd_output(*args: Any, encoding: str = SYS_ENCODING) -> str:
     """
     Returns text output of a command.
     """
@@ -456,7 +457,7 @@ def get_cmd_output(*args, encoding: str = SYS_ENCODING) -> str:
 
 
 def get_cmd_output_from_stdin(
-    stdint_content_binary: bytes, *args, encoding: str = SYS_ENCODING
+    stdint_content_binary: bytes, *args: Any, encoding: str = SYS_ENCODING
 ) -> str:
     """
     Returns text output of a command, passing binary data in via stdin.
@@ -559,7 +560,7 @@ def availability_pdf() -> bool:
 )
 
 
-def docx_qn(tagroot):
+def docx_qn(tagroot: str) -> str:
     return f"{{{DOCX_SCHEMA_URL}}}{tagroot}"
 
 
@@ -624,7 +625,7 @@ def docx_gen_wordwrapped_fragments(
     """
     to_wrap = []  # type: List[DocxFragment]
 
-    def yield_wrapped():
+    def yield_wrapped() -> Generator[str, None, None]:
         """
         Yield the word-wrapped stuff to date.
         """
@@ -1267,7 +1268,7 @@ def availability_anything() -> bool:
 # Decider
 # =============================================================================
 
-ext_map = {
+ext_map: dict[str, dict[str, Any]] = {
     # Converter functions must be of the form: func(filename, blob, config).
     # Availability must be either a boolean literal or a function that takes no
     # params.
diff --git a/setup.cfg b/setup.cfg
index 0652958..b922b87 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -22,3 +22,6 @@ no_strict_optional = True
 allow_redefinition = True
 disallow_untyped_defs = True
 disallow_incomplete_defs = True
+
+[mypy-semantic_version.*]
+ignore_missing_imports = True

From c9a06cec691dbbe33dfd42367ca3623994a5c179 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Wed, 7 May 2025 06:52:38 +0100
Subject: [PATCH 22/39] Use html.parser for BeautifulSoup

---
 cardinal_pythonlib/extract_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index 96df52d..fc2431a 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -1134,7 +1134,7 @@ def convert_html_to_text(
     Converts HTML to text.
     """
     with get_filelikeobject(filename, blob) as fp:
-        soup = bs4.BeautifulSoup(fp)
+        soup = bs4.BeautifulSoup(fp, "html.parser")
         return soup.get_text()
 
 

From 761e404fc1752f8e360f1d06a46d1a0745a0b7a0 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Fri, 9 May 2025 16:53:31 +0100
Subject: [PATCH 23/39] Support .eml text extraction

---
 .../scripts/install_test_python_packages.sh   |   2 +-
 cardinal_pythonlib/extract_text.py            |  69 +++++++++
 .../tests/extract_text_tests.py               | 137 +++++++++++++++++-
 3 files changed, 204 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/install_test_python_packages.sh b/.github/scripts/install_test_python_packages.sh
index ef6fbf9..129d97e 100755
--- a/.github/scripts/install_test_python_packages.sh
+++ b/.github/scripts/install_test_python_packages.sh
@@ -10,4 +10,4 @@ ${PYTHON} -m pip install xlrd
 ${PYTHON} -m pip install dogpile.cache==0.9.2  # Later versions incompatible
 ${PYTHON} -m pip install pytest
 ${PYTHON} -m pip install xhtml2pdf weasyprint pdfkit  # For PDF tests
-${PYTHON} -m pip install faker==13.3.1 faker-file'[common]'==0.17.13
+${PYTHON} -m pip install faker==13.3.1 faker-file'[common]'==0.18.3
diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index fc2431a..22d1ef0 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -77,9 +77,14 @@
 # =============================================================================
 
 import argparse
+import base64
+from email import policy
+from email.message import EmailMessage
+from email.parser import BytesParser
 from io import StringIO
 import io
 import logging
+from mimetypes import guess_extension
 import os
 import re
 import shutil
@@ -1230,6 +1235,69 @@ def availability_doc() -> bool:
     return bool(antiword)
 
 
+# =============================================================================
+# EML
+# =============================================================================
+
+
+def convert_eml_to_text(
+    filename: str = None,
+    blob: bytes = None,
+    config: TextProcessingConfig = _DEFAULT_CONFIG,
+) -> str:
+    email_content_list: list[str] = []
+
+    with get_filelikeobject(filename, blob) as fp:
+        parser = BytesParser(policy=policy.default)  # type: ignore[arg-type]
+        message = parser.parse(fp)
+
+        for email_content in _gen_email_content(message, config):
+            if email_content is not None:
+                email_content_list.append(email_content)
+
+    text = "\n".join(email_content_list)
+
+    return text
+
+
+def _gen_email_content(
+    message: EmailMessage, config: TextProcessingConfig
+) -> Generator[Optional[str], None, None]:
+    body = message.get_body(
+        preferencelist=(
+            "html",
+            "plain",
+        )
+    )  # type: ignore[attr-defined]
+    if body is not None:
+        yield _get_email_content(body, config)
+
+    for part in message.iter_attachments():  # type: ignore[attr-defined]
+        yield _get_email_content(part, config)
+
+
+def _get_email_content(
+    message: EmailMessage,
+    config: TextProcessingConfig,
+) -> Optional[str]:
+    content_type = message.get_content_type()
+    ext = guess_extension(content_type)
+
+    if ext is not None and ext in ext_map:
+        content = message.get_content()
+        if isinstance(content, str):
+            charset = message["Content-Type"].params["charset"]
+            blob = content.encode(charset)
+        elif isinstance(content, EmailMessage):
+            blob = content.as_bytes()
+            if message.get("Content-Transfer-Encoding") == "base64":
+                blob = base64.b64decode(blob)
+        else:
+            blob = content
+
+        return document_to_text(blob=blob, extension=ext, config=config)
+
+
 # =============================================================================
 # Anything
 # =============================================================================
@@ -1277,6 +1345,7 @@ def availability_anything() -> bool:
     ".docm": {CONVERTER: convert_docx_to_text, AVAILABILITY: True},
     ".docx": {CONVERTER: convert_docx_to_text, AVAILABILITY: True},
     ".dot": {CONVERTER: convert_doc_to_text, AVAILABILITY: availability_doc},
+    ".eml": {CONVERTER: convert_eml_to_text, AVAILABILITY: True},
     ".htm": {CONVERTER: convert_html_to_text, AVAILABILITY: True},
     ".html": {CONVERTER: convert_html_to_text, AVAILABILITY: True},
     ".log": {CONVERTER: get_file_contents_text, AVAILABILITY: True},
diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index b715ba3..f81808b 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -25,6 +25,8 @@
 
 """
 
+from email import message_from_string, policy
+from email.message import EmailMessage
 import os
 import subprocess
 from tempfile import mkdtemp, NamedTemporaryFile
@@ -32,8 +34,12 @@
 
 from faker import Faker
 from faker_file.providers.docx_file import DocxFileProvider
+from faker_file.providers.eml_file import EmlFileProvider
+from faker_file.providers.helpers.inner import (
+    create_inner_docx_file,
+    create_inner_eml_file,
+)
 from faker_file.providers.odt_file import OdtFileProvider
-from faker_file.providers.pdf_file import PdfFileProvider
 from faker_file.providers.txt_file import TxtFileProvider
 from faker_file.providers.xml_file import XmlFileProvider
 
@@ -63,10 +69,11 @@ def _create_mock_objects(self) -> None:
         )
 
     def _register_faker_providers(self) -> None:
-        self.fake = Faker("en-GB")
+        self.fake = Faker("en-US")  # To avoid Lorem Ipsum
+        self.fake.seed_instance(12345)
         self.fake.add_provider(DocxFileProvider)
+        self.fake.add_provider(EmlFileProvider)
         self.fake.add_provider(OdtFileProvider)
-        self.fake.add_provider(PdfFileProvider)
         self.fake.add_provider(TxtFileProvider)
         self.fake.add_provider(XmlFileProvider)
 
@@ -293,6 +300,130 @@ def test_xml_converted(self) -> None:
 
         self.assertEqual(text.strip(), f"{name}{address}")
 
+    def test_eml_converted(self) -> None:
+        content = self.fake.paragraph(nb_sentences=10)
+        eml_file = self.fake.eml_file(content=content)
+        text = document_to_text(filename=eml_file.data["filename"])
+
+        self.assertEqual(text.strip(), content)
+
+    def test_eml_with_docx_attachment_converted(self) -> None:
+        body_content = self.fake.paragraph(nb_sentences=10)
+        docx_content = self.fake.paragraph(nb_sentences=10)
+
+        docx_file_args = dict(content=docx_content)
+        options = dict(
+            count=1,
+            create_inner_file_func=create_inner_docx_file,
+            create_inner_file_args=docx_file_args,
+        )
+
+        eml_file = self.fake.eml_file(
+            content=body_content,
+            options=options,
+        )
+        self.config.width = 0
+        text = document_to_text(
+            filename=eml_file.data["filename"], config=self.config
+        )
+
+        self.assertIn(body_content, text)
+        self.assertIn(docx_content, text)
+
+    def test_eml_with_nested_docx_attachment_converted(self) -> None:
+        outer_email_content = self.fake.paragraph(nb_sentences=10)
+        inner_email_content = self.fake.paragraph(nb_sentences=10)
+
+        docx_content = self.fake.paragraph(nb_sentences=10)
+
+        docx_file_args = dict(content=docx_content)
+        docx_options = dict(
+            count=1,
+            create_inner_file_func=create_inner_docx_file,
+            create_inner_file_args=docx_file_args,
+        )
+        eml_file_args = dict(
+            content=inner_email_content,
+            options=docx_options,
+        )
+        eml_options = dict(
+            count=1,
+            create_inner_file_func=create_inner_eml_file,
+            create_inner_file_args=eml_file_args,
+        )
+
+        eml_file = self.fake.eml_file(
+            content=outer_email_content,
+            options=eml_options,
+        )
+
+        self.config.width = 0
+        text = document_to_text(
+            filename=eml_file.data["filename"], config=self.config
+        )
+
+        self.assertIn(outer_email_content, text)
+        self.assertIn(inner_email_content, text)
+        self.assertIn(docx_content, text)
+
+    def test_eml_html_body_preferred_over_text(self) -> None:
+        # Contrived example. Normally these would have the same content
+        text_content = self.fake.paragraph(nb_sentences=10)
+        html_content = self.fake.paragraph(nb_sentences=10)
+        html = f"""
+<!DOCTYPE html>
+<html>
+<head>
+</head>
+<body>
+{html_content}
+</body>
+</html>
+"""
+        # faker-file can't do this yet
+        message = EmailMessage()
+        message.set_content(text_content)
+        message.add_alternative(html, subtype="html")
+        blob = message.as_bytes()
+
+        text = document_to_text(
+            blob=blob, extension=".eml", config=self.config
+        )
+
+        self.assertIn(html_content, text)
+        self.assertNotIn(text_content, text)
+
+    def test_eml_latin1_html_decoded_correctly(self) -> None:
+        content = """From: foo@example.org
+To: bar@example.org
+Subject: Latin-1 test
+Content-Type: multipart/mixed; boundary="==="
+MIME-Version: 1.0
+
+--===
+Content-Type: text/html; charset="iso-8859-1"
+Content-Transfer-Encoding: quoted-printable
+
+<html><head>
+<meta http-equiv=3D"Content-Type" content=3D"text/html; charset=3Diso-8859-=
+1">
+</head>
+<body lang=3D"EN-GB">
+Caf=E9
+</body>
+</html>
+--===--
+"""
+
+        message = message_from_string(content, policy=policy.default)
+        blob = message.as_bytes()
+
+        text = document_to_text(
+            blob=blob, extension=".eml", config=self.config
+        )
+
+        self.assertIn("Café", text)
+
     def test_unsupported_converted(self) -> None:
         with mock.patch.multiple(
             "cardinal_pythonlib.extract_text.subprocess",

From 75b9ce6b99922aab05f8ef183dea808a76ce96fa Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Fri, 9 May 2025 16:54:05 +0100
Subject: [PATCH 24/39] Replace deprecated BeautifulStoneSoup as advised

---
 cardinal_pythonlib/extract_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index 22d1ef0..0a035d9 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -1158,7 +1158,7 @@ def convert_xml_to_text(
     Converts XML to text.
     """
     with get_filelikeobject(filename, blob) as fp:
-        soup = bs4.BeautifulStoneSoup(fp)
+        soup = bs4.BeautifulSoup(fp, features="xml")
         return soup.get_text()
 
 

From e58d8fddfdc0c4f4e2f81f6dfd7a3032b5449b68 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Fri, 9 May 2025 20:33:19 +0100
Subject: [PATCH 25/39] Default to UTF-8 when no charset in emails

---
 cardinal_pythonlib/extract_text.py            |  2 +-
 .../tests/extract_text_tests.py               | 27 +++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index 0a035d9..4657580 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -1286,7 +1286,7 @@ def _get_email_content(
     if ext is not None and ext in ext_map:
         content = message.get_content()
         if isinstance(content, str):
-            charset = message["Content-Type"].params["charset"]
+            charset = message["Content-Type"].params.get("charset", "utf-8")
             blob = content.encode(charset)
         elif isinstance(content, EmailMessage):
             blob = content.as_bytes()
diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index f81808b..77feacc 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -424,6 +424,33 @@ def test_eml_latin1_html_decoded_correctly(self) -> None:
 
         self.assertIn("Café", text)
 
+    def test_eml_with_no_charset_converted(self) -> None:
+        text_content = self.fake.paragraph(nb_sentences=10)
+
+        content = f"""From: bar@example.org
+Subject: No charset
+To: foo@example.org
+Mime-Version: 1.0
+Content-Type: multipart/mixed;boundary="==="
+
+--===
+Content-Type: text/plain
+
+{text_content}
+
+--===--
+
+"""
+
+        message = message_from_string(content, policy=policy.default)
+        blob = message.as_bytes()
+
+        text = document_to_text(
+            blob=blob, extension=".eml", config=self.config
+        )
+
+        self.assertIn(text_content, text)
+
     def test_unsupported_converted(self) -> None:
         with mock.patch.multiple(
             "cardinal_pythonlib.extract_text.subprocess",

From 4a11b4983a93b0829025e8d6c27ddd755e548f05 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Sat, 10 May 2025 06:57:24 +0100
Subject: [PATCH 26/39] Default to UTF-8 when no content type header in emails

---
 cardinal_pythonlib/extract_text.py            |  5 +++-
 .../tests/extract_text_tests.py               | 26 +++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index 4657580..b311ca1 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -1286,7 +1286,10 @@ def _get_email_content(
     if ext is not None and ext in ext_map:
         content = message.get_content()
         if isinstance(content, str):
-            charset = message["Content-Type"].params.get("charset", "utf-8")
+            charset = "utf-8"
+            content_type_header = message.get("Content-Type")
+            if content_type_header:
+                charset = content_type_header.params.get("charset", "utf-8")
             blob = content.encode(charset)
         elif isinstance(content, EmailMessage):
             blob = content.as_bytes()
diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index 77feacc..752c41d 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -440,6 +440,32 @@ def test_eml_with_no_charset_converted(self) -> None:
 
 --===--
 
+"""
+
+        message = message_from_string(content, policy=policy.default)
+        blob = message.as_bytes()
+
+        text = document_to_text(
+            blob=blob, extension=".eml", config=self.config
+        )
+
+        self.assertIn(text_content, text)
+
+    def test_eml_with_no_content_type_converted(self) -> None:
+        text_content = self.fake.paragraph(nb_sentences=10)
+
+        content = f"""From: bar@example.org
+Subject: No content type
+To: foo@example.org
+Mime-Version: 1.0
+Content-Type: multipart/mixed;boundary="==="
+
+--===
+
+{text_content}
+
+--===--
+
 """
 
         message = message_from_string(content, policy=policy.default)

From 5fb204f16565d6d86aeb84ab9685fad1f24f5fb3 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Mon, 12 May 2025 11:23:30 +0100
Subject: [PATCH 27/39] Allow docx files to include document files with
 document[nn].xml form

I don't know if this is deviating from the standard but I have seen one example of this in the real world
---
 cardinal_pythonlib/extract_text.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index b311ca1..3432080 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -555,10 +555,10 @@ def availability_pdf() -> bool:
 # -----------------------------------------------------------------------------
 # In a D.I.Y. fashion
 # -----------------------------------------------------------------------------
-# DOCX specification: http://www.ecma-international.org/news/TC45_current_work/TC45_available_docs.htm  # noqa: E501
+# DOCX specification: https://ecma-international.org/publications-and-standards/standards/ecma-376/  # noqa: E501
 
 DOCX_HEADER_FILE_REGEX = re.compile("word/header[0-9]*.xml")
-DOCX_DOC_FILE = "word/document.xml"
+DOCX_DOCUMENT_FILE_REGEX = re.compile("word/document[0-9]*.xml")
 DOCX_FOOTER_FILE_REGEX = re.compile("word/footer[0-9]*.xml")
 DOCX_SCHEMA_URL = (
     "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
@@ -601,7 +601,9 @@ def gen_xml_files_from_docx(fp: BinaryIO) -> Iterator[str]:
         for filename in filelist:
             if DOCX_HEADER_FILE_REGEX.match(filename):
                 yield z.read(filename).decode("utf8")
-        yield z.read(DOCX_DOC_FILE)
+        for filename in filelist:
+            if DOCX_DOCUMENT_FILE_REGEX.match(filename):
+                yield z.read(filename)
         for filename in filelist:
             if DOCX_FOOTER_FILE_REGEX.match(filename):
                 yield z.read(filename).decode("utf8")

From de72344f763ddf6ba3c8dd6d006192bb713abb3a Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Mon, 12 May 2025 15:00:41 +0100
Subject: [PATCH 28/39] Allow blobs to be empty when extracting text

It is possible to have an email with an empty body. Other scenarios (empty HTML, docx etc)
are pretty unlikely
---
 cardinal_pythonlib/extract_text.py            |  8 +++---
 .../tests/extract_text_tests.py               | 25 +++++++++++++++++++
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index 3432080..f6c0250 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -358,7 +358,7 @@ def get_filelikeobject(filename: str = None, blob: bytes = None) -> BinaryIO:
     Returns:
         a :class:`BinaryIO` object
     """
-    if not filename and not blob:
+    if filename is None and blob is None:
         raise ValueError("no filename and no blob")
     if filename and blob:
         raise ValueError("specify either filename or blob")
@@ -373,11 +373,11 @@ def get_file_contents(filename: str = None, blob: bytes = None) -> bytes:
     """
     Returns the binary contents of a file, or of a BLOB.
     """
-    if not filename and not blob:
+    if filename is None and blob is None:
         raise ValueError("no filename and no blob")
     if filename and blob:
         raise ValueError("specify either filename or blob")
-    if blob:
+    if blob is not None:
         return blob
     with open(filename, "rb") as f:
         return f.read()
@@ -1408,7 +1408,7 @@ def document_to_text(
         Raises an exception for malformed arguments, missing files, bad
         filetypes, etc.
     """
-    if not filename and not blob:
+    if filename is None and blob is None:
         raise ValueError("document_to_text: no filename and no blob")
     if filename and blob:
         raise ValueError("document_to_text: specify either filename or blob")
diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index 752c41d..92c17c2 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -204,6 +204,12 @@ def test_htm_converted(self) -> None:
         )
         self.assertEqual(text.strip(), content)
 
+    def test_empty_htm_converted(self) -> None:
+        text = document_to_text(
+            blob="".encode("utf-8"), extension="htm", config=self.config
+        )
+        self.assertEqual(text, "")
+
     def test_log_converted(self) -> None:
         content = """
 2025-04-02 06:05:43,772 INFO Starting unattended upgrades script
@@ -477,6 +483,25 @@ def test_eml_with_no_content_type_converted(self) -> None:
 
         self.assertIn(text_content, text)
 
+    def test_eml_with_empty_body_converted(self) -> None:
+        content = """From: bar@example.org
+Subject: No body
+To: foo@example.org
+Mime-Version: 1.0
+Content-Type: multipart/mixed;boundary="==="
+
+--===
+--===--
+"""
+        message = message_from_string(content, policy=policy.default)
+        blob = message.as_bytes()
+
+        text = document_to_text(
+            blob=blob, extension=".eml", config=self.config
+        )
+
+        self.assertEqual("", text)
+
     def test_unsupported_converted(self) -> None:
         with mock.patch.multiple(
             "cardinal_pythonlib.extract_text.subprocess",

From 87f7754ce85d613d3d7b3ddc5d70f7d7e485da7f Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Mon, 12 May 2025 15:21:56 +0100
Subject: [PATCH 29/39] Fix docx filename generation to yield string, not bytes

---
 cardinal_pythonlib/extract_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index f6c0250..7ab576e 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -603,7 +603,7 @@ def gen_xml_files_from_docx(fp: BinaryIO) -> Iterator[str]:
                 yield z.read(filename).decode("utf8")
         for filename in filelist:
             if DOCX_DOCUMENT_FILE_REGEX.match(filename):
-                yield z.read(filename)
+                yield z.read(filename).decode("utf8")
         for filename in filelist:
             if DOCX_FOOTER_FILE_REGEX.match(filename):
                 yield z.read(filename).decode("utf8")

From e17023e900585340962a73e9cf6960aa66b5ca3b Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Mon, 12 May 2025 15:22:27 +0100
Subject: [PATCH 30/39] Fix missing return value

---
 cardinal_pythonlib/extract_text.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index 7ab576e..f36cba6 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -1302,6 +1302,8 @@ def _get_email_content(
 
         return document_to_text(blob=blob, extension=ext, config=config)
 
+    return None
+
 
 # =============================================================================
 # Anything

From bdc9983e1cb1e2cf07783284903cab1a789a2b9e Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Mon, 12 May 2025 16:24:52 +0100
Subject: [PATCH 31/39] Workaround BeautifulSoup not handling empty byte array
 correctly

---
 cardinal_pythonlib/extract_text.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index f36cba6..4172b99 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -1140,6 +1140,12 @@ def convert_html_to_text(
     """
     Converts HTML to text.
     """
+
+    # beautifulsoup4==4.13.4 returns "b''" for an empty bytes array
+    # So we just workaround this here:
+    if bytes is not None and len(blob) == 0:
+        return ""
+
     with get_filelikeobject(filename, blob) as fp:
         soup = bs4.BeautifulSoup(fp, "html.parser")
         return soup.get_text()

From 499f994606c5fb27000db61a8c7aebbd21808700 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Mon, 12 May 2025 17:18:27 +0100
Subject: [PATCH 32/39] Note BS4 bug report

---
 cardinal_pythonlib/extract_text.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index 4172b99..cb6f108 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -1141,6 +1141,7 @@ def convert_html_to_text(
     Converts HTML to text.
     """
 
+    # https://bugs.launchpad.net/beautifulsoup/+bug/2110492
     # beautifulsoup4==4.13.4 returns "b''" for an empty bytes array
     # So we just workaround this here:
     if bytes is not None and len(blob) == 0:

From dc92a17bdec317b1f9a8db2cb241ab0687efeaa1 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Tue, 13 May 2025 10:04:11 +0100
Subject: [PATCH 33/39] Replace illegal multibyte sequences when encoding
 emails

---
 cardinal_pythonlib/extract_text.py            |  2 +-
 .../tests/extract_text_tests.py               | 29 +++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index cb6f108..26f339a 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -1299,7 +1299,7 @@ def _get_email_content(
             content_type_header = message.get("Content-Type")
             if content_type_header:
                 charset = content_type_header.params.get("charset", "utf-8")
-            blob = content.encode(charset)
+            blob = content.encode(charset, "replace")
         elif isinstance(content, EmailMessage):
             blob = content.as_bytes()
             if message.get("Content-Transfer-Encoding") == "base64":
diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index 92c17c2..d37546e 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -502,6 +502,35 @@ def test_eml_with_empty_body_converted(self) -> None:
 
         self.assertEqual("", text)
 
+    def test_eml_with_illegal_multibyte_sequence_replaced(self) -> None:
+        content = """From: bar@example.org
+Subject: Illegal multibyte sequence
+To: foo@example.org
+Mime-Version: 1.0
+Content-Type: multipart/mixed;boundary="==="
+
+--===
+Content-Type: text/html; charset="big5"
+Content-Transfer-Encoding: quoted-printable
+
+<html><head>
+<meta http-equiv=3D"Content-Type" content=3D"text/html; charset=3Dbig5">
+</head>
+<body>
+=F9=F9
+</body>
+</html>
+--===--
+"""
+        message = message_from_string(content, policy=policy.default)
+        blob = message.as_bytes()
+
+        text = document_to_text(
+            blob=blob, extension=".eml", config=self.config
+        )
+
+        self.assertEqual(text.strip(), "??")
+
     def test_unsupported_converted(self) -> None:
         with mock.patch.multiple(
             "cardinal_pythonlib.extract_text.subprocess",

From 51e9295292f5e92c0dd7bafac38f3ce33b4f661c Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Tue, 13 May 2025 16:51:18 +0100
Subject: [PATCH 34/39] Handle invalid surrogate characters in HTML conversion

---
 cardinal_pythonlib/extract_text.py            | 10 ++++++-
 .../tests/extract_text_tests.py               | 30 +++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index 26f339a..eb57d60 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -1149,7 +1149,15 @@ def convert_html_to_text(
 
     with get_filelikeobject(filename, blob) as fp:
         soup = bs4.BeautifulSoup(fp, "html.parser")
-        return soup.get_text()
+
+        # In the real world we can end up with UTF-16 characters embedded as
+        # numbered entities in Windows-1252 encoded HTML such as
+        # &#55357;&#56898; "Slightly smiling face". Replacing these here
+        # avoids "UnicodeEncodeError: 'utf-8' codec can't encode characters in
+        # position ... surrogates not allowed".
+        text = soup.get_text().encode(errors="replace").decode()
+
+        return text
 
 
 # =============================================================================
diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index d37546e..c0e8590 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -531,6 +531,36 @@ def test_eml_with_illegal_multibyte_sequence_replaced(self) -> None:
 
         self.assertEqual(text.strip(), "??")
 
+    def test_eml_invalid_surrogate_characters_replaced(self) -> None:
+        content = """From: bar@example.org
+Subject: Invalid surrogate characters
+To: foo@example.org
+Mime-Version: 1.0
+Content-Type: multipart/mixed;boundary="==="
+
+--===
+Content-Type: text/html; charset="windows-1252"
+Content-Transfer-Encoding: quoted-printable
+
+<html><head>
+<meta http-equiv=3D"Content-Type" content=3D"text/html; charset=3DWindows-1=
+252">
+</head>
+<body>
+&#55357;&#56898;
+</body>
+</html>
+--===--
+"""
+        message = message_from_string(content, policy=policy.default)
+        blob = message.as_bytes()
+
+        text = document_to_text(
+            blob=blob, extension=".eml", config=self.config
+        )
+
+        self.assertEqual(text.strip(), "??")
+
     def test_unsupported_converted(self) -> None:
         with mock.patch.multiple(
             "cardinal_pythonlib.extract_text.subprocess",

From fdccb7640b6e3651fcb772e11c359dbfd9182c3f Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Tue, 13 May 2025 20:56:14 +0100
Subject: [PATCH 35/39] Better names for test methods

---
 cardinal_pythonlib/tests/extract_text_tests.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index c0e8590..9485043 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -130,7 +130,7 @@ def test_csv_converted(self) -> None:
 
         self.assertEqual(text, content)
 
-    def test_doc_converted_with_antiword(self) -> None:
+    def test_doc_will_be_converted_with_antiword(self) -> None:
         with mock.patch.multiple(
             "cardinal_pythonlib.extract_text.subprocess",
             Popen=self.mock_popen,
@@ -152,7 +152,7 @@ def test_doc_converted_with_antiword(self) -> None:
         ]
         self.mock_popen.assert_has_calls(expected_calls)
 
-    def test_dot_converted_with_antiword(self) -> None:
+    def test_dot_will_be_converted_with_antiword(self) -> None:
         with mock.patch.multiple(
             "cardinal_pythonlib.extract_text.subprocess",
             Popen=self.mock_popen,
@@ -233,7 +233,7 @@ def test_odt_converted(self) -> None:
 
         self.assertEqual(text.strip(), content)
 
-    def test_pdf_converted(self) -> None:
+    def test_pdf_will_be_converted_with_pdftotext(self) -> None:
         with mock.patch.multiple(
             "cardinal_pythonlib.extract_text.subprocess",
             Popen=self.mock_popen,
@@ -254,7 +254,7 @@ def test_pdf_converted(self) -> None:
         ]
         self.mock_popen.assert_has_calls(expected_calls)
 
-    def test_rtf_converted(self) -> None:
+    def test_rtf_will_be_converted_with_unrtf(self) -> None:
         with mock.patch(
             "cardinal_pythonlib.extract_text.UNRTF_SUPPORTS_QUIET", True
         ):
@@ -561,7 +561,7 @@ def test_eml_invalid_surrogate_characters_replaced(self) -> None:
 
         self.assertEqual(text.strip(), "??")
 
-    def test_unsupported_converted(self) -> None:
+    def test_unsupported_will_be_converted_with_strings(self) -> None:
         with mock.patch.multiple(
             "cardinal_pythonlib.extract_text.subprocess",
             Popen=self.mock_popen,

From dba72a96ea27618fb511242ebd93a42b0f9d32c9 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Tue, 13 May 2025 21:12:41 +0100
Subject: [PATCH 36/39] Fix test comment

---
 cardinal_pythonlib/tests/extract_text_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index 9485043..6e5c9ee 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -1,4 +1,4 @@
-# cardinal_pythonlib/tests/datetimefunc_tests.py
+# cardinal_pythonlib/tests/extract_text_tests.py
 
 """
 ===============================================================================

From 97b5a0a79e98c6525a412219c40d842c576092a2 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Wed, 14 May 2025 06:35:04 +0100
Subject: [PATCH 37/39] Update changelog

---
 docs/source/changelog.rst | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
index deb6c93..056ac0b 100644
--- a/docs/source/changelog.rst
+++ b/docs/source/changelog.rst
@@ -889,3 +889,16 @@ Quick links:
 **2.0.5 (2025-04-07)**
 
 - Add VARCHAR to valid Databricks types.
+
+**2.1.0 (2025-05-13)**
+
+- **BREAKING CHANGE**: Rename modules to avoid conflicts with the Python
+  standard library:
+
+   - :mod:`cardinal_pythonlib.email` is now :mod:`cardinal_pythonlib.email_utils`
+   - :mod:`cardinal_pythonlib.json` is now :mod:`cardinal_pythonlib.json_utils`
+   - :mod:`cardinal_pythonlib.profile` is now :mod:`cardinal_pythonlib.profiling`
+
+- Add support for ``.eml`` files with attachments processed by supported
+  document converters (``.docx``, ``.pdf``, ``.odt`` etc.) to
+  :func:`cardinal_pythonlib.extract_text.document_to_text`.

From 32cfc583b7b1d4dca45f7628be6e1ff3e8a46146 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Wed, 14 May 2025 06:47:27 +0100
Subject: [PATCH 38/39] Align version of faker-file used in docs to that used
 in tests

---
 docs/docs_requirements.txt | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/docs/docs_requirements.txt b/docs/docs_requirements.txt
index f489b7f..c88fa73 100644
--- a/docs/docs_requirements.txt
+++ b/docs/docs_requirements.txt
@@ -7,18 +7,12 @@ dogpile.cache==0.9.2
 # CRATE is on 4.2
 Django>=4.2,<5.0
 faker==13.3.1
-faker-file[common]==0.17.13
+faker-file[common]==0.18.3
 libChEBIpy
 pdfkit
 pyramid==1.10.8
 pytest
-# sphinx==4.2.0
 sphinx==7.1.2
-# sphinxcontrib-applehelp==1.0.4
-# sphinxcontrib-devhelp==1.0.2
-# sphinxcontrib-htmlhelp==2.0.1
-# sphinxcontrib-serializinghtml==1.1.5
-# sphinxcontrib-qthelp==1.0.3
 sphinx-paramlinks==0.6.0
 sphinx_rtd_theme==2.0.0
 weasyprint

From b00e82e0e15f75afddd3a642b0ac3e3977a05742 Mon Sep 17 00:00:00 2001
From: Martin Burchell <mb2353@cam.ac.uk>
Date: Wed, 14 May 2025 09:38:16 +0100
Subject: [PATCH 39/39] Revert empty filename check when extracting text

---
 cardinal_pythonlib/extract_text.py             | 4 ++--
 cardinal_pythonlib/tests/extract_text_tests.py | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index eb57d60..0c24121 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -358,7 +358,7 @@ def get_filelikeobject(filename: str = None, blob: bytes = None) -> BinaryIO:
     Returns:
         a :class:`BinaryIO` object
     """
-    if filename is None and blob is None:
+    if not filename and blob is None:
         raise ValueError("no filename and no blob")
     if filename and blob:
         raise ValueError("specify either filename or blob")
@@ -1425,7 +1425,7 @@ def document_to_text(
         Raises an exception for malformed arguments, missing files, bad
         filetypes, etc.
     """
-    if filename is None and blob is None:
+    if not filename and blob is None:
         raise ValueError("document_to_text: no filename and no blob")
     if filename and blob:
         raise ValueError("document_to_text: specify either filename or blob")
diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
index 6e5c9ee..3a64b7b 100644
--- a/cardinal_pythonlib/tests/extract_text_tests.py
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -101,6 +101,12 @@ def test_raises_when_no_filename_or_blob(self) -> None:
 
         self.assertIn("no filename and no blob", str(cm.exception))
 
+    def test_raises_when_filename_empty(self) -> None:
+        with self.assertRaises(ValueError) as cm:
+            document_to_text(filename="")
+
+        self.assertIn("no filename and no blob", str(cm.exception))
+
     def test_raises_when_filename_and_blob(self) -> None:
         with self.assertRaises(ValueError) as cm:
             document_to_text(filename="foo", blob="bar")