diff --git a/.github/scripts/install_test_python_packages.sh b/.github/scripts/install_test_python_packages.sh
index 12eb3720..129d97ef 100755
--- a/.github/scripts/install_test_python_packages.sh
+++ b/.github/scripts/install_test_python_packages.sh
@@ -10,3 +10,4 @@ ${PYTHON} -m pip install xlrd
 ${PYTHON} -m pip install dogpile.cache==0.9.2  # Later versions incompatible
 ${PYTHON} -m pip install pytest
 ${PYTHON} -m pip install xhtml2pdf weasyprint pdfkit  # For PDF tests
+${PYTHON} -m pip install faker==13.3.1 faker-file'[common]'==0.18.3
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2c39bb5d..1af96a37 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,10 +18,14 @@ repos:
     rev: 5.0.4
     hooks:
     -   id: flake8
+        additional_dependencies:
+        - flake8-builtins==2.5.0
 -   repo: https://github.com/asottile/yesqa
     rev: v1.5.0
     hooks:
     -   id: yesqa
+        additional_dependencies:
+        - flake8-builtins==2.5.0
 -   repo: https://github.com/pre-commit/pygrep-hooks
     rev: v1.9.0
     hooks:
diff --git a/cardinal_pythonlib/bulk_email/main.py b/cardinal_pythonlib/bulk_email/main.py
index 7757574f..a70f19e8 100644
--- a/cardinal_pythonlib/bulk_email/main.py
+++ b/cardinal_pythonlib/bulk_email/main.py
@@ -57,7 +57,7 @@
     Recipient,
     SendAttempt,
 )
-from cardinal_pythonlib.email.sendmail import (
+from cardinal_pythonlib.email_utils.sendmail import (
     CONTENT_TYPE_HTML,
     CONTENT_TYPE_TEXT,
     is_email_valid,
diff --git a/cardinal_pythonlib/bulk_email/models.py b/cardinal_pythonlib/bulk_email/models.py
index c6d210f5..a01b76c4 100644
--- a/cardinal_pythonlib/bulk_email/models.py
+++ b/cardinal_pythonlib/bulk_email/models.py
@@ -63,7 +63,7 @@
     USERNAME_MAX_LENGTH,
 )
 from cardinal_pythonlib.colander_utils import EMAIL_ADDRESS_MAX_LEN
-from cardinal_pythonlib.email.sendmail import (
+from cardinal_pythonlib.email_utils.sendmail import (
     ASCII,
     CONTENT_TYPE_TEXT,
     is_email_valid,
diff --git a/cardinal_pythonlib/django/fields/jsonclassfield.py b/cardinal_pythonlib/django/fields/jsonclassfield.py
index 43fbaf50..2240707b 100644
--- a/cardinal_pythonlib/django/fields/jsonclassfield.py
+++ b/cardinal_pythonlib/django/fields/jsonclassfield.py
@@ -130,7 +130,7 @@ def my_decoder_hook(d: Dict) -> Any:
 # noinspection PyUnresolvedReferences
 from django.db.models import TextField
 
-from cardinal_pythonlib.json.serialize import json_decode, json_encode
+from cardinal_pythonlib.json_utils.serialize import json_decode, json_encode
 
 
 # =============================================================================
diff --git a/cardinal_pythonlib/django/function_cache.py b/cardinal_pythonlib/django/function_cache.py
index 11127b11..a2416ab4 100644
--- a/cardinal_pythonlib/django/function_cache.py
+++ b/cardinal_pythonlib/django/function_cache.py
@@ -36,7 +36,7 @@
 from django.core.cache import cache  # default cache
 
 from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler
-from cardinal_pythonlib.json.serialize import json_encode
+from cardinal_pythonlib.json_utils.serialize import json_encode
 
 log = get_brace_style_log_with_null_handler(__name__)
 
diff --git a/cardinal_pythonlib/django/middleware.py b/cardinal_pythonlib/django/middleware.py
index b201a089..3d614a79 100644
--- a/cardinal_pythonlib/django/middleware.py
+++ b/cardinal_pythonlib/django/middleware.py
@@ -28,7 +28,7 @@
 
 import logging
 import os
-from re import compile
+import re
 import sys
 from typing import Optional
 
@@ -107,9 +107,9 @@ def process_exception(
 Modified according to: https://djangosnippets.org/snippets/2845/
 """
 
-# EXEMPT_URLS = [compile(settings.LOGIN_URL.lstrip('/'))]
+# EXEMPT_URLS = [re.compile(settings.LOGIN_URL.lstrip('/'))]
 # if hasattr(settings, 'LOGIN_EXEMPT_URLS'):
-#     EXEMPT_URLS += [compile(expr) for expr in settings.LOGIN_EXEMPT_URLS]
+#     EXEMPT_URLS += [re.compile(expr) for expr in settings.LOGIN_EXEMPT_URLS]
 #
 #
 # class LoginRequiredMiddleware:
@@ -166,10 +166,10 @@ def process_exception(
 # 3. RNC; composite of those patterns.
 # -----------------------------------------------------------------------------
 
-EXEMPT_URLS = [compile(settings.LOGIN_URL.lstrip("/"))]
+EXEMPT_URLS = [re.compile(settings.LOGIN_URL.lstrip("/"))]
 if hasattr(settings, "LOGIN_EXEMPT_URLS"):
     EXEMPT_URLS += [
-        compile(expr.lstrip("/")) for expr in settings.LOGIN_EXEMPT_URLS
+        re.compile(expr.lstrip("/")) for expr in settings.LOGIN_EXEMPT_URLS
     ]
 
 
diff --git a/cardinal_pythonlib/email/__init__.py b/cardinal_pythonlib/email_utils/__init__.py
similarity index 95%
rename from cardinal_pythonlib/email/__init__.py
rename to cardinal_pythonlib/email_utils/__init__.py
index 61be9840..c94078aa 100644
--- a/cardinal_pythonlib/email/__init__.py
+++ b/cardinal_pythonlib/email_utils/__init__.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/email/__init__.py
+# cardinal_pythonlib/email_utils/__init__.py
 
 """
 ===============================================================================
diff --git a/cardinal_pythonlib/email/mailboxpurge.py b/cardinal_pythonlib/email_utils/mailboxpurge.py
similarity index 98%
rename from cardinal_pythonlib/email/mailboxpurge.py
rename to cardinal_pythonlib/email_utils/mailboxpurge.py
index cae5c07d..1f52b518 100755
--- a/cardinal_pythonlib/email/mailboxpurge.py
+++ b/cardinal_pythonlib/email_utils/mailboxpurge.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/email/mailboxpurge.py
+# cardinal_pythonlib/email_utils/mailboxpurge.py
 
 """
 Remove all binary attachments from email messages
diff --git a/cardinal_pythonlib/email/sendmail.py b/cardinal_pythonlib/email_utils/sendmail.py
similarity index 99%
rename from cardinal_pythonlib/email/sendmail.py
rename to cardinal_pythonlib/email_utils/sendmail.py
index a286fb8b..edebe34c 100755
--- a/cardinal_pythonlib/email/sendmail.py
+++ b/cardinal_pythonlib/email_utils/sendmail.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/email/sendmail.py
+# cardinal_pythonlib/email_utils/sendmail.py
 
 """
 ===============================================================================
diff --git a/cardinal_pythonlib/email/tests/sendmail_tests.py b/cardinal_pythonlib/email_utils/tests/sendmail_tests.py
similarity index 93%
rename from cardinal_pythonlib/email/tests/sendmail_tests.py
rename to cardinal_pythonlib/email_utils/tests/sendmail_tests.py
index 7e3107df..3c4eb374 100644
--- a/cardinal_pythonlib/email/tests/sendmail_tests.py
+++ b/cardinal_pythonlib/email_utils/tests/sendmail_tests.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/email/tests/sendmail_tests.py
+# cardinal_pythonlib/email_utils/tests/sendmail_tests.py
 
 """
 ===============================================================================
@@ -28,7 +28,7 @@
 
 import unittest
 
-from cardinal_pythonlib.email.sendmail import is_email_valid
+from cardinal_pythonlib.email_utils.sendmail import is_email_valid
 
 
 class TestIsEmailValid(unittest.TestCase):
diff --git a/cardinal_pythonlib/ensure_test_executed_correctly.py b/cardinal_pythonlib/ensure_test_executed_correctly.py
deleted file mode 100644
index 6ae15f40..00000000
--- a/cardinal_pythonlib/ensure_test_executed_correctly.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env python
-# cardinal_pythonlib/module_version.py
-
-"""
-===============================================================================
-
-    Original code copyright (C) 2009-2022 Rudolf Cardinal (rudolf@pobox.com).
-
-    This file is part of cardinal_pythonlib.
-
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-
-        https://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-
-===============================================================================
-
-**Ensure that a library module is executed properly, and not via a way that
-breaks imports.**
-
-"""
-
-try:
-    # we want the stdlib email package!
-    from email import message_from_string  # noqa: F401
-except ImportError:
-    raise ImportError(
-        "A test of importing 'email' has found "
-        "cardinal_pythonlib/email/__init__.py, not the email package from "
-        "stdlib. You are probably running a cardinal_pythonlib file directly, "
-        "e.g. with 'python somefile.py' or '/path/somefile.py'. Instead, use "
-        "'python -m cardinal_pythonlib.somefile'."
-    )
diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
index 41dbd52e..0c24121e 100755
--- a/cardinal_pythonlib/extract_text.py
+++ b/cardinal_pythonlib/extract_text.py
@@ -77,9 +77,14 @@
 # =============================================================================
 
 import argparse
+import base64
+from email import policy
+from email.message import EmailMessage
+from email.parser import BytesParser
 from io import StringIO
 import io
 import logging
+from mimetypes import guess_extension
 import os
 import re
 import shutil
@@ -87,6 +92,7 @@
 import sys
 import textwrap
 from typing import (
+    Any,
     BinaryIO,
     Dict,
     Generator,
@@ -205,9 +211,9 @@ def __init__(
         plain: bool = False,
         semiplain: bool = False,
         docx_in_order: bool = True,
-        horizontal_char="─",
-        vertical_char="│",
-        junction_char="┼",
+        horizontal_char: str = "─",
+        vertical_char: str = "│",
+        junction_char: str = "┼",
         plain_table_start: str = None,
         plain_table_end: str = None,
         plain_table_col_boundary: str = None,
@@ -352,7 +358,7 @@ def get_filelikeobject(filename: str = None, blob: bytes = None) -> BinaryIO:
     Returns:
         a :class:`BinaryIO` object
     """
-    if not filename and not blob:
+    if not filename and blob is None:
         raise ValueError("no filename and no blob")
     if filename and blob:
         raise ValueError("specify either filename or blob")
@@ -367,11 +373,11 @@ def get_file_contents(filename: str = None, blob: bytes = None) -> bytes:
     """
     Returns the binary contents of a file, or of a BLOB.
     """
-    if not filename and not blob:
+    if filename is None and blob is None:
         raise ValueError("no filename and no blob")
     if filename and blob:
         raise ValueError("specify either filename or blob")
-    if blob:
+    if blob is not None:
         return blob
     with open(filename, "rb") as f:
         return f.read()
@@ -445,7 +451,7 @@ def get_file_contents_text(
     )
 
 
-def get_cmd_output(*args, encoding: str = SYS_ENCODING) -> str:
+def get_cmd_output(*args: Any, encoding: str = SYS_ENCODING) -> str:
     """
     Returns text output of a command.
     """
@@ -456,7 +462,7 @@ def get_cmd_output(*args, encoding: str = SYS_ENCODING) -> str:
 
 
 def get_cmd_output_from_stdin(
-    stdint_content_binary: bytes, *args, encoding: str = SYS_ENCODING
+    stdint_content_binary: bytes, *args: Any, encoding: str = SYS_ENCODING
 ) -> str:
     """
     Returns text output of a command, passing binary data in via stdin.
@@ -549,17 +555,17 @@ def availability_pdf() -> bool:
 # -----------------------------------------------------------------------------
 # In a D.I.Y. fashion
 # -----------------------------------------------------------------------------
-# DOCX specification: http://www.ecma-international.org/news/TC45_current_work/TC45_available_docs.htm  # noqa: E501
+# DOCX specification: https://ecma-international.org/publications-and-standards/standards/ecma-376/  # noqa: E501
 
 DOCX_HEADER_FILE_REGEX = re.compile("word/header[0-9]*.xml")
-DOCX_DOC_FILE = "word/document.xml"
+DOCX_DOCUMENT_FILE_REGEX = re.compile("word/document[0-9]*.xml")
 DOCX_FOOTER_FILE_REGEX = re.compile("word/footer[0-9]*.xml")
 DOCX_SCHEMA_URL = (
     "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
 )
 
 
-def docx_qn(tagroot):
+def docx_qn(tagroot: str) -> str:
     return f"{{{DOCX_SCHEMA_URL}}}{tagroot}"
 
 
@@ -595,7 +601,9 @@ def gen_xml_files_from_docx(fp: BinaryIO) -> Iterator[str]:
         for filename in filelist:
             if DOCX_HEADER_FILE_REGEX.match(filename):
                 yield z.read(filename).decode("utf8")
-        yield z.read(DOCX_DOC_FILE)
+        for filename in filelist:
+            if DOCX_DOCUMENT_FILE_REGEX.match(filename):
+                yield z.read(filename).decode("utf8")
         for filename in filelist:
             if DOCX_FOOTER_FILE_REGEX.match(filename):
                 yield z.read(filename).decode("utf8")
@@ -624,7 +632,7 @@ def docx_gen_wordwrapped_fragments(
     """
     to_wrap = []  # type: List[DocxFragment]
 
-    def yield_wrapped():
+    def yield_wrapped() -> Generator[str, None, None]:
         """
         Yield the word-wrapped stuff to date.
         """
@@ -1132,9 +1140,24 @@ def convert_html_to_text(
     """
     Converts HTML to text.
     """
+
+    # https://bugs.launchpad.net/beautifulsoup/+bug/2110492
+    # beautifulsoup4==4.13.4 returns "b''" for an empty bytes array
+    # So we just workaround this here:
+    if bytes is not None and len(blob) == 0:
+        return ""
+
     with get_filelikeobject(filename, blob) as fp:
-        soup = bs4.BeautifulSoup(fp)
-        return soup.get_text()
+        soup = bs4.BeautifulSoup(fp, "html.parser")
+
+        # In the real world we can end up with UTF-16 characters embedded as
+        # numbered entities in Windows-1252 encoded HTML such as
+        # &#55357;&#56898; "Slightly smiling face". Replacing these here
+        # avoids "UnicodeEncodeError: 'utf-8' codec can't encode characters in
+        # position ... surrogates not allowed".
+        text = soup.get_text().encode(errors="replace").decode()
+
+        return text
 
 
 # =============================================================================
@@ -1152,7 +1175,7 @@ def convert_xml_to_text(
     Converts XML to text.
     """
     with get_filelikeobject(filename, blob) as fp:
-        soup = bs4.BeautifulStoneSoup(fp)
+        soup = bs4.BeautifulSoup(fp, features="xml")
         return soup.get_text()
 
 
@@ -1229,6 +1252,74 @@ def availability_doc() -> bool:
     return bool(antiword)
 
 
+# =============================================================================
+# EML
+# =============================================================================
+
+
+def convert_eml_to_text(
+    filename: str = None,
+    blob: bytes = None,
+    config: TextProcessingConfig = _DEFAULT_CONFIG,
+) -> str:
+    email_content_list: list[str] = []
+
+    with get_filelikeobject(filename, blob) as fp:
+        parser = BytesParser(policy=policy.default)  # type: ignore[arg-type]
+        message = parser.parse(fp)
+
+        for email_content in _gen_email_content(message, config):
+            if email_content is not None:
+                email_content_list.append(email_content)
+
+    text = "\n".join(email_content_list)
+
+    return text
+
+
+def _gen_email_content(
+    message: EmailMessage, config: TextProcessingConfig
+) -> Generator[Optional[str], None, None]:
+    body = message.get_body(
+        preferencelist=(
+            "html",
+            "plain",
+        )
+    )  # type: ignore[attr-defined]
+    if body is not None:
+        yield _get_email_content(body, config)
+
+    for part in message.iter_attachments():  # type: ignore[attr-defined]
+        yield _get_email_content(part, config)
+
+
+def _get_email_content(
+    message: EmailMessage,
+    config: TextProcessingConfig,
+) -> Optional[str]:
+    content_type = message.get_content_type()
+    ext = guess_extension(content_type)
+
+    if ext is not None and ext in ext_map:
+        content = message.get_content()
+        if isinstance(content, str):
+            charset = "utf-8"
+            content_type_header = message.get("Content-Type")
+            if content_type_header:
+                charset = content_type_header.params.get("charset", "utf-8")
+            blob = content.encode(charset, "replace")
+        elif isinstance(content, EmailMessage):
+            blob = content.as_bytes()
+            if message.get("Content-Transfer-Encoding") == "base64":
+                blob = base64.b64decode(blob)
+        else:
+            blob = content
+
+        return document_to_text(blob=blob, extension=ext, config=config)
+
+    return None
+
+
 # =============================================================================
 # Anything
 # =============================================================================
@@ -1267,7 +1358,7 @@ def availability_anything() -> bool:
 # Decider
 # =============================================================================
 
-ext_map = {
+ext_map: dict[str, dict[str, Any]] = {
     # Converter functions must be of the form: func(filename, blob, config).
     # Availability must be either a boolean literal or a function that takes no
     # params.
@@ -1276,6 +1367,7 @@ def availability_anything() -> bool:
     ".docm": {CONVERTER: convert_docx_to_text, AVAILABILITY: True},
     ".docx": {CONVERTER: convert_docx_to_text, AVAILABILITY: True},
     ".dot": {CONVERTER: convert_doc_to_text, AVAILABILITY: availability_doc},
+    ".eml": {CONVERTER: convert_eml_to_text, AVAILABILITY: True},
     ".htm": {CONVERTER: convert_html_to_text, AVAILABILITY: True},
     ".html": {CONVERTER: convert_html_to_text, AVAILABILITY: True},
     ".log": {CONVERTER: get_file_contents_text, AVAILABILITY: True},
@@ -1333,7 +1425,7 @@ def document_to_text(
         Raises an exception for malformed arguments, missing files, bad
         filetypes, etc.
     """
-    if not filename and not blob:
+    if not filename and blob is None:
         raise ValueError("document_to_text: no filename and no blob")
     if filename and blob:
         raise ValueError("document_to_text: specify either filename or blob")
diff --git a/cardinal_pythonlib/json/__init__.py b/cardinal_pythonlib/json_utils/__init__.py
similarity index 95%
rename from cardinal_pythonlib/json/__init__.py
rename to cardinal_pythonlib/json_utils/__init__.py
index 3256199d..51cc3bd1 100644
--- a/cardinal_pythonlib/json/__init__.py
+++ b/cardinal_pythonlib/json_utils/__init__.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/json/__init__.py
+# cardinal_pythonlib/json_utils/__init__.py
 
 """
 ===============================================================================
diff --git a/cardinal_pythonlib/json/serialize.py b/cardinal_pythonlib/json_utils/serialize.py
similarity index 99%
rename from cardinal_pythonlib/json/serialize.py
rename to cardinal_pythonlib/json_utils/serialize.py
index 3103a6ee..eb3a434f 100644
--- a/cardinal_pythonlib/json/serialize.py
+++ b/cardinal_pythonlib/json_utils/serialize.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/json/serialize.py
+# cardinal_pythonlib/json_utils/serialize.py
 
 """
 ===============================================================================
diff --git a/cardinal_pythonlib/json/typing_helpers.py b/cardinal_pythonlib/json_utils/typing_helpers.py
similarity index 96%
rename from cardinal_pythonlib/json/typing_helpers.py
rename to cardinal_pythonlib/json_utils/typing_helpers.py
index d5c6c18a..47c7161f 100644
--- a/cardinal_pythonlib/json/typing_helpers.py
+++ b/cardinal_pythonlib/json_utils/typing_helpers.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/json/typing_helpers.py
+# cardinal_pythonlib/json_utils/typing_helpers.py
 
 """
 ===============================================================================
diff --git a/cardinal_pythonlib/module_version.py b/cardinal_pythonlib/module_version.py
index f18f9a0a..cb251fe6 100644
--- a/cardinal_pythonlib/module_version.py
+++ b/cardinal_pythonlib/module_version.py
@@ -39,9 +39,6 @@
 
 from semantic_version import Version
 
-# noinspection PyUnresolvedReferences
-import cardinal_pythonlib.ensure_test_executed_correctly  # noqa: F401
-
 
 # =============================================================================
 # Report Python module versions
diff --git a/cardinal_pythonlib/profile.py b/cardinal_pythonlib/profiling.py
similarity index 97%
rename from cardinal_pythonlib/profile.py
rename to cardinal_pythonlib/profiling.py
index 558e13af..a06074e8 100644
--- a/cardinal_pythonlib/profile.py
+++ b/cardinal_pythonlib/profiling.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# cardinal_pythonlib/profile.py
+# cardinal_pythonlib/profiling.py
 
 """
 ===============================================================================
diff --git a/cardinal_pythonlib/sizeformatter.py b/cardinal_pythonlib/sizeformatter.py
index 10169fea..69345e2a 100644
--- a/cardinal_pythonlib/sizeformatter.py
+++ b/cardinal_pythonlib/sizeformatter.py
@@ -69,7 +69,7 @@ def sizeof_fmt(num: float, suffix: str = "B") -> str:
 
 def bytes2human(
     n: Union[int, float],
-    format: str = "%(value).1f %(symbol)s",
+    format: str = "%(value).1f %(symbol)s",  # noqa: A002
     symbols: str = "customary",
 ) -> str:
     """
diff --git a/cardinal_pythonlib/tests/__init__.py b/cardinal_pythonlib/tests/__init__.py
new file mode 100644
index 00000000..a5311f01
--- /dev/null
+++ b/cardinal_pythonlib/tests/__init__.py
@@ -0,0 +1,27 @@
+# cardinal_pythonlib/tests/__init__.py
+
+"""
+===============================================================================
+
+    Original code copyright (C) 2009-2022 Rudolf Cardinal (rudolf@pobox.com).
+
+    This file is part of cardinal_pythonlib.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        https://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+===============================================================================
+
+The mere existence of this file makes Python treat the directory as a
+package.
+
+"""
diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
new file mode 100644
index 00000000..3a64b7b8
--- /dev/null
+++ b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -0,0 +1,588 @@
+# cardinal_pythonlib/tests/extract_text_tests.py
+
+"""
+===============================================================================
+
+    Original code copyright (C) 2009-2022 Rudolf Cardinal (rudolf@pobox.com).
+
+    This file is part of cardinal_pythonlib.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        https://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+===============================================================================
+
+**Text extraction tests.**
+
+"""
+
+from email import message_from_string, policy
+from email.message import EmailMessage
+import os
+import subprocess
+from tempfile import mkdtemp, NamedTemporaryFile
+from unittest import mock, TestCase
+
+from faker import Faker
+from faker_file.providers.docx_file import DocxFileProvider
+from faker_file.providers.eml_file import EmlFileProvider
+from faker_file.providers.helpers.inner import (
+    create_inner_docx_file,
+    create_inner_eml_file,
+)
+from faker_file.providers.odt_file import OdtFileProvider
+from faker_file.providers.txt_file import TxtFileProvider
+from faker_file.providers.xml_file import XmlFileProvider
+
+from cardinal_pythonlib.extract_text import (
+    document_to_text,
+    TextProcessingConfig,
+    update_external_tools,
+)
+
+
+class DocumentToTextTests(TestCase):
+    def setUp(self) -> None:
+        self.empty_dir = mkdtemp()
+
+        self._replace_external_tools_with_fakes()
+        self.config = TextProcessingConfig()
+        self._create_mock_objects()
+        self._register_faker_providers()
+
+    def _create_mock_objects(self) -> None:
+        # Some mock empty output that we don't check
+        mock_decode = mock.Mock(return_value="")
+        mock_stdout = mock.Mock(decode=mock_decode)
+        mock_communicate = mock.Mock(return_value=(mock_stdout, None))
+        self.mock_popen = mock.Mock(
+            return_value=mock.Mock(communicate=mock_communicate)
+        )
+
+    def _register_faker_providers(self) -> None:
+        self.fake = Faker("en-US")  # To avoid Lorem Ipsum
+        self.fake.seed_instance(12345)
+        self.fake.add_provider(DocxFileProvider)
+        self.fake.add_provider(EmlFileProvider)
+        self.fake.add_provider(OdtFileProvider)
+        self.fake.add_provider(TxtFileProvider)
+        self.fake.add_provider(XmlFileProvider)
+
+    def _replace_external_tools_with_fakes(self) -> None:
+        # For external tools we assume the tools are running correctly
+        # and we just check that they are invoked with the correct arguments.
+
+        tool_names = [
+            "antiword",
+            "pdftotext",
+            "strings",
+            "strings2",
+            "unrtf",
+        ]
+
+        tools_dir = {t: os.path.join(self.empty_dir, t) for t in tool_names}
+        update_external_tools(tools_dir)
+
+    def tearDown(self) -> None:
+        os.rmdir(self.empty_dir)
+
+    def test_raises_when_no_filename_or_blob(self) -> None:
+        with self.assertRaises(ValueError) as cm:
+            document_to_text()
+
+        self.assertIn("no filename and no blob", str(cm.exception))
+
+    def test_raises_when_filename_empty(self) -> None:
+        with self.assertRaises(ValueError) as cm:
+            document_to_text(filename="")
+
+        self.assertIn("no filename and no blob", str(cm.exception))
+
+    def test_raises_when_filename_and_blob(self) -> None:
+        with self.assertRaises(ValueError) as cm:
+            document_to_text(filename="foo", blob="bar")
+
+        self.assertIn("specify either filename or blob", str(cm.exception))
+
+    def test_raises_when_blob_but_no_extension(self) -> None:
+        with self.assertRaises(ValueError) as cm:
+            document_to_text(blob="bar")
+
+        self.assertIn("need extension hint for blob", str(cm.exception))
+
+    def test_raises_when_not_a_file(self) -> None:
+        with self.assertRaises(ValueError) as cm:
+            filename = os.path.join(self.empty_dir, "foo")
+            document_to_text(filename=filename)
+
+        self.assertIn("no such file", str(cm.exception))
+
+    def test_csv_converted(self) -> None:
+        content = "one,two,three"
+
+        with NamedTemporaryFile(suffix=".csv", delete=False) as temp_file:
+            temp_file.write(content.encode("utf-8"))
+            temp_file.close()
+            text = document_to_text(filename=temp_file.name)
+
+        self.assertEqual(text, content)
+
+    def test_doc_will_be_converted_with_antiword(self) -> None:
+        with mock.patch.multiple(
+            "cardinal_pythonlib.extract_text.subprocess",
+            Popen=self.mock_popen,
+        ):
+            with NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
+                temp_file.close()
+                document_to_text(filename=temp_file.name, config=self.config)
+
+        expected_calls = [
+            mock.call(
+                (
+                    f"{self.empty_dir}/antiword",
+                    "-w",
+                    str(self.config.width),
+                    temp_file.name,
+                ),
+                stdout=subprocess.PIPE,
+            ),
+        ]
+        self.mock_popen.assert_has_calls(expected_calls)
+
+    def test_dot_will_be_converted_with_antiword(self) -> None:
+        with mock.patch.multiple(
+            "cardinal_pythonlib.extract_text.subprocess",
+            Popen=self.mock_popen,
+        ):
+            with NamedTemporaryFile(suffix=".dot", delete=False) as temp_file:
+                temp_file.close()
+                document_to_text(filename=temp_file.name)
+
+        expected_calls = [
+            mock.call(
+                (
+                    f"{self.empty_dir}/antiword",
+                    "-w",
+                    str(self.config.width),
+                    temp_file.name,
+                ),
+                stdout=subprocess.PIPE,
+            ),
+        ]
+        self.mock_popen.assert_has_calls(expected_calls)
+
+    def test_docx_converted(self) -> None:
+        content = self.fake.paragraph(nb_sentences=10)
+
+        docx = self.fake.docx_file(content=content)
+        self.config.width = 0
+        text = document_to_text(
+            filename=docx.data["filename"], config=self.config
+        )
+
+        self.assertEqual(text.strip(), content)
+
+    def test_htm_converted(self) -> None:
+        content = self.fake.paragraph(nb_sentences=10)
+
+        html = f"""
+<!DOCTYPE html>
+<html>
+<head>
+</head>
+<body>
+{content}
+</body>
+</html>
+"""
+
+        text = document_to_text(
+            blob=html.encode("utf-8"), extension="htm", config=self.config
+        )
+        self.assertEqual(text.strip(), content)
+
+    def test_empty_htm_converted(self) -> None:
+        text = document_to_text(
+            blob="".encode("utf-8"), extension="htm", config=self.config
+        )
+        self.assertEqual(text, "")
+
+    def test_log_converted(self) -> None:
+        content = """
+2025-04-02 06:05:43,772 INFO Starting unattended upgrades script
+2025-04-02 06:05:43,772 INFO Allowed origins are: o=Ubuntu,a=focal, o=Ubuntu,a=focal-security, o=UbuntuESMApps,a=focal-apps-security, o=UbuntuESM,a=focal-infra-security
+"""  # noqa: E501
+
+        text = document_to_text(
+            blob=content.encode("utf-8"), extension="log", config=self.config
+        )
+
+        self.assertEqual(text.strip(), content.strip())
+
+    def test_odt_converted(self) -> None:
+        content = self.fake.paragraph(nb_sentences=10)
+
+        odt = self.fake.odt_file(content=content)
+        self.config.width = 0
+        text = document_to_text(
+            filename=odt.data["filename"], config=self.config
+        )
+
+        self.assertEqual(text.strip(), content)
+
+    def test_pdf_will_be_converted_with_pdftotext(self) -> None:
+        with mock.patch.multiple(
+            "cardinal_pythonlib.extract_text.subprocess",
+            Popen=self.mock_popen,
+        ):
+            with NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
+                temp_file.close()
+                document_to_text(filename=temp_file.name, config=self.config)
+
+        expected_calls = [
+            mock.call(
+                (
+                    f"{self.empty_dir}/pdftotext",
+                    temp_file.name,
+                    "-",
+                ),
+                stdout=subprocess.PIPE,
+            ),
+        ]
+        self.mock_popen.assert_has_calls(expected_calls)
+
+    def test_rtf_will_be_converted_with_unrtf(self) -> None:
+        with mock.patch(
+            "cardinal_pythonlib.extract_text.UNRTF_SUPPORTS_QUIET", True
+        ):
+            with mock.patch.multiple(
+                "cardinal_pythonlib.extract_text.subprocess",
+                Popen=self.mock_popen,
+            ):
+                with NamedTemporaryFile(
+                    suffix=".rtf", delete=False
+                ) as temp_file:
+                    temp_file.close()
+                    document_to_text(
+                        filename=temp_file.name, config=self.config
+                    )
+
+        expected_calls = [
+            mock.call(
+                (
+                    f"{self.empty_dir}/unrtf",
+                    "--text",
+                    "--nopict",
+                    "--quiet",
+                    temp_file.name,
+                ),
+                stdout=subprocess.PIPE,
+            ),
+        ]
+        self.mock_popen.assert_has_calls(expected_calls)
+
+    def test_txt_converted(self) -> None:
+        content = self.fake.paragraph(nb_sentences=10)
+        txt_file = self.fake.txt_file(content=content)
+        text = document_to_text(filename=txt_file.data["filename"])
+
+        self.assertEqual(text.strip(), content)
+
+    def test_xml_converted(self) -> None:
+        name = self.fake.name()
+        address = self.fake.address()
+
+        xml_file = self.fake.xml_file(
+            num_rows=1,
+            data_columns={
+                "name": name,
+                "address": address,
+            },
+        )
+        text = document_to_text(filename=xml_file.data["filename"])
+
+        self.assertEqual(text.strip(), f"{name}{address}")
+
+    def test_eml_converted(self) -> None:
+        content = self.fake.paragraph(nb_sentences=10)
+        eml_file = self.fake.eml_file(content=content)
+        text = document_to_text(filename=eml_file.data["filename"])
+
+        self.assertEqual(text.strip(), content)
+
+    def test_eml_with_docx_attachment_converted(self) -> None:
+        body_content = self.fake.paragraph(nb_sentences=10)
+        docx_content = self.fake.paragraph(nb_sentences=10)
+
+        docx_file_args = dict(content=docx_content)
+        options = dict(
+            count=1,
+            create_inner_file_func=create_inner_docx_file,
+            create_inner_file_args=docx_file_args,
+        )
+
+        eml_file = self.fake.eml_file(
+            content=body_content,
+            options=options,
+        )
+        self.config.width = 0
+        text = document_to_text(
+            filename=eml_file.data["filename"], config=self.config
+        )
+
+        self.assertIn(body_content, text)
+        self.assertIn(docx_content, text)
+
+    def test_eml_with_nested_docx_attachment_converted(self) -> None:
+        outer_email_content = self.fake.paragraph(nb_sentences=10)
+        inner_email_content = self.fake.paragraph(nb_sentences=10)
+
+        docx_content = self.fake.paragraph(nb_sentences=10)
+
+        docx_file_args = dict(content=docx_content)
+        docx_options = dict(
+            count=1,
+            create_inner_file_func=create_inner_docx_file,
+            create_inner_file_args=docx_file_args,
+        )
+        eml_file_args = dict(
+            content=inner_email_content,
+            options=docx_options,
+        )
+        eml_options = dict(
+            count=1,
+            create_inner_file_func=create_inner_eml_file,
+            create_inner_file_args=eml_file_args,
+        )
+
+        eml_file = self.fake.eml_file(
+            content=outer_email_content,
+            options=eml_options,
+        )
+
+        self.config.width = 0
+        text = document_to_text(
+            filename=eml_file.data["filename"], config=self.config
+        )
+
+        self.assertIn(outer_email_content, text)
+        self.assertIn(inner_email_content, text)
+        self.assertIn(docx_content, text)
+
+    def test_eml_html_body_preferred_over_text(self) -> None:
+        # Contrived example. Normally these would have the same content
+        text_content = self.fake.paragraph(nb_sentences=10)
+        html_content = self.fake.paragraph(nb_sentences=10)
+        html = f"""
+<!DOCTYPE html>
+<html>
+<head>
+</head>
+<body>
+{html_content}
+</body>
+</html>
+"""
+        # faker-file can't do this yet
+        message = EmailMessage()
+        message.set_content(text_content)
+        message.add_alternative(html, subtype="html")
+        blob = message.as_bytes()
+
+        text = document_to_text(
+            blob=blob, extension=".eml", config=self.config
+        )
+
+        self.assertIn(html_content, text)
+        self.assertNotIn(text_content, text)
+
+    def test_eml_latin1_html_decoded_correctly(self) -> None:
+        content = """From: foo@example.org
+To: bar@example.org
+Subject: Latin-1 test
+Content-Type: multipart/mixed; boundary="==="
+MIME-Version: 1.0
+
+--===
+Content-Type: text/html; charset="iso-8859-1"
+Content-Transfer-Encoding: quoted-printable
+
+<html><head>
+<meta http-equiv=3D"Content-Type" content=3D"text/html; charset=3Diso-8859-=
+1">
+</head>
+<body lang=3D"EN-GB">
+Caf=E9
+</body>
+</html>
+--===--
+"""
+
+        message = message_from_string(content, policy=policy.default)
+        blob = message.as_bytes()
+
+        text = document_to_text(
+            blob=blob, extension=".eml", config=self.config
+        )
+
+        self.assertIn("Café", text)
+
+    def test_eml_with_no_charset_converted(self) -> None:
+        text_content = self.fake.paragraph(nb_sentences=10)
+
+        content = f"""From: bar@example.org
+Subject: No charset
+To: foo@example.org
+Mime-Version: 1.0
+Content-Type: multipart/mixed;boundary="==="
+
+--===
+Content-Type: text/plain
+
+{text_content}
+
+--===--
+
+"""
+
+        message = message_from_string(content, policy=policy.default)
+        blob = message.as_bytes()
+
+        text = document_to_text(
+            blob=blob, extension=".eml", config=self.config
+        )
+
+        self.assertIn(text_content, text)
+
+    def test_eml_with_no_content_type_converted(self) -> None:
+        text_content = self.fake.paragraph(nb_sentences=10)
+
+        content = f"""From: bar@example.org
+Subject: No content type
+To: foo@example.org
+Mime-Version: 1.0
+Content-Type: multipart/mixed;boundary="==="
+
+--===
+
+{text_content}
+
+--===--
+
+"""
+
+        message = message_from_string(content, policy=policy.default)
+        blob = message.as_bytes()
+
+        text = document_to_text(
+            blob=blob, extension=".eml", config=self.config
+        )
+
+        self.assertIn(text_content, text)
+
+    def test_eml_with_empty_body_converted(self) -> None:
+        content = """From: bar@example.org
+Subject: No body
+To: foo@example.org
+Mime-Version: 1.0
+Content-Type: multipart/mixed;boundary="==="
+
+--===
+--===--
+"""
+        message = message_from_string(content, policy=policy.default)
+        blob = message.as_bytes()
+
+        text = document_to_text(
+            blob=blob, extension=".eml", config=self.config
+        )
+
+        self.assertEqual("", text)
+
+    def test_eml_with_illegal_multibyte_sequence_replaced(self) -> None:
+        content = """From: bar@example.org
+Subject: Illegal multibyte sequence
+To: foo@example.org
+Mime-Version: 1.0
+Content-Type: multipart/mixed;boundary="==="
+
+--===
+Content-Type: text/html; charset="big5"
+Content-Transfer-Encoding: quoted-printable
+
+<html><head>
+<meta http-equiv=3D"Content-Type" content=3D"text/html; charset=3Dbig5">
+</head>
+<body>
+=F9=F9
+</body>
+</html>
+--===--
+"""
+        message = message_from_string(content, policy=policy.default)
+        blob = message.as_bytes()
+
+        text = document_to_text(
+            blob=blob, extension=".eml", config=self.config
+        )
+
+        self.assertEqual(text.strip(), "??")
+
+    def test_eml_invalid_surrogate_characters_replaced(self) -> None:
+        content = """From: bar@example.org
+Subject: Invalid surrogate characters
+To: foo@example.org
+Mime-Version: 1.0
+Content-Type: multipart/mixed;boundary="==="
+
+--===
+Content-Type: text/html; charset="windows-1252"
+Content-Transfer-Encoding: quoted-printable
+
+<html><head>
+<meta http-equiv=3D"Content-Type" content=3D"text/html; charset=3DWindows-1=
+252">
+</head>
+<body>
+&#55357;&#56898;
+</body>
+</html>
+--===--
+"""
+        message = message_from_string(content, policy=policy.default)
+        blob = message.as_bytes()
+
+        text = document_to_text(
+            blob=blob, extension=".eml", config=self.config
+        )
+
+        self.assertEqual(text.strip(), "??")
+
+    def test_unsupported_will_be_converted_with_strings(self) -> None:
+        with mock.patch.multiple(
+            "cardinal_pythonlib.extract_text.subprocess",
+            Popen=self.mock_popen,
+        ):
+            with NamedTemporaryFile(suffix=".exe", delete=False) as temp_file:
+                temp_file.close()
+                document_to_text(filename=temp_file.name, config=self.config)
+
+        expected_calls = [
+            mock.call(
+                (
+                    f"{self.empty_dir}/strings",
+                    temp_file.name,
+                ),
+                stdout=subprocess.PIPE,
+            ),
+        ]
+        self.mock_popen.assert_has_calls(expected_calls)
diff --git a/cardinal_pythonlib/tools/convert_mdb_to_mysql.py b/cardinal_pythonlib/tools/convert_mdb_to_mysql.py
index 8f2b3975..da859058 100644
--- a/cardinal_pythonlib/tools/convert_mdb_to_mysql.py
+++ b/cardinal_pythonlib/tools/convert_mdb_to_mysql.py
@@ -156,9 +156,9 @@ def __init__(
         nargs: Union[int, str] = "?",  # 0 or 1
         default: Any = None,
         required: bool = False,
-        type: Callable[[str], Any] = None,
+        type: Callable[[str], Any] = None,  # noqa: A002
         metavar: str = None,
-        help: str = None,
+        help: str = None,  # noqa: A002
     ) -> None:
         super(PasswordPromptAction, self).__init__(
             option_strings=option_strings,
diff --git a/cardinal_pythonlib/tools/explore_clang_format_config.py b/cardinal_pythonlib/tools/explore_clang_format_config.py
index 94cb8b0d..c6de7cc6 100644
--- a/cardinal_pythonlib/tools/explore_clang_format_config.py
+++ b/cardinal_pythonlib/tools/explore_clang_format_config.py
@@ -60,7 +60,11 @@ def monitor_diff(filenames: List[str], meld_exe: str) -> subprocess.Popen:
 
 
 def clang_format(
-    config: str, src: str, dest: str, dir: str, clang_format_exe: str
+    config: str,
+    src: str,
+    dest: str,
+    dir: str,  # noqa: A002
+    clang_format_exe: str,
 ) -> None:
     """
     Rungs clang-format, formatting a source file to a destination file using a
diff --git a/docs/docs_requirements.txt b/docs/docs_requirements.txt
index 9ffc9b2e..c88fa733 100644
--- a/docs/docs_requirements.txt
+++ b/docs/docs_requirements.txt
@@ -6,17 +6,13 @@ deform
 dogpile.cache==0.9.2
 # CRATE is on 4.2
 Django>=4.2,<5.0
+faker==13.3.1
+faker-file[common]==0.18.3
 libChEBIpy
 pdfkit
 pyramid==1.10.8
 pytest
-# sphinx==4.2.0
 sphinx==7.1.2
-# sphinxcontrib-applehelp==1.0.4
-# sphinxcontrib-devhelp==1.0.2
-# sphinxcontrib-htmlhelp==2.0.1
-# sphinxcontrib-serializinghtml==1.1.5
-# sphinxcontrib-qthelp==1.0.3
 sphinx-paramlinks==0.6.0
 sphinx_rtd_theme==2.0.0
 weasyprint
diff --git a/docs/source/autodoc/_index.rst b/docs/source/autodoc/_index.rst
index 4adf8f62..d910bf49 100644
--- a/docs/source/autodoc/_index.rst
+++ b/docs/source/autodoc/_index.rst
@@ -66,10 +66,9 @@ Automatic documentation of source code
     docker.py.rst
     dogpile_cache.py.rst
     dsp.py.rst
-    email/mailboxpurge.py.rst
-    email/sendmail.py.rst
-    email/tests/sendmail_tests.py.rst
-    ensure_test_executed_correctly.py.rst
+    email_utils/mailboxpurge.py.rst
+    email_utils/sendmail.py.rst
+    email_utils/tests/sendmail_tests.py.rst
     enumlike.py.rst
     excel.py.rst
     exceptions.py.rst
@@ -82,8 +81,8 @@ Automatic documentation of source code
     httpconst.py.rst
     interval.py.rst
     iterhelp.py.rst
-    json/serialize.py.rst
-    json/typing_helpers.py.rst
+    json_utils/serialize.py.rst
+    json_utils/typing_helpers.py.rst
     lang.py.rst
     lists.py.rst
     logs.py.rst
@@ -104,7 +103,7 @@ Automatic documentation of source code
     plot.py.rst
     probability.py.rst
     process.py.rst
-    profile.py.rst
+    profiling.py.rst
     progress.py.rst
     psychiatry/drugs.py.rst
     psychiatry/mk_r_druglists.py.rst
@@ -178,6 +177,7 @@ Automatic documentation of source code
     tee.py.rst
     tests/datetimefunc_tests.py.rst
     tests/dogpile_cache_tests.py.rst
+    tests/extract_text_tests.py.rst
     tests/interval_tests.py.rst
     tests/lists_tests.py.rst
     tests/pdf_tests.py.rst
diff --git a/docs/source/autodoc/email/mailboxpurge.py.rst b/docs/source/autodoc/email_utils/mailboxpurge.py.rst
similarity index 77%
rename from docs/source/autodoc/email/mailboxpurge.py.rst
rename to docs/source/autodoc/email_utils/mailboxpurge.py.rst
index c49e933f..a4c86daa 100644
--- a/docs/source/autodoc/email/mailboxpurge.py.rst
+++ b/docs/source/autodoc/email_utils/mailboxpurge.py.rst
@@ -1,4 +1,4 @@
-.. docs/source/autodoc/email/mailboxpurge.py.rst
+.. docs/source/autodoc/email_utils/mailboxpurge.py.rst
 
 .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
 
@@ -18,8 +18,8 @@
     limitations under the License.
 
 
-cardinal_pythonlib.email.mailboxpurge
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+cardinal_pythonlib.email_utils.mailboxpurge
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. automodule:: cardinal_pythonlib.email.mailboxpurge
+.. automodule:: cardinal_pythonlib.email_utils.mailboxpurge
     :members:
diff --git a/docs/source/autodoc/email/sendmail.py.rst b/docs/source/autodoc/email_utils/sendmail.py.rst
similarity index 79%
rename from docs/source/autodoc/email/sendmail.py.rst
rename to docs/source/autodoc/email_utils/sendmail.py.rst
index 82327dc5..e090e976 100644
--- a/docs/source/autodoc/email/sendmail.py.rst
+++ b/docs/source/autodoc/email_utils/sendmail.py.rst
@@ -1,4 +1,4 @@
-.. docs/source/autodoc/email/sendmail.py.rst
+.. docs/source/autodoc/email_utils/sendmail.py.rst
 
 .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
 
@@ -18,8 +18,8 @@
     limitations under the License.
 
 
-cardinal_pythonlib.email.sendmail
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+cardinal_pythonlib.email_utils.sendmail
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. automodule:: cardinal_pythonlib.email.sendmail
+.. automodule:: cardinal_pythonlib.email_utils.sendmail
     :members:
diff --git a/docs/source/autodoc/email/tests/sendmail_tests.py.rst b/docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst
similarity index 75%
rename from docs/source/autodoc/email/tests/sendmail_tests.py.rst
rename to docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst
index f209e33e..22fa8634 100644
--- a/docs/source/autodoc/email/tests/sendmail_tests.py.rst
+++ b/docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst
@@ -1,4 +1,4 @@
-.. docs/source/autodoc/email/tests/sendmail_tests.py.rst
+.. docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst
 
 .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
 
@@ -18,8 +18,8 @@
     limitations under the License.
 
 
-cardinal_pythonlib.email.tests.sendmail_tests
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+cardinal_pythonlib.email_utils.tests.sendmail_tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. automodule:: cardinal_pythonlib.email.tests.sendmail_tests
+.. automodule:: cardinal_pythonlib.email_utils.tests.sendmail_tests
     :members:
diff --git a/docs/source/autodoc/json/serialize.py.rst b/docs/source/autodoc/json_utils/serialize.py.rst
similarity index 79%
rename from docs/source/autodoc/json/serialize.py.rst
rename to docs/source/autodoc/json_utils/serialize.py.rst
index 18689562..15e18c33 100644
--- a/docs/source/autodoc/json/serialize.py.rst
+++ b/docs/source/autodoc/json_utils/serialize.py.rst
@@ -1,4 +1,4 @@
-.. docs/source/autodoc/json/serialize.py.rst
+.. docs/source/autodoc/json_utils/serialize.py.rst
 
 .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
 
@@ -18,8 +18,8 @@
     limitations under the License.
 
 
-cardinal_pythonlib.json.serialize
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+cardinal_pythonlib.json_utils.serialize
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. automodule:: cardinal_pythonlib.json.serialize
+.. automodule:: cardinal_pythonlib.json_utils.serialize
     :members:
diff --git a/docs/source/autodoc/json/typing_helpers.py.rst b/docs/source/autodoc/json_utils/typing_helpers.py.rst
similarity index 77%
rename from docs/source/autodoc/json/typing_helpers.py.rst
rename to docs/source/autodoc/json_utils/typing_helpers.py.rst
index d6125b12..e53154ee 100644
--- a/docs/source/autodoc/json/typing_helpers.py.rst
+++ b/docs/source/autodoc/json_utils/typing_helpers.py.rst
@@ -1,4 +1,4 @@
-.. docs/source/autodoc/json/typing_helpers.py.rst
+.. docs/source/autodoc/json_utils/typing_helpers.py.rst
 
 .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
 
@@ -18,8 +18,8 @@
     limitations under the License.
 
 
-cardinal_pythonlib.json.typing_helpers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+cardinal_pythonlib.json_utils.typing_helpers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. automodule:: cardinal_pythonlib.json.typing_helpers
+.. automodule:: cardinal_pythonlib.json_utils.typing_helpers
     :members:
diff --git a/docs/source/autodoc/profile.py.rst b/docs/source/autodoc/profiling.py.rst
similarity index 83%
rename from docs/source/autodoc/profile.py.rst
rename to docs/source/autodoc/profiling.py.rst
index 6149c87a..1c40074b 100644
--- a/docs/source/autodoc/profile.py.rst
+++ b/docs/source/autodoc/profiling.py.rst
@@ -1,4 +1,4 @@
-.. docs/source/autodoc/profile.py.rst
+.. docs/source/autodoc/profiling.py.rst
 
 .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
 
@@ -18,8 +18,8 @@
     limitations under the License.
 
 
-cardinal_pythonlib.profile
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+cardinal_pythonlib.profiling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. automodule:: cardinal_pythonlib.profile
+.. automodule:: cardinal_pythonlib.profiling
     :members:
diff --git a/docs/source/autodoc/ensure_test_executed_correctly.py.rst b/docs/source/autodoc/tests/extract_text_tests.py.rst
similarity index 75%
rename from docs/source/autodoc/ensure_test_executed_correctly.py.rst
rename to docs/source/autodoc/tests/extract_text_tests.py.rst
index efde8de6..b1a6abb5 100644
--- a/docs/source/autodoc/ensure_test_executed_correctly.py.rst
+++ b/docs/source/autodoc/tests/extract_text_tests.py.rst
@@ -1,4 +1,4 @@
-.. docs/source/autodoc/ensure_test_executed_correctly.py.rst
+.. docs/source/autodoc/tests/extract_text_tests.py.rst
 
 .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT.
 
@@ -18,8 +18,8 @@
     limitations under the License.
 
 
-cardinal_pythonlib.ensure_test_executed_correctly
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+cardinal_pythonlib.tests.extract_text_tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. automodule:: cardinal_pythonlib.ensure_test_executed_correctly
+.. automodule:: cardinal_pythonlib.tests.extract_text_tests
     :members:
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
index deb6c932..056ac0b4 100644
--- a/docs/source/changelog.rst
+++ b/docs/source/changelog.rst
@@ -889,3 +889,16 @@ Quick links:
 **2.0.5 (2025-04-07)**
 
 - Add VARCHAR to valid Databricks types.
+
+**2.1.0 (2025-05-13)**
+
+- **BREAKING CHANGE**: Rename modules to avoid conflicts with the Python
+  standard library:
+
+   - :mod:`cardinal_pythonlib.email` is now :mod:`cardinal_pythonlib.email_utils`
+   - :mod:`cardinal_pythonlib.json` is now :mod:`cardinal_pythonlib.json_utils`
+   - :mod:`cardinal_pythonlib.profile` is now :mod:`cardinal_pythonlib.profiling`
+
+- Add support for ``.eml`` files with attachments processed by supported
+  document converters (``.docx``, ``.pdf``, ``.odt`` etc.) to
+  :func:`cardinal_pythonlib.extract_text.document_to_text`.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 92546291..f45f913a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -32,7 +32,7 @@
 
 project = "cardinal_pythonlib"
 # noinspection PyShadowingBuiltins
-copyright = "2009-2020, Rudolf Cardinal"
+copyright = "2009-2020, Rudolf Cardinal"  # noqa: A001
 author = "Rudolf Cardinal"
 
 # The short X.Y version
diff --git a/setup.cfg b/setup.cfg
index 7985ef36..b922b871 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -13,7 +13,7 @@ linters=pycodestyle,pyflakes
 max-line-length=79
 # Not compatible with Black and not PEP8 apparently
 # E203: Whitespace before ':'
-extend-ignore = E203
+extend-ignore = A003,E203
 
 [mypy]
 # MyPy is a static type checker. It will not execute the code!
@@ -22,3 +22,6 @@ no_strict_optional = True
 allow_redefinition = True
 disallow_untyped_defs = True
 disallow_incomplete_defs = True
+
+[mypy-semantic_version.*]
+ignore_missing_imports = True
diff --git a/setup.py b/setup.py
index 26489c1b..553ff116 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@
 """
 
 from setuptools import setup, find_packages
-from codecs import open
+from codecs import open  # noqa: A004
 from os import path
 
 from cardinal_pythonlib.version_string import VERSION_STRING
@@ -192,7 +192,7 @@
             "cardinalpythonlib_chebi=cardinal_pythonlib.chebi:main",
             (
                 "cardinalpythonlib_email="
-                "cardinal_pythonlib.email.sendmail:main"
+                "cardinal_pythonlib.email_utils.sendmail:main"
             ),
             (
                 "cardinalpythonlib_extract_text="