diff --git a/.github/scripts/install_test_python_packages.sh b/.github/scripts/install_test_python_packages.sh index 12eb3720..129d97ef 100755 --- a/.github/scripts/install_test_python_packages.sh +++ b/.github/scripts/install_test_python_packages.sh @@ -10,3 +10,4 @@ ${PYTHON} -m pip install xlrd ${PYTHON} -m pip install dogpile.cache==0.9.2 # Later versions incompatible ${PYTHON} -m pip install pytest ${PYTHON} -m pip install xhtml2pdf weasyprint pdfkit # For PDF tests +${PYTHON} -m pip install faker==13.3.1 faker-file'[common]'==0.18.3 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2c39bb5d..1af96a37 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,10 +18,14 @@ repos: rev: 5.0.4 hooks: - id: flake8 + additional_dependencies: + - flake8-builtins==2.5.0 - repo: https://github.com/asottile/yesqa rev: v1.5.0 hooks: - id: yesqa + additional_dependencies: + - flake8-builtins==2.5.0 - repo: https://github.com/pre-commit/pygrep-hooks rev: v1.9.0 hooks: diff --git a/cardinal_pythonlib/bulk_email/main.py b/cardinal_pythonlib/bulk_email/main.py index 7757574f..a70f19e8 100644 --- a/cardinal_pythonlib/bulk_email/main.py +++ b/cardinal_pythonlib/bulk_email/main.py @@ -57,7 +57,7 @@ Recipient, SendAttempt, ) -from cardinal_pythonlib.email.sendmail import ( +from cardinal_pythonlib.email_utils.sendmail import ( CONTENT_TYPE_HTML, CONTENT_TYPE_TEXT, is_email_valid, diff --git a/cardinal_pythonlib/bulk_email/models.py b/cardinal_pythonlib/bulk_email/models.py index c6d210f5..a01b76c4 100644 --- a/cardinal_pythonlib/bulk_email/models.py +++ b/cardinal_pythonlib/bulk_email/models.py @@ -63,7 +63,7 @@ USERNAME_MAX_LENGTH, ) from cardinal_pythonlib.colander_utils import EMAIL_ADDRESS_MAX_LEN -from cardinal_pythonlib.email.sendmail import ( +from cardinal_pythonlib.email_utils.sendmail import ( ASCII, CONTENT_TYPE_TEXT, is_email_valid, diff --git a/cardinal_pythonlib/django/fields/jsonclassfield.py b/cardinal_pythonlib/django/fields/jsonclassfield.py index 43fbaf50..2240707b 100644 --- a/cardinal_pythonlib/django/fields/jsonclassfield.py +++ b/cardinal_pythonlib/django/fields/jsonclassfield.py @@ -130,7 +130,7 @@ def my_decoder_hook(d: Dict) -> Any: # noinspection PyUnresolvedReferences from django.db.models import TextField -from cardinal_pythonlib.json.serialize import json_decode, json_encode +from cardinal_pythonlib.json_utils.serialize import json_decode, json_encode # ============================================================================= diff --git a/cardinal_pythonlib/django/function_cache.py b/cardinal_pythonlib/django/function_cache.py index 11127b11..a2416ab4 100644 --- a/cardinal_pythonlib/django/function_cache.py +++ b/cardinal_pythonlib/django/function_cache.py @@ -36,7 +36,7 @@ from django.core.cache import cache # default cache from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler -from cardinal_pythonlib.json.serialize import json_encode +from cardinal_pythonlib.json_utils.serialize import json_encode log = get_brace_style_log_with_null_handler(__name__) diff --git a/cardinal_pythonlib/django/middleware.py b/cardinal_pythonlib/django/middleware.py index b201a089..3d614a79 100644 --- a/cardinal_pythonlib/django/middleware.py +++ b/cardinal_pythonlib/django/middleware.py @@ -28,7 +28,7 @@ import logging import os -from re import compile +import re import sys from typing import Optional @@ -107,9 +107,9 @@ def process_exception( Modified according to: https://djangosnippets.org/snippets/2845/ """ -# EXEMPT_URLS = [compile(settings.LOGIN_URL.lstrip('/'))] +# EXEMPT_URLS = [re.compile(settings.LOGIN_URL.lstrip('/'))] # if hasattr(settings, 'LOGIN_EXEMPT_URLS'): -# EXEMPT_URLS += [compile(expr) for expr in settings.LOGIN_EXEMPT_URLS] +# EXEMPT_URLS += [re.compile(expr) for expr in settings.LOGIN_EXEMPT_URLS] # # # class LoginRequiredMiddleware: @@ -166,10 +166,10 @@ def process_exception( # 3. RNC; composite of those patterns. # ----------------------------------------------------------------------------- -EXEMPT_URLS = [compile(settings.LOGIN_URL.lstrip("/"))] +EXEMPT_URLS = [re.compile(settings.LOGIN_URL.lstrip("/"))] if hasattr(settings, "LOGIN_EXEMPT_URLS"): EXEMPT_URLS += [ - compile(expr.lstrip("/")) for expr in settings.LOGIN_EXEMPT_URLS + re.compile(expr.lstrip("/")) for expr in settings.LOGIN_EXEMPT_URLS ] diff --git a/cardinal_pythonlib/email/__init__.py b/cardinal_pythonlib/email_utils/__init__.py similarity index 95% rename from cardinal_pythonlib/email/__init__.py rename to cardinal_pythonlib/email_utils/__init__.py index 61be9840..c94078aa 100644 --- a/cardinal_pythonlib/email/__init__.py +++ b/cardinal_pythonlib/email_utils/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/email/__init__.py +# cardinal_pythonlib/email_utils/__init__.py """ =============================================================================== diff --git a/cardinal_pythonlib/email/mailboxpurge.py b/cardinal_pythonlib/email_utils/mailboxpurge.py similarity index 98% rename from cardinal_pythonlib/email/mailboxpurge.py rename to cardinal_pythonlib/email_utils/mailboxpurge.py index cae5c07d..1f52b518 100755 --- a/cardinal_pythonlib/email/mailboxpurge.py +++ b/cardinal_pythonlib/email_utils/mailboxpurge.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/email/mailboxpurge.py +# cardinal_pythonlib/email_utils/mailboxpurge.py """ Remove all binary attachments from email messages diff --git a/cardinal_pythonlib/email/sendmail.py b/cardinal_pythonlib/email_utils/sendmail.py similarity index 99% rename from cardinal_pythonlib/email/sendmail.py rename to cardinal_pythonlib/email_utils/sendmail.py index a286fb8b..edebe34c 100755 --- a/cardinal_pythonlib/email/sendmail.py +++ b/cardinal_pythonlib/email_utils/sendmail.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/email/sendmail.py +# cardinal_pythonlib/email_utils/sendmail.py """ =============================================================================== diff --git a/cardinal_pythonlib/email/tests/sendmail_tests.py b/cardinal_pythonlib/email_utils/tests/sendmail_tests.py similarity index 93% rename from cardinal_pythonlib/email/tests/sendmail_tests.py rename to cardinal_pythonlib/email_utils/tests/sendmail_tests.py index 7e3107df..3c4eb374 100644 --- a/cardinal_pythonlib/email/tests/sendmail_tests.py +++ b/cardinal_pythonlib/email_utils/tests/sendmail_tests.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/email/tests/sendmail_tests.py +# cardinal_pythonlib/email_utils/tests/sendmail_tests.py """ =============================================================================== @@ -28,7 +28,7 @@ import unittest -from cardinal_pythonlib.email.sendmail import is_email_valid +from cardinal_pythonlib.email_utils.sendmail import is_email_valid class TestIsEmailValid(unittest.TestCase): diff --git a/cardinal_pythonlib/ensure_test_executed_correctly.py b/cardinal_pythonlib/ensure_test_executed_correctly.py deleted file mode 100644 index 6ae15f40..00000000 --- a/cardinal_pythonlib/ensure_test_executed_correctly.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python -# cardinal_pythonlib/module_version.py - -""" -=============================================================================== - - Original code copyright (C) 2009-2022 Rudolf Cardinal (rudolf@pobox.com). - - This file is part of cardinal_pythonlib. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - -=============================================================================== - -**Ensure that a library module is executed properly, and not via a way that -breaks imports.** - -""" - -try: - # we want the stdlib email package! - from email import message_from_string # noqa: F401 -except ImportError: - raise ImportError( - "A test of importing 'email' has found " - "cardinal_pythonlib/email/__init__.py, not the email package from " - "stdlib. You are probably running a cardinal_pythonlib file directly, " - "e.g. with 'python somefile.py' or '/path/somefile.py'. Instead, use " - "'python -m cardinal_pythonlib.somefile'." - ) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index 41dbd52e..0c24121e 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -77,9 +77,14 @@ # ============================================================================= import argparse +import base64 +from email import policy +from email.message import EmailMessage +from email.parser import BytesParser from io import StringIO import io import logging +from mimetypes import guess_extension import os import re import shutil @@ -87,6 +92,7 @@ import sys import textwrap from typing import ( + Any, BinaryIO, Dict, Generator, @@ -205,9 +211,9 @@ def __init__( plain: bool = False, semiplain: bool = False, docx_in_order: bool = True, - horizontal_char="─", - vertical_char="│", - junction_char="┼", + horizontal_char: str = "─", + vertical_char: str = "│", + junction_char: str = "┼", plain_table_start: str = None, plain_table_end: str = None, plain_table_col_boundary: str = None, @@ -352,7 +358,7 @@ def get_filelikeobject(filename: str = None, blob: bytes = None) -> BinaryIO: Returns: a :class:`BinaryIO` object """ - if not filename and not blob: + if not filename and blob is None: raise ValueError("no filename and no blob") if filename and blob: raise ValueError("specify either filename or blob") @@ -367,11 +373,11 @@ def get_file_contents(filename: str = None, blob: bytes = None) -> bytes: """ Returns the binary contents of a file, or of a BLOB. """ - if not filename and not blob: + if filename is None and blob is None: raise ValueError("no filename and no blob") if filename and blob: raise ValueError("specify either filename or blob") - if blob: + if blob is not None: return blob with open(filename, "rb") as f: return f.read() @@ -445,7 +451,7 @@ def get_file_contents_text( ) -def get_cmd_output(*args, encoding: str = SYS_ENCODING) -> str: +def get_cmd_output(*args: Any, encoding: str = SYS_ENCODING) -> str: """ Returns text output of a command. """ @@ -456,7 +462,7 @@ def get_cmd_output(*args, encoding: str = SYS_ENCODING) -> str: def get_cmd_output_from_stdin( - stdint_content_binary: bytes, *args, encoding: str = SYS_ENCODING + stdint_content_binary: bytes, *args: Any, encoding: str = SYS_ENCODING ) -> str: """ Returns text output of a command, passing binary data in via stdin. @@ -549,17 +555,17 @@ def availability_pdf() -> bool: # ----------------------------------------------------------------------------- # In a D.I.Y. fashion # ----------------------------------------------------------------------------- -# DOCX specification: http://www.ecma-international.org/news/TC45_current_work/TC45_available_docs.htm # noqa: E501 +# DOCX specification: https://ecma-international.org/publications-and-standards/standards/ecma-376/ # noqa: E501 DOCX_HEADER_FILE_REGEX = re.compile("word/header[0-9]*.xml") -DOCX_DOC_FILE = "word/document.xml" +DOCX_DOCUMENT_FILE_REGEX = re.compile("word/document[0-9]*.xml") DOCX_FOOTER_FILE_REGEX = re.compile("word/footer[0-9]*.xml") DOCX_SCHEMA_URL = ( "http://schemas.openxmlformats.org/wordprocessingml/2006/main" ) -def docx_qn(tagroot): +def docx_qn(tagroot: str) -> str: return f"{{{DOCX_SCHEMA_URL}}}{tagroot}" @@ -595,7 +601,9 @@ def gen_xml_files_from_docx(fp: BinaryIO) -> Iterator[str]: for filename in filelist: if DOCX_HEADER_FILE_REGEX.match(filename): yield z.read(filename).decode("utf8") - yield z.read(DOCX_DOC_FILE) + for filename in filelist: + if DOCX_DOCUMENT_FILE_REGEX.match(filename): + yield z.read(filename).decode("utf8") for filename in filelist: if DOCX_FOOTER_FILE_REGEX.match(filename): yield z.read(filename).decode("utf8") @@ -624,7 +632,7 @@ def docx_gen_wordwrapped_fragments( """ to_wrap = [] # type: List[DocxFragment] - def yield_wrapped(): + def yield_wrapped() -> Generator[str, None, None]: """ Yield the word-wrapped stuff to date. """ @@ -1132,9 +1140,24 @@ def convert_html_to_text( """ Converts HTML to text. """ + + # https://bugs.launchpad.net/beautifulsoup/+bug/2110492 + # beautifulsoup4==4.13.4 returns "b''" for an empty bytes array + # So we just workaround this here: + if bytes is not None and len(blob) == 0: + return "" + with get_filelikeobject(filename, blob) as fp: - soup = bs4.BeautifulSoup(fp) - return soup.get_text() + soup = bs4.BeautifulSoup(fp, "html.parser") + + # In the real world we can end up with UTF-16 characters embedded as + # numbered entities in Windows-1252 encoded HTML such as + # �� "Slightly smiling face". Replacing these here + # avoids "UnicodeEncodeError: 'utf-8' codec can't encode characters in + # position ... surrogates not allowed". + text = soup.get_text().encode(errors="replace").decode() + + return text # ============================================================================= @@ -1152,7 +1175,7 @@ def convert_xml_to_text( Converts XML to text. """ with get_filelikeobject(filename, blob) as fp: - soup = bs4.BeautifulStoneSoup(fp) + soup = bs4.BeautifulSoup(fp, features="xml") return soup.get_text() @@ -1229,6 +1252,74 @@ def availability_doc() -> bool: return bool(antiword) +# ============================================================================= +# EML +# ============================================================================= + + +def convert_eml_to_text( + filename: str = None, + blob: bytes = None, + config: TextProcessingConfig = _DEFAULT_CONFIG, +) -> str: + email_content_list: list[str] = [] + + with get_filelikeobject(filename, blob) as fp: + parser = BytesParser(policy=policy.default) # type: ignore[arg-type] + message = parser.parse(fp) + + for email_content in _gen_email_content(message, config): + if email_content is not None: + email_content_list.append(email_content) + + text = "\n".join(email_content_list) + + return text + + +def _gen_email_content( + message: EmailMessage, config: TextProcessingConfig +) -> Generator[Optional[str], None, None]: + body = message.get_body( + preferencelist=( + "html", + "plain", + ) + ) # type: ignore[attr-defined] + if body is not None: + yield _get_email_content(body, config) + + for part in message.iter_attachments(): # type: ignore[attr-defined] + yield _get_email_content(part, config) + + +def _get_email_content( + message: EmailMessage, + config: TextProcessingConfig, +) -> Optional[str]: + content_type = message.get_content_type() + ext = guess_extension(content_type) + + if ext is not None and ext in ext_map: + content = message.get_content() + if isinstance(content, str): + charset = "utf-8" + content_type_header = message.get("Content-Type") + if content_type_header: + charset = content_type_header.params.get("charset", "utf-8") + blob = content.encode(charset, "replace") + elif isinstance(content, EmailMessage): + blob = content.as_bytes() + if message.get("Content-Transfer-Encoding") == "base64": + blob = base64.b64decode(blob) + else: + blob = content + + return document_to_text(blob=blob, extension=ext, config=config) + + return None + + # ============================================================================= # Anything # ============================================================================= @@ -1267,7 +1358,7 @@ def availability_anything() -> bool: # Decider # ============================================================================= -ext_map = { +ext_map: dict[str, dict[str, Any]] = { # Converter functions must be of the form: func(filename, blob, config). # Availability must be either a boolean literal or a function that takes no # params. @@ -1276,6 +1367,7 @@ def availability_anything() -> bool: ".docm": {CONVERTER: convert_docx_to_text, AVAILABILITY: True}, ".docx": {CONVERTER: convert_docx_to_text, AVAILABILITY: True}, ".dot": {CONVERTER: convert_doc_to_text, AVAILABILITY: availability_doc}, + ".eml": {CONVERTER: convert_eml_to_text, AVAILABILITY: True}, ".htm": {CONVERTER: convert_html_to_text, AVAILABILITY: True}, ".html": {CONVERTER: convert_html_to_text, AVAILABILITY: True}, ".log": {CONVERTER: get_file_contents_text, AVAILABILITY: True}, @@ -1333,7 +1425,7 @@ def document_to_text( Raises an exception for malformed arguments, missing files, bad filetypes, etc. """ - if not filename and not blob: + if not filename and blob is None: raise ValueError("document_to_text: no filename and no blob") if filename and blob: raise ValueError("document_to_text: specify either filename or blob") diff --git a/cardinal_pythonlib/json/__init__.py b/cardinal_pythonlib/json_utils/__init__.py similarity index 95% rename from cardinal_pythonlib/json/__init__.py rename to cardinal_pythonlib/json_utils/__init__.py index 3256199d..51cc3bd1 100644 --- a/cardinal_pythonlib/json/__init__.py +++ b/cardinal_pythonlib/json_utils/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/json/__init__.py +# cardinal_pythonlib/json_utils/__init__.py """ =============================================================================== diff --git a/cardinal_pythonlib/json/serialize.py b/cardinal_pythonlib/json_utils/serialize.py similarity index 99% rename from cardinal_pythonlib/json/serialize.py rename to cardinal_pythonlib/json_utils/serialize.py index 3103a6ee..eb3a434f 100644 --- a/cardinal_pythonlib/json/serialize.py +++ b/cardinal_pythonlib/json_utils/serialize.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/json/serialize.py +# cardinal_pythonlib/json_utils/serialize.py """ =============================================================================== diff --git a/cardinal_pythonlib/json/typing_helpers.py b/cardinal_pythonlib/json_utils/typing_helpers.py similarity index 96% rename from cardinal_pythonlib/json/typing_helpers.py rename to cardinal_pythonlib/json_utils/typing_helpers.py index d5c6c18a..47c7161f 100644 --- a/cardinal_pythonlib/json/typing_helpers.py +++ b/cardinal_pythonlib/json_utils/typing_helpers.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/json/typing_helpers.py +# cardinal_pythonlib/json_utils/typing_helpers.py """ =============================================================================== diff --git a/cardinal_pythonlib/module_version.py b/cardinal_pythonlib/module_version.py index f18f9a0a..cb251fe6 100644 --- a/cardinal_pythonlib/module_version.py +++ b/cardinal_pythonlib/module_version.py @@ -39,9 +39,6 @@ from semantic_version import Version -# noinspection PyUnresolvedReferences -import cardinal_pythonlib.ensure_test_executed_correctly # noqa: F401 - # ============================================================================= # Report Python module versions diff --git a/cardinal_pythonlib/profile.py b/cardinal_pythonlib/profiling.py similarity index 97% rename from cardinal_pythonlib/profile.py rename to cardinal_pythonlib/profiling.py index 558e13af..a06074e8 100644 --- a/cardinal_pythonlib/profile.py +++ b/cardinal_pythonlib/profiling.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/profile.py +# cardinal_pythonlib/profiling.py """ =============================================================================== diff --git a/cardinal_pythonlib/sizeformatter.py b/cardinal_pythonlib/sizeformatter.py index 10169fea..69345e2a 100644 --- a/cardinal_pythonlib/sizeformatter.py +++ b/cardinal_pythonlib/sizeformatter.py @@ -69,7 +69,7 @@ def sizeof_fmt(num: float, suffix: str = "B") -> str: def bytes2human( n: Union[int, float], - format: str = "%(value).1f %(symbol)s", + format: str = "%(value).1f %(symbol)s", # noqa: A002 symbols: str = "customary", ) -> str: """ diff --git a/cardinal_pythonlib/tests/__init__.py b/cardinal_pythonlib/tests/__init__.py new file mode 100644 index 00000000..a5311f01 --- /dev/null +++ b/cardinal_pythonlib/tests/__init__.py @@ -0,0 +1,27 @@ +# cardinal_pythonlib/tests/__init__.py + +""" +=============================================================================== + + Original code copyright (C) 2009-2022 Rudolf Cardinal (rudolf@pobox.com). + + This file is part of cardinal_pythonlib. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +=============================================================================== + +The mere existence of this file makes Python treat the directory as a +package. + +""" diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py new file mode 100644 index 00000000..3a64b7b8 --- /dev/null +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -0,0 +1,588 @@ +# cardinal_pythonlib/tests/extract_text_tests.py + +""" +=============================================================================== + + Original code copyright (C) 2009-2022 Rudolf Cardinal (rudolf@pobox.com). + + This file is part of cardinal_pythonlib. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +=============================================================================== + +**Text extraction tests.** + +""" + +from email import message_from_string, policy +from email.message import EmailMessage +import os +import subprocess +from tempfile import mkdtemp, NamedTemporaryFile +from unittest import mock, TestCase + +from faker import Faker +from faker_file.providers.docx_file import DocxFileProvider +from faker_file.providers.eml_file import EmlFileProvider +from faker_file.providers.helpers.inner import ( + create_inner_docx_file, + create_inner_eml_file, +) +from faker_file.providers.odt_file import OdtFileProvider +from faker_file.providers.txt_file import TxtFileProvider +from faker_file.providers.xml_file import XmlFileProvider + +from cardinal_pythonlib.extract_text import ( + document_to_text, + TextProcessingConfig, + update_external_tools, +) + + +class DocumentToTextTests(TestCase): + def setUp(self) -> None: + self.empty_dir = mkdtemp() + + self._replace_external_tools_with_fakes() + self.config = TextProcessingConfig() + self._create_mock_objects() + self._register_faker_providers() + + def _create_mock_objects(self) -> None: + # Some mock empty output that we don't check + mock_decode = mock.Mock(return_value="") + mock_stdout = mock.Mock(decode=mock_decode) + mock_communicate = mock.Mock(return_value=(mock_stdout, None)) + self.mock_popen = mock.Mock( + return_value=mock.Mock(communicate=mock_communicate) + ) + + def _register_faker_providers(self) -> None: + self.fake = Faker("en-US") # To avoid Lorem Ipsum + self.fake.seed_instance(12345) + self.fake.add_provider(DocxFileProvider) + self.fake.add_provider(EmlFileProvider) + self.fake.add_provider(OdtFileProvider) + self.fake.add_provider(TxtFileProvider) + self.fake.add_provider(XmlFileProvider) + + def _replace_external_tools_with_fakes(self) -> None: + # For external tools we assume the tools are running correctly + # and we just check that they are invoked with the correct arguments. + + tool_names = [ + "antiword", + "pdftotext", + "strings", + "strings2", + "unrtf", + ] + + tools_dir = {t: os.path.join(self.empty_dir, t) for t in tool_names} + update_external_tools(tools_dir) + + def tearDown(self) -> None: + os.rmdir(self.empty_dir) + + def test_raises_when_no_filename_or_blob(self) -> None: + with self.assertRaises(ValueError) as cm: + document_to_text() + + self.assertIn("no filename and no blob", str(cm.exception)) + + def test_raises_when_filename_empty(self) -> None: + with self.assertRaises(ValueError) as cm: + document_to_text(filename="") + + self.assertIn("no filename and no blob", str(cm.exception)) + + def test_raises_when_filename_and_blob(self) -> None: + with self.assertRaises(ValueError) as cm: + document_to_text(filename="foo", blob="bar") + + self.assertIn("specify either filename or blob", str(cm.exception)) + + def test_raises_when_blob_but_no_extension(self) -> None: + with self.assertRaises(ValueError) as cm: + document_to_text(blob="bar") + + self.assertIn("need extension hint for blob", str(cm.exception)) + + def test_raises_when_not_a_file(self) -> None: + with self.assertRaises(ValueError) as cm: + filename = os.path.join(self.empty_dir, "foo") + document_to_text(filename=filename) + + self.assertIn("no such file", str(cm.exception)) + + def test_csv_converted(self) -> None: + content = "one,two,three" + + with NamedTemporaryFile(suffix=".csv", delete=False) as temp_file: + temp_file.write(content.encode("utf-8")) + temp_file.close() + text = document_to_text(filename=temp_file.name) + + self.assertEqual(text, content) + + def test_doc_will_be_converted_with_antiword(self) -> None: + with mock.patch.multiple( + "cardinal_pythonlib.extract_text.subprocess", + Popen=self.mock_popen, + ): + with NamedTemporaryFile(suffix=".doc", delete=False) as temp_file: + temp_file.close() + document_to_text(filename=temp_file.name, config=self.config) + + expected_calls = [ + mock.call( + ( + f"{self.empty_dir}/antiword", + "-w", + str(self.config.width), + temp_file.name, + ), + stdout=subprocess.PIPE, + ), + ] + self.mock_popen.assert_has_calls(expected_calls) + + def test_dot_will_be_converted_with_antiword(self) -> None: + with mock.patch.multiple( + "cardinal_pythonlib.extract_text.subprocess", + Popen=self.mock_popen, + ): + with NamedTemporaryFile(suffix=".dot", delete=False) as temp_file: + temp_file.close() + document_to_text(filename=temp_file.name) + + expected_calls = [ + mock.call( + ( + f"{self.empty_dir}/antiword", + "-w", + str(self.config.width), + temp_file.name, + ), + stdout=subprocess.PIPE, + ), + ] + self.mock_popen.assert_has_calls(expected_calls) + + def test_docx_converted(self) -> None: + content = self.fake.paragraph(nb_sentences=10) + + docx = self.fake.docx_file(content=content) + self.config.width = 0 + text = document_to_text( + filename=docx.data["filename"], config=self.config + ) + + self.assertEqual(text.strip(), content) + + def test_htm_converted(self) -> None: + content = self.fake.paragraph(nb_sentences=10) + + html = f""" + + + + + +{content} + + +""" + + text = document_to_text( + blob=html.encode("utf-8"), extension="htm", config=self.config + ) + self.assertEqual(text.strip(), content) + + def test_empty_htm_converted(self) -> None: + text = document_to_text( + blob="".encode("utf-8"), extension="htm", config=self.config + ) + self.assertEqual(text, "") + + def test_log_converted(self) -> None: + content = """ +2025-04-02 06:05:43,772 INFO Starting unattended upgrades script +2025-04-02 06:05:43,772 INFO Allowed origins are: o=Ubuntu,a=focal, o=Ubuntu,a=focal-security, o=UbuntuESMApps,a=focal-apps-security, o=UbuntuESM,a=focal-infra-security +""" # noqa: E501 + + text = document_to_text( + blob=content.encode("utf-8"), extension="log", config=self.config + ) + + self.assertEqual(text.strip(), content.strip()) + + def test_odt_converted(self) -> None: + content = self.fake.paragraph(nb_sentences=10) + + odt = self.fake.odt_file(content=content) + self.config.width = 0 + text = document_to_text( + filename=odt.data["filename"], config=self.config + ) + + self.assertEqual(text.strip(), content) + + def test_pdf_will_be_converted_with_pdftotext(self) -> None: + with mock.patch.multiple( + "cardinal_pythonlib.extract_text.subprocess", + Popen=self.mock_popen, + ): + with NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file: + temp_file.close() + document_to_text(filename=temp_file.name, config=self.config) + + expected_calls = [ + mock.call( + ( + f"{self.empty_dir}/pdftotext", + temp_file.name, + "-", + ), + stdout=subprocess.PIPE, + ), + ] + self.mock_popen.assert_has_calls(expected_calls) + + def test_rtf_will_be_converted_with_unrtf(self) -> None: + with mock.patch( + "cardinal_pythonlib.extract_text.UNRTF_SUPPORTS_QUIET", True + ): + with mock.patch.multiple( + "cardinal_pythonlib.extract_text.subprocess", + Popen=self.mock_popen, + ): + with NamedTemporaryFile( + suffix=".rtf", delete=False + ) as temp_file: + temp_file.close() + document_to_text( + filename=temp_file.name, config=self.config + ) + + expected_calls = [ + mock.call( + ( + f"{self.empty_dir}/unrtf", + "--text", + "--nopict", + "--quiet", + temp_file.name, + ), + stdout=subprocess.PIPE, + ), + ] + self.mock_popen.assert_has_calls(expected_calls) + + def test_txt_converted(self) -> None: + content = self.fake.paragraph(nb_sentences=10) + txt_file = self.fake.txt_file(content=content) + text = document_to_text(filename=txt_file.data["filename"]) + + self.assertEqual(text.strip(), content) + + def test_xml_converted(self) -> None: + name = self.fake.name() + address = self.fake.address() + + xml_file = self.fake.xml_file( + num_rows=1, + data_columns={ + "name": name, + "address": address, + }, + ) + text = document_to_text(filename=xml_file.data["filename"]) + + self.assertEqual(text.strip(), f"{name}{address}") + + def test_eml_converted(self) -> None: + content = self.fake.paragraph(nb_sentences=10) + eml_file = self.fake.eml_file(content=content) + text = document_to_text(filename=eml_file.data["filename"]) + + self.assertEqual(text.strip(), content) + + def test_eml_with_docx_attachment_converted(self) -> None: + body_content = self.fake.paragraph(nb_sentences=10) + docx_content = self.fake.paragraph(nb_sentences=10) + + docx_file_args = dict(content=docx_content) + options = dict( + count=1, + create_inner_file_func=create_inner_docx_file, + create_inner_file_args=docx_file_args, + ) + + eml_file = self.fake.eml_file( + content=body_content, + options=options, + ) + self.config.width = 0 + text = document_to_text( + filename=eml_file.data["filename"], config=self.config + ) + + self.assertIn(body_content, text) + self.assertIn(docx_content, text) + + def test_eml_with_nested_docx_attachment_converted(self) -> None: + outer_email_content = self.fake.paragraph(nb_sentences=10) + inner_email_content = self.fake.paragraph(nb_sentences=10) + + docx_content = self.fake.paragraph(nb_sentences=10) + + docx_file_args = dict(content=docx_content) + docx_options = dict( + count=1, + create_inner_file_func=create_inner_docx_file, + create_inner_file_args=docx_file_args, + ) + eml_file_args = dict( + content=inner_email_content, + options=docx_options, + ) + eml_options = dict( + count=1, + create_inner_file_func=create_inner_eml_file, + create_inner_file_args=eml_file_args, + ) + + eml_file = self.fake.eml_file( + content=outer_email_content, + options=eml_options, + ) + + self.config.width = 0 + text = document_to_text( + filename=eml_file.data["filename"], config=self.config + ) + + self.assertIn(outer_email_content, text) + self.assertIn(inner_email_content, text) + self.assertIn(docx_content, text) + + def test_eml_html_body_preferred_over_text(self) -> None: + # Contrived example. Normally these would have the same content + text_content = self.fake.paragraph(nb_sentences=10) + html_content = self.fake.paragraph(nb_sentences=10) + html = f""" + + + + + +{html_content} + + +""" + # faker-file can't do this yet + message = EmailMessage() + message.set_content(text_content) + message.add_alternative(html, subtype="html") + blob = message.as_bytes() + + text = document_to_text( + blob=blob, extension=".eml", config=self.config + ) + + self.assertIn(html_content, text) + self.assertNotIn(text_content, text) + + def test_eml_latin1_html_decoded_correctly(self) -> None: + content = """From: foo@example.org +To: bar@example.org +Subject: Latin-1 test +Content-Type: multipart/mixed; boundary="===" +MIME-Version: 1.0 + +--=== +Content-Type: text/html; charset="iso-8859-1" +Content-Transfer-Encoding: quoted-printable + + + + + +Caf=E9 + + +--===-- +""" + + message = message_from_string(content, policy=policy.default) + blob = message.as_bytes() + + text = document_to_text( + blob=blob, extension=".eml", config=self.config + ) + + self.assertIn("Café", text) + + def test_eml_with_no_charset_converted(self) -> None: + text_content = self.fake.paragraph(nb_sentences=10) + + content = f"""From: bar@example.org +Subject: No charset +To: foo@example.org +Mime-Version: 1.0 +Content-Type: multipart/mixed;boundary="===" + +--=== +Content-Type: text/plain + +{text_content} + +--===-- + +""" + + message = message_from_string(content, policy=policy.default) + blob = message.as_bytes() + + text = document_to_text( + blob=blob, extension=".eml", config=self.config + ) + + self.assertIn(text_content, text) + + def test_eml_with_no_content_type_converted(self) -> None: + text_content = self.fake.paragraph(nb_sentences=10) + + content = f"""From: bar@example.org +Subject: No content type +To: foo@example.org +Mime-Version: 1.0 +Content-Type: multipart/mixed;boundary="===" + +--=== + +{text_content} + +--===-- + +""" + + message = message_from_string(content, policy=policy.default) + blob = message.as_bytes() + + text = document_to_text( + blob=blob, extension=".eml", config=self.config + ) + + self.assertIn(text_content, text) + + def test_eml_with_empty_body_converted(self) -> None: + content = """From: bar@example.org +Subject: No body +To: foo@example.org +Mime-Version: 1.0 +Content-Type: multipart/mixed;boundary="===" + +--=== +--===-- +""" + message = message_from_string(content, policy=policy.default) + blob = message.as_bytes() + + text = document_to_text( + blob=blob, extension=".eml", config=self.config + ) + + self.assertEqual("", text) + + def test_eml_with_illegal_multibyte_sequence_replaced(self) -> None: + content = """From: bar@example.org +Subject: Illegal multibyte sequence +To: foo@example.org +Mime-Version: 1.0 +Content-Type: multipart/mixed;boundary="===" + +--=== +Content-Type: text/html; charset="big5" +Content-Transfer-Encoding: quoted-printable + + + + + +=F9=F9 + + +--===-- +""" + message = message_from_string(content, policy=policy.default) + blob = message.as_bytes() + + text = document_to_text( + blob=blob, extension=".eml", config=self.config + ) + + self.assertEqual(text.strip(), "??") + + def test_eml_invalid_surrogate_characters_replaced(self) -> None: + content = """From: bar@example.org +Subject: Invalid surrogate characters +To: foo@example.org +Mime-Version: 1.0 +Content-Type: multipart/mixed;boundary="===" + +--=== +Content-Type: text/html; charset="windows-1252" +Content-Transfer-Encoding: quoted-printable + + + + + +�� + + +--===-- +""" + message = message_from_string(content, policy=policy.default) + blob = message.as_bytes() + + text = document_to_text( + blob=blob, extension=".eml", config=self.config + ) + + self.assertEqual(text.strip(), "??") + + def test_unsupported_will_be_converted_with_strings(self) -> None: + with mock.patch.multiple( + "cardinal_pythonlib.extract_text.subprocess", + Popen=self.mock_popen, + ): + with NamedTemporaryFile(suffix=".exe", delete=False) as temp_file: + temp_file.close() + document_to_text(filename=temp_file.name, config=self.config) + + expected_calls = [ + mock.call( + ( + f"{self.empty_dir}/strings", + temp_file.name, + ), + stdout=subprocess.PIPE, + ), + ] + self.mock_popen.assert_has_calls(expected_calls) diff --git a/cardinal_pythonlib/tools/convert_mdb_to_mysql.py b/cardinal_pythonlib/tools/convert_mdb_to_mysql.py index 8f2b3975..da859058 100644 --- a/cardinal_pythonlib/tools/convert_mdb_to_mysql.py +++ b/cardinal_pythonlib/tools/convert_mdb_to_mysql.py @@ -156,9 +156,9 @@ def __init__( nargs: Union[int, str] = "?", # 0 or 1 default: Any = None, required: bool = False, - type: Callable[[str], Any] = None, + type: Callable[[str], Any] = None, # noqa: A002 metavar: str = None, - help: str = None, + help: str = None, # noqa: A002 ) -> None: super(PasswordPromptAction, self).__init__( option_strings=option_strings, diff --git a/cardinal_pythonlib/tools/explore_clang_format_config.py b/cardinal_pythonlib/tools/explore_clang_format_config.py index 94cb8b0d..c6de7cc6 100644 --- a/cardinal_pythonlib/tools/explore_clang_format_config.py +++ b/cardinal_pythonlib/tools/explore_clang_format_config.py @@ -60,7 +60,11 @@ def monitor_diff(filenames: List[str], meld_exe: str) -> subprocess.Popen: def clang_format( - config: str, src: str, dest: str, dir: str, clang_format_exe: str + config: str, + src: str, + dest: str, + dir: str, # noqa: A002 + clang_format_exe: str, ) -> None: """ Rungs clang-format, formatting a source file to a destination file using a diff --git a/docs/docs_requirements.txt b/docs/docs_requirements.txt index 9ffc9b2e..c88fa733 100644 --- a/docs/docs_requirements.txt +++ b/docs/docs_requirements.txt @@ -6,17 +6,13 @@ deform dogpile.cache==0.9.2 # CRATE is on 4.2 Django>=4.2,<5.0 +faker==13.3.1 +faker-file[common]==0.18.3 libChEBIpy pdfkit pyramid==1.10.8 pytest -# sphinx==4.2.0 sphinx==7.1.2 -# sphinxcontrib-applehelp==1.0.4 -# sphinxcontrib-devhelp==1.0.2 -# sphinxcontrib-htmlhelp==2.0.1 -# sphinxcontrib-serializinghtml==1.1.5 -# sphinxcontrib-qthelp==1.0.3 sphinx-paramlinks==0.6.0 sphinx_rtd_theme==2.0.0 weasyprint diff --git a/docs/source/autodoc/_index.rst b/docs/source/autodoc/_index.rst index 4adf8f62..d910bf49 100644 --- a/docs/source/autodoc/_index.rst +++ b/docs/source/autodoc/_index.rst @@ -66,10 +66,9 @@ Automatic documentation of source code docker.py.rst dogpile_cache.py.rst dsp.py.rst - email/mailboxpurge.py.rst - email/sendmail.py.rst - email/tests/sendmail_tests.py.rst - ensure_test_executed_correctly.py.rst + email_utils/mailboxpurge.py.rst + email_utils/sendmail.py.rst + email_utils/tests/sendmail_tests.py.rst enumlike.py.rst excel.py.rst exceptions.py.rst @@ -82,8 +81,8 @@ Automatic documentation of source code httpconst.py.rst interval.py.rst iterhelp.py.rst - json/serialize.py.rst - json/typing_helpers.py.rst + json_utils/serialize.py.rst + json_utils/typing_helpers.py.rst lang.py.rst lists.py.rst logs.py.rst @@ -104,7 +103,7 @@ Automatic documentation of source code plot.py.rst probability.py.rst process.py.rst - profile.py.rst + profiling.py.rst progress.py.rst psychiatry/drugs.py.rst psychiatry/mk_r_druglists.py.rst @@ -178,6 +177,7 @@ Automatic documentation of source code tee.py.rst tests/datetimefunc_tests.py.rst tests/dogpile_cache_tests.py.rst + tests/extract_text_tests.py.rst tests/interval_tests.py.rst tests/lists_tests.py.rst tests/pdf_tests.py.rst diff --git a/docs/source/autodoc/email/mailboxpurge.py.rst b/docs/source/autodoc/email_utils/mailboxpurge.py.rst similarity index 77% rename from docs/source/autodoc/email/mailboxpurge.py.rst rename to docs/source/autodoc/email_utils/mailboxpurge.py.rst index c49e933f..a4c86daa 100644 --- a/docs/source/autodoc/email/mailboxpurge.py.rst +++ b/docs/source/autodoc/email_utils/mailboxpurge.py.rst @@ -1,4 +1,4 @@ -.. docs/source/autodoc/email/mailboxpurge.py.rst +.. docs/source/autodoc/email_utils/mailboxpurge.py.rst .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. @@ -18,8 +18,8 @@ limitations under the License. -cardinal_pythonlib.email.mailboxpurge -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +cardinal_pythonlib.email_utils.mailboxpurge +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: cardinal_pythonlib.email.mailboxpurge +.. automodule:: cardinal_pythonlib.email_utils.mailboxpurge :members: diff --git a/docs/source/autodoc/email/sendmail.py.rst b/docs/source/autodoc/email_utils/sendmail.py.rst similarity index 79% rename from docs/source/autodoc/email/sendmail.py.rst rename to docs/source/autodoc/email_utils/sendmail.py.rst index 82327dc5..e090e976 100644 --- a/docs/source/autodoc/email/sendmail.py.rst +++ b/docs/source/autodoc/email_utils/sendmail.py.rst @@ -1,4 +1,4 @@ -.. docs/source/autodoc/email/sendmail.py.rst +.. docs/source/autodoc/email_utils/sendmail.py.rst .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. @@ -18,8 +18,8 @@ limitations under the License. -cardinal_pythonlib.email.sendmail -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +cardinal_pythonlib.email_utils.sendmail +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: cardinal_pythonlib.email.sendmail +.. automodule:: cardinal_pythonlib.email_utils.sendmail :members: diff --git a/docs/source/autodoc/email/tests/sendmail_tests.py.rst b/docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst similarity index 75% rename from docs/source/autodoc/email/tests/sendmail_tests.py.rst rename to docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst index f209e33e..22fa8634 100644 --- a/docs/source/autodoc/email/tests/sendmail_tests.py.rst +++ b/docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst @@ -1,4 +1,4 @@ -.. docs/source/autodoc/email/tests/sendmail_tests.py.rst +.. docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. @@ -18,8 +18,8 @@ limitations under the License. -cardinal_pythonlib.email.tests.sendmail_tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +cardinal_pythonlib.email_utils.tests.sendmail_tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: cardinal_pythonlib.email.tests.sendmail_tests +.. automodule:: cardinal_pythonlib.email_utils.tests.sendmail_tests :members: diff --git a/docs/source/autodoc/json/serialize.py.rst b/docs/source/autodoc/json_utils/serialize.py.rst similarity index 79% rename from docs/source/autodoc/json/serialize.py.rst rename to docs/source/autodoc/json_utils/serialize.py.rst index 18689562..15e18c33 100644 --- a/docs/source/autodoc/json/serialize.py.rst +++ b/docs/source/autodoc/json_utils/serialize.py.rst @@ -1,4 +1,4 @@ -.. docs/source/autodoc/json/serialize.py.rst +.. docs/source/autodoc/json_utils/serialize.py.rst .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. @@ -18,8 +18,8 @@ limitations under the License. -cardinal_pythonlib.json.serialize -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +cardinal_pythonlib.json_utils.serialize +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: cardinal_pythonlib.json.serialize +.. automodule:: cardinal_pythonlib.json_utils.serialize :members: diff --git a/docs/source/autodoc/json/typing_helpers.py.rst b/docs/source/autodoc/json_utils/typing_helpers.py.rst similarity index 77% rename from docs/source/autodoc/json/typing_helpers.py.rst rename to docs/source/autodoc/json_utils/typing_helpers.py.rst index d6125b12..e53154ee 100644 --- a/docs/source/autodoc/json/typing_helpers.py.rst +++ b/docs/source/autodoc/json_utils/typing_helpers.py.rst @@ -1,4 +1,4 @@ -.. docs/source/autodoc/json/typing_helpers.py.rst +.. docs/source/autodoc/json_utils/typing_helpers.py.rst .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. @@ -18,8 +18,8 @@ limitations under the License. -cardinal_pythonlib.json.typing_helpers -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +cardinal_pythonlib.json_utils.typing_helpers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: cardinal_pythonlib.json.typing_helpers +.. automodule:: cardinal_pythonlib.json_utils.typing_helpers :members: diff --git a/docs/source/autodoc/profile.py.rst b/docs/source/autodoc/profiling.py.rst similarity index 83% rename from docs/source/autodoc/profile.py.rst rename to docs/source/autodoc/profiling.py.rst index 6149c87a..1c40074b 100644 --- a/docs/source/autodoc/profile.py.rst +++ b/docs/source/autodoc/profiling.py.rst @@ -1,4 +1,4 @@ -.. docs/source/autodoc/profile.py.rst +.. docs/source/autodoc/profiling.py.rst .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. @@ -18,8 +18,8 @@ limitations under the License. -cardinal_pythonlib.profile -~~~~~~~~~~~~~~~~~~~~~~~~~~ +cardinal_pythonlib.profiling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: cardinal_pythonlib.profile +.. automodule:: cardinal_pythonlib.profiling :members: diff --git a/docs/source/autodoc/ensure_test_executed_correctly.py.rst b/docs/source/autodoc/tests/extract_text_tests.py.rst similarity index 75% rename from docs/source/autodoc/ensure_test_executed_correctly.py.rst rename to docs/source/autodoc/tests/extract_text_tests.py.rst index efde8de6..b1a6abb5 100644 --- a/docs/source/autodoc/ensure_test_executed_correctly.py.rst +++ b/docs/source/autodoc/tests/extract_text_tests.py.rst @@ -1,4 +1,4 @@ -.. docs/source/autodoc/ensure_test_executed_correctly.py.rst +.. docs/source/autodoc/tests/extract_text_tests.py.rst .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. @@ -18,8 +18,8 @@ limitations under the License. -cardinal_pythonlib.ensure_test_executed_correctly -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +cardinal_pythonlib.tests.extract_text_tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: cardinal_pythonlib.ensure_test_executed_correctly +.. automodule:: cardinal_pythonlib.tests.extract_text_tests :members: diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index deb6c932..056ac0b4 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -889,3 +889,16 @@ Quick links: **2.0.5 (2025-04-07)** - Add VARCHAR to valid Databricks types. + +**2.1.0 (2025-05-13)** + +- **BREAKING CHANGE**: Rename modules to avoid conflicts with the Python + standard library: + + - :mod:`cardinal_pythonlib.email` is now :mod:`cardinal_pythonlib.email_utils` + - :mod:`cardinal_pythonlib.json` is now :mod:`cardinal_pythonlib.json_utils` + - :mod:`cardinal_pythonlib.profile` is now :mod:`cardinal_pythonlib.profiling` + +- Add support for ``.eml`` files with attachments processed by supported + document converters (``.docx``, ``.pdf``, ``.odt`` etc.) to + :func:`cardinal_pythonlib.extract_text.document_to_text`. diff --git a/docs/source/conf.py b/docs/source/conf.py index 92546291..f45f913a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -32,7 +32,7 @@ project = "cardinal_pythonlib" # noinspection PyShadowingBuiltins -copyright = "2009-2020, Rudolf Cardinal" +copyright = "2009-2020, Rudolf Cardinal" # noqa: A001 author = "Rudolf Cardinal" # The short X.Y version diff --git a/setup.cfg b/setup.cfg index 7985ef36..b922b871 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,7 +13,7 @@ linters=pycodestyle,pyflakes max-line-length=79 # Not compatible with Black and not PEP8 apparently # E203: Whitespace before ':' -extend-ignore = E203 +extend-ignore = A003,E203 [mypy] # MyPy is a static type checker. It will not execute the code! @@ -22,3 +22,6 @@ no_strict_optional = True allow_redefinition = True disallow_untyped_defs = True disallow_incomplete_defs = True + +[mypy-semantic_version.*] +ignore_missing_imports = True diff --git a/setup.py b/setup.py index 26489c1b..553ff116 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ """ from setuptools import setup, find_packages -from codecs import open +from codecs import open # noqa: A004 from os import path from cardinal_pythonlib.version_string import VERSION_STRING @@ -192,7 +192,7 @@ "cardinalpythonlib_chebi=cardinal_pythonlib.chebi:main", ( "cardinalpythonlib_email=" - "cardinal_pythonlib.email.sendmail:main" + "cardinal_pythonlib.email_utils.sendmail:main" ), ( "cardinalpythonlib_extract_text="