From 04266f38096b3cbcddbc1747b5025dd87fcac1ad Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 29 Apr 2025 13:36:42 +0100 Subject: [PATCH 01/39] Add tests __init__.py This will cause the tests directory to be treated as a package and allow two test files to have the same name in different directories. --- cardinal_pythonlib/tests/__init__.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 cardinal_pythonlib/tests/__init__.py diff --git a/cardinal_pythonlib/tests/__init__.py b/cardinal_pythonlib/tests/__init__.py new file mode 100644 index 0000000..a5311f0 --- /dev/null +++ b/cardinal_pythonlib/tests/__init__.py @@ -0,0 +1,27 @@ +# cardinal_pythonlib/tests/__init__.py + +""" +=============================================================================== + + Original code copyright (C) 2009-2022 Rudolf Cardinal (rudolf@pobox.com). + + This file is part of cardinal_pythonlib. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +=============================================================================== + +The mere existence of this file makes Python treat the directory as a +package. + +""" From 6e9f343dd82206024377a574c56a5b8eaf1ffa19 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 29 Apr 2025 14:17:38 +0100 Subject: [PATCH 02/39] Test document_to_text exceptions --- .../tests/extract_text_tests.py | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 cardinal_pythonlib/tests/extract_text_tests.py diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py new file mode 100644 index 0000000..4576e0f --- /dev/null +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -0,0 +1,60 @@ +# cardinal_pythonlib/tests/datetimefunc_tests.py + +""" +=============================================================================== + + Original code copyright (C) 2009-2022 Rudolf Cardinal (rudolf@pobox.com). + + This file is part of cardinal_pythonlib. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +=============================================================================== + +**Text extraction tests.** + +""" + +import os +from tempfile import TemporaryDirectory +from unittest import TestCase + +from cardinal_pythonlib.extract_text import document_to_text + + +class DocumentToTextTests(TestCase): + def test_raises_when_no_filename_or_blob(self) -> None: + with self.assertRaises(ValueError) as cm: + document_to_text() + + self.assertIn("no filename and no blob", str(cm.exception)) + + def test_raises_when_filename_and_blob(self) -> None: + with self.assertRaises(ValueError) as cm: + document_to_text(filename="foo", blob="bar") + + self.assertIn("specify either filename or blob", str(cm.exception)) + + def test_raises_when_blob_but_no_extension(self) -> None: + with self.assertRaises(ValueError) as cm: + document_to_text(blob="bar") + + self.assertIn("need extension hint for blob", str(cm.exception)) + + def test_raises_when_not_a_file(self) -> None: + with self.assertRaises(ValueError) as cm: + with TemporaryDirectory() as temp_dir_name: + filename = os.path.join(temp_dir_name, "foo") + document_to_text(filename=filename) + + self.assertIn("no such file", str(cm.exception)) From 9d78d2ccce3cdc9e100bbf3981a0faa7f635f24f Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 29 Apr 2025 14:51:47 +0100 Subject: [PATCH 03/39] Test document_to_text CSV extraction --- cardinal_pythonlib/tests/extract_text_tests.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 4576e0f..5a2acf8 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -26,7 +26,7 @@ """ import os -from tempfile import TemporaryDirectory +from tempfile import TemporaryDirectory, NamedTemporaryFile from unittest import TestCase from cardinal_pythonlib.extract_text import document_to_text @@ -58,3 +58,13 @@ def test_raises_when_not_a_file(self) -> None: document_to_text(filename=filename) self.assertIn("no such file", str(cm.exception)) + + def test_csv_converted(self) -> None: + content = "one,two,three" + + with NamedTemporaryFile(suffix=".csv", delete=False) as temp_file: + temp_file.write(content.encode("utf-8")) + temp_file.close() + text = document_to_text(temp_file.name) + + self.assertEqual(text, content) From 5a8d542439ce6fccafcb92bfeb0e50f481dacb30 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 29 Apr 2025 15:35:40 +0100 Subject: [PATCH 04/39] Test doc extraction --- .../tests/extract_text_tests.py | 47 ++++++++++++++++++- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 5a2acf8..090842c 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -26,13 +26,34 @@ """ import os +import subprocess from tempfile import TemporaryDirectory, NamedTemporaryFile -from unittest import TestCase +from unittest import mock, TestCase -from cardinal_pythonlib.extract_text import document_to_text +from cardinal_pythonlib.extract_text import ( + document_to_text, + TextProcessingConfig, + update_external_tools, +) class DocumentToTextTests(TestCase): + def setUp(self) -> None: + update_external_tools( + { + "antiword": "/path/to/antiword", + } + ) + + self.config = TextProcessingConfig() + + mock_decode = mock.Mock(return_value="") + mock_stdout = mock.Mock(decode=mock_decode) + mock_communicate = mock.Mock(return_value=(mock_stdout, None)) + self.mock_popen = mock.Mock( + return_value=mock.Mock(communicate=mock_communicate) + ) + def test_raises_when_no_filename_or_blob(self) -> None: with self.assertRaises(ValueError) as cm: document_to_text() @@ -68,3 +89,25 @@ def test_csv_converted(self) -> None: text = document_to_text(temp_file.name) self.assertEqual(text, content) + + def test_doc_converted_with_antiword(self) -> None: + with mock.patch.multiple( + "cardinal_pythonlib.extract_text.subprocess", + Popen=self.mock_popen, + ): + with NamedTemporaryFile(suffix=".doc", delete=False) as temp_file: + temp_file.close() + document_to_text(temp_file.name) + + expected_calls = [ + mock.call( + ( + "/path/to/antiword", + "-w", + str(self.config.width), + temp_file.name, + ), + stdout=subprocess.PIPE, + ), + ] + self.mock_popen.assert_has_calls(expected_calls) From a8f8cb54d421e80ea1f05f30a9bb978c128d51d7 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 29 Apr 2025 15:48:26 +0100 Subject: [PATCH 05/39] Test dot file extraction --- .../tests/extract_text_tests.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 090842c..437c111 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -38,6 +38,8 @@ class DocumentToTextTests(TestCase): + # For external tools we assume the tools are running correctly + # and we just check that they are invoked with the correct arguments. def setUp(self) -> None: update_external_tools( { @@ -47,6 +49,7 @@ def setUp(self) -> None: self.config = TextProcessingConfig() + # Some mock empty output that we don't check mock_decode = mock.Mock(return_value="") mock_stdout = mock.Mock(decode=mock_decode) mock_communicate = mock.Mock(return_value=(mock_stdout, None)) @@ -111,3 +114,25 @@ def test_doc_converted_with_antiword(self) -> None: ), ] self.mock_popen.assert_has_calls(expected_calls) + + def test_dot_converted_with_antiword(self) -> None: + with mock.patch.multiple( + "cardinal_pythonlib.extract_text.subprocess", + Popen=self.mock_popen, + ): + with NamedTemporaryFile(suffix=".dot", delete=False) as temp_file: + temp_file.close() + document_to_text(temp_file.name) + + expected_calls = [ + mock.call( + ( + "/path/to/antiword", + "-w", + str(self.config.width), + temp_file.name, + ), + stdout=subprocess.PIPE, + ), + ] + self.mock_popen.assert_has_calls(expected_calls) From 2cb286625a90324aba9f7edc1f9051416c3fb82e Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 29 Apr 2025 16:37:21 +0100 Subject: [PATCH 06/39] Update docs --- docs/source/autodoc/_index.rst | 1 + .../autodoc/tests/extract_text_tests.py.rst | 25 +++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 docs/source/autodoc/tests/extract_text_tests.py.rst diff --git a/docs/source/autodoc/_index.rst b/docs/source/autodoc/_index.rst index 4adf8f6..3a7fb7e 100644 --- a/docs/source/autodoc/_index.rst +++ b/docs/source/autodoc/_index.rst @@ -178,6 +178,7 @@ Automatic documentation of source code tee.py.rst tests/datetimefunc_tests.py.rst tests/dogpile_cache_tests.py.rst + tests/extract_text_tests.py.rst tests/interval_tests.py.rst tests/lists_tests.py.rst tests/pdf_tests.py.rst diff --git a/docs/source/autodoc/tests/extract_text_tests.py.rst b/docs/source/autodoc/tests/extract_text_tests.py.rst new file mode 100644 index 0000000..b1a6abb --- /dev/null +++ b/docs/source/autodoc/tests/extract_text_tests.py.rst @@ -0,0 +1,25 @@ +.. docs/source/autodoc/tests/extract_text_tests.py.rst + +.. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. + + +.. Copyright (C) 2009-2020 Rudolf Cardinal (rudolf@pobox.com). + . + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + . + https://www.apache.org/licenses/LICENSE-2.0 + . + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +cardinal_pythonlib.tests.extract_text_tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: cardinal_pythonlib.tests.extract_text_tests + :members: From 699645d199823b7c81ed1b15eb6df88cde6dfaa2 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Wed, 30 Apr 2025 08:02:35 +0100 Subject: [PATCH 07/39] Test DOCX conversion --- .../tests/extract_text_tests.py | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 437c111..11a2c9a 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -30,6 +30,9 @@ from tempfile import TemporaryDirectory, NamedTemporaryFile from unittest import mock, TestCase +from faker import Faker +from faker_file.providers.docx_file import DocxFileProvider + from cardinal_pythonlib.extract_text import ( document_to_text, TextProcessingConfig, @@ -57,6 +60,9 @@ def setUp(self) -> None: return_value=mock.Mock(communicate=mock_communicate) ) + self.fake = Faker() + self.fake.add_provider(DocxFileProvider) + def test_raises_when_no_filename_or_blob(self) -> None: with self.assertRaises(ValueError) as cm: document_to_text() @@ -89,7 +95,7 @@ def test_csv_converted(self) -> None: with NamedTemporaryFile(suffix=".csv", delete=False) as temp_file: temp_file.write(content.encode("utf-8")) temp_file.close() - text = document_to_text(temp_file.name) + text = document_to_text(filename=temp_file.name) self.assertEqual(text, content) @@ -100,7 +106,7 @@ def test_doc_converted_with_antiword(self) -> None: ): with NamedTemporaryFile(suffix=".doc", delete=False) as temp_file: temp_file.close() - document_to_text(temp_file.name) + document_to_text(filename=temp_file.name, config=self.config) expected_calls = [ mock.call( @@ -122,7 +128,7 @@ def test_dot_converted_with_antiword(self) -> None: ): with NamedTemporaryFile(suffix=".dot", delete=False) as temp_file: temp_file.close() - document_to_text(temp_file.name) + document_to_text(filename=temp_file.name) expected_calls = [ mock.call( @@ -136,3 +142,14 @@ def test_dot_converted_with_antiword(self) -> None: ), ] self.mock_popen.assert_has_calls(expected_calls) + + def test_docx_converted(self) -> None: + content = self.fake.paragraph(nb_sentences=10) + + docx = self.fake.docx_file(content=content) + self.config.width = 0 + text = document_to_text( + docx.data["filename"], extension="docx", config=self.config + ) + + self.assertEqual(text.strip(), content) From 78873eb9933c2cadb45b2e1709b8f991e60d7ca8 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Wed, 30 Apr 2025 08:13:19 +0100 Subject: [PATCH 08/39] Test HTML conversion --- .../tests/extract_text_tests.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 11a2c9a..6dfcb55 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -148,8 +148,25 @@ def test_docx_converted(self) -> None: docx = self.fake.docx_file(content=content) self.config.width = 0 + text = document_to_text(docx.data["filename"], config=self.config) + + self.assertEqual(text.strip(), content) + + def test_htm_converted(self) -> None: + content = self.fake.paragraph(nb_sentences=10) + + html = f""" + + + + + +{content} + + +""" + text = document_to_text( - docx.data["filename"], extension="docx", config=self.config + blob=html.encode("utf-8"), extension="htm", config=self.config ) - self.assertEqual(text.strip(), content) From 9219cab1bd7ba72fea6c10e46b5c5c5d67e6631d Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Wed, 30 Apr 2025 08:26:31 +0100 Subject: [PATCH 09/39] Test log file conversion --- cardinal_pythonlib/tests/extract_text_tests.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 6dfcb55..5b51964 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -170,3 +170,15 @@ def test_htm_converted(self) -> None: blob=html.encode("utf-8"), extension="htm", config=self.config ) self.assertEqual(text.strip(), content) + + def test_log_converted(self) -> None: + content = """ +2025-04-02 06:05:43,772 INFO Starting unattended upgrades script +2025-04-02 06:05:43,772 INFO Allowed origins are: o=Ubuntu,a=focal, o=Ubuntu,a=focal-security, o=UbuntuESMApps,a=focal-apps-security, o=UbuntuESM,a=focal-infra-security +""" # noqa: E501 + + text = document_to_text( + blob=content.encode("utf-8"), extension="log", config=self.config + ) + + self.assertEqual(text.strip(), content.strip()) From 04b0c3739fe5f4539bece63d701de81c2af95f3a Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Wed, 30 Apr 2025 08:32:19 +0100 Subject: [PATCH 10/39] Test ODT file conversion --- cardinal_pythonlib/tests/extract_text_tests.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 5b51964..c106d19 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -32,6 +32,7 @@ from faker import Faker from faker_file.providers.docx_file import DocxFileProvider +from faker_file.providers.odt_file import OdtFileProvider from cardinal_pythonlib.extract_text import ( document_to_text, @@ -62,6 +63,7 @@ def setUp(self) -> None: self.fake = Faker() self.fake.add_provider(DocxFileProvider) + self.fake.add_provider(OdtFileProvider) def test_raises_when_no_filename_or_blob(self) -> None: with self.assertRaises(ValueError) as cm: @@ -182,3 +184,12 @@ def test_log_converted(self) -> None: ) self.assertEqual(text.strip(), content.strip()) + + def test_odt_converted(self) -> None: + content = self.fake.paragraph(nb_sentences=10) + + odt = self.fake.odt_file(content=content) + self.config.width = 0 + text = document_to_text(odt.data["filename"], config=self.config) + + self.assertEqual(text.strip(), content) From 21e2b81c03aa8927d996af0f092cab5085cab6cf Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Wed, 30 Apr 2025 08:54:15 +0100 Subject: [PATCH 11/39] Test PDF file conversion --- .../tests/extract_text_tests.py | 65 +++++++++++++++---- 1 file changed, 52 insertions(+), 13 deletions(-) diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index c106d19..5dfaf8d 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -27,12 +27,13 @@ import os import subprocess -from tempfile import TemporaryDirectory, NamedTemporaryFile +from tempfile import mkdtemp, NamedTemporaryFile from unittest import mock, TestCase from faker import Faker from faker_file.providers.docx_file import DocxFileProvider from faker_file.providers.odt_file import OdtFileProvider +from faker_file.providers.pdf_file import PdfFileProvider from cardinal_pythonlib.extract_text import ( document_to_text, @@ -42,17 +43,15 @@ class DocumentToTextTests(TestCase): - # For external tools we assume the tools are running correctly - # and we just check that they are invoked with the correct arguments. def setUp(self) -> None: - update_external_tools( - { - "antiword": "/path/to/antiword", - } - ) + self.empty_dir = mkdtemp() + self._replace_external_tools_with_fakes() self.config = TextProcessingConfig() + self._create_mock_objects() + self._register_faker_providers() + def _create_mock_objects(self) -> None: # Some mock empty output that we don't check mock_decode = mock.Mock(return_value="") mock_stdout = mock.Mock(decode=mock_decode) @@ -61,9 +60,29 @@ def setUp(self) -> None: return_value=mock.Mock(communicate=mock_communicate) ) + def _register_faker_providers(self) -> None: self.fake = Faker() self.fake.add_provider(DocxFileProvider) self.fake.add_provider(OdtFileProvider) + self.fake.add_provider(PdfFileProvider) + + def _replace_external_tools_with_fakes(self) -> None: + # For external tools we assume the tools are running correctly + # and we just check that they are invoked with the correct arguments. + + tool_names = [ + "antiword", + "pdftotext", + "strings", + "strings2", + "unrtf", + ] + + tools_dir = {t: os.path.join(self.empty_dir, t) for t in tool_names} + update_external_tools(tools_dir) + + def tearDown(self) -> None: + os.rmdir(self.empty_dir) def test_raises_when_no_filename_or_blob(self) -> None: with self.assertRaises(ValueError) as cm: @@ -85,9 +104,8 @@ def test_raises_when_blob_but_no_extension(self) -> None: def test_raises_when_not_a_file(self) -> None: with self.assertRaises(ValueError) as cm: - with TemporaryDirectory() as temp_dir_name: - filename = os.path.join(temp_dir_name, "foo") - document_to_text(filename=filename) + filename = os.path.join(self.empty_dir, "foo") + document_to_text(filename=filename) self.assertIn("no such file", str(cm.exception)) @@ -113,7 +131,7 @@ def test_doc_converted_with_antiword(self) -> None: expected_calls = [ mock.call( ( - "/path/to/antiword", + f"{self.empty_dir}/antiword", "-w", str(self.config.width), temp_file.name, @@ -135,7 +153,7 @@ def test_dot_converted_with_antiword(self) -> None: expected_calls = [ mock.call( ( - "/path/to/antiword", + f"{self.empty_dir}/antiword", "-w", str(self.config.width), temp_file.name, @@ -193,3 +211,24 @@ def test_odt_converted(self) -> None: text = document_to_text(odt.data["filename"], config=self.config) self.assertEqual(text.strip(), content) + + def test_pdf_converted(self) -> None: + with mock.patch.multiple( + "cardinal_pythonlib.extract_text.subprocess", + Popen=self.mock_popen, + ): + with NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file: + temp_file.close() + document_to_text(filename=temp_file.name, config=self.config) + + expected_calls = [ + mock.call( + ( + f"{self.empty_dir}/pdftotext", + temp_file.name, + "-", + ), + stdout=subprocess.PIPE, + ), + ] + self.mock_popen.assert_has_calls(expected_calls) From 82737b17df937cf89358eb71a0c580edf8a933c8 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Wed, 30 Apr 2025 09:41:57 +0100 Subject: [PATCH 12/39] Test RTF file conversion --- .../tests/extract_text_tests.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 5dfaf8d..a5dd724 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -232,3 +232,33 @@ def test_pdf_converted(self) -> None: ), ] self.mock_popen.assert_has_calls(expected_calls) + + def test_rtf_converted(self) -> None: + with mock.patch( + "cardinal_pythonlib.extract_text.UNRTF_SUPPORTS_QUIET", True + ): + with mock.patch.multiple( + "cardinal_pythonlib.extract_text.subprocess", + Popen=self.mock_popen, + ): + with NamedTemporaryFile( + suffix=".rtf", delete=False + ) as temp_file: + temp_file.close() + document_to_text( + filename=temp_file.name, config=self.config + ) + + expected_calls = [ + mock.call( + ( + f"{self.empty_dir}/unrtf", + "--text", + "--nopict", + "--quiet", + temp_file.name, + ), + stdout=subprocess.PIPE, + ), + ] + self.mock_popen.assert_has_calls(expected_calls) From 1427e823c8799cf47103b917c9dfb313e07e13ce Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Wed, 30 Apr 2025 14:45:59 +0100 Subject: [PATCH 13/39] Install Faker when building docs and running tests --- .github/scripts/install_test_python_packages.sh | 1 + docs/docs_requirements.txt | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.github/scripts/install_test_python_packages.sh b/.github/scripts/install_test_python_packages.sh index 12eb372..ef6fbf9 100755 --- a/.github/scripts/install_test_python_packages.sh +++ b/.github/scripts/install_test_python_packages.sh @@ -10,3 +10,4 @@ ${PYTHON} -m pip install xlrd ${PYTHON} -m pip install dogpile.cache==0.9.2 # Later versions incompatible ${PYTHON} -m pip install pytest ${PYTHON} -m pip install xhtml2pdf weasyprint pdfkit # For PDF tests +${PYTHON} -m pip install faker==13.3.1 faker-file'[common]'==0.17.13 diff --git a/docs/docs_requirements.txt b/docs/docs_requirements.txt index 9ffc9b2..f489b7f 100644 --- a/docs/docs_requirements.txt +++ b/docs/docs_requirements.txt @@ -6,6 +6,8 @@ deform dogpile.cache==0.9.2 # CRATE is on 4.2 Django>=4.2,<5.0 +faker==13.3.1 +faker-file[common]==0.17.13 libChEBIpy pdfkit pyramid==1.10.8 From d1f89770bc20414ebfcc966908ab42370c2234ae Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Wed, 30 Apr 2025 15:13:49 +0100 Subject: [PATCH 14/39] Test TXT file conversion --- cardinal_pythonlib/tests/extract_text_tests.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index a5dd724..5f467aa 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -34,6 +34,7 @@ from faker_file.providers.docx_file import DocxFileProvider from faker_file.providers.odt_file import OdtFileProvider from faker_file.providers.pdf_file import PdfFileProvider +from faker_file.providers.txt_file import TxtFileProvider from cardinal_pythonlib.extract_text import ( document_to_text, @@ -65,6 +66,7 @@ def _register_faker_providers(self) -> None: self.fake.add_provider(DocxFileProvider) self.fake.add_provider(OdtFileProvider) self.fake.add_provider(PdfFileProvider) + self.fake.add_provider(TxtFileProvider) def _replace_external_tools_with_fakes(self) -> None: # For external tools we assume the tools are running correctly @@ -168,7 +170,9 @@ def test_docx_converted(self) -> None: docx = self.fake.docx_file(content=content) self.config.width = 0 - text = document_to_text(docx.data["filename"], config=self.config) + text = document_to_text( + filename=docx.data["filename"], config=self.config + ) self.assertEqual(text.strip(), content) @@ -208,7 +212,9 @@ def test_odt_converted(self) -> None: odt = self.fake.odt_file(content=content) self.config.width = 0 - text = document_to_text(odt.data["filename"], config=self.config) + text = document_to_text( + filename=odt.data["filename"], config=self.config + ) self.assertEqual(text.strip(), content) @@ -262,3 +268,10 @@ def test_rtf_converted(self) -> None: ), ] self.mock_popen.assert_has_calls(expected_calls) + + def test_txt_converted(self) -> None: + content = self.fake.paragraph(nb_sentences=10) + txt_file = self.fake.txt_file(content=content) + text = document_to_text(filename=txt_file.data["filename"]) + + self.assertEqual(text.strip(), content) From 10b1ac011f6c8d6cca7100026f1cd02e215d449e Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Wed, 30 Apr 2025 15:35:43 +0100 Subject: [PATCH 15/39] Test XML and anything else converted to text --- .../tests/extract_text_tests.py | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 5f467aa..b715ba3 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -35,6 +35,7 @@ from faker_file.providers.odt_file import OdtFileProvider from faker_file.providers.pdf_file import PdfFileProvider from faker_file.providers.txt_file import TxtFileProvider +from faker_file.providers.xml_file import XmlFileProvider from cardinal_pythonlib.extract_text import ( document_to_text, @@ -62,11 +63,12 @@ def _create_mock_objects(self) -> None: ) def _register_faker_providers(self) -> None: - self.fake = Faker() + self.fake = Faker("en-GB") self.fake.add_provider(DocxFileProvider) self.fake.add_provider(OdtFileProvider) self.fake.add_provider(PdfFileProvider) self.fake.add_provider(TxtFileProvider) + self.fake.add_provider(XmlFileProvider) def _replace_external_tools_with_fakes(self) -> None: # For external tools we assume the tools are running correctly @@ -275,3 +277,38 @@ def test_txt_converted(self) -> None: text = document_to_text(filename=txt_file.data["filename"]) self.assertEqual(text.strip(), content) + + def test_xml_converted(self) -> None: + name = self.fake.name() + address = self.fake.address() + + xml_file = self.fake.xml_file( + num_rows=1, + data_columns={ + "name": name, + "address": address, + }, + ) + text = document_to_text(filename=xml_file.data["filename"]) + + self.assertEqual(text.strip(), f"{name}{address}") + + def test_unsupported_converted(self) -> None: + with mock.patch.multiple( + "cardinal_pythonlib.extract_text.subprocess", + Popen=self.mock_popen, + ): + with NamedTemporaryFile(suffix=".exe", delete=False) as temp_file: + temp_file.close() + document_to_text(filename=temp_file.name, config=self.config) + + expected_calls = [ + mock.call( + ( + f"{self.empty_dir}/strings", + temp_file.name, + ), + stdout=subprocess.PIPE, + ), + ] + self.mock_popen.assert_has_calls(expected_calls) From 37d3257fb667c9d32066aeb741533972e202501c Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 6 May 2025 10:23:53 +0100 Subject: [PATCH 16/39] Fix name clashes with python built-ins This is preventing us from doing this in extract_text.py: from email.parser import Parser as EmailParser Because email already exists as a module --- .pre-commit-config.yaml | 4 ++++ cardinal_pythonlib/django/fields/jsonclassfield.py | 2 +- cardinal_pythonlib/django/function_cache.py | 2 +- cardinal_pythonlib/django/middleware.py | 10 +++++----- cardinal_pythonlib/{email => email_utils}/__init__.py | 0 .../{email => email_utils}/mailboxpurge.py | 0 cardinal_pythonlib/{email => email_utils}/sendmail.py | 0 .../{email => email_utils}/tests/sendmail_tests.py | 0 cardinal_pythonlib/{json => json_utils}/__init__.py | 2 +- cardinal_pythonlib/{json => json_utils}/serialize.py | 2 +- .../{json => json_utils}/typing_helpers.py | 2 +- cardinal_pythonlib/{profile.py => profiling.py} | 0 .../tools/explore_clang_format_config.py | 6 +++++- docs/source/conf.py | 2 +- setup.cfg | 2 +- 15 files changed, 21 insertions(+), 13 deletions(-) rename cardinal_pythonlib/{email => email_utils}/__init__.py (100%) rename cardinal_pythonlib/{email => email_utils}/mailboxpurge.py (100%) rename cardinal_pythonlib/{email => email_utils}/sendmail.py (100%) rename cardinal_pythonlib/{email => email_utils}/tests/sendmail_tests.py (100%) rename cardinal_pythonlib/{json => json_utils}/__init__.py (95%) rename cardinal_pythonlib/{json => json_utils}/serialize.py (99%) rename cardinal_pythonlib/{json => json_utils}/typing_helpers.py (96%) rename cardinal_pythonlib/{profile.py => profiling.py} (100%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2c39bb5..1af96a3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,10 +18,14 @@ repos: rev: 5.0.4 hooks: - id: flake8 + additional_dependencies: + - flake8-builtins==2.5.0 - repo: https://github.com/asottile/yesqa rev: v1.5.0 hooks: - id: yesqa + additional_dependencies: + - flake8-builtins==2.5.0 - repo: https://github.com/pre-commit/pygrep-hooks rev: v1.9.0 hooks: diff --git a/cardinal_pythonlib/django/fields/jsonclassfield.py b/cardinal_pythonlib/django/fields/jsonclassfield.py index 43fbaf5..2240707 100644 --- a/cardinal_pythonlib/django/fields/jsonclassfield.py +++ b/cardinal_pythonlib/django/fields/jsonclassfield.py @@ -130,7 +130,7 @@ def my_decoder_hook(d: Dict) -> Any: # noinspection PyUnresolvedReferences from django.db.models import TextField -from cardinal_pythonlib.json.serialize import json_decode, json_encode +from cardinal_pythonlib.json_utils.serialize import json_decode, json_encode # ============================================================================= diff --git a/cardinal_pythonlib/django/function_cache.py b/cardinal_pythonlib/django/function_cache.py index 11127b1..a2416ab 100644 --- a/cardinal_pythonlib/django/function_cache.py +++ b/cardinal_pythonlib/django/function_cache.py @@ -36,7 +36,7 @@ from django.core.cache import cache # default cache from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler -from cardinal_pythonlib.json.serialize import json_encode +from cardinal_pythonlib.json_utils.serialize import json_encode log = get_brace_style_log_with_null_handler(__name__) diff --git a/cardinal_pythonlib/django/middleware.py b/cardinal_pythonlib/django/middleware.py index b201a08..3d614a7 100644 --- a/cardinal_pythonlib/django/middleware.py +++ b/cardinal_pythonlib/django/middleware.py @@ -28,7 +28,7 @@ import logging import os -from re import compile +import re import sys from typing import Optional @@ -107,9 +107,9 @@ def process_exception( Modified according to: https://djangosnippets.org/snippets/2845/ """ -# EXEMPT_URLS = [compile(settings.LOGIN_URL.lstrip('/'))] +# EXEMPT_URLS = [re.compile(settings.LOGIN_URL.lstrip('/'))] # if hasattr(settings, 'LOGIN_EXEMPT_URLS'): -# EXEMPT_URLS += [compile(expr) for expr in settings.LOGIN_EXEMPT_URLS] +# EXEMPT_URLS += [re.compile(expr) for expr in settings.LOGIN_EXEMPT_URLS] # # # class LoginRequiredMiddleware: @@ -166,10 +166,10 @@ def process_exception( # 3. RNC; composite of those patterns. # ----------------------------------------------------------------------------- -EXEMPT_URLS = [compile(settings.LOGIN_URL.lstrip("/"))] +EXEMPT_URLS = [re.compile(settings.LOGIN_URL.lstrip("/"))] if hasattr(settings, "LOGIN_EXEMPT_URLS"): EXEMPT_URLS += [ - compile(expr.lstrip("/")) for expr in settings.LOGIN_EXEMPT_URLS + re.compile(expr.lstrip("/")) for expr in settings.LOGIN_EXEMPT_URLS ] diff --git a/cardinal_pythonlib/email/__init__.py b/cardinal_pythonlib/email_utils/__init__.py similarity index 100% rename from cardinal_pythonlib/email/__init__.py rename to cardinal_pythonlib/email_utils/__init__.py diff --git a/cardinal_pythonlib/email/mailboxpurge.py b/cardinal_pythonlib/email_utils/mailboxpurge.py similarity index 100% rename from cardinal_pythonlib/email/mailboxpurge.py rename to cardinal_pythonlib/email_utils/mailboxpurge.py diff --git a/cardinal_pythonlib/email/sendmail.py b/cardinal_pythonlib/email_utils/sendmail.py similarity index 100% rename from cardinal_pythonlib/email/sendmail.py rename to cardinal_pythonlib/email_utils/sendmail.py diff --git a/cardinal_pythonlib/email/tests/sendmail_tests.py b/cardinal_pythonlib/email_utils/tests/sendmail_tests.py similarity index 100% rename from cardinal_pythonlib/email/tests/sendmail_tests.py rename to cardinal_pythonlib/email_utils/tests/sendmail_tests.py diff --git a/cardinal_pythonlib/json/__init__.py b/cardinal_pythonlib/json_utils/__init__.py similarity index 95% rename from cardinal_pythonlib/json/__init__.py rename to cardinal_pythonlib/json_utils/__init__.py index 3256199..51cc3bd 100644 --- a/cardinal_pythonlib/json/__init__.py +++ b/cardinal_pythonlib/json_utils/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/json/__init__.py +# cardinal_pythonlib/json_utils/__init__.py """ =============================================================================== diff --git a/cardinal_pythonlib/json/serialize.py b/cardinal_pythonlib/json_utils/serialize.py similarity index 99% rename from cardinal_pythonlib/json/serialize.py rename to cardinal_pythonlib/json_utils/serialize.py index 3103a6e..eb3a434 100644 --- a/cardinal_pythonlib/json/serialize.py +++ b/cardinal_pythonlib/json_utils/serialize.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/json/serialize.py +# cardinal_pythonlib/json_utils/serialize.py """ =============================================================================== diff --git a/cardinal_pythonlib/json/typing_helpers.py b/cardinal_pythonlib/json_utils/typing_helpers.py similarity index 96% rename from cardinal_pythonlib/json/typing_helpers.py rename to cardinal_pythonlib/json_utils/typing_helpers.py index d5c6c18..47c7161 100644 --- a/cardinal_pythonlib/json/typing_helpers.py +++ b/cardinal_pythonlib/json_utils/typing_helpers.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/json/typing_helpers.py +# cardinal_pythonlib/json_utils/typing_helpers.py """ =============================================================================== diff --git a/cardinal_pythonlib/profile.py b/cardinal_pythonlib/profiling.py similarity index 100% rename from cardinal_pythonlib/profile.py rename to cardinal_pythonlib/profiling.py diff --git a/cardinal_pythonlib/tools/explore_clang_format_config.py b/cardinal_pythonlib/tools/explore_clang_format_config.py index 94cb8b0..c6de7cc 100644 --- a/cardinal_pythonlib/tools/explore_clang_format_config.py +++ b/cardinal_pythonlib/tools/explore_clang_format_config.py @@ -60,7 +60,11 @@ def monitor_diff(filenames: List[str], meld_exe: str) -> subprocess.Popen: def clang_format( - config: str, src: str, dest: str, dir: str, clang_format_exe: str + config: str, + src: str, + dest: str, + dir: str, # noqa: A002 + clang_format_exe: str, ) -> None: """ Rungs clang-format, formatting a source file to a destination file using a diff --git a/docs/source/conf.py b/docs/source/conf.py index 9254629..f45f913 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -32,7 +32,7 @@ project = "cardinal_pythonlib" # noinspection PyShadowingBuiltins -copyright = "2009-2020, Rudolf Cardinal" +copyright = "2009-2020, Rudolf Cardinal" # noqa: A001 author = "Rudolf Cardinal" # The short X.Y version diff --git a/setup.cfg b/setup.cfg index 7985ef3..0652958 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,7 +13,7 @@ linters=pycodestyle,pyflakes max-line-length=79 # Not compatible with Black and not PEP8 apparently # E203: Whitespace before ':' -extend-ignore = E203 +extend-ignore = A003,E203 [mypy] # MyPy is a static type checker. It will not execute the code! From 4ba8610400e8d53093d212f7caf766ae13f9fdd0 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 6 May 2025 10:45:35 +0100 Subject: [PATCH 17/39] Ignore shadowing of python built-ins --- cardinal_pythonlib/sizeformatter.py | 2 +- cardinal_pythonlib/tools/convert_mdb_to_mysql.py | 4 ++-- setup.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cardinal_pythonlib/sizeformatter.py b/cardinal_pythonlib/sizeformatter.py index 10169fe..69345e2 100644 --- a/cardinal_pythonlib/sizeformatter.py +++ b/cardinal_pythonlib/sizeformatter.py @@ -69,7 +69,7 @@ def sizeof_fmt(num: float, suffix: str = "B") -> str: def bytes2human( n: Union[int, float], - format: str = "%(value).1f %(symbol)s", + format: str = "%(value).1f %(symbol)s", # noqa: A002 symbols: str = "customary", ) -> str: """ diff --git a/cardinal_pythonlib/tools/convert_mdb_to_mysql.py b/cardinal_pythonlib/tools/convert_mdb_to_mysql.py index 8f2b397..da85905 100644 --- a/cardinal_pythonlib/tools/convert_mdb_to_mysql.py +++ b/cardinal_pythonlib/tools/convert_mdb_to_mysql.py @@ -156,9 +156,9 @@ def __init__( nargs: Union[int, str] = "?", # 0 or 1 default: Any = None, required: bool = False, - type: Callable[[str], Any] = None, + type: Callable[[str], Any] = None, # noqa: A002 metavar: str = None, - help: str = None, + help: str = None, # noqa: A002 ) -> None: super(PasswordPromptAction, self).__init__( option_strings=option_strings, diff --git a/setup.py b/setup.py index 26489c1..2a3e6d4 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ """ from setuptools import setup, find_packages -from codecs import open +from codecs import open # noqa: A004 from os import path from cardinal_pythonlib.version_string import VERSION_STRING From b0520cb813c651e82ee43c23a34054abdb9430ae Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 6 May 2025 10:47:53 +0100 Subject: [PATCH 18/39] Remove check for conflicting email import --- .../ensure_test_executed_correctly.py | 40 ------------------- cardinal_pythonlib/module_version.py | 3 -- 2 files changed, 43 deletions(-) delete mode 100644 cardinal_pythonlib/ensure_test_executed_correctly.py diff --git a/cardinal_pythonlib/ensure_test_executed_correctly.py b/cardinal_pythonlib/ensure_test_executed_correctly.py deleted file mode 100644 index 6ae15f4..0000000 --- a/cardinal_pythonlib/ensure_test_executed_correctly.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python -# cardinal_pythonlib/module_version.py - -""" -=============================================================================== - - Original code copyright (C) 2009-2022 Rudolf Cardinal (rudolf@pobox.com). - - This file is part of cardinal_pythonlib. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - -=============================================================================== - -**Ensure that a library module is executed properly, and not via a way that -breaks imports.** - -""" - -try: - # we want the stdlib email package! - from email import message_from_string # noqa: F401 -except ImportError: - raise ImportError( - "A test of importing 'email' has found " - "cardinal_pythonlib/email/__init__.py, not the email package from " - "stdlib. You are probably running a cardinal_pythonlib file directly, " - "e.g. with 'python somefile.py' or '/path/somefile.py'. Instead, use " - "'python -m cardinal_pythonlib.somefile'." - ) diff --git a/cardinal_pythonlib/module_version.py b/cardinal_pythonlib/module_version.py index f18f9a0..cb251fe 100644 --- a/cardinal_pythonlib/module_version.py +++ b/cardinal_pythonlib/module_version.py @@ -39,9 +39,6 @@ from semantic_version import Version -# noinspection PyUnresolvedReferences -import cardinal_pythonlib.ensure_test_executed_correctly # noqa: F401 - # ============================================================================= # Report Python module versions From d1b00b034e70289e360c467201da7a470233d576 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 6 May 2025 10:55:42 +0100 Subject: [PATCH 19/39] Update docs --- docs/source/autodoc/_index.rst | 13 +++++----- .../mailboxpurge.py.rst | 8 +++--- .../{email => email_utils}/sendmail.py.rst | 8 +++--- .../tests/sendmail_tests.py.rst | 8 +++--- .../ensure_test_executed_correctly.py.rst | 25 ------------------- .../{json => json_utils}/serialize.py.rst | 8 +++--- .../typing_helpers.py.rst | 8 +++--- .../{profile.py.rst => profiling.py.rst} | 8 +++--- 8 files changed, 30 insertions(+), 56 deletions(-) rename docs/source/autodoc/{email => email_utils}/mailboxpurge.py.rst (77%) rename docs/source/autodoc/{email => email_utils}/sendmail.py.rst (79%) rename docs/source/autodoc/{email => email_utils}/tests/sendmail_tests.py.rst (75%) delete mode 100644 docs/source/autodoc/ensure_test_executed_correctly.py.rst rename docs/source/autodoc/{json => json_utils}/serialize.py.rst (79%) rename docs/source/autodoc/{json => json_utils}/typing_helpers.py.rst (77%) rename docs/source/autodoc/{profile.py.rst => profiling.py.rst} (83%) diff --git a/docs/source/autodoc/_index.rst b/docs/source/autodoc/_index.rst index 3a7fb7e..d910bf4 100644 --- a/docs/source/autodoc/_index.rst +++ b/docs/source/autodoc/_index.rst @@ -66,10 +66,9 @@ Automatic documentation of source code docker.py.rst dogpile_cache.py.rst dsp.py.rst - email/mailboxpurge.py.rst - email/sendmail.py.rst - email/tests/sendmail_tests.py.rst - ensure_test_executed_correctly.py.rst + email_utils/mailboxpurge.py.rst + email_utils/sendmail.py.rst + email_utils/tests/sendmail_tests.py.rst enumlike.py.rst excel.py.rst exceptions.py.rst @@ -82,8 +81,8 @@ Automatic documentation of source code httpconst.py.rst interval.py.rst iterhelp.py.rst - json/serialize.py.rst - json/typing_helpers.py.rst + json_utils/serialize.py.rst + json_utils/typing_helpers.py.rst lang.py.rst lists.py.rst logs.py.rst @@ -104,7 +103,7 @@ Automatic documentation of source code plot.py.rst probability.py.rst process.py.rst - profile.py.rst + profiling.py.rst progress.py.rst psychiatry/drugs.py.rst psychiatry/mk_r_druglists.py.rst diff --git a/docs/source/autodoc/email/mailboxpurge.py.rst b/docs/source/autodoc/email_utils/mailboxpurge.py.rst similarity index 77% rename from docs/source/autodoc/email/mailboxpurge.py.rst rename to docs/source/autodoc/email_utils/mailboxpurge.py.rst index c49e933..a4c86da 100644 --- a/docs/source/autodoc/email/mailboxpurge.py.rst +++ b/docs/source/autodoc/email_utils/mailboxpurge.py.rst @@ -1,4 +1,4 @@ -.. docs/source/autodoc/email/mailboxpurge.py.rst +.. docs/source/autodoc/email_utils/mailboxpurge.py.rst .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. @@ -18,8 +18,8 @@ limitations under the License. -cardinal_pythonlib.email.mailboxpurge -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +cardinal_pythonlib.email_utils.mailboxpurge +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: cardinal_pythonlib.email.mailboxpurge +.. automodule:: cardinal_pythonlib.email_utils.mailboxpurge :members: diff --git a/docs/source/autodoc/email/sendmail.py.rst b/docs/source/autodoc/email_utils/sendmail.py.rst similarity index 79% rename from docs/source/autodoc/email/sendmail.py.rst rename to docs/source/autodoc/email_utils/sendmail.py.rst index 82327dc..e090e97 100644 --- a/docs/source/autodoc/email/sendmail.py.rst +++ b/docs/source/autodoc/email_utils/sendmail.py.rst @@ -1,4 +1,4 @@ -.. docs/source/autodoc/email/sendmail.py.rst +.. docs/source/autodoc/email_utils/sendmail.py.rst .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. @@ -18,8 +18,8 @@ limitations under the License. -cardinal_pythonlib.email.sendmail -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +cardinal_pythonlib.email_utils.sendmail +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: cardinal_pythonlib.email.sendmail +.. automodule:: cardinal_pythonlib.email_utils.sendmail :members: diff --git a/docs/source/autodoc/email/tests/sendmail_tests.py.rst b/docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst similarity index 75% rename from docs/source/autodoc/email/tests/sendmail_tests.py.rst rename to docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst index f209e33..22fa863 100644 --- a/docs/source/autodoc/email/tests/sendmail_tests.py.rst +++ b/docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst @@ -1,4 +1,4 @@ -.. docs/source/autodoc/email/tests/sendmail_tests.py.rst +.. docs/source/autodoc/email_utils/tests/sendmail_tests.py.rst .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. @@ -18,8 +18,8 @@ limitations under the License. -cardinal_pythonlib.email.tests.sendmail_tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +cardinal_pythonlib.email_utils.tests.sendmail_tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: cardinal_pythonlib.email.tests.sendmail_tests +.. automodule:: cardinal_pythonlib.email_utils.tests.sendmail_tests :members: diff --git a/docs/source/autodoc/ensure_test_executed_correctly.py.rst b/docs/source/autodoc/ensure_test_executed_correctly.py.rst deleted file mode 100644 index efde8de..0000000 --- a/docs/source/autodoc/ensure_test_executed_correctly.py.rst +++ /dev/null @@ -1,25 +0,0 @@ -.. docs/source/autodoc/ensure_test_executed_correctly.py.rst - -.. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. - - -.. Copyright (C) 2009-2020 Rudolf Cardinal (rudolf@pobox.com). - . - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - . - https://www.apache.org/licenses/LICENSE-2.0 - . - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - -cardinal_pythonlib.ensure_test_executed_correctly -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. automodule:: cardinal_pythonlib.ensure_test_executed_correctly - :members: diff --git a/docs/source/autodoc/json/serialize.py.rst b/docs/source/autodoc/json_utils/serialize.py.rst similarity index 79% rename from docs/source/autodoc/json/serialize.py.rst rename to docs/source/autodoc/json_utils/serialize.py.rst index 1868956..15e18c3 100644 --- a/docs/source/autodoc/json/serialize.py.rst +++ b/docs/source/autodoc/json_utils/serialize.py.rst @@ -1,4 +1,4 @@ -.. docs/source/autodoc/json/serialize.py.rst +.. docs/source/autodoc/json_utils/serialize.py.rst .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. @@ -18,8 +18,8 @@ limitations under the License. -cardinal_pythonlib.json.serialize -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +cardinal_pythonlib.json_utils.serialize +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: cardinal_pythonlib.json.serialize +.. automodule:: cardinal_pythonlib.json_utils.serialize :members: diff --git a/docs/source/autodoc/json/typing_helpers.py.rst b/docs/source/autodoc/json_utils/typing_helpers.py.rst similarity index 77% rename from docs/source/autodoc/json/typing_helpers.py.rst rename to docs/source/autodoc/json_utils/typing_helpers.py.rst index d6125b1..e53154e 100644 --- a/docs/source/autodoc/json/typing_helpers.py.rst +++ b/docs/source/autodoc/json_utils/typing_helpers.py.rst @@ -1,4 +1,4 @@ -.. docs/source/autodoc/json/typing_helpers.py.rst +.. docs/source/autodoc/json_utils/typing_helpers.py.rst .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. @@ -18,8 +18,8 @@ limitations under the License. -cardinal_pythonlib.json.typing_helpers -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +cardinal_pythonlib.json_utils.typing_helpers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: cardinal_pythonlib.json.typing_helpers +.. automodule:: cardinal_pythonlib.json_utils.typing_helpers :members: diff --git a/docs/source/autodoc/profile.py.rst b/docs/source/autodoc/profiling.py.rst similarity index 83% rename from docs/source/autodoc/profile.py.rst rename to docs/source/autodoc/profiling.py.rst index 6149c87..1c40074 100644 --- a/docs/source/autodoc/profile.py.rst +++ b/docs/source/autodoc/profiling.py.rst @@ -1,4 +1,4 @@ -.. docs/source/autodoc/profile.py.rst +.. docs/source/autodoc/profiling.py.rst .. THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. @@ -18,8 +18,8 @@ limitations under the License. -cardinal_pythonlib.profile -~~~~~~~~~~~~~~~~~~~~~~~~~~ +cardinal_pythonlib.profiling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: cardinal_pythonlib.profile +.. automodule:: cardinal_pythonlib.profiling :members: From dc511c77cbc3e906465740c48ce8863ef6ef16c4 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 6 May 2025 11:03:07 +0100 Subject: [PATCH 20/39] Fixups following module renaming --- cardinal_pythonlib/bulk_email/main.py | 2 +- cardinal_pythonlib/bulk_email/models.py | 2 +- cardinal_pythonlib/email_utils/__init__.py | 2 +- cardinal_pythonlib/email_utils/mailboxpurge.py | 2 +- cardinal_pythonlib/email_utils/sendmail.py | 2 +- cardinal_pythonlib/email_utils/tests/sendmail_tests.py | 4 ++-- cardinal_pythonlib/profiling.py | 2 +- setup.py | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cardinal_pythonlib/bulk_email/main.py b/cardinal_pythonlib/bulk_email/main.py index 7757574..a70f19e 100644 --- a/cardinal_pythonlib/bulk_email/main.py +++ b/cardinal_pythonlib/bulk_email/main.py @@ -57,7 +57,7 @@ Recipient, SendAttempt, ) -from cardinal_pythonlib.email.sendmail import ( +from cardinal_pythonlib.email_utils.sendmail import ( CONTENT_TYPE_HTML, CONTENT_TYPE_TEXT, is_email_valid, diff --git a/cardinal_pythonlib/bulk_email/models.py b/cardinal_pythonlib/bulk_email/models.py index c6d210f..a01b76c 100644 --- a/cardinal_pythonlib/bulk_email/models.py +++ b/cardinal_pythonlib/bulk_email/models.py @@ -63,7 +63,7 @@ USERNAME_MAX_LENGTH, ) from cardinal_pythonlib.colander_utils import EMAIL_ADDRESS_MAX_LEN -from cardinal_pythonlib.email.sendmail import ( +from cardinal_pythonlib.email_utils.sendmail import ( ASCII, CONTENT_TYPE_TEXT, is_email_valid, diff --git a/cardinal_pythonlib/email_utils/__init__.py b/cardinal_pythonlib/email_utils/__init__.py index 61be984..c94078a 100644 --- a/cardinal_pythonlib/email_utils/__init__.py +++ b/cardinal_pythonlib/email_utils/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/email/__init__.py +# cardinal_pythonlib/email_utils/__init__.py """ =============================================================================== diff --git a/cardinal_pythonlib/email_utils/mailboxpurge.py b/cardinal_pythonlib/email_utils/mailboxpurge.py index cae5c07..1f52b51 100755 --- a/cardinal_pythonlib/email_utils/mailboxpurge.py +++ b/cardinal_pythonlib/email_utils/mailboxpurge.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/email/mailboxpurge.py +# cardinal_pythonlib/email_utils/mailboxpurge.py """ Remove all binary attachments from email messages diff --git a/cardinal_pythonlib/email_utils/sendmail.py b/cardinal_pythonlib/email_utils/sendmail.py index a286fb8..edebe34 100755 --- a/cardinal_pythonlib/email_utils/sendmail.py +++ b/cardinal_pythonlib/email_utils/sendmail.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/email/sendmail.py +# cardinal_pythonlib/email_utils/sendmail.py """ =============================================================================== diff --git a/cardinal_pythonlib/email_utils/tests/sendmail_tests.py b/cardinal_pythonlib/email_utils/tests/sendmail_tests.py index 7e3107d..3c4eb37 100644 --- a/cardinal_pythonlib/email_utils/tests/sendmail_tests.py +++ b/cardinal_pythonlib/email_utils/tests/sendmail_tests.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/email/tests/sendmail_tests.py +# cardinal_pythonlib/email_utils/tests/sendmail_tests.py """ =============================================================================== @@ -28,7 +28,7 @@ import unittest -from cardinal_pythonlib.email.sendmail import is_email_valid +from cardinal_pythonlib.email_utils.sendmail import is_email_valid class TestIsEmailValid(unittest.TestCase): diff --git a/cardinal_pythonlib/profiling.py b/cardinal_pythonlib/profiling.py index 558e13a..a06074e 100644 --- a/cardinal_pythonlib/profiling.py +++ b/cardinal_pythonlib/profiling.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# cardinal_pythonlib/profile.py +# cardinal_pythonlib/profiling.py """ =============================================================================== diff --git a/setup.py b/setup.py index 2a3e6d4..553ff11 100644 --- a/setup.py +++ b/setup.py @@ -192,7 +192,7 @@ "cardinalpythonlib_chebi=cardinal_pythonlib.chebi:main", ( "cardinalpythonlib_email=" - "cardinal_pythonlib.email.sendmail:main" + "cardinal_pythonlib.email_utils.sendmail:main" ), ( "cardinalpythonlib_extract_text=" From be15403e7d5ad8273d08935bab812d8e9e9b51c8 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Wed, 7 May 2025 06:51:47 +0100 Subject: [PATCH 21/39] extract_text.py type hints --- cardinal_pythonlib/extract_text.py | 17 +++++++++-------- setup.cfg | 3 +++ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index 41dbd52..96df52d 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -87,6 +87,7 @@ import sys import textwrap from typing import ( + Any, BinaryIO, Dict, Generator, @@ -205,9 +206,9 @@ def __init__( plain: bool = False, semiplain: bool = False, docx_in_order: bool = True, - horizontal_char="─", - vertical_char="│", - junction_char="┼", + horizontal_char: str = "─", + vertical_char: str = "│", + junction_char: str = "┼", plain_table_start: str = None, plain_table_end: str = None, plain_table_col_boundary: str = None, @@ -445,7 +446,7 @@ def get_file_contents_text( ) -def get_cmd_output(*args, encoding: str = SYS_ENCODING) -> str: +def get_cmd_output(*args: Any, encoding: str = SYS_ENCODING) -> str: """ Returns text output of a command. """ @@ -456,7 +457,7 @@ def get_cmd_output(*args, encoding: str = SYS_ENCODING) -> str: def get_cmd_output_from_stdin( - stdint_content_binary: bytes, *args, encoding: str = SYS_ENCODING + stdint_content_binary: bytes, *args: Any, encoding: str = SYS_ENCODING ) -> str: """ Returns text output of a command, passing binary data in via stdin. @@ -559,7 +560,7 @@ def availability_pdf() -> bool: ) -def docx_qn(tagroot): +def docx_qn(tagroot: str) -> str: return f"{{{DOCX_SCHEMA_URL}}}{tagroot}" @@ -624,7 +625,7 @@ def docx_gen_wordwrapped_fragments( """ to_wrap = [] # type: List[DocxFragment] - def yield_wrapped(): + def yield_wrapped() -> Generator[str, None, None]: """ Yield the word-wrapped stuff to date. """ @@ -1267,7 +1268,7 @@ def availability_anything() -> bool: # Decider # ============================================================================= -ext_map = { +ext_map: dict[str, dict[str, Any]] = { # Converter functions must be of the form: func(filename, blob, config). # Availability must be either a boolean literal or a function that takes no # params. diff --git a/setup.cfg b/setup.cfg index 0652958..b922b87 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,3 +22,6 @@ no_strict_optional = True allow_redefinition = True disallow_untyped_defs = True disallow_incomplete_defs = True + +[mypy-semantic_version.*] +ignore_missing_imports = True From c9a06cec691dbbe33dfd42367ca3623994a5c179 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Wed, 7 May 2025 06:52:38 +0100 Subject: [PATCH 22/39] Use html.parser for BeautifulSoup --- cardinal_pythonlib/extract_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index 96df52d..fc2431a 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -1134,7 +1134,7 @@ def convert_html_to_text( Converts HTML to text. """ with get_filelikeobject(filename, blob) as fp: - soup = bs4.BeautifulSoup(fp) + soup = bs4.BeautifulSoup(fp, "html.parser") return soup.get_text() From 761e404fc1752f8e360f1d06a46d1a0745a0b7a0 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Fri, 9 May 2025 16:53:31 +0100 Subject: [PATCH 23/39] Support .eml text extraction --- .../scripts/install_test_python_packages.sh | 2 +- cardinal_pythonlib/extract_text.py | 69 +++++++++ .../tests/extract_text_tests.py | 137 +++++++++++++++++- 3 files changed, 204 insertions(+), 4 deletions(-) diff --git a/.github/scripts/install_test_python_packages.sh b/.github/scripts/install_test_python_packages.sh index ef6fbf9..129d97e 100755 --- a/.github/scripts/install_test_python_packages.sh +++ b/.github/scripts/install_test_python_packages.sh @@ -10,4 +10,4 @@ ${PYTHON} -m pip install xlrd ${PYTHON} -m pip install dogpile.cache==0.9.2 # Later versions incompatible ${PYTHON} -m pip install pytest ${PYTHON} -m pip install xhtml2pdf weasyprint pdfkit # For PDF tests -${PYTHON} -m pip install faker==13.3.1 faker-file'[common]'==0.17.13 +${PYTHON} -m pip install faker==13.3.1 faker-file'[common]'==0.18.3 diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index fc2431a..22d1ef0 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -77,9 +77,14 @@ # ============================================================================= import argparse +import base64 +from email import policy +from email.message import EmailMessage +from email.parser import BytesParser from io import StringIO import io import logging +from mimetypes import guess_extension import os import re import shutil @@ -1230,6 +1235,69 @@ def availability_doc() -> bool: return bool(antiword) +# ============================================================================= +# EML +# ============================================================================= + + +def convert_eml_to_text( + filename: str = None, + blob: bytes = None, + config: TextProcessingConfig = _DEFAULT_CONFIG, +) -> str: + email_content_list: list[str] = [] + + with get_filelikeobject(filename, blob) as fp: + parser = BytesParser(policy=policy.default) # type: ignore[arg-type] + message = parser.parse(fp) + + for email_content in _gen_email_content(message, config): + if email_content is not None: + email_content_list.append(email_content) + + text = "\n".join(email_content_list) + + return text + + +def _gen_email_content( + message: EmailMessage, config: TextProcessingConfig +) -> Generator[Optional[str], None, None]: + body = message.get_body( + preferencelist=( + "html", + "plain", + ) + ) # type: ignore[attr-defined] + if body is not None: + yield _get_email_content(body, config) + + for part in message.iter_attachments(): # type: ignore[attr-defined] + yield _get_email_content(part, config) + + +def _get_email_content( + message: EmailMessage, + config: TextProcessingConfig, +) -> Optional[str]: + content_type = message.get_content_type() + ext = guess_extension(content_type) + + if ext is not None and ext in ext_map: + content = message.get_content() + if isinstance(content, str): + charset = message["Content-Type"].params["charset"] + blob = content.encode(charset) + elif isinstance(content, EmailMessage): + blob = content.as_bytes() + if message.get("Content-Transfer-Encoding") == "base64": + blob = base64.b64decode(blob) + else: + blob = content + + return document_to_text(blob=blob, extension=ext, config=config) + + # ============================================================================= # Anything # ============================================================================= @@ -1277,6 +1345,7 @@ def availability_anything() -> bool: ".docm": {CONVERTER: convert_docx_to_text, AVAILABILITY: True}, ".docx": {CONVERTER: convert_docx_to_text, AVAILABILITY: True}, ".dot": {CONVERTER: convert_doc_to_text, AVAILABILITY: availability_doc}, + ".eml": {CONVERTER: convert_eml_to_text, AVAILABILITY: True}, ".htm": {CONVERTER: convert_html_to_text, AVAILABILITY: True}, ".html": {CONVERTER: convert_html_to_text, AVAILABILITY: True}, ".log": {CONVERTER: get_file_contents_text, AVAILABILITY: True}, diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index b715ba3..f81808b 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -25,6 +25,8 @@ """ +from email import message_from_string, policy +from email.message import EmailMessage import os import subprocess from tempfile import mkdtemp, NamedTemporaryFile @@ -32,8 +34,12 @@ from faker import Faker from faker_file.providers.docx_file import DocxFileProvider +from faker_file.providers.eml_file import EmlFileProvider +from faker_file.providers.helpers.inner import ( + create_inner_docx_file, + create_inner_eml_file, +) from faker_file.providers.odt_file import OdtFileProvider -from faker_file.providers.pdf_file import PdfFileProvider from faker_file.providers.txt_file import TxtFileProvider from faker_file.providers.xml_file import XmlFileProvider @@ -63,10 +69,11 @@ def _create_mock_objects(self) -> None: ) def _register_faker_providers(self) -> None: - self.fake = Faker("en-GB") + self.fake = Faker("en-US") # To avoid Lorem Ipsum + self.fake.seed_instance(12345) self.fake.add_provider(DocxFileProvider) + self.fake.add_provider(EmlFileProvider) self.fake.add_provider(OdtFileProvider) - self.fake.add_provider(PdfFileProvider) self.fake.add_provider(TxtFileProvider) self.fake.add_provider(XmlFileProvider) @@ -293,6 +300,130 @@ def test_xml_converted(self) -> None: self.assertEqual(text.strip(), f"{name}{address}") + def test_eml_converted(self) -> None: + content = self.fake.paragraph(nb_sentences=10) + eml_file = self.fake.eml_file(content=content) + text = document_to_text(filename=eml_file.data["filename"]) + + self.assertEqual(text.strip(), content) + + def test_eml_with_docx_attachment_converted(self) -> None: + body_content = self.fake.paragraph(nb_sentences=10) + docx_content = self.fake.paragraph(nb_sentences=10) + + docx_file_args = dict(content=docx_content) + options = dict( + count=1, + create_inner_file_func=create_inner_docx_file, + create_inner_file_args=docx_file_args, + ) + + eml_file = self.fake.eml_file( + content=body_content, + options=options, + ) + self.config.width = 0 + text = document_to_text( + filename=eml_file.data["filename"], config=self.config + ) + + self.assertIn(body_content, text) + self.assertIn(docx_content, text) + + def test_eml_with_nested_docx_attachment_converted(self) -> None: + outer_email_content = self.fake.paragraph(nb_sentences=10) + inner_email_content = self.fake.paragraph(nb_sentences=10) + + docx_content = self.fake.paragraph(nb_sentences=10) + + docx_file_args = dict(content=docx_content) + docx_options = dict( + count=1, + create_inner_file_func=create_inner_docx_file, + create_inner_file_args=docx_file_args, + ) + eml_file_args = dict( + content=inner_email_content, + options=docx_options, + ) + eml_options = dict( + count=1, + create_inner_file_func=create_inner_eml_file, + create_inner_file_args=eml_file_args, + ) + + eml_file = self.fake.eml_file( + content=outer_email_content, + options=eml_options, + ) + + self.config.width = 0 + text = document_to_text( + filename=eml_file.data["filename"], config=self.config + ) + + self.assertIn(outer_email_content, text) + self.assertIn(inner_email_content, text) + self.assertIn(docx_content, text) + + def test_eml_html_body_preferred_over_text(self) -> None: + # Contrived example. Normally these would have the same content + text_content = self.fake.paragraph(nb_sentences=10) + html_content = self.fake.paragraph(nb_sentences=10) + html = f""" + + + + + +{html_content} + + +""" + # faker-file can't do this yet + message = EmailMessage() + message.set_content(text_content) + message.add_alternative(html, subtype="html") + blob = message.as_bytes() + + text = document_to_text( + blob=blob, extension=".eml", config=self.config + ) + + self.assertIn(html_content, text) + self.assertNotIn(text_content, text) + + def test_eml_latin1_html_decoded_correctly(self) -> None: + content = """From: foo@example.org +To: bar@example.org +Subject: Latin-1 test +Content-Type: multipart/mixed; boundary="===" +MIME-Version: 1.0 + +--=== +Content-Type: text/html; charset="iso-8859-1" +Content-Transfer-Encoding: quoted-printable + + + + + +Caf=E9 + + +--===-- +""" + + message = message_from_string(content, policy=policy.default) + blob = message.as_bytes() + + text = document_to_text( + blob=blob, extension=".eml", config=self.config + ) + + self.assertIn("Café", text) + def test_unsupported_converted(self) -> None: with mock.patch.multiple( "cardinal_pythonlib.extract_text.subprocess", From 75b9ce6b99922aab05f8ef183dea808a76ce96fa Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Fri, 9 May 2025 16:54:05 +0100 Subject: [PATCH 24/39] Replace deprecated BeautifulStoneSoup as advised --- cardinal_pythonlib/extract_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index 22d1ef0..0a035d9 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -1158,7 +1158,7 @@ def convert_xml_to_text( Converts XML to text. """ with get_filelikeobject(filename, blob) as fp: - soup = bs4.BeautifulStoneSoup(fp) + soup = bs4.BeautifulSoup(fp, features="xml") return soup.get_text() From e58d8fddfdc0c4f4e2f81f6dfd7a3032b5449b68 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Fri, 9 May 2025 20:33:19 +0100 Subject: [PATCH 25/39] Default to UTF-8 when no charset in emails --- cardinal_pythonlib/extract_text.py | 2 +- .../tests/extract_text_tests.py | 27 +++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index 0a035d9..4657580 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -1286,7 +1286,7 @@ def _get_email_content( if ext is not None and ext in ext_map: content = message.get_content() if isinstance(content, str): - charset = message["Content-Type"].params["charset"] + charset = message["Content-Type"].params.get("charset", "utf-8") blob = content.encode(charset) elif isinstance(content, EmailMessage): blob = content.as_bytes() diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index f81808b..77feacc 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -424,6 +424,33 @@ def test_eml_latin1_html_decoded_correctly(self) -> None: self.assertIn("Café", text) + def test_eml_with_no_charset_converted(self) -> None: + text_content = self.fake.paragraph(nb_sentences=10) + + content = f"""From: bar@example.org +Subject: No charset +To: foo@example.org +Mime-Version: 1.0 +Content-Type: multipart/mixed;boundary="===" + +--=== +Content-Type: text/plain + +{text_content} + +--===-- + +""" + + message = message_from_string(content, policy=policy.default) + blob = message.as_bytes() + + text = document_to_text( + blob=blob, extension=".eml", config=self.config + ) + + self.assertIn(text_content, text) + def test_unsupported_converted(self) -> None: with mock.patch.multiple( "cardinal_pythonlib.extract_text.subprocess", From 4a11b4983a93b0829025e8d6c27ddd755e548f05 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Sat, 10 May 2025 06:57:24 +0100 Subject: [PATCH 26/39] Default to UTF-8 when no content type header in emails --- cardinal_pythonlib/extract_text.py | 5 +++- .../tests/extract_text_tests.py | 26 +++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index 4657580..b311ca1 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -1286,7 +1286,10 @@ def _get_email_content( if ext is not None and ext in ext_map: content = message.get_content() if isinstance(content, str): - charset = message["Content-Type"].params.get("charset", "utf-8") + charset = "utf-8" + content_type_header = message.get("Content-Type") + if content_type_header: + charset = content_type_header.params.get("charset", "utf-8") blob = content.encode(charset) elif isinstance(content, EmailMessage): blob = content.as_bytes() diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 77feacc..752c41d 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -440,6 +440,32 @@ def test_eml_with_no_charset_converted(self) -> None: --===-- +""" + + message = message_from_string(content, policy=policy.default) + blob = message.as_bytes() + + text = document_to_text( + blob=blob, extension=".eml", config=self.config + ) + + self.assertIn(text_content, text) + + def test_eml_with_no_content_type_converted(self) -> None: + text_content = self.fake.paragraph(nb_sentences=10) + + content = f"""From: bar@example.org +Subject: No content type +To: foo@example.org +Mime-Version: 1.0 +Content-Type: multipart/mixed;boundary="===" + +--=== + +{text_content} + +--===-- + """ message = message_from_string(content, policy=policy.default) From 5fb204f16565d6d86aeb84ab9685fad1f24f5fb3 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Mon, 12 May 2025 11:23:30 +0100 Subject: [PATCH 27/39] Allow docx files to include document files with document[nn].xml form I don't know if this is deviating from the standard but I have seen one example of this in the real world --- cardinal_pythonlib/extract_text.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index b311ca1..3432080 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -555,10 +555,10 @@ def availability_pdf() -> bool: # ----------------------------------------------------------------------------- # In a D.I.Y. fashion # ----------------------------------------------------------------------------- -# DOCX specification: http://www.ecma-international.org/news/TC45_current_work/TC45_available_docs.htm # noqa: E501 +# DOCX specification: https://ecma-international.org/publications-and-standards/standards/ecma-376/ # noqa: E501 DOCX_HEADER_FILE_REGEX = re.compile("word/header[0-9]*.xml") -DOCX_DOC_FILE = "word/document.xml" +DOCX_DOCUMENT_FILE_REGEX = re.compile("word/document[0-9]*.xml") DOCX_FOOTER_FILE_REGEX = re.compile("word/footer[0-9]*.xml") DOCX_SCHEMA_URL = ( "http://schemas.openxmlformats.org/wordprocessingml/2006/main" @@ -601,7 +601,9 @@ def gen_xml_files_from_docx(fp: BinaryIO) -> Iterator[str]: for filename in filelist: if DOCX_HEADER_FILE_REGEX.match(filename): yield z.read(filename).decode("utf8") - yield z.read(DOCX_DOC_FILE) + for filename in filelist: + if DOCX_DOCUMENT_FILE_REGEX.match(filename): + yield z.read(filename) for filename in filelist: if DOCX_FOOTER_FILE_REGEX.match(filename): yield z.read(filename).decode("utf8") From de72344f763ddf6ba3c8dd6d006192bb713abb3a Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Mon, 12 May 2025 15:00:41 +0100 Subject: [PATCH 28/39] Allow blobs to be empty when extracting text It is possible to have an email with an empty body. Other scenarios (empty HTML, docx etc) are pretty unlikely --- cardinal_pythonlib/extract_text.py | 8 +++--- .../tests/extract_text_tests.py | 25 +++++++++++++++++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index 3432080..f6c0250 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -358,7 +358,7 @@ def get_filelikeobject(filename: str = None, blob: bytes = None) -> BinaryIO: Returns: a :class:`BinaryIO` object """ - if not filename and not blob: + if filename is None and blob is None: raise ValueError("no filename and no blob") if filename and blob: raise ValueError("specify either filename or blob") @@ -373,11 +373,11 @@ def get_file_contents(filename: str = None, blob: bytes = None) -> bytes: """ Returns the binary contents of a file, or of a BLOB. """ - if not filename and not blob: + if filename is None and blob is None: raise ValueError("no filename and no blob") if filename and blob: raise ValueError("specify either filename or blob") - if blob: + if blob is not None: return blob with open(filename, "rb") as f: return f.read() @@ -1408,7 +1408,7 @@ def document_to_text( Raises an exception for malformed arguments, missing files, bad filetypes, etc. """ - if not filename and not blob: + if filename is None and blob is None: raise ValueError("document_to_text: no filename and no blob") if filename and blob: raise ValueError("document_to_text: specify either filename or blob") diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 752c41d..92c17c2 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -204,6 +204,12 @@ def test_htm_converted(self) -> None: ) self.assertEqual(text.strip(), content) + def test_empty_htm_converted(self) -> None: + text = document_to_text( + blob="".encode("utf-8"), extension="htm", config=self.config + ) + self.assertEqual(text, "") + def test_log_converted(self) -> None: content = """ 2025-04-02 06:05:43,772 INFO Starting unattended upgrades script @@ -477,6 +483,25 @@ def test_eml_with_no_content_type_converted(self) -> None: self.assertIn(text_content, text) + def test_eml_with_empty_body_converted(self) -> None: + content = """From: bar@example.org +Subject: No body +To: foo@example.org +Mime-Version: 1.0 +Content-Type: multipart/mixed;boundary="===" + +--=== +--===-- +""" + message = message_from_string(content, policy=policy.default) + blob = message.as_bytes() + + text = document_to_text( + blob=blob, extension=".eml", config=self.config + ) + + self.assertEqual("", text) + def test_unsupported_converted(self) -> None: with mock.patch.multiple( "cardinal_pythonlib.extract_text.subprocess", From 87f7754ce85d613d3d7b3ddc5d70f7d7e485da7f Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Mon, 12 May 2025 15:21:56 +0100 Subject: [PATCH 29/39] Fix docx filename generation to yield string, not bytes --- cardinal_pythonlib/extract_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index f6c0250..7ab576e 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -603,7 +603,7 @@ def gen_xml_files_from_docx(fp: BinaryIO) -> Iterator[str]: yield z.read(filename).decode("utf8") for filename in filelist: if DOCX_DOCUMENT_FILE_REGEX.match(filename): - yield z.read(filename) + yield z.read(filename).decode("utf8") for filename in filelist: if DOCX_FOOTER_FILE_REGEX.match(filename): yield z.read(filename).decode("utf8") From e17023e900585340962a73e9cf6960aa66b5ca3b Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Mon, 12 May 2025 15:22:27 +0100 Subject: [PATCH 30/39] Fix missing return value --- cardinal_pythonlib/extract_text.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index 7ab576e..f36cba6 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -1302,6 +1302,8 @@ def _get_email_content( return document_to_text(blob=blob, extension=ext, config=config) + return None + # ============================================================================= # Anything From bdc9983e1cb1e2cf07783284903cab1a789a2b9e Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Mon, 12 May 2025 16:24:52 +0100 Subject: [PATCH 31/39] Workaround BeautifulSoup not handling empty byte array correctly --- cardinal_pythonlib/extract_text.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index f36cba6..4172b99 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -1140,6 +1140,12 @@ def convert_html_to_text( """ Converts HTML to text. """ + + # beautifulsoup4==4.13.4 returns "b''" for an empty bytes array + # So we just workaround this here: + if bytes is not None and len(blob) == 0: + return "" + with get_filelikeobject(filename, blob) as fp: soup = bs4.BeautifulSoup(fp, "html.parser") return soup.get_text() From 499f994606c5fb27000db61a8c7aebbd21808700 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Mon, 12 May 2025 17:18:27 +0100 Subject: [PATCH 32/39] Note BS4 bug report --- cardinal_pythonlib/extract_text.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index 4172b99..cb6f108 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -1141,6 +1141,7 @@ def convert_html_to_text( Converts HTML to text. """ + # https://bugs.launchpad.net/beautifulsoup/+bug/2110492 # beautifulsoup4==4.13.4 returns "b''" for an empty bytes array # So we just workaround this here: if bytes is not None and len(blob) == 0: From dc92a17bdec317b1f9a8db2cb241ab0687efeaa1 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 13 May 2025 10:04:11 +0100 Subject: [PATCH 33/39] Replace illegal multibyte sequences when encoding emails --- cardinal_pythonlib/extract_text.py | 2 +- .../tests/extract_text_tests.py | 29 +++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index cb6f108..26f339a 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -1299,7 +1299,7 @@ def _get_email_content( content_type_header = message.get("Content-Type") if content_type_header: charset = content_type_header.params.get("charset", "utf-8") - blob = content.encode(charset) + blob = content.encode(charset, "replace") elif isinstance(content, EmailMessage): blob = content.as_bytes() if message.get("Content-Transfer-Encoding") == "base64": diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 92c17c2..d37546e 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -502,6 +502,35 @@ def test_eml_with_empty_body_converted(self) -> None: self.assertEqual("", text) + def test_eml_with_illegal_multibyte_sequence_replaced(self) -> None: + content = """From: bar@example.org +Subject: Illegal multibyte sequence +To: foo@example.org +Mime-Version: 1.0 +Content-Type: multipart/mixed;boundary="===" + +--=== +Content-Type: text/html; charset="big5" +Content-Transfer-Encoding: quoted-printable + + + + + +=F9=F9 + + +--===-- +""" + message = message_from_string(content, policy=policy.default) + blob = message.as_bytes() + + text = document_to_text( + blob=blob, extension=".eml", config=self.config + ) + + self.assertEqual(text.strip(), "??") + def test_unsupported_converted(self) -> None: with mock.patch.multiple( "cardinal_pythonlib.extract_text.subprocess", From 51e9295292f5e92c0dd7bafac38f3ce33b4f661c Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 13 May 2025 16:51:18 +0100 Subject: [PATCH 34/39] Handle invalid surrogate characters in HTML conversion --- cardinal_pythonlib/extract_text.py | 10 ++++++- .../tests/extract_text_tests.py | 30 +++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index 26f339a..eb57d60 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -1149,7 +1149,15 @@ def convert_html_to_text( with get_filelikeobject(filename, blob) as fp: soup = bs4.BeautifulSoup(fp, "html.parser") - return soup.get_text() + + # In the real world we can end up with UTF-16 characters embedded as + # numbered entities in Windows-1252 encoded HTML such as + # �� "Slightly smiling face". Replacing these here + # avoids "UnicodeEncodeError: 'utf-8' codec can't encode characters in + # position ... surrogates not allowed". + text = soup.get_text().encode(errors="replace").decode() + + return text # ============================================================================= diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index d37546e..c0e8590 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -531,6 +531,36 @@ def test_eml_with_illegal_multibyte_sequence_replaced(self) -> None: self.assertEqual(text.strip(), "??") + def test_eml_invalid_surrogate_characters_replaced(self) -> None: + content = """From: bar@example.org +Subject: Invalid surrogate characters +To: foo@example.org +Mime-Version: 1.0 +Content-Type: multipart/mixed;boundary="===" + +--=== +Content-Type: text/html; charset="windows-1252" +Content-Transfer-Encoding: quoted-printable + + + + + +�� + + +--===-- +""" + message = message_from_string(content, policy=policy.default) + blob = message.as_bytes() + + text = document_to_text( + blob=blob, extension=".eml", config=self.config + ) + + self.assertEqual(text.strip(), "??") + def test_unsupported_converted(self) -> None: with mock.patch.multiple( "cardinal_pythonlib.extract_text.subprocess", From fdccb7640b6e3651fcb772e11c359dbfd9182c3f Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 13 May 2025 20:56:14 +0100 Subject: [PATCH 35/39] Better names for test methods --- cardinal_pythonlib/tests/extract_text_tests.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index c0e8590..9485043 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -130,7 +130,7 @@ def test_csv_converted(self) -> None: self.assertEqual(text, content) - def test_doc_converted_with_antiword(self) -> None: + def test_doc_will_be_converted_with_antiword(self) -> None: with mock.patch.multiple( "cardinal_pythonlib.extract_text.subprocess", Popen=self.mock_popen, @@ -152,7 +152,7 @@ def test_doc_converted_with_antiword(self) -> None: ] self.mock_popen.assert_has_calls(expected_calls) - def test_dot_converted_with_antiword(self) -> None: + def test_dot_will_be_converted_with_antiword(self) -> None: with mock.patch.multiple( "cardinal_pythonlib.extract_text.subprocess", Popen=self.mock_popen, @@ -233,7 +233,7 @@ def test_odt_converted(self) -> None: self.assertEqual(text.strip(), content) - def test_pdf_converted(self) -> None: + def test_pdf_will_be_converted_with_pdftotext(self) -> None: with mock.patch.multiple( "cardinal_pythonlib.extract_text.subprocess", Popen=self.mock_popen, @@ -254,7 +254,7 @@ def test_pdf_converted(self) -> None: ] self.mock_popen.assert_has_calls(expected_calls) - def test_rtf_converted(self) -> None: + def test_rtf_will_be_converted_with_unrtf(self) -> None: with mock.patch( "cardinal_pythonlib.extract_text.UNRTF_SUPPORTS_QUIET", True ): @@ -561,7 +561,7 @@ def test_eml_invalid_surrogate_characters_replaced(self) -> None: self.assertEqual(text.strip(), "??") - def test_unsupported_converted(self) -> None: + def test_unsupported_will_be_converted_with_strings(self) -> None: with mock.patch.multiple( "cardinal_pythonlib.extract_text.subprocess", Popen=self.mock_popen, From dba72a96ea27618fb511242ebd93a42b0f9d32c9 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 13 May 2025 21:12:41 +0100 Subject: [PATCH 36/39] Fix test comment --- cardinal_pythonlib/tests/extract_text_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 9485043..6e5c9ee 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -1,4 +1,4 @@ -# cardinal_pythonlib/tests/datetimefunc_tests.py +# cardinal_pythonlib/tests/extract_text_tests.py """ =============================================================================== From 97b5a0a79e98c6525a412219c40d842c576092a2 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Wed, 14 May 2025 06:35:04 +0100 Subject: [PATCH 37/39] Update changelog --- docs/source/changelog.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index deb6c93..056ac0b 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -889,3 +889,16 @@ Quick links: **2.0.5 (2025-04-07)** - Add VARCHAR to valid Databricks types. + +**2.1.0 (2025-05-13)** + +- **BREAKING CHANGE**: Rename modules to avoid conflicts with the Python + standard library: + + - :mod:`cardinal_pythonlib.email` is now :mod:`cardinal_pythonlib.email_utils` + - :mod:`cardinal_pythonlib.json` is now :mod:`cardinal_pythonlib.json_utils` + - :mod:`cardinal_pythonlib.profile` is now :mod:`cardinal_pythonlib.profiling` + +- Add support for ``.eml`` files with attachments processed by supported + document converters (``.docx``, ``.pdf``, ``.odt`` etc.) to + :func:`cardinal_pythonlib.extract_text.document_to_text`. From 32cfc583b7b1d4dca45f7628be6e1ff3e8a46146 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Wed, 14 May 2025 06:47:27 +0100 Subject: [PATCH 38/39] Align version of faker-file used in docs to that used in tests --- docs/docs_requirements.txt | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/docs/docs_requirements.txt b/docs/docs_requirements.txt index f489b7f..c88fa73 100644 --- a/docs/docs_requirements.txt +++ b/docs/docs_requirements.txt @@ -7,18 +7,12 @@ dogpile.cache==0.9.2 # CRATE is on 4.2 Django>=4.2,<5.0 faker==13.3.1 -faker-file[common]==0.17.13 +faker-file[common]==0.18.3 libChEBIpy pdfkit pyramid==1.10.8 pytest -# sphinx==4.2.0 sphinx==7.1.2 -# sphinxcontrib-applehelp==1.0.4 -# sphinxcontrib-devhelp==1.0.2 -# sphinxcontrib-htmlhelp==2.0.1 -# sphinxcontrib-serializinghtml==1.1.5 -# sphinxcontrib-qthelp==1.0.3 sphinx-paramlinks==0.6.0 sphinx_rtd_theme==2.0.0 weasyprint From b00e82e0e15f75afddd3a642b0ac3e3977a05742 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Wed, 14 May 2025 09:38:16 +0100 Subject: [PATCH 39/39] Revert empty filename check when extracting text --- cardinal_pythonlib/extract_text.py | 4 ++-- cardinal_pythonlib/tests/extract_text_tests.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index eb57d60..0c24121 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -358,7 +358,7 @@ def get_filelikeobject(filename: str = None, blob: bytes = None) -> BinaryIO: Returns: a :class:`BinaryIO` object """ - if filename is None and blob is None: + if not filename and blob is None: raise ValueError("no filename and no blob") if filename and blob: raise ValueError("specify either filename or blob") @@ -1425,7 +1425,7 @@ def document_to_text( Raises an exception for malformed arguments, missing files, bad filetypes, etc. """ - if filename is None and blob is None: + if not filename and blob is None: raise ValueError("document_to_text: no filename and no blob") if filename and blob: raise ValueError("document_to_text: specify either filename or blob") diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 6e5c9ee..3a64b7b 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -101,6 +101,12 @@ def test_raises_when_no_filename_or_blob(self) -> None: self.assertIn("no filename and no blob", str(cm.exception)) + def test_raises_when_filename_empty(self) -> None: + with self.assertRaises(ValueError) as cm: + document_to_text(filename="") + + self.assertIn("no filename and no blob", str(cm.exception)) + def test_raises_when_filename_and_blob(self) -> None: with self.assertRaises(ValueError) as cm: document_to_text(filename="foo", blob="bar")