diff --git a/HISTORY.rst b/HISTORY.rst index 97ef39f7..3671b423 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -13,6 +13,13 @@ version 2.0.2 .. This document is user facing. Please word the changes in such a way .. that users understand how the changes affect the new version. +version 2.1.0-dev +--------------------------- ++ Add extract_md5sum check on uncompressed contents of compressed output files. + Gzipped files contain a timestamp which makes it hard to directly compare the + md5sums of gzipped files. ++ Document naming conventions for Python test discovery + version 2.0.1 --------------------------- + Fixed a bug where pytest-workflow would crash on logs that used non-ASCII diff --git a/README.rst b/README.rst index c53eb866..3e19015c 100644 --- a/README.rst +++ b/README.rst @@ -127,6 +127,7 @@ predefined tests as well as custom tests are possible. - path: "TomCruise.txt.gz" # Gzipped files can also be searched, provided their extension is '.gz' contains: - "starring" + extract_md5sum: e27c52f6b5f8152aa3ef58be7bdacc4d # Md5sum of the uncompressed file (optional) stderr: # Options for testing stderr (optional) contains: # A list of strings which should be in stderr (optional) - "BSOD error, please contact the IT crowd" diff --git a/docs/writing_tests.rst b/docs/writing_tests.rst index 9d75d435..0a3896d1 100644 --- a/docs/writing_tests.rst +++ b/docs/writing_tests.rst @@ -64,6 +64,7 @@ Test options - path: "TomCruise.txt.gz" # Gzipped files can also be searched, provided their extension is '.gz' contains: - "starring" + extract_md5sum: e27c52f6b5f8152aa3ef58be7bdacc4d # Md5sum of the uncompressed file (optional) stderr: # Options for testing stderr (optional) contains: # A list of strings which should be in stderr (optional) - "BSOD error, please contact the IT crowd" @@ -89,6 +90,12 @@ Please see the `Python documentation on regular expressions `_ to see how Python handles escape sequences. +The ``extract_md5sum`` option is used to uncompress a file and then compare +the md5sum of the uncompressed file with the supplied md5sum. This option is +particularly useful when testing gzipped files, which may contain a file +creation timestamp in the gzip header. The supported compressed file +formats for this option are gzip, bzip2, xz and Zstandard. + .. note:: Workflow names must be unique. Pytest workflow will crash when multiple workflows have the same name, even if they are in different files. @@ -160,6 +167,10 @@ Multiple workflows can use the same custom test like this: points to the folder where the named workflow was executed. This allows writing of advanced python tests for each file produced by the workflow. +Custom tests must follow the `conventions for Python test discovery +`_, +which constrains the names of files and functions containing custom tests. + .. note:: stdout and stderr are available as files in the root of the diff --git a/requirements.txt b/requirements.txt index 2c5d3bff..884f5ec3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ pyyaml pytest>=7.0.0 -jsonschema \ No newline at end of file +jsonschema +xopen>=1.7.0 +zstandard diff --git a/setup.py b/setup.py index 015d52ba..e4de914f 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ setup( name="pytest-workflow", - version="2.0.1", + version="2.1.0-dev", description="A pytest plugin for configuring workflow/pipeline tests " "using YAML files", author="Leiden University Medical Center", @@ -54,7 +54,9 @@ install_requires=[ "pytest>=7.0.0", # To use pathlib Path's in pytest "pyyaml", - "jsonschema" + "jsonschema", + "xopen>=1.4.0", + "zstandard", ], # This line makes sure the plugin is automatically loaded when it is # installed in the same environment as pytest. No need to configure diff --git a/src/pytest_workflow/file_tests.py b/src/pytest_workflow/file_tests.py index 17642fc8..1f98b1c2 100644 --- a/src/pytest_workflow/file_tests.py +++ b/src/pytest_workflow/file_tests.py @@ -22,7 +22,7 @@ from .content_tests import ContentTestCollector from .schema import FileTest -from .util import file_md5sum +from .util import extract_md5sum, file_md5sum from .workflow import Workflow @@ -76,7 +76,16 @@ def collect(self): parent=self, filepath=filepath, md5sum=self.filetest.md5sum, - workflow=self.workflow)] + workflow=self.workflow, + extract=False)] + + if self.filetest.extract_md5sum: + tests += [FileMd5.from_parent( + parent=self, + filepath=filepath, + md5sum=self.filetest.extract_md5sum, + workflow=self.workflow, + extract=True)] return tests @@ -119,20 +128,22 @@ def repr_failure(self, excinfo, style=None): class FileMd5(pytest.Item): def __init__(self, parent: pytest.Collector, filepath: Path, - md5sum: str, workflow: Workflow): + md5sum: str, workflow: Workflow, extract: bool): """ Create a tests for the file md5sum. :param parent: The collector that started this item :param filepath: The path to the file :param md5sum: The expected md5sum :param workflow: The workflow running to generate the file + :param extract: Whether the file should be extracted before calculating """ - name = "md5sum" + name = "extract_md5sum" if extract else "md5sum" super().__init__(name, parent) self.filepath = filepath self.expected_md5sum = md5sum self.observed_md5sum = None self.workflow = workflow + self.extract = extract def runtest(self): # Wait for the workflow to finish before we check the md5sum of a file. @@ -140,11 +151,14 @@ def runtest(self): if not self.workflow.matching_exitcode(): pytest.skip(f"'{self.parent.workflow.name}' did not exit with" f"desired exit code.") - self.observed_md5sum = file_md5sum(self.filepath) + sum_func = extract_md5sum if self.extract else file_md5sum + self.observed_md5sum = sum_func(self.filepath) assert self.observed_md5sum == self.expected_md5sum def repr_failure(self, excinfo, style=None): + metric = "extract_md5sum" if self.extract else "md5sum" return ( - f"Observed md5sum '{self.observed_md5sum}' not equal to expected " - f"md5sum '{self.expected_md5sum}' for file '{self.filepath}'" - ) + f"Observed {metric} '{self.observed_md5sum}' not equal to " + f"expected {metric} '{self.expected_md5sum}' for file " + f"'{self.filepath}'" + ) diff --git a/src/pytest_workflow/schema.py b/src/pytest_workflow/schema.py index c8c3e40f..499dd416 100644 --- a/src/pytest_workflow/schema.py +++ b/src/pytest_workflow/schema.py @@ -125,6 +125,7 @@ def __init__(self, contains: Optional[List[str]] = None, class FileTest(ContentTest): """A class that contains all the properties of a to be tested file.""" def __init__(self, path: str, md5sum: Optional[str] = None, + extract_md5sum: Optional[str] = None, should_exist: bool = DEFAULT_FILE_SHOULD_EXIST, contains: Optional[List[str]] = None, must_not_contain: Optional[List[str]] = None, @@ -135,6 +136,7 @@ def __init__(self, path: str, md5sum: Optional[str] = None, A container object :param path: the path to the file :param md5sum: md5sum of the file contents + :param extract_md5sum: md5sum of the extracted file contents :param should_exist: whether the file should exist or not :param contains: a list of strings that should be present in the file :param must_not_contain: a list of strings that should not be present @@ -150,6 +152,7 @@ def __init__(self, path: str, md5sum: Optional[str] = None, encoding=encoding) self.path = Path(path) self.md5sum = md5sum + self.extract_md5sum = extract_md5sum self.should_exist = should_exist diff --git a/src/pytest_workflow/schema/schema.json b/src/pytest_workflow/schema/schema.json index 9ead66bb..718b6c25 100644 --- a/src/pytest_workflow/schema/schema.json +++ b/src/pytest_workflow/schema/schema.json @@ -123,6 +123,10 @@ "should_exist": { "type": "boolean" }, + "extract_md5sum": { + "type": "string", + "pattern": "^[a-f0-9]{32}$" + }, "contains": { "type": "array", "items": { diff --git a/src/pytest_workflow/util.py b/src/pytest_workflow/util.py index a7c91bc0..beed52af 100644 --- a/src/pytest_workflow/util.py +++ b/src/pytest_workflow/util.py @@ -7,7 +7,10 @@ import sys import warnings from pathlib import Path -from typing import Callable, Iterator, List, Optional, Set, Tuple, Union +from typing import Callable, IO, Iterator, List, Optional, Set, Tuple, Union, \ + cast + +from xopen import xopen Filepath = Union[str, os.PathLike] @@ -204,10 +207,32 @@ def file_md5sum(filepath: Path, block_size=64 * 1024) -> str: :param block_size: Block size in bytes :return: a md5sum as hexadecimal string. """ - hasher = hashlib.md5() with filepath.open('rb') as file_handler: # Read the file in bytes - for block in iter(lambda: file_handler.read(block_size), b''): - hasher.update(block) + return file_handle_md5sum(file_handler, block_size) + + +def extract_md5sum(filepath: Path, block_size=64 * 1024) -> str: + """ + Generates a md5sum for the uncompressed contents of compressed file. + Reads file in blocks to save memory. + :param filepath: a pathlib. Path to the compressed file + :param block_size: Block size in bytes + :return: a md5sum as hexadecimal string. + """ + with xopen(filepath, 'rb') as file_handler: # Read the file in bytes + return file_handle_md5sum(cast(IO[bytes], file_handler), block_size) + + +def file_handle_md5sum(file_handler: IO[bytes], block_size) -> str: + """ + Generates a md5sum for a file handle. Reads file in blocks to save memory. + :param file_handler: a readable binary file handler + :param block_size: Block size in bytes + :return: a md5sum as hexadecimal string. + """ + hasher = hashlib.md5() + for block in iter(lambda: file_handler.read(block_size), b''): + hasher.update(block) return hasher.hexdigest() diff --git a/tests/test_schema.py b/tests/test_schema.py index 378be288..8defda21 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -171,6 +171,7 @@ def test_filetest_defaults(): assert file_test.contains_regex == [] assert file_test.must_not_contain_regex == [] assert file_test.md5sum is None + assert file_test.extract_md5sum is None assert file_test.should_exist diff --git a/tests/test_utils.py b/tests/test_utils.py index 45f789c3..574225bd 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -13,6 +13,7 @@ # # You should have received a copy of the GNU Affero General Public License # along with pytest-workflow. If not, see