Merge branch 'develop' into release_2.0.2

LUMC · Mar 4, 2024 · 8386a25 · 8386a25
2 parents a76d1a5 + dd69ee7
commit 8386a25
Show file tree

Hide file tree

Showing 11 changed files with 95 additions and 16 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -13,6 +13,13 @@ version 2.0.2
 .. This document is user facing. Please word the changes in such a way
 .. that users understand how the changes affect the new version.
 
+version 2.1.0-dev
+---------------------------
++ Add extract_md5sum check on uncompressed contents of compressed output files.
+  Gzipped files contain a timestamp which makes it hard to directly compare the
+  md5sums of gzipped files.
++ Document naming conventions for Python test discovery
+
 version 2.0.1
 ---------------------------
 + Fixed a bug where pytest-workflow would crash on logs that used non-ASCII

diff --git a/README.rst b/README.rst
@@ -127,6 +127,7 @@ predefined tests as well as custom tests are possible.
       - path: "TomCruise.txt.gz"       # Gzipped files can also be searched, provided their extension is '.gz'
         contains:
           - "starring"
+        extract_md5sum: e27c52f6b5f8152aa3ef58be7bdacc4d   # Md5sum of the uncompressed file (optional)
     stderr:                            # Options for testing stderr (optional)
       contains:                        # A list of strings which should be in stderr (optional)
         - "BSOD error, please contact the IT crowd"

diff --git a/docs/writing_tests.rst b/docs/writing_tests.rst
@@ -64,6 +64,7 @@ Test options
       - path: "TomCruise.txt.gz"       # Gzipped files can also be searched, provided their extension is '.gz'
         contains:
           - "starring"
+        extract_md5sum: e27c52f6b5f8152aa3ef58be7bdacc4d   # Md5sum of the uncompressed file (optional)
     stderr:                            # Options for testing stderr (optional)
       contains:                        # A list of strings which should be in stderr (optional)
         - "BSOD error, please contact the IT crowd"
@@ -89,6 +90,12 @@ Please see the `Python documentation on regular expressions
 <https://docs.python.org/3/library/re.html>`_ to see how Python handles escape
 sequences.
 
+The ``extract_md5sum`` option is used to uncompress a file and then compare
+the md5sum of the uncompressed file with the supplied md5sum. This option is
+particularly useful when testing gzipped files, which may contain a file
+creation timestamp in the gzip header. The supported compressed file
+formats for this option are gzip, bzip2, xz and Zstandard.
+
 .. note::
     Workflow names must be unique. Pytest workflow will crash when multiple
     workflows have the same name, even if they are in different files.
@@ -160,6 +167,10 @@ Multiple workflows can use the same custom test like this:
 points to the folder where the named workflow was executed. This allows writing
 of advanced python tests for each file produced by the workflow.
 
+Custom tests must follow the `conventions for Python test discovery
+<https://docs.pytest.org/en/latest/explanation/goodpractices.html#conventions-for-python-test-discovery>`_,
+which constrains the names of files and functions containing custom tests.
+
 .. note::
 
     stdout and stderr are available as files in the root of the

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,5 @@
 pyyaml
 pytest>=7.0.0
-jsonschema
+jsonschema
+xopen>=1.7.0
+zstandard
diff --git a/setup.py b/setup.py
@@ -20,7 +20,7 @@
 
 setup(
     name="pytest-workflow",
-    version="2.0.1",
+    version="2.1.0-dev",
     description="A pytest plugin for configuring workflow/pipeline tests "
                 "using YAML files",
     author="Leiden University Medical Center",
@@ -54,7 +54,9 @@
     install_requires=[
         "pytest>=7.0.0",  # To use pathlib Path's in pytest
         "pyyaml",
-        "jsonschema"
+        "jsonschema",
+        "xopen>=1.4.0",
+        "zstandard",
     ],
     # This line makes sure the plugin is automatically loaded when it is
     # installed in the same environment as pytest. No need to configure

diff --git a/src/pytest_workflow/file_tests.py b/src/pytest_workflow/file_tests.py
@@ -22,7 +22,7 @@
 
 from .content_tests import ContentTestCollector
 from .schema import FileTest
-from .util import file_md5sum
+from .util import extract_md5sum, file_md5sum
 from .workflow import Workflow
 
 
@@ -76,7 +76,16 @@ def collect(self):
                 parent=self,
                 filepath=filepath,
                 md5sum=self.filetest.md5sum,
-                workflow=self.workflow)]
+                workflow=self.workflow,
+                extract=False)]
+
+        if self.filetest.extract_md5sum:
+            tests += [FileMd5.from_parent(
+                parent=self,
+                filepath=filepath,
+                md5sum=self.filetest.extract_md5sum,
+                workflow=self.workflow,
+                extract=True)]
 
         return tests
 
@@ -119,32 +128,37 @@ def repr_failure(self, excinfo, style=None):
 
 class FileMd5(pytest.Item):
     def __init__(self, parent: pytest.Collector, filepath: Path,
-                 md5sum: str, workflow: Workflow):
+                 md5sum: str, workflow: Workflow, extract: bool):
         """
         Create a tests for the file md5sum.
         :param parent: The collector that started this item
         :param filepath: The path to the file
         :param md5sum:  The expected md5sum
         :param workflow: The workflow running to generate the file
+        :param extract: Whether the file should be extracted before calculating
         """
-        name = "md5sum"
+        name = "extract_md5sum" if extract else "md5sum"
         super().__init__(name, parent)
         self.filepath = filepath
         self.expected_md5sum = md5sum
         self.observed_md5sum = None
         self.workflow = workflow
+        self.extract = extract
 
     def runtest(self):
         # Wait for the workflow to finish before we check the md5sum of a file.
         self.workflow.wait()
         if not self.workflow.matching_exitcode():
             pytest.skip(f"'{self.parent.workflow.name}' did not exit with"
                         f"desired exit code.")
-        self.observed_md5sum = file_md5sum(self.filepath)
+        sum_func = extract_md5sum if self.extract else file_md5sum
+        self.observed_md5sum = sum_func(self.filepath)
         assert self.observed_md5sum == self.expected_md5sum
 
     def repr_failure(self, excinfo, style=None):
+        metric = "extract_md5sum" if self.extract else "md5sum"
         return (
-            f"Observed md5sum '{self.observed_md5sum}' not equal to expected "
-            f"md5sum '{self.expected_md5sum}' for file '{self.filepath}'"
-        )
+            f"Observed {metric} '{self.observed_md5sum}' not equal to "
+            f"expected {metric} '{self.expected_md5sum}' for file "
+            f"'{self.filepath}'"
+         )
diff --git a/src/pytest_workflow/schema.py b/src/pytest_workflow/schema.py
@@ -125,6 +125,7 @@ def __init__(self, contains: Optional[List[str]] = None,
 class FileTest(ContentTest):
     """A class that contains all the properties of a to be tested file."""
     def __init__(self, path: str, md5sum: Optional[str] = None,
+                 extract_md5sum: Optional[str] = None,
                  should_exist: bool = DEFAULT_FILE_SHOULD_EXIST,
                  contains: Optional[List[str]] = None,
                  must_not_contain: Optional[List[str]] = None,
@@ -135,6 +136,7 @@ def __init__(self, path: str, md5sum: Optional[str] = None,
         A container object
         :param path: the path to the file
         :param md5sum: md5sum of the file contents
+        :param extract_md5sum: md5sum of the extracted file contents
         :param should_exist: whether the file should exist or not
         :param contains: a list of strings that should be present in the file
         :param must_not_contain: a list of strings that should not be present
@@ -150,6 +152,7 @@ def __init__(self, path: str, md5sum: Optional[str] = None,
                          encoding=encoding)
         self.path = Path(path)
         self.md5sum = md5sum
+        self.extract_md5sum = extract_md5sum
         self.should_exist = should_exist
 
 

diff --git a/src/pytest_workflow/schema/schema.json b/src/pytest_workflow/schema/schema.json
@@ -123,6 +123,10 @@
             "should_exist": {
               "type": "boolean"
             },
+            "extract_md5sum": {
+              "type": "string",
+              "pattern": "^[a-f0-9]{32}$"
+            },
             "contains": {
               "type": "array",
               "items": {

diff --git a/src/pytest_workflow/util.py b/src/pytest_workflow/util.py
@@ -7,7 +7,10 @@
 import sys
 import warnings
 from pathlib import Path
-from typing import Callable, Iterator, List, Optional, Set, Tuple, Union
+from typing import Callable, IO, Iterator, List, Optional, Set, Tuple, Union, \
+                   cast
+
+from xopen import xopen
 
 Filepath = Union[str, os.PathLike]
 
@@ -204,10 +207,32 @@ def file_md5sum(filepath: Path, block_size=64 * 1024) -> str:
     :param block_size: Block size in bytes
     :return: a md5sum as hexadecimal string.
     """
-    hasher = hashlib.md5()
     with filepath.open('rb') as file_handler:  # Read the file in bytes
-        for block in iter(lambda: file_handler.read(block_size), b''):
-            hasher.update(block)
+        return file_handle_md5sum(file_handler, block_size)
+
+
+def extract_md5sum(filepath: Path, block_size=64 * 1024) -> str:
+    """
+    Generates a md5sum for the uncompressed contents of compressed file.
+    Reads file in blocks to save memory.
+    :param filepath: a pathlib. Path to the compressed file
+    :param block_size: Block size in bytes
+    :return: a md5sum as hexadecimal string.
+    """
+    with xopen(filepath, 'rb') as file_handler:  # Read the file in bytes
+        return file_handle_md5sum(cast(IO[bytes], file_handler), block_size)
+
+
+def file_handle_md5sum(file_handler: IO[bytes], block_size) -> str:
+    """
+    Generates a md5sum for a file handle. Reads file in blocks to save memory.
+    :param file_handler: a readable binary file handler
+    :param block_size: Block size in bytes
+    :return: a md5sum as hexadecimal string.
+    """
+    hasher = hashlib.md5()
+    for block in iter(lambda: file_handler.read(block_size), b''):
+        hasher.update(block)
     return hasher.hexdigest()
 
 

diff --git a/tests/test_schema.py b/tests/test_schema.py
@@ -171,6 +171,7 @@ def test_filetest_defaults():
     assert file_test.contains_regex == []
     assert file_test.must_not_contain_regex == []
     assert file_test.md5sum is None
+    assert file_test.extract_md5sum is None
     assert file_test.should_exist
 
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -13,6 +13,7 @@
 #
 # You should have received a copy of the GNU Affero General Public License
 # along with pytest-workflow.  If not, see <https://www.gnu.org/licenses/
+import gzip
 import hashlib
 import itertools
 import os
@@ -25,7 +26,7 @@
 import pytest
 
 from pytest_workflow.util import decode_unaligned, duplicate_tree, \
-    file_md5sum, git_check_submodules_cloned, git_root, \
+    extract_md5sum, file_md5sum, git_check_submodules_cloned, git_root, \
     is_in_dir, link_tree, replace_whitespace
 
 WHITESPACE_TESTS = [
@@ -163,6 +164,14 @@ def test_file_md5sum(hash_file: Path):
     assert whole_file_md5 == per_line_md5
 
 
+def test_extract_md5sum():
+    hash_file = HASH_FILE_DIR / "LICENSE.gz"
+    with gzip.open(hash_file, "rb") as contents_fh:
+        whole_file_md5 = hashlib.md5(contents_fh.read()).hexdigest()
+    per_line_md5 = extract_md5sum(hash_file)
+    assert whole_file_md5 == per_line_md5
+
+
 def create_git_repo(path):
     dir = Path(path)
     os.mkdir(dir)