Skip to content

Commit

Permalink
Merge branch 'develop' into release_2.0.2
Browse files Browse the repository at this point in the history
  • Loading branch information
JAlvarezJarreta authored Mar 4, 2024
2 parents a76d1a5 + dd69ee7 commit 8386a25
Show file tree
Hide file tree
Showing 11 changed files with 95 additions and 16 deletions.
7 changes: 7 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ version 2.0.2
.. This document is user facing. Please word the changes in such a way
.. that users understand how the changes affect the new version.
version 2.1.0-dev
---------------------------
+ Add extract_md5sum check on uncompressed contents of compressed output files.
Gzipped files contain a timestamp which makes it hard to directly compare the
md5sums of gzipped files.
+ Document naming conventions for Python test discovery

version 2.0.1
---------------------------
+ Fixed a bug where pytest-workflow would crash on logs that used non-ASCII
Expand Down
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ predefined tests as well as custom tests are possible.
- path: "TomCruise.txt.gz" # Gzipped files can also be searched, provided their extension is '.gz'
contains:
- "starring"
extract_md5sum: e27c52f6b5f8152aa3ef58be7bdacc4d # Md5sum of the uncompressed file (optional)
stderr: # Options for testing stderr (optional)
contains: # A list of strings which should be in stderr (optional)
- "BSOD error, please contact the IT crowd"
Expand Down
11 changes: 11 additions & 0 deletions docs/writing_tests.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ Test options
- path: "TomCruise.txt.gz" # Gzipped files can also be searched, provided their extension is '.gz'
contains:
- "starring"
extract_md5sum: e27c52f6b5f8152aa3ef58be7bdacc4d # Md5sum of the uncompressed file (optional)
stderr: # Options for testing stderr (optional)
contains: # A list of strings which should be in stderr (optional)
- "BSOD error, please contact the IT crowd"
Expand All @@ -89,6 +90,12 @@ Please see the `Python documentation on regular expressions
<https://docs.python.org/3/library/re.html>`_ to see how Python handles escape
sequences.

The ``extract_md5sum`` option is used to uncompress a file and then compare
the md5sum of the uncompressed file with the supplied md5sum. This option is
particularly useful when testing gzipped files, which may contain a file
creation timestamp in the gzip header. The supported compressed file
formats for this option are gzip, bzip2, xz and Zstandard.

.. note::
Workflow names must be unique. Pytest workflow will crash when multiple
workflows have the same name, even if they are in different files.
Expand Down Expand Up @@ -160,6 +167,10 @@ Multiple workflows can use the same custom test like this:
points to the folder where the named workflow was executed. This allows writing
of advanced python tests for each file produced by the workflow.

Custom tests must follow the `conventions for Python test discovery
<https://docs.pytest.org/en/latest/explanation/goodpractices.html#conventions-for-python-test-discovery>`_,
which constrains the names of files and functions containing custom tests.

.. note::

stdout and stderr are available as files in the root of the
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
pyyaml
pytest>=7.0.0
jsonschema
jsonschema
xopen>=1.7.0
zstandard
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

setup(
name="pytest-workflow",
version="2.0.1",
version="2.1.0-dev",
description="A pytest plugin for configuring workflow/pipeline tests "
"using YAML files",
author="Leiden University Medical Center",
Expand Down Expand Up @@ -54,7 +54,9 @@
install_requires=[
"pytest>=7.0.0", # To use pathlib Path's in pytest
"pyyaml",
"jsonschema"
"jsonschema",
"xopen>=1.4.0",
"zstandard",
],
# This line makes sure the plugin is automatically loaded when it is
# installed in the same environment as pytest. No need to configure
Expand Down
30 changes: 22 additions & 8 deletions src/pytest_workflow/file_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

from .content_tests import ContentTestCollector
from .schema import FileTest
from .util import file_md5sum
from .util import extract_md5sum, file_md5sum
from .workflow import Workflow


Expand Down Expand Up @@ -76,7 +76,16 @@ def collect(self):
parent=self,
filepath=filepath,
md5sum=self.filetest.md5sum,
workflow=self.workflow)]
workflow=self.workflow,
extract=False)]

if self.filetest.extract_md5sum:
tests += [FileMd5.from_parent(
parent=self,
filepath=filepath,
md5sum=self.filetest.extract_md5sum,
workflow=self.workflow,
extract=True)]

return tests

Expand Down Expand Up @@ -119,32 +128,37 @@ def repr_failure(self, excinfo, style=None):

class FileMd5(pytest.Item):
def __init__(self, parent: pytest.Collector, filepath: Path,
md5sum: str, workflow: Workflow):
md5sum: str, workflow: Workflow, extract: bool):
"""
Create a tests for the file md5sum.
:param parent: The collector that started this item
:param filepath: The path to the file
:param md5sum: The expected md5sum
:param workflow: The workflow running to generate the file
:param extract: Whether the file should be extracted before calculating
"""
name = "md5sum"
name = "extract_md5sum" if extract else "md5sum"
super().__init__(name, parent)
self.filepath = filepath
self.expected_md5sum = md5sum
self.observed_md5sum = None
self.workflow = workflow
self.extract = extract

def runtest(self):
# Wait for the workflow to finish before we check the md5sum of a file.
self.workflow.wait()
if not self.workflow.matching_exitcode():
pytest.skip(f"'{self.parent.workflow.name}' did not exit with"
f"desired exit code.")
self.observed_md5sum = file_md5sum(self.filepath)
sum_func = extract_md5sum if self.extract else file_md5sum
self.observed_md5sum = sum_func(self.filepath)
assert self.observed_md5sum == self.expected_md5sum

def repr_failure(self, excinfo, style=None):
metric = "extract_md5sum" if self.extract else "md5sum"
return (
f"Observed md5sum '{self.observed_md5sum}' not equal to expected "
f"md5sum '{self.expected_md5sum}' for file '{self.filepath}'"
)
f"Observed {metric} '{self.observed_md5sum}' not equal to "
f"expected {metric} '{self.expected_md5sum}' for file "
f"'{self.filepath}'"
)
3 changes: 3 additions & 0 deletions src/pytest_workflow/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def __init__(self, contains: Optional[List[str]] = None,
class FileTest(ContentTest):
"""A class that contains all the properties of a to be tested file."""
def __init__(self, path: str, md5sum: Optional[str] = None,
extract_md5sum: Optional[str] = None,
should_exist: bool = DEFAULT_FILE_SHOULD_EXIST,
contains: Optional[List[str]] = None,
must_not_contain: Optional[List[str]] = None,
Expand All @@ -135,6 +136,7 @@ def __init__(self, path: str, md5sum: Optional[str] = None,
A container object
:param path: the path to the file
:param md5sum: md5sum of the file contents
:param extract_md5sum: md5sum of the extracted file contents
:param should_exist: whether the file should exist or not
:param contains: a list of strings that should be present in the file
:param must_not_contain: a list of strings that should not be present
Expand All @@ -150,6 +152,7 @@ def __init__(self, path: str, md5sum: Optional[str] = None,
encoding=encoding)
self.path = Path(path)
self.md5sum = md5sum
self.extract_md5sum = extract_md5sum
self.should_exist = should_exist


Expand Down
4 changes: 4 additions & 0 deletions src/pytest_workflow/schema/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,10 @@
"should_exist": {
"type": "boolean"
},
"extract_md5sum": {
"type": "string",
"pattern": "^[a-f0-9]{32}$"
},
"contains": {
"type": "array",
"items": {
Expand Down
33 changes: 29 additions & 4 deletions src/pytest_workflow/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
import sys
import warnings
from pathlib import Path
from typing import Callable, Iterator, List, Optional, Set, Tuple, Union
from typing import Callable, IO, Iterator, List, Optional, Set, Tuple, Union, \
cast

from xopen import xopen

Filepath = Union[str, os.PathLike]

Expand Down Expand Up @@ -204,10 +207,32 @@ def file_md5sum(filepath: Path, block_size=64 * 1024) -> str:
:param block_size: Block size in bytes
:return: a md5sum as hexadecimal string.
"""
hasher = hashlib.md5()
with filepath.open('rb') as file_handler: # Read the file in bytes
for block in iter(lambda: file_handler.read(block_size), b''):
hasher.update(block)
return file_handle_md5sum(file_handler, block_size)


def extract_md5sum(filepath: Path, block_size=64 * 1024) -> str:
"""
Generates a md5sum for the uncompressed contents of compressed file.
Reads file in blocks to save memory.
:param filepath: a pathlib. Path to the compressed file
:param block_size: Block size in bytes
:return: a md5sum as hexadecimal string.
"""
with xopen(filepath, 'rb') as file_handler: # Read the file in bytes
return file_handle_md5sum(cast(IO[bytes], file_handler), block_size)


def file_handle_md5sum(file_handler: IO[bytes], block_size) -> str:
"""
Generates a md5sum for a file handle. Reads file in blocks to save memory.
:param file_handler: a readable binary file handler
:param block_size: Block size in bytes
:return: a md5sum as hexadecimal string.
"""
hasher = hashlib.md5()
for block in iter(lambda: file_handler.read(block_size), b''):
hasher.update(block)
return hasher.hexdigest()


Expand Down
1 change: 1 addition & 0 deletions tests/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ def test_filetest_defaults():
assert file_test.contains_regex == []
assert file_test.must_not_contain_regex == []
assert file_test.md5sum is None
assert file_test.extract_md5sum is None
assert file_test.should_exist


Expand Down
11 changes: 10 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#
# You should have received a copy of the GNU Affero General Public License
# along with pytest-workflow. If not, see <https://www.gnu.org/licenses/
import gzip
import hashlib
import itertools
import os
Expand All @@ -25,7 +26,7 @@
import pytest

from pytest_workflow.util import decode_unaligned, duplicate_tree, \
file_md5sum, git_check_submodules_cloned, git_root, \
extract_md5sum, file_md5sum, git_check_submodules_cloned, git_root, \
is_in_dir, link_tree, replace_whitespace

WHITESPACE_TESTS = [
Expand Down Expand Up @@ -163,6 +164,14 @@ def test_file_md5sum(hash_file: Path):
assert whole_file_md5 == per_line_md5


def test_extract_md5sum():
hash_file = HASH_FILE_DIR / "LICENSE.gz"
with gzip.open(hash_file, "rb") as contents_fh:
whole_file_md5 = hashlib.md5(contents_fh.read()).hexdigest()
per_line_md5 = extract_md5sum(hash_file)
assert whole_file_md5 == per_line_md5


def create_git_repo(path):
dir = Path(path)
os.mkdir(dir)
Expand Down

0 comments on commit 8386a25

Please sign in to comment.