From 4add25ae3364e4c8fd3b611566aaa898b8af2815 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Fri, 13 Jan 2023 12:07:13 +0100
Subject: [PATCH 1/8] New version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 015d52ba..d4642b64 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@
 
 setup(
     name="pytest-workflow",
-    version="2.0.1",
+    version="2.1.0-dev",
     description="A pytest plugin for configuring workflow/pipeline tests "
                 "using YAML files",
     author="Leiden University Medical Center",

From f81a588f634cd9fe048ea2e79c7934e1cde210d2 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 20 Jan 2023 00:49:35 -0800
Subject: [PATCH 2/8] Document test discovery naming conventions (#172)

* docs: conventions for test discovery

* docs: update HISTORY.rst

* add back end of file newline

* use version indepdent URL
---
 HISTORY.rst            | 4 ++++
 docs/writing_tests.rst | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/HISTORY.rst b/HISTORY.rst
index d05d7d92..3a71c2b1 100644
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -7,6 +7,10 @@ Changelog
 .. This document is user facing. Please word the changes in such a way
 .. that users understand how the changes affect the new version.
 
+version 2.1.0-dev
+---------------------------
+* Document naming conventions for Python test discovery
+
 version 2.0.1
 ---------------------------
 + Fixed a bug where pytest-workflow would crash on logs that used non-ASCII
diff --git a/docs/writing_tests.rst b/docs/writing_tests.rst
index 9d75d435..db09e66f 100644
--- a/docs/writing_tests.rst
+++ b/docs/writing_tests.rst
@@ -160,6 +160,10 @@ Multiple workflows can use the same custom test like this:
 points to the folder where the named workflow was executed. This allows writing
 of advanced python tests for each file produced by the workflow.
 
+Custom tests must follow the `conventions for Python test discovery
+<https://docs.pytest.org/en/latest/explanation/goodpractices.html#conventions-for-python-test-discovery>`_,
+which constrains the names of files and functions containing custom tests.
+
 .. note::
 
     stdout and stderr are available as files in the root of the

From ec8be3c6f83720047fafd7691bad924e225d9aac Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 21 Jun 2023 17:07:12 -0700
Subject: [PATCH 3/8] Add ungzip_md5sum check

---
 HISTORY.rst                       |  5 ++++-
 docs/writing_tests.rst            |  1 +
 src/pytest_workflow/file_tests.py | 30 ++++++++++++++++++++++--------
 src/pytest_workflow/schema.py     |  3 +++
 src/pytest_workflow/util.py       | 31 +++++++++++++++++++++++++++----
 tests/test_schema.py              |  1 +
 tests/test_utils.py               | 11 ++++++++++-
 7 files changed, 68 insertions(+), 14 deletions(-)

diff --git a/HISTORY.rst b/HISTORY.rst
index 3a71c2b1..8b2b8ef0 100644
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -9,7 +9,10 @@ Changelog
 
 version 2.1.0-dev
 ---------------------------
-* Document naming conventions for Python test discovery
++ Add md5sum checking on unzipped contents of gzipped output files. Gzipped
+  files contain a timestamp which makes it hard to directly compare the md5sums
+  of gzipped files.
++ Document naming conventions for Python test discovery
 
 version 2.0.1
 ---------------------------
diff --git a/docs/writing_tests.rst b/docs/writing_tests.rst
index db09e66f..0e11a7d7 100644
--- a/docs/writing_tests.rst
+++ b/docs/writing_tests.rst
@@ -64,6 +64,7 @@ Test options
       - path: "TomCruise.txt.gz"       # Gzipped files can also be searched, provided their extension is '.gz'
         contains:
           - "starring"
+        ungzip_md5sum: e27c52f6b5f8152aa3ef58be7bdacc4d   # Md5sum of the ungzipped file (optional)
     stderr:                            # Options for testing stderr (optional)
       contains:                        # A list of strings which should be in stderr (optional)
         - "BSOD error, please contact the IT crowd"
diff --git a/src/pytest_workflow/file_tests.py b/src/pytest_workflow/file_tests.py
index 17642fc8..5503154b 100644
--- a/src/pytest_workflow/file_tests.py
+++ b/src/pytest_workflow/file_tests.py
@@ -22,7 +22,7 @@
 
 from .content_tests import ContentTestCollector
 from .schema import FileTest
-from .util import file_md5sum
+from .util import file_md5sum, gzip_md5sum
 from .workflow import Workflow
 
 
@@ -76,7 +76,16 @@ def collect(self):
                 parent=self,
                 filepath=filepath,
                 md5sum=self.filetest.md5sum,
-                workflow=self.workflow)]
+                workflow=self.workflow,
+                ungzip=False)]
+
+        if self.filetest.ungzip_md5sum:
+            tests += [FileMd5.from_parent(
+                parent=self,
+                filepath=filepath,
+                md5sum=self.filetest.ungzip_md5sum,
+                workflow=self.workflow,
+                ungzip=True)]
 
         return tests
 
@@ -119,20 +128,22 @@ def repr_failure(self, excinfo, style=None):
 
 class FileMd5(pytest.Item):
     def __init__(self, parent: pytest.Collector, filepath: Path,
-                 md5sum: str, workflow: Workflow):
+                 md5sum: str, workflow: Workflow, ungzip: bool):
         """
         Create a tests for the file md5sum.
         :param parent: The collector that started this item
         :param filepath: The path to the file
         :param md5sum:  The expected md5sum
         :param workflow: The workflow running to generate the file
+        :param ungzip: Whether the file should be ungzipped before calculating
         """
-        name = "md5sum"
+        name = "unzip_md5sum" if ungzip else "md5sum"
         super().__init__(name, parent)
         self.filepath = filepath
         self.expected_md5sum = md5sum
         self.observed_md5sum = None
         self.workflow = workflow
+        self.ungzip = ungzip
 
     def runtest(self):
         # Wait for the workflow to finish before we check the md5sum of a file.
@@ -140,11 +151,14 @@ def runtest(self):
         if not self.workflow.matching_exitcode():
             pytest.skip(f"'{self.parent.workflow.name}' did not exit with"
                         f"desired exit code.")
-        self.observed_md5sum = file_md5sum(self.filepath)
+        sum_func = gzip_md5sum if self.ungzip else file_md5sum
+        self.observed_md5sum = sum_func(self.filepath)
         assert self.observed_md5sum == self.expected_md5sum
 
     def repr_failure(self, excinfo, style=None):
+        metric = "ungzip_md5sum" if self.ungzip else "md5sum"
         return (
-            f"Observed md5sum '{self.observed_md5sum}' not equal to expected "
-            f"md5sum '{self.expected_md5sum}' for file '{self.filepath}'"
-        )
+            f"Observed {metric} '{self.observed_md5sum}' not equal to "
+            f"expected {metric} '{self.expected_md5sum}' for file "
+            f"'{self.filepath}'"
+         )
diff --git a/src/pytest_workflow/schema.py b/src/pytest_workflow/schema.py
index c8c3e40f..ed74b6c3 100644
--- a/src/pytest_workflow/schema.py
+++ b/src/pytest_workflow/schema.py
@@ -125,6 +125,7 @@ def __init__(self, contains: Optional[List[str]] = None,
 class FileTest(ContentTest):
     """A class that contains all the properties of a to be tested file."""
     def __init__(self, path: str, md5sum: Optional[str] = None,
+                 ungzip_md5sum: Optional[str] = None,
                  should_exist: bool = DEFAULT_FILE_SHOULD_EXIST,
                  contains: Optional[List[str]] = None,
                  must_not_contain: Optional[List[str]] = None,
@@ -135,6 +136,7 @@ def __init__(self, path: str, md5sum: Optional[str] = None,
         A container object
         :param path: the path to the file
         :param md5sum: md5sum of the file contents
+        :param unzip_md5sum: md5sum of the unzipped file contents
         :param should_exist: whether the file should exist or not
         :param contains: a list of strings that should be present in the file
         :param must_not_contain: a list of strings that should not be present
@@ -150,6 +152,7 @@ def __init__(self, path: str, md5sum: Optional[str] = None,
                          encoding=encoding)
         self.path = Path(path)
         self.md5sum = md5sum
+        self.ungzip_md5sum = ungzip_md5sum
         self.should_exist = should_exist
 
 
diff --git a/src/pytest_workflow/util.py b/src/pytest_workflow/util.py
index a7c91bc0..a91c9b5c 100644
--- a/src/pytest_workflow/util.py
+++ b/src/pytest_workflow/util.py
@@ -1,4 +1,5 @@
 import functools
+import gzip
 import hashlib
 import os
 import re
@@ -7,7 +8,7 @@
 import sys
 import warnings
 from pathlib import Path
-from typing import Callable, Iterator, List, Optional, Set, Tuple, Union
+from typing import BinaryIO, Callable, Iterator, List, Optional, Set, Tuple, Union
 
 Filepath = Union[str, os.PathLike]
 
@@ -204,10 +205,32 @@ def file_md5sum(filepath: Path, block_size=64 * 1024) -> str:
     :param block_size: Block size in bytes
     :return: a md5sum as hexadecimal string.
     """
-    hasher = hashlib.md5()
     with filepath.open('rb') as file_handler:  # Read the file in bytes
-        for block in iter(lambda: file_handler.read(block_size), b''):
-            hasher.update(block)
+        return file_handle_md5sum(file_handler, block_size)
+
+
+def gzip_md5sum(filepath: Path, block_size=64 * 1024) -> str:
+    """
+    Generates a md5sum for the uncompressed contents of gzipped file.
+    Reads file in blocks to save memory.
+    :param filepath: a pathlib. Path to the gzipped file
+    :param block_size: Block size in bytes
+    :return: a md5sum as hexadecimal string.
+    """
+    with gzip.open(filepath) as file_handler:  # Read the file in bytes
+        return file_handle_md5sum(file_handler, block_size)
+
+
+def file_handle_md5sum(file_handler: BinaryIO, block_size) -> str:
+    """
+    Generates a md5sum for a file handle. Reads file in blocks to save memory.
+    :param file_handler: a readable binary file handler
+    :param block_size: Block size in bytes
+    :return: a md5sum as hexadecimal string.
+    """
+    hasher = hashlib.md5()
+    for block in iter(lambda: file_handler.read(block_size), b''):
+        hasher.update(block)
     return hasher.hexdigest()
 
 
diff --git a/tests/test_schema.py b/tests/test_schema.py
index 378be288..98edfc95 100644
--- a/tests/test_schema.py
+++ b/tests/test_schema.py
@@ -171,6 +171,7 @@ def test_filetest_defaults():
     assert file_test.contains_regex == []
     assert file_test.must_not_contain_regex == []
     assert file_test.md5sum is None
+    assert file_test.ungzip_md5sum is None
     assert file_test.should_exist
 
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 45f789c3..b2f9f376 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -13,6 +13,7 @@
 #
 # You should have received a copy of the GNU Affero General Public License
 # along with pytest-workflow.  If not, see <https://www.gnu.org/licenses/
+import gzip
 import hashlib
 import itertools
 import os
@@ -26,7 +27,7 @@
 
 from pytest_workflow.util import decode_unaligned, duplicate_tree, \
     file_md5sum, git_check_submodules_cloned, git_root, \
-    is_in_dir, link_tree, replace_whitespace
+    gzip_md5sum, is_in_dir, link_tree, replace_whitespace
 
 WHITESPACE_TESTS = [
     ("bla\nbla", "bla_bla"),
@@ -163,6 +164,14 @@ def test_file_md5sum(hash_file: Path):
     assert whole_file_md5 == per_line_md5
 
 
+def test_gzip_md5sum():
+    hash_file = HASH_FILE_DIR / "LICENSE.gz"
+    with gzip.open(hash_file, "rb") as contents_fh:
+        whole_file_md5 = hashlib.md5(contents_fh.read()).hexdigest()
+    per_line_md5 = gzip_md5sum(hash_file)
+    assert whole_file_md5 == per_line_md5
+
+
 def create_git_repo(path):
     dir = Path(path)
     os.mkdir(dir)

From b83cf3dba7c8f697c34bbaef79d664cc53b1e50d Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 21 Jun 2023 17:36:15 -0700
Subject: [PATCH 4/8] add ungzip_md5sum to schema.json

---
 src/pytest_workflow/schema/schema.json | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/pytest_workflow/schema/schema.json b/src/pytest_workflow/schema/schema.json
index 9ead66bb..82ec80c1 100644
--- a/src/pytest_workflow/schema/schema.json
+++ b/src/pytest_workflow/schema/schema.json
@@ -123,6 +123,10 @@
             "should_exist": {
               "type": "boolean"
             },
+            "ungzip_md5sum": {
+              "type": "string",
+              "pattern": "^[a-f0-9]{32}$"
+            },
             "contains": {
               "type": "array",
               "items": {

From 684f314de1b3da2234e53c9de923eb037652169d Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 21 Jun 2023 17:54:59 -0700
Subject: [PATCH 5/8] fix unzip_md5sum to ungzip_md5sum

---
 src/pytest_workflow/file_tests.py | 2 +-
 src/pytest_workflow/schema.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pytest_workflow/file_tests.py b/src/pytest_workflow/file_tests.py
index 5503154b..99030c79 100644
--- a/src/pytest_workflow/file_tests.py
+++ b/src/pytest_workflow/file_tests.py
@@ -137,7 +137,7 @@ def __init__(self, parent: pytest.Collector, filepath: Path,
         :param workflow: The workflow running to generate the file
         :param ungzip: Whether the file should be ungzipped before calculating
         """
-        name = "unzip_md5sum" if ungzip else "md5sum"
+        name = "ungzip_md5sum" if ungzip else "md5sum"
         super().__init__(name, parent)
         self.filepath = filepath
         self.expected_md5sum = md5sum
diff --git a/src/pytest_workflow/schema.py b/src/pytest_workflow/schema.py
index ed74b6c3..d4eada2f 100644
--- a/src/pytest_workflow/schema.py
+++ b/src/pytest_workflow/schema.py
@@ -136,7 +136,7 @@ def __init__(self, path: str, md5sum: Optional[str] = None,
         A container object
         :param path: the path to the file
         :param md5sum: md5sum of the file contents
-        :param unzip_md5sum: md5sum of the unzipped file contents
+        :param ungzip_md5sum: md5sum of the unzipped file contents
         :param should_exist: whether the file should exist or not
         :param contains: a list of strings that should be present in the file
         :param must_not_contain: a list of strings that should not be present

From a22950cf58a531d7db6a5cc29d8d032f0634f98f Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Wed, 21 Jun 2023 18:16:28 -0700
Subject: [PATCH 6/8] fix typing for gzip file handles

---
 src/pytest_workflow/util.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/pytest_workflow/util.py b/src/pytest_workflow/util.py
index a91c9b5c..491033d0 100644
--- a/src/pytest_workflow/util.py
+++ b/src/pytest_workflow/util.py
@@ -8,7 +8,8 @@
 import sys
 import warnings
 from pathlib import Path
-from typing import BinaryIO, Callable, Iterator, List, Optional, Set, Tuple, Union
+from typing import Callable, IO, Iterator, List, Optional, Set, Tuple, Union, \
+                   cast
 
 Filepath = Union[str, os.PathLike]
 
@@ -218,10 +219,10 @@ def gzip_md5sum(filepath: Path, block_size=64 * 1024) -> str:
     :return: a md5sum as hexadecimal string.
     """
     with gzip.open(filepath) as file_handler:  # Read the file in bytes
-        return file_handle_md5sum(file_handler, block_size)
+        return file_handle_md5sum(cast(IO[bytes], file_handler), block_size)
 
 
-def file_handle_md5sum(file_handler: BinaryIO, block_size) -> str:
+def file_handle_md5sum(file_handler: IO[bytes], block_size) -> str:
     """
     Generates a md5sum for a file handle. Reads file in blocks to save memory.
     :param file_handler: a readable binary file handler

From 2f583bf633750556f65fd89242f1e077382d4632 Mon Sep 17 00:00:00 2001
From: Will Holtz <wholtz@gmail.com>
Date: Fri, 23 Jun 2023 12:52:41 -0700
Subject: [PATCH 7/8] move to extract_md5sum using xopen

---
 HISTORY.rst                            |  6 +++---
 docs/writing_tests.rst                 |  8 +++++++-
 requirements.txt                       |  4 +++-
 setup.py                               |  4 +++-
 src/pytest_workflow/file_tests.py      | 22 +++++++++++-----------
 src/pytest_workflow/schema.py          |  6 +++---
 src/pytest_workflow/schema/schema.json |  2 +-
 src/pytest_workflow/util.py            | 11 ++++++-----
 tests/test_schema.py                   |  2 +-
 tests/test_utils.py                    |  8 ++++----
 10 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/HISTORY.rst b/HISTORY.rst
index 8b2b8ef0..455177f1 100644
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -9,9 +9,9 @@ Changelog
 
 version 2.1.0-dev
 ---------------------------
-+ Add md5sum checking on unzipped contents of gzipped output files. Gzipped
-  files contain a timestamp which makes it hard to directly compare the md5sums
-  of gzipped files.
++ Add extract_md5sum check on uncompressed contents of compressed output files.
+  Gzipped files contain a timestamp which makes it hard to directly compare the
+  md5sums of gzipped files.
 + Document naming conventions for Python test discovery
 
 version 2.0.1
diff --git a/docs/writing_tests.rst b/docs/writing_tests.rst
index 0e11a7d7..0a3896d1 100644
--- a/docs/writing_tests.rst
+++ b/docs/writing_tests.rst
@@ -64,7 +64,7 @@ Test options
       - path: "TomCruise.txt.gz"       # Gzipped files can also be searched, provided their extension is '.gz'
         contains:
           - "starring"
-        ungzip_md5sum: e27c52f6b5f8152aa3ef58be7bdacc4d   # Md5sum of the ungzipped file (optional)
+        extract_md5sum: e27c52f6b5f8152aa3ef58be7bdacc4d   # Md5sum of the uncompressed file (optional)
     stderr:                            # Options for testing stderr (optional)
       contains:                        # A list of strings which should be in stderr (optional)
         - "BSOD error, please contact the IT crowd"
@@ -90,6 +90,12 @@ Please see the `Python documentation on regular expressions
 <https://docs.python.org/3/library/re.html>`_ to see how Python handles escape
 sequences.
 
+The ``extract_md5sum`` option is used to uncompress a file and then compare
+the md5sum of the uncompressed file with the supplied md5sum. This option is
+particularly useful when testing gzipped files, which may contain a file
+creation timestamp in the gzip header. The supported compressed file
+formats for this option are gzip, bzip2, xz and Zstandard.
+
 .. note::
     Workflow names must be unique. Pytest workflow will crash when multiple
     workflows have the same name, even if they are in different files.
diff --git a/requirements.txt b/requirements.txt
index 2c5d3bff..884f5ec3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
 pyyaml
 pytest>=7.0.0
-jsonschema
\ No newline at end of file
+jsonschema
+xopen>=1.7.0
+zstandard
diff --git a/setup.py b/setup.py
index d4642b64..e4de914f 100644
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,9 @@
     install_requires=[
         "pytest>=7.0.0",  # To use pathlib Path's in pytest
         "pyyaml",
-        "jsonschema"
+        "jsonschema",
+        "xopen>=1.4.0",
+        "zstandard",
     ],
     # This line makes sure the plugin is automatically loaded when it is
     # installed in the same environment as pytest. No need to configure
diff --git a/src/pytest_workflow/file_tests.py b/src/pytest_workflow/file_tests.py
index 99030c79..1f98b1c2 100644
--- a/src/pytest_workflow/file_tests.py
+++ b/src/pytest_workflow/file_tests.py
@@ -22,7 +22,7 @@
 
 from .content_tests import ContentTestCollector
 from .schema import FileTest
-from .util import file_md5sum, gzip_md5sum
+from .util import extract_md5sum, file_md5sum
 from .workflow import Workflow
 
 
@@ -77,15 +77,15 @@ def collect(self):
                 filepath=filepath,
                 md5sum=self.filetest.md5sum,
                 workflow=self.workflow,
-                ungzip=False)]
+                extract=False)]
 
-        if self.filetest.ungzip_md5sum:
+        if self.filetest.extract_md5sum:
             tests += [FileMd5.from_parent(
                 parent=self,
                 filepath=filepath,
-                md5sum=self.filetest.ungzip_md5sum,
+                md5sum=self.filetest.extract_md5sum,
                 workflow=self.workflow,
-                ungzip=True)]
+                extract=True)]
 
         return tests
 
@@ -128,22 +128,22 @@ def repr_failure(self, excinfo, style=None):
 
 class FileMd5(pytest.Item):
     def __init__(self, parent: pytest.Collector, filepath: Path,
-                 md5sum: str, workflow: Workflow, ungzip: bool):
+                 md5sum: str, workflow: Workflow, extract: bool):
         """
         Create a tests for the file md5sum.
         :param parent: The collector that started this item
         :param filepath: The path to the file
         :param md5sum:  The expected md5sum
         :param workflow: The workflow running to generate the file
-        :param ungzip: Whether the file should be ungzipped before calculating
+        :param extract: Whether the file should be extracted before calculating
         """
-        name = "ungzip_md5sum" if ungzip else "md5sum"
+        name = "extract_md5sum" if extract else "md5sum"
         super().__init__(name, parent)
         self.filepath = filepath
         self.expected_md5sum = md5sum
         self.observed_md5sum = None
         self.workflow = workflow
-        self.ungzip = ungzip
+        self.extract = extract
 
     def runtest(self):
         # Wait for the workflow to finish before we check the md5sum of a file.
@@ -151,12 +151,12 @@ def runtest(self):
         if not self.workflow.matching_exitcode():
             pytest.skip(f"'{self.parent.workflow.name}' did not exit with"
                         f"desired exit code.")
-        sum_func = gzip_md5sum if self.ungzip else file_md5sum
+        sum_func = extract_md5sum if self.extract else file_md5sum
         self.observed_md5sum = sum_func(self.filepath)
         assert self.observed_md5sum == self.expected_md5sum
 
     def repr_failure(self, excinfo, style=None):
-        metric = "ungzip_md5sum" if self.ungzip else "md5sum"
+        metric = "extract_md5sum" if self.extract else "md5sum"
         return (
             f"Observed {metric} '{self.observed_md5sum}' not equal to "
             f"expected {metric} '{self.expected_md5sum}' for file "
diff --git a/src/pytest_workflow/schema.py b/src/pytest_workflow/schema.py
index d4eada2f..499dd416 100644
--- a/src/pytest_workflow/schema.py
+++ b/src/pytest_workflow/schema.py
@@ -125,7 +125,7 @@ def __init__(self, contains: Optional[List[str]] = None,
 class FileTest(ContentTest):
     """A class that contains all the properties of a to be tested file."""
     def __init__(self, path: str, md5sum: Optional[str] = None,
-                 ungzip_md5sum: Optional[str] = None,
+                 extract_md5sum: Optional[str] = None,
                  should_exist: bool = DEFAULT_FILE_SHOULD_EXIST,
                  contains: Optional[List[str]] = None,
                  must_not_contain: Optional[List[str]] = None,
@@ -136,7 +136,7 @@ def __init__(self, path: str, md5sum: Optional[str] = None,
         A container object
         :param path: the path to the file
         :param md5sum: md5sum of the file contents
-        :param ungzip_md5sum: md5sum of the unzipped file contents
+        :param extract_md5sum: md5sum of the extracted file contents
         :param should_exist: whether the file should exist or not
         :param contains: a list of strings that should be present in the file
         :param must_not_contain: a list of strings that should not be present
@@ -152,7 +152,7 @@ def __init__(self, path: str, md5sum: Optional[str] = None,
                          encoding=encoding)
         self.path = Path(path)
         self.md5sum = md5sum
-        self.ungzip_md5sum = ungzip_md5sum
+        self.extract_md5sum = extract_md5sum
         self.should_exist = should_exist
 
 
diff --git a/src/pytest_workflow/schema/schema.json b/src/pytest_workflow/schema/schema.json
index 82ec80c1..718b6c25 100644
--- a/src/pytest_workflow/schema/schema.json
+++ b/src/pytest_workflow/schema/schema.json
@@ -123,7 +123,7 @@
             "should_exist": {
               "type": "boolean"
             },
-            "ungzip_md5sum": {
+            "extract_md5sum": {
               "type": "string",
               "pattern": "^[a-f0-9]{32}$"
             },
diff --git a/src/pytest_workflow/util.py b/src/pytest_workflow/util.py
index 491033d0..beed52af 100644
--- a/src/pytest_workflow/util.py
+++ b/src/pytest_workflow/util.py
@@ -1,5 +1,4 @@
 import functools
-import gzip
 import hashlib
 import os
 import re
@@ -11,6 +10,8 @@
 from typing import Callable, IO, Iterator, List, Optional, Set, Tuple, Union, \
                    cast
 
+from xopen import xopen
+
 Filepath = Union[str, os.PathLike]
 
 
@@ -210,15 +211,15 @@ def file_md5sum(filepath: Path, block_size=64 * 1024) -> str:
         return file_handle_md5sum(file_handler, block_size)
 
 
-def gzip_md5sum(filepath: Path, block_size=64 * 1024) -> str:
+def extract_md5sum(filepath: Path, block_size=64 * 1024) -> str:
     """
-    Generates a md5sum for the uncompressed contents of gzipped file.
+    Generates a md5sum for the uncompressed contents of compressed file.
     Reads file in blocks to save memory.
-    :param filepath: a pathlib. Path to the gzipped file
+    :param filepath: a pathlib. Path to the compressed file
     :param block_size: Block size in bytes
     :return: a md5sum as hexadecimal string.
     """
-    with gzip.open(filepath) as file_handler:  # Read the file in bytes
+    with xopen(filepath, 'rb') as file_handler:  # Read the file in bytes
         return file_handle_md5sum(cast(IO[bytes], file_handler), block_size)
 
 
diff --git a/tests/test_schema.py b/tests/test_schema.py
index 98edfc95..8defda21 100644
--- a/tests/test_schema.py
+++ b/tests/test_schema.py
@@ -171,7 +171,7 @@ def test_filetest_defaults():
     assert file_test.contains_regex == []
     assert file_test.must_not_contain_regex == []
     assert file_test.md5sum is None
-    assert file_test.ungzip_md5sum is None
+    assert file_test.extract_md5sum is None
     assert file_test.should_exist
 
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index b2f9f376..574225bd 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -26,8 +26,8 @@
 import pytest
 
 from pytest_workflow.util import decode_unaligned, duplicate_tree, \
-    file_md5sum, git_check_submodules_cloned, git_root, \
-    gzip_md5sum, is_in_dir, link_tree, replace_whitespace
+    extract_md5sum, file_md5sum, git_check_submodules_cloned, git_root, \
+    is_in_dir, link_tree, replace_whitespace
 
 WHITESPACE_TESTS = [
     ("bla\nbla", "bla_bla"),
@@ -164,11 +164,11 @@ def test_file_md5sum(hash_file: Path):
     assert whole_file_md5 == per_line_md5
 
 
-def test_gzip_md5sum():
+def test_extract_md5sum():
     hash_file = HASH_FILE_DIR / "LICENSE.gz"
     with gzip.open(hash_file, "rb") as contents_fh:
         whole_file_md5 = hashlib.md5(contents_fh.read()).hexdigest()
-    per_line_md5 = gzip_md5sum(hash_file)
+    per_line_md5 = extract_md5sum(hash_file)
     assert whole_file_md5 == per_line_md5
 
 

From dd69ee7153313631b09ebf178eedddf537820cfe Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 26 Jun 2023 08:51:27 +0200
Subject: [PATCH 8/8] Also include extract_md5sum keyword on the readme

---
 README.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.rst b/README.rst
index c53eb866..3e19015c 100644
--- a/README.rst
+++ b/README.rst
@@ -127,6 +127,7 @@ predefined tests as well as custom tests are possible.
       - path: "TomCruise.txt.gz"       # Gzipped files can also be searched, provided their extension is '.gz'
         contains:
           - "starring"
+        extract_md5sum: e27c52f6b5f8152aa3ef58be7bdacc4d   # Md5sum of the uncompressed file (optional)
     stderr:                            # Options for testing stderr (optional)
       contains:                        # A list of strings which should be in stderr (optional)
         - "BSOD error, please contact the IT crowd"