Ignore n levels instead of just leaves

blingenf · Apr 6, 2024 · e183099 · e183099
1 parent ba07281
commit e183099
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -31,7 +31,8 @@ Configuration options can be provided either by using the command line arguments
 - `display_threshold` (`-d`, `--display-thresh`): the similarity percentage cutoff for displaying similar files on the detector report.
 - `force_language` (`-o`, `--force-language`): forces the tokenizer to tokenize input as a specific language, rather than automatically detecting the language using the file extension.
 - `same_name_only` (`-s`, `--same-name`): if `true`, the detector will only compare files that have the same name (for example, `decision_tree.py` will not be compared to `k_nn.py`). Note that this also means that, for example, `bryson_k_nn.py` will not be compared to `sara_k_nn.py`.
-- `ignore_leaf` (`-l`, `--ignore-leaf`):  if `true`, the detector will not compare files located in the same leaf directory.
+- `ignore_leaf` (`-l`, `--ignore-leaf`):  if `true`, the detector will not compare files located in the same leaf directory (equivalent to setting `--ignore-depth 1`).
+- `ignore_depth` (`--ignore-depth`):  if set to `n`, the detector will not compare files whose n'th parent directory resolves to the same path.
 - `disable_filtering` (`-f`, `--disable-filter`):  if `true`, the detector will not tokenize and filter code before generating file fingerprints.
 - `disable_autoopen` (`-a`, `--disable-autoopen`):  if `true`, the detector will not automatically open a browser window to display the report.
 - `truncate` (`-T`, `--truncate`):  if `true`, highlighted code will be truncated to remove non-highlighted regions from the displayed output (sections not within 10 lines of highlighted code will be replaced with "...").

diff --git a/copydetect/__main__.py b/copydetect/__main__.py
@@ -59,10 +59,16 @@ def main():
     parser.add_argument("-s", '--same-name', dest='same_name',
                         action='store_true', default=False,
                         help="only compare files which have the same name")
-    parser.add_argument("-l", '--ignore-leaf', dest='ignore_leaf',
-                        action='store_true', default=False,
-                        help="don't compare files located in the same "
-                        "leaf directory")
+
+    grouping = parser.add_mutually_exclusive_group()
+    grouping.add_argument("-l", '--ignore-leaf', dest='ignore_depth',
+                          action='store_const', const=1,
+                          help="don't compare files located in the same "
+                          "leaf directory")
+    grouping.add_argument('--ignore-depth', dest='ignore_depth',
+                          type=int, help="don't compare files whose n'th parent "
+                          "is the same folder", default=0)
+
     parser.add_argument("-f", '--disable-filter', dest='filter',
                         action='store_true', default=False,
                         help="disable code tokenization and filtering")
@@ -111,7 +117,7 @@ def main():
           "display_threshold" : args.display_thresh,
           "force_language" : args.language,
           "same_name_only" : args.same_name,
-          "ignore_leaf" : args.ignore_leaf,
+          "ignore_depth" : args.ignore_depth,
           "disable_filtering" : args.filter,
           "disable_autoopen" : args.autoopen,
           "truncate" : args.truncate,

diff --git a/copydetect/_config.py b/copydetect/_config.py
@@ -20,7 +20,7 @@ class CopydetectConfig:
     guarantee_t: int = defaults.GUARANTEE_THRESHOLD
     display_t: float = defaults.DISPLAY_THRESHOLD
     same_name_only: bool = False
-    ignore_leaf: bool = False
+    ignore_depth: int = 0
     autoopen: bool = True
     disable_filtering: bool = False
     force_language: Optional[str] = None
@@ -52,8 +52,8 @@ def _check_arguments(self):
             raise TypeError("Boilerplate directories must be a list")
         if not isinstance(self.same_name_only, bool):
             raise TypeError("same_name_only must be true or false")
-        if not isinstance(self.ignore_leaf, bool):
-            raise TypeError("ignore_leaf must be true or false")
+        if not isinstance(self.ignore_depth, int):
+            raise TypeError("ignore_depth must be an integer")
         if not isinstance(self.disable_filtering, bool):
             raise TypeError("disable_filtering must be true or false")
         if not isinstance(self.autoopen, bool):

diff --git a/copydetect/detector.py b/copydetect/detector.py
@@ -2,7 +2,7 @@
 a set of test files (files to check for plagairism) and a set of
 reference files (files that might have been plagairised from).
 """
-
+from collections import defaultdict
 from pathlib import Path
 import time
 import logging
@@ -221,9 +221,9 @@ class CopyDetector:
     same_name_only : bool
         If true, the detector will only compare files that have the
         same name
-    ignore_leaf : bool
-        If true, the detector will not compare files located in the
-        same leaf directory.
+    ignore_depth : int
+        The detector will not compare files whose n'th parent folders
+        are equal.
     autoopen : bool
         If true, the detector will automatically open a webbrowser to
         display the results of generate_html_report
@@ -252,7 +252,7 @@ def __init__(self, test_dirs=None, ref_dirs=None,
                  noise_t=defaults.NOISE_THRESHOLD,
                  guarantee_t=defaults.GUARANTEE_THRESHOLD,
                  display_t=defaults.DISPLAY_THRESHOLD,
-                 same_name_only=False, ignore_leaf=False, autoopen=True,
+                 same_name_only=False, ignore_depth=0, autoopen=True,
                  disable_filtering=False, force_language=None,
                  truncate=False, out_file="./report.html", css_files=None,
                  silent=False, encoding: str = "utf-8"):
@@ -322,27 +322,29 @@ def _get_file_list(self, dirs, exts):
         # convert to a set to remove duplicates, then back to a list
         return list(set(file_list))
 
-    def add_file(self, filename, type="testref"):
+    def add_file(self, filename, file_type="testref"):
         """Adds a file to the list of test files, reference files, or
         boilerplate files.
 
         Parameters
         ----------
         filename : str
             Name of file to add.
-        type : {"testref", "test", "ref", "boilerplate"}
+        file_type : {"testref", "test", "ref", "boilerplate"}
             Type of file to add. "testref" will add the file as both a
             test and reference file.
         """
-        if type == "testref":
+        if file_type == "testref":
             self.test_files.append(filename)
             self.ref_files.append(filename)
-        elif type == "test":
+        elif file_type == "test":
             self.test_files.append(filename)
-        elif type == "ref":
+        elif file_type == "ref":
             self.ref_files.append(filename)
-        elif type == "boilerplate":
+        elif file_type == "boilerplate":
             self.boilerplate_files.append(filename)
+        else:
+            raise ValueError(file_type)
 
     def _get_boilerplate_hashes(self):
         """Generates a list of hashes of the boilerplate text. Returns
@@ -409,35 +411,30 @@ def _comparison_loop(self):
         # test and reference files
         comparisons = {}
 
-        for i, test_f in enumerate(
-            tqdm(self.test_files,
-                 bar_format= '   {l_bar}{bar}{r_bar}',
-                 disable=self.conf.silent)
-        ):
-            for j, ref_f in enumerate(self.ref_files):
-                if (test_f not in self.file_data
-                        or ref_f not in self.file_data
-                        or test_f == ref_f
-                        or (self.conf.same_name_only
-                            and (Path(test_f).name != Path(ref_f).name))
-                        or (self.conf.ignore_leaf
-                            and (Path(test_f).parent == Path(ref_f).parent))):
-                    continue
+        test_indices = {f: i for i, f in enumerate(self.test_files)}
+        ref_indices = {f: i for i, f in enumerate(self.ref_files)}
 
-                if (ref_f, test_f) in comparisons:
-                    ref_idx, test_idx = comparisons[(ref_f, test_f)]
-                    overlap = self.token_overlap_matrix[ref_idx, test_idx]
-                    sim2, sim1 = self.similarity_matrix[ref_idx, test_idx]
-                else:
-                    overlap, (sim1, sim2), (slices1, slices2) = compare_files(
-                        self.file_data[test_f], self.file_data[ref_f]
-                    )
-                    comparisons[(test_f, ref_f)] = (i, j)
-                    if slices1.shape[0] != 0:
-                        self.slice_matrix[(test_f, ref_f)] = [slices1, slices2]
+        for test_f, ref_f in tqdm(self.get_comparison_pairs(),
+                                  bar_format='   {l_bar}{bar}{r_bar}',
+                                  disable=self.conf.silent
+                                  ):
+            i = test_indices[test_f]
+            j = ref_indices[ref_f]
+
+            if (ref_f, test_f) in comparisons:
+                ref_idx, test_idx = comparisons[(ref_f, test_f)]
+                overlap = self.token_overlap_matrix[ref_idx, test_idx]
+                sim2, sim1 = self.similarity_matrix[ref_idx, test_idx]
+            else:
+                overlap, (sim1, sim2), (slices1, slices2) = compare_files(
+                    self.file_data[test_f], self.file_data[ref_f]
+                )
+                comparisons[(test_f, ref_f)] = (i, j)
+                if slices1.shape[0] != 0:
+                    self.slice_matrix[(test_f, ref_f)] = [slices1, slices2]
 
-                self.similarity_matrix[i, j] = np.array([sim1, sim2])
-                self.token_overlap_matrix[i, j] = overlap
+            self.similarity_matrix[i, j] = np.array([sim1, sim2])
+            self.token_overlap_matrix[i, j] = overlap
 
     def run(self):
         """Runs the copy detection loop for detecting overlap between
@@ -467,6 +464,37 @@ def run(self):
             if not self.conf.silent:
                 print(f"{time.time()-start_time:6.2f}: Code comparison completed")
 
+    def get_comparison_pairs(self):
+        """Get a set of file pairs that are considered during the
+        comparison.
+
+        Returns
+        -------
+        set
+            set of pairs that are considered during comparison.
+        """
+
+        compared_files = set()
+
+        for test_f in self.test_files:
+            test_path = Path(test_f).resolve()
+            for ref_f in self.ref_files:
+                ref_path = Path(ref_f).resolve()
+                if (test_f not in self.file_data
+                        or ref_f not in self.file_data
+                        or test_f == ref_f
+                        or (self.conf.same_name_only
+                            and (test_path.name != ref_path.name))):
+                    continue
+                if self.conf.ignore_depth:
+                    depth = self.conf.ignore_depth - 1
+                    ref_parents, test_parents = ref_path.parents, test_path.parents
+                    if (len(test_parents) >= depth and len(ref_parents) >= depth
+                            and test_parents[depth] == ref_parents[depth]):
+                        continue
+                compared_files.add((test_f, ref_f))
+        return compared_files
+
     def get_copied_code_list(self):
         """Get a list of copied code to display on the output report.
         Returns a list of tuples containing the similarity score, the

diff --git a/docs/cmdline.rst b/docs/cmdline.rst
@@ -27,7 +27,8 @@ Configuration options can be provided either by using the command line arguments
 - ``display_threshold`` (``-d``, ``--display-thresh``): the similarity percentage cutoff for displaying similar files on the detector report.
 - ``force_language`` (``-o``, ``--force-language``): forces the tokenizer to tokenize input as a specific language, rather than automatically detecting the language using the file extension.
 - ``same_name_only`` (``-s``, ``--same-name``): if ``true``, the detector will only compare files that have the same name (for example, ``decision_tree.py`` will not be compared to ``k_nn.py``). Note that this also means that, for example, ``bryson_k_nn.py`` will not be compared to ``sara_k_nn.py``.
-- ``ignore_leaf`` (``-l``, ``--ignore-leaf``):  if ``true``, the detector will not compare files located in the same leaf directory.
+- ``ignore_leaf`` (``-l``, ``--ignore-leaf``):  if ``true``, the detector will not compare files located in the same leaf directory (equivalent to setting ``--ignore-depth 1``).
+- ``ignore_depth`` (``--ignore-depth``):  if set to ``n``, the detector will not compare files whose n'th parent directory resolves to the same path.
 - ``disable_filtering`` (``-f``, ``--disable-filter``):  if ``true``, the detector will not tokenize and filter code before generating file fingerprints.
 - ``disable_autoopen`` (``-a``, ``--disable-autoopen``):  if ``true``, the detector will not automatically open a browser window to display the report.
 - ``truncate`` (``-T``, ``--truncate``):  if ``true``, highlighted code will be truncated to remove non-highlighted regions from the displayed output (sections not within 10 lines of highlighted code will be replaced with "...").

diff --git a/tests/test_detector.py b/tests/test_detector.py
@@ -171,14 +171,32 @@ def test_compare_boilerplate(self):
 class TestParameters():
     """Test cases for individual parameters"""
     def test_ignore_leaf(self):
+        # TODO Once ignore_leaf is added with deprecation warning, also test it
         detector = CopyDetector(test_dirs=[TESTS_DIR + "/sample_py"],
-                                ignore_leaf=True, silent=True)
+                                ignore_depth=1, silent=True)
         detector.run()
 
         # sample1 and sample2 should not have been compared
         # + 4 self compares = 6 total skips
         assert np.sum(detector.similarity_matrix[:,:,0] == -1) == 6
 
+    def test_ignore_depth_empty(self):
+        detector = CopyDetector(test_dirs=[TESTS_DIR + "/sample_other", TESTS_DIR + "/sample_sanity_check"],
+                                ignore_depth=2, silent=True)
+        detector.run()
+
+        # No files should be compared
+        assert (detector.similarity_matrix[:,:,0] == -1).all()
+        assert not detector.get_comparison_pairs()
+
+    def test_ignore_depth(self):
+        detector = CopyDetector(test_dirs=[TESTS_DIR + "/sample_py"],
+                                ignore_depth=2, silent=True)
+        detector.run()
+
+        # Only compare /handout.py with the files in /boilerplate and /code
+        assert np.sum(detector.similarity_matrix[:,:,0] != -1) == 6
+
     def test_same_name_only(self):
         detector = CopyDetector(test_dirs=[TESTS_DIR + "/sample_py"],
                                 same_name_only=True, silent=True)