Skip to content

Commit

Permalink
Ignore n levels instead of just leaves
Browse files Browse the repository at this point in the history
  • Loading branch information
incaseoftrouble committed Apr 6, 2024
1 parent ba07281 commit e183099
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 49 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ Configuration options can be provided either by using the command line arguments
- `display_threshold` (`-d`, `--display-thresh`): the similarity percentage cutoff for displaying similar files on the detector report.
- `force_language` (`-o`, `--force-language`): forces the tokenizer to tokenize input as a specific language, rather than automatically detecting the language using the file extension.
- `same_name_only` (`-s`, `--same-name`): if `true`, the detector will only compare files that have the same name (for example, `decision_tree.py` will not be compared to `k_nn.py`). Note that this also means that, for example, `bryson_k_nn.py` will not be compared to `sara_k_nn.py`.
- `ignore_leaf` (`-l`, `--ignore-leaf`): if `true`, the detector will not compare files located in the same leaf directory.
- `ignore_leaf` (`-l`, `--ignore-leaf`): if `true`, the detector will not compare files located in the same leaf directory (equivalent to setting `--ignore-depth 1`).
- `ignore_depth` (`--ignore-depth`): if set to `n`, the detector will not compare files whose n'th parent directory resolves to the same path.
- `disable_filtering` (`-f`, `--disable-filter`): if `true`, the detector will not tokenize and filter code before generating file fingerprints.
- `disable_autoopen` (`-a`, `--disable-autoopen`): if `true`, the detector will not automatically open a browser window to display the report.
- `truncate` (`-T`, `--truncate`): if `true`, highlighted code will be truncated to remove non-highlighted regions from the displayed output (sections not within 10 lines of highlighted code will be replaced with "...").
Expand Down
16 changes: 11 additions & 5 deletions copydetect/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,16 @@ def main():
parser.add_argument("-s", '--same-name', dest='same_name',
action='store_true', default=False,
help="only compare files which have the same name")
parser.add_argument("-l", '--ignore-leaf', dest='ignore_leaf',
action='store_true', default=False,
help="don't compare files located in the same "
"leaf directory")

grouping = parser.add_mutually_exclusive_group()
grouping.add_argument("-l", '--ignore-leaf', dest='ignore_depth',
action='store_const', const=1,
help="don't compare files located in the same "
"leaf directory")
grouping.add_argument('--ignore-depth', dest='ignore_depth',
type=int, help="don't compare files whose n'th parent "
"is the same folder", default=0)

parser.add_argument("-f", '--disable-filter', dest='filter',
action='store_true', default=False,
help="disable code tokenization and filtering")
Expand Down Expand Up @@ -111,7 +117,7 @@ def main():
"display_threshold" : args.display_thresh,
"force_language" : args.language,
"same_name_only" : args.same_name,
"ignore_leaf" : args.ignore_leaf,
"ignore_depth" : args.ignore_depth,
"disable_filtering" : args.filter,
"disable_autoopen" : args.autoopen,
"truncate" : args.truncate,
Expand Down
6 changes: 3 additions & 3 deletions copydetect/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class CopydetectConfig:
guarantee_t: int = defaults.GUARANTEE_THRESHOLD
display_t: float = defaults.DISPLAY_THRESHOLD
same_name_only: bool = False
ignore_leaf: bool = False
ignore_depth: int = 0
autoopen: bool = True
disable_filtering: bool = False
force_language: Optional[str] = None
Expand Down Expand Up @@ -52,8 +52,8 @@ def _check_arguments(self):
raise TypeError("Boilerplate directories must be a list")
if not isinstance(self.same_name_only, bool):
raise TypeError("same_name_only must be true or false")
if not isinstance(self.ignore_leaf, bool):
raise TypeError("ignore_leaf must be true or false")
if not isinstance(self.ignore_depth, int):
raise TypeError("ignore_depth must be an integer")
if not isinstance(self.disable_filtering, bool):
raise TypeError("disable_filtering must be true or false")
if not isinstance(self.autoopen, bool):
Expand Down
104 changes: 66 additions & 38 deletions copydetect/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
a set of test files (files to check for plagairism) and a set of
reference files (files that might have been plagairised from).
"""

from collections import defaultdict
from pathlib import Path
import time
import logging
Expand Down Expand Up @@ -221,9 +221,9 @@ class CopyDetector:
same_name_only : bool
If true, the detector will only compare files that have the
same name
ignore_leaf : bool
If true, the detector will not compare files located in the
same leaf directory.
ignore_depth : int
The detector will not compare files whose n'th parent folders
are equal.
autoopen : bool
If true, the detector will automatically open a webbrowser to
display the results of generate_html_report
Expand Down Expand Up @@ -252,7 +252,7 @@ def __init__(self, test_dirs=None, ref_dirs=None,
noise_t=defaults.NOISE_THRESHOLD,
guarantee_t=defaults.GUARANTEE_THRESHOLD,
display_t=defaults.DISPLAY_THRESHOLD,
same_name_only=False, ignore_leaf=False, autoopen=True,
same_name_only=False, ignore_depth=0, autoopen=True,
disable_filtering=False, force_language=None,
truncate=False, out_file="./report.html", css_files=None,
silent=False, encoding: str = "utf-8"):
Expand Down Expand Up @@ -322,27 +322,29 @@ def _get_file_list(self, dirs, exts):
# convert to a set to remove duplicates, then back to a list
return list(set(file_list))

def add_file(self, filename, type="testref"):
def add_file(self, filename, file_type="testref"):
"""Adds a file to the list of test files, reference files, or
boilerplate files.
Parameters
----------
filename : str
Name of file to add.
type : {"testref", "test", "ref", "boilerplate"}
file_type : {"testref", "test", "ref", "boilerplate"}
Type of file to add. "testref" will add the file as both a
test and reference file.
"""
if type == "testref":
if file_type == "testref":
self.test_files.append(filename)
self.ref_files.append(filename)
elif type == "test":
elif file_type == "test":
self.test_files.append(filename)
elif type == "ref":
elif file_type == "ref":
self.ref_files.append(filename)
elif type == "boilerplate":
elif file_type == "boilerplate":
self.boilerplate_files.append(filename)
else:
raise ValueError(file_type)

def _get_boilerplate_hashes(self):
"""Generates a list of hashes of the boilerplate text. Returns
Expand Down Expand Up @@ -409,35 +411,30 @@ def _comparison_loop(self):
# test and reference files
comparisons = {}

for i, test_f in enumerate(
tqdm(self.test_files,
bar_format= ' {l_bar}{bar}{r_bar}',
disable=self.conf.silent)
):
for j, ref_f in enumerate(self.ref_files):
if (test_f not in self.file_data
or ref_f not in self.file_data
or test_f == ref_f
or (self.conf.same_name_only
and (Path(test_f).name != Path(ref_f).name))
or (self.conf.ignore_leaf
and (Path(test_f).parent == Path(ref_f).parent))):
continue
test_indices = {f: i for i, f in enumerate(self.test_files)}
ref_indices = {f: i for i, f in enumerate(self.ref_files)}

if (ref_f, test_f) in comparisons:
ref_idx, test_idx = comparisons[(ref_f, test_f)]
overlap = self.token_overlap_matrix[ref_idx, test_idx]
sim2, sim1 = self.similarity_matrix[ref_idx, test_idx]
else:
overlap, (sim1, sim2), (slices1, slices2) = compare_files(
self.file_data[test_f], self.file_data[ref_f]
)
comparisons[(test_f, ref_f)] = (i, j)
if slices1.shape[0] != 0:
self.slice_matrix[(test_f, ref_f)] = [slices1, slices2]
for test_f, ref_f in tqdm(self.get_comparison_pairs(),
bar_format=' {l_bar}{bar}{r_bar}',
disable=self.conf.silent
):
i = test_indices[test_f]
j = ref_indices[ref_f]

if (ref_f, test_f) in comparisons:
ref_idx, test_idx = comparisons[(ref_f, test_f)]
overlap = self.token_overlap_matrix[ref_idx, test_idx]
sim2, sim1 = self.similarity_matrix[ref_idx, test_idx]
else:
overlap, (sim1, sim2), (slices1, slices2) = compare_files(
self.file_data[test_f], self.file_data[ref_f]
)
comparisons[(test_f, ref_f)] = (i, j)
if slices1.shape[0] != 0:
self.slice_matrix[(test_f, ref_f)] = [slices1, slices2]

self.similarity_matrix[i, j] = np.array([sim1, sim2])
self.token_overlap_matrix[i, j] = overlap
self.similarity_matrix[i, j] = np.array([sim1, sim2])
self.token_overlap_matrix[i, j] = overlap

def run(self):
"""Runs the copy detection loop for detecting overlap between
Expand Down Expand Up @@ -467,6 +464,37 @@ def run(self):
if not self.conf.silent:
print(f"{time.time()-start_time:6.2f}: Code comparison completed")

def get_comparison_pairs(self):
"""Get a set of file pairs that are considered during the
comparison.
Returns
-------
set
set of pairs that are considered during comparison.
"""

compared_files = set()

for test_f in self.test_files:
test_path = Path(test_f).resolve()
for ref_f in self.ref_files:
ref_path = Path(ref_f).resolve()
if (test_f not in self.file_data
or ref_f not in self.file_data
or test_f == ref_f
or (self.conf.same_name_only
and (test_path.name != ref_path.name))):
continue
if self.conf.ignore_depth:
depth = self.conf.ignore_depth - 1
ref_parents, test_parents = ref_path.parents, test_path.parents
if (len(test_parents) >= depth and len(ref_parents) >= depth
and test_parents[depth] == ref_parents[depth]):
continue
compared_files.add((test_f, ref_f))
return compared_files

def get_copied_code_list(self):
"""Get a list of copied code to display on the output report.
Returns a list of tuples containing the similarity score, the
Expand Down
3 changes: 2 additions & 1 deletion docs/cmdline.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ Configuration options can be provided either by using the command line arguments
- ``display_threshold`` (``-d``, ``--display-thresh``): the similarity percentage cutoff for displaying similar files on the detector report.
- ``force_language`` (``-o``, ``--force-language``): forces the tokenizer to tokenize input as a specific language, rather than automatically detecting the language using the file extension.
- ``same_name_only`` (``-s``, ``--same-name``): if ``true``, the detector will only compare files that have the same name (for example, ``decision_tree.py`` will not be compared to ``k_nn.py``). Note that this also means that, for example, ``bryson_k_nn.py`` will not be compared to ``sara_k_nn.py``.
- ``ignore_leaf`` (``-l``, ``--ignore-leaf``): if ``true``, the detector will not compare files located in the same leaf directory.
- ``ignore_leaf`` (``-l``, ``--ignore-leaf``): if ``true``, the detector will not compare files located in the same leaf directory (equivalent to setting ``--ignore-depth 1``).
- ``ignore_depth`` (``--ignore-depth``): if set to ``n``, the detector will not compare files whose n'th parent directory resolves to the same path.
- ``disable_filtering`` (``-f``, ``--disable-filter``): if ``true``, the detector will not tokenize and filter code before generating file fingerprints.
- ``disable_autoopen`` (``-a``, ``--disable-autoopen``): if ``true``, the detector will not automatically open a browser window to display the report.
- ``truncate`` (``-T``, ``--truncate``): if ``true``, highlighted code will be truncated to remove non-highlighted regions from the displayed output (sections not within 10 lines of highlighted code will be replaced with "...").
Expand Down
20 changes: 19 additions & 1 deletion tests/test_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,14 +171,32 @@ def test_compare_boilerplate(self):
class TestParameters():
"""Test cases for individual parameters"""
def test_ignore_leaf(self):
# TODO Once ignore_leaf is added with deprecation warning, also test it
detector = CopyDetector(test_dirs=[TESTS_DIR + "/sample_py"],
ignore_leaf=True, silent=True)
ignore_depth=1, silent=True)
detector.run()

# sample1 and sample2 should not have been compared
# + 4 self compares = 6 total skips
assert np.sum(detector.similarity_matrix[:,:,0] == -1) == 6

def test_ignore_depth_empty(self):
detector = CopyDetector(test_dirs=[TESTS_DIR + "/sample_other", TESTS_DIR + "/sample_sanity_check"],
ignore_depth=2, silent=True)
detector.run()

# No files should be compared
assert (detector.similarity_matrix[:,:,0] == -1).all()
assert not detector.get_comparison_pairs()

def test_ignore_depth(self):
detector = CopyDetector(test_dirs=[TESTS_DIR + "/sample_py"],
ignore_depth=2, silent=True)
detector.run()

# Only compare /handout.py with the files in /boilerplate and /code
assert np.sum(detector.similarity_matrix[:,:,0] != -1) == 6

def test_same_name_only(self):
detector = CopyDetector(test_dirs=[TESTS_DIR + "/sample_py"],
same_name_only=True, silent=True)
Expand Down

0 comments on commit e183099

Please sign in to comment.