Skip to content

Commit

Permalink
Merge pull request #48 from blingenf/feature/duplicate_hashes
Browse files Browse the repository at this point in the history
Improved duplicate hash handling
  • Loading branch information
blingenf authored Sep 4, 2023
2 parents 974fb41 + 87749cb commit d3667ff
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 31 deletions.
14 changes: 7 additions & 7 deletions copydetect/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,20 +65,20 @@ class CodeFingerprint:
The cumulative number of characters removed during filtering at
each index of the filtered code. Used for translating locations
in the filtered code to locations in the unfiltered code.
hashes : 1D array of ints
List of fingerprints extracted from the filtered code.
hash_idx : 1D array of ints
List of indexes of the selected fingerprints. Used for
translating hash indexes to indexes in the filtered code.
hashes : Set[int]
Set of fingerprint hashes extracted from the filtered code.
hash_idx : Dict[int, List[int]]
Mapping of each fingerprint hash back to all indexes in the
original code in which this fingerprint appeared.
k : int
Value of provided k argument.
language : str
If set, will force the tokenizer to use the provided language
rather than guessing from the file extension.
token_coverage : int
The number of tokens in the tokenized code which are considered
for fingerprint comparison, after dropping duplicate k-grams and
performing winnowing.
for fingerprint comparison, after performing winnowing and
removing boilerplate.
"""
def __init__(self, file, k, win_size, boilerplate=None, filter=True,
language=None, fp=None, encoding: str = "utf-8"):
Expand Down
33 changes: 26 additions & 7 deletions copydetect/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import logging
import warnings
from typing import Dict, List

from pygments import lexers, token
import pygments.util
Expand Down Expand Up @@ -135,24 +136,38 @@ def get_document_fingerprints(doc, k, window_size, boilerplate=None):
"""
if boilerplate is None:
boilerplate = []
hashes, idx = winnow(hashed_kgrams(doc, k=k), window_size=window_size)
all_hashes = hashed_kgrams(doc, k=k)
hashes, idx = winnow(
all_hashes, window_size=window_size, remove_duplicates=False
)
if len(boilerplate) > 0:
_, overlap_idx, _ = np.intersect1d(hashes, boilerplate,
return_indices=True,
assume_unique=True)
idx = np.delete(idx, overlap_idx)
hashes = np.delete(hashes, overlap_idx)
return hashes, idx

hash_dict = {}
for hash_val, i in zip(hashes, idx):
if hash_val not in hash_dict:
hash_dict[hash_val] = [i]
else:
hash_dict[hash_val].append(i)
return set(hashes), hash_dict

def find_fingerprint_overlap(hashes1, hashes2, idx1, idx2):
"""Finds the indexes of overlapping values between two lists of
hashes. Returns two lists of indexes, one for the first hash list
and one for the second. The indexes of the original hashes are
provided in case boilerplate results in gaps.
"""
overlap, ol_idx1, ol_idx2 = np.intersect1d(hashes1, hashes2,
return_indices=True, assume_unique=True)
return idx1[ol_idx1], idx2[ol_idx2]
intersection = hashes1.intersection(hashes2)
if len(intersection) > 0:
overlap_1 = np.concatenate([np.array(idx1[i]) for i in intersection])
overlap_2 = np.concatenate([np.array(idx2[i]) for i in intersection])
return overlap_1.flatten(), overlap_2.flatten()
else:
return np.array([], dtype=int), np.array([], dtype=int)

def highlight_overlap(doc, slices, left_hl, right_hl,
truncate=-1, escape_html=False):
Expand Down Expand Up @@ -206,11 +221,15 @@ def highlight_overlap(doc, slices, left_hl, right_hl,

return new_doc, hl_percent

def get_token_coverage(idx, k, token_len):
def get_token_coverage(idx: Dict[int, List[int]], k: int, token_len: int):
"""Determines the number of tokens in the original document which
are included in the winnowed indices
"""
if len(idx) > 0:
idx_arr = np.concatenate([np.array(i) for i in idx.values()])
else:
idx_arr = np.array([], dtype=int)
coverage = np.zeros(token_len)
for offset in range(k):
coverage[idx + offset] = 1
coverage[idx_arr + offset] = 1
return np.sum(coverage)
50 changes: 35 additions & 15 deletions tests/test_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,22 @@

TESTS_DIR = str(Path(__file__).parent)

class TestTwoFileDetection():

@pytest.fixture
def sample_file_metrics():
return {
"file1_len": 2052,
"file2_len": 1257,
"token_overlap": 1155
}


class TestTwoFileDetection:
"""Test of the user-facing copydetect code for a simple two-file
case. The two files both use several sections from a boilerplate
file but are otherwise different.
"""
def test_compare(self):
def test_compare(self, sample_file_metrics):
config = {
"test_directories" : [TESTS_DIR + "/sample_py/code"],
"reference_directories" : [TESTS_DIR + "/sample_py/code"],
Expand All @@ -25,14 +35,18 @@ def test_compare(self):
detector = CopyDetector.from_config(config)
detector.run()

overlap = sample_file_metrics["token_overlap"]
sim1 = overlap / sample_file_metrics["file1_len"]
sim2 = overlap / sample_file_metrics["file2_len"]

# file order is not guaranteed, so there are two possible
# similarity matrices depending on the order of the files
possible_mtx_1 = np.array([[[-1, -1], [1137/1829,1137/1257]],
[[1137/1257,1137/1829], [-1, -1]]])
possible_mtx_1 = np.array([[[-1, -1], [sim1, sim2]],
[[sim2, sim1], [-1, -1]]])
possible_mtx_2 = np.flip(possible_mtx_1, 2)
assert (np.array_equal(possible_mtx_1, detector.similarity_matrix)
or np.array_equal(possible_mtx_2, detector.similarity_matrix))
assert np.array_equal(np.array([[-1, 1137],[1137,-1]]),
assert np.array_equal(np.array([[-1, overlap],[overlap,-1]]),
detector.token_overlap_matrix)

html_out = detector.generate_html_report(output_mode="return")
Expand All @@ -46,16 +60,20 @@ def test_compare(self):
assert test_str2 in html_out
assert test_str3 in html_out

def test_compare_manual_config(self):
def test_compare_manual_config(self, sample_file_metrics):
detector = CopyDetector(noise_t=25, guarantee_t=25, silent=True)
detector.add_file(TESTS_DIR + "/sample_py/code/sample1.py")
detector.add_file(TESTS_DIR + "/sample_py/code/sample2.py")
detector.run()

assert np.array_equal(np.array([[[-1, -1], [1137/1829,1137/1257]],
[[1137/1257,1137/1829], [-1, -1]]]),
overlap = sample_file_metrics["token_overlap"]
sim1 = overlap / sample_file_metrics["file1_len"]
sim2 = overlap / sample_file_metrics["file2_len"]

assert np.array_equal(np.array([[[-1, -1], [sim1, sim2]],
[[sim2, sim1], [-1, -1]]]),
detector.similarity_matrix)
assert np.array_equal(np.array([[-1,1137],[1137,-1]]),
assert np.array_equal(np.array([[-1,overlap],[overlap,-1]]),
detector.token_overlap_matrix)

def test_compare_saving(self, tmpdir):
Expand Down Expand Up @@ -125,22 +143,24 @@ class TestTwoFileAPIDetection():
"""Performs the same checks as the other two-file check, but uses
the API instead of the command line code.
"""
def test_compare(self):
def test_compare(self, sample_file_metrics):
fp1 = CodeFingerprint(TESTS_DIR+"/sample_py/code/sample1.py", 25, 1)
fp2 = CodeFingerprint(TESTS_DIR+"/sample_py/code/sample2.py", 25, 1)
token_overlap, similarities, slices = compare_files(fp1, fp2)

assert token_overlap == 1137
assert similarities[0] == 1137/1829
assert similarities[1] == 1137/1257
overlap = sample_file_metrics["token_overlap"]

assert token_overlap == overlap
assert similarities[0] == overlap / sample_file_metrics["file1_len"]
assert similarities[1] == overlap / sample_file_metrics["file2_len"]

def test_compare_boilerplate(self):
bp_fingerprint = CodeFingerprint(
TESTS_DIR + "/sample_py/boilerplate/handout.py", 25, 1)
fp1 = CodeFingerprint(TESTS_DIR+"/sample_py/code/sample1.py", 25, 1,
bp_fingerprint.hashes)
np.array(list(bp_fingerprint.hashes)))
fp2 = CodeFingerprint(TESTS_DIR+"/sample_py/code/sample2.py", 25, 1,
bp_fingerprint.hashes)
np.array(list(bp_fingerprint.hashes)))

token_overlap, similarities, slices = compare_files(fp1, fp2)

Expand Down
4 changes: 2 additions & 2 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def test_java_tokenization(self):

def test_get_token_coverage(self):
sample = "0123456789"
idx1 = np.array([0, 5])
idx1 = {0: [0], 1: [5]}

# two 5-grams starting at 0 and 5 cover all 10 tokens
assert cd.get_token_coverage(idx1, 5, len(sample)) == len(sample)
Expand All @@ -187,5 +187,5 @@ def test_get_token_coverage(self):
assert cd.get_token_coverage(idx1, 1, len(sample)) == 2

# k-gram overlap shouldn't matter
idx = np.arange(8)
idx = {i: [i] for i in range(8)}
assert cd.get_token_coverage(idx, 3, len(sample)) == len(sample)

0 comments on commit d3667ff

Please sign in to comment.