Merge pull request #48 from blingenf/feature/duplicate_hashes

Improved duplicate hash handling
blingenf · Sep 4, 2023 · d3667ff · d3667ff
2 parents 974fb41 + 87749cb
commit d3667ff
Show file tree

Hide file tree

Showing 4 changed files with 70 additions and 31 deletions.
diff --git a/copydetect/detector.py b/copydetect/detector.py
@@ -65,20 +65,20 @@ class CodeFingerprint:
         The cumulative number of characters removed during filtering at
         each index of the filtered code. Used for translating locations
         in the filtered code to locations in the unfiltered code.
-    hashes : 1D array of ints
-        List of fingerprints extracted from the filtered code.
-    hash_idx : 1D array of ints
-        List of indexes of the selected fingerprints. Used for
-        translating hash indexes to indexes in the filtered code.
+    hashes : Set[int]
+        Set of fingerprint hashes extracted from the filtered code.
+    hash_idx : Dict[int, List[int]]
+        Mapping of each fingerprint hash back to all indexes in the
+        original code in which this fingerprint appeared.
     k : int
         Value of provided k argument.
     language : str
         If set, will force the tokenizer to use the provided language
         rather than guessing from the file extension.
     token_coverage : int
         The number of tokens in the tokenized code which are considered
-        for fingerprint comparison, after dropping duplicate k-grams and
-        performing winnowing.
+        for fingerprint comparison, after performing winnowing and
+        removing boilerplate.
     """
     def __init__(self, file, k, win_size, boilerplate=None, filter=True,
                  language=None, fp=None, encoding: str = "utf-8"):

diff --git a/copydetect/utils.py b/copydetect/utils.py
@@ -5,6 +5,7 @@
 
 import logging
 import warnings
+from typing import Dict, List
 
 from pygments import lexers, token
 import pygments.util
@@ -135,24 +136,38 @@ def get_document_fingerprints(doc, k, window_size, boilerplate=None):
     """
     if boilerplate is None:
         boilerplate = []
-    hashes, idx = winnow(hashed_kgrams(doc, k=k), window_size=window_size)
+    all_hashes = hashed_kgrams(doc, k=k)
+    hashes, idx = winnow(
+        all_hashes, window_size=window_size, remove_duplicates=False
+    )
     if len(boilerplate) > 0:
         _, overlap_idx, _ = np.intersect1d(hashes, boilerplate,
                                            return_indices=True,
                                            assume_unique=True)
         idx = np.delete(idx, overlap_idx)
         hashes = np.delete(hashes, overlap_idx)
-    return hashes, idx
+
+    hash_dict = {}
+    for hash_val, i in zip(hashes, idx):
+        if hash_val not in hash_dict:
+            hash_dict[hash_val] = [i]
+        else:
+            hash_dict[hash_val].append(i)
+    return set(hashes), hash_dict
 
 def find_fingerprint_overlap(hashes1, hashes2, idx1, idx2):
     """Finds the indexes of overlapping values between two lists of
     hashes. Returns two lists of indexes, one for the first hash list
     and one for the second. The indexes of the original hashes are
     provided in case boilerplate results in gaps.
     """
-    overlap, ol_idx1, ol_idx2 = np.intersect1d(hashes1, hashes2,
-        return_indices=True, assume_unique=True)
-    return idx1[ol_idx1], idx2[ol_idx2]
+    intersection = hashes1.intersection(hashes2)
+    if len(intersection) > 0:
+        overlap_1 = np.concatenate([np.array(idx1[i]) for i in intersection])
+        overlap_2 = np.concatenate([np.array(idx2[i]) for i in intersection])
+        return overlap_1.flatten(), overlap_2.flatten()
+    else:
+        return np.array([], dtype=int), np.array([], dtype=int)
 
 def highlight_overlap(doc, slices, left_hl, right_hl,
                       truncate=-1, escape_html=False):
@@ -206,11 +221,15 @@ def highlight_overlap(doc, slices, left_hl, right_hl,
 
     return new_doc, hl_percent
 
-def get_token_coverage(idx, k, token_len):
+def get_token_coverage(idx: Dict[int, List[int]], k: int, token_len: int):
     """Determines the number of tokens in the original document which
     are included in the winnowed indices
     """
+    if len(idx) > 0:
+        idx_arr = np.concatenate([np.array(i) for i in idx.values()])
+    else:
+        idx_arr = np.array([], dtype=int)
     coverage = np.zeros(token_len)
     for offset in range(k):
-        coverage[idx + offset] = 1
+        coverage[idx_arr + offset] = 1
     return np.sum(coverage)
diff --git a/tests/test_detector.py b/tests/test_detector.py
@@ -7,12 +7,22 @@
 
 TESTS_DIR = str(Path(__file__).parent)
 
-class TestTwoFileDetection():
+
+@pytest.fixture
+def sample_file_metrics():
+    return {
+        "file1_len": 2052,
+        "file2_len": 1257,
+        "token_overlap": 1155
+    }
+
+
+class TestTwoFileDetection:
     """Test of the user-facing copydetect code for a simple two-file
     case. The two files both use several sections from a boilerplate
     file but are otherwise different.
     """
-    def test_compare(self):
+    def test_compare(self, sample_file_metrics):
         config = {
             "test_directories" : [TESTS_DIR + "/sample_py/code"],
             "reference_directories" : [TESTS_DIR + "/sample_py/code"],
@@ -25,14 +35,18 @@ def test_compare(self):
         detector = CopyDetector.from_config(config)
         detector.run()
 
+        overlap = sample_file_metrics["token_overlap"]
+        sim1 = overlap / sample_file_metrics["file1_len"]
+        sim2 = overlap / sample_file_metrics["file2_len"]
+
         # file order is not guaranteed, so there are two possible
         # similarity matrices depending on the order of the files
-        possible_mtx_1 = np.array([[[-1, -1], [1137/1829,1137/1257]],
-                                  [[1137/1257,1137/1829], [-1, -1]]])
+        possible_mtx_1 = np.array([[[-1, -1], [sim1, sim2]],
+                                  [[sim2, sim1], [-1, -1]]])
         possible_mtx_2 = np.flip(possible_mtx_1, 2)
         assert (np.array_equal(possible_mtx_1, detector.similarity_matrix)
                 or np.array_equal(possible_mtx_2, detector.similarity_matrix))
-        assert np.array_equal(np.array([[-1, 1137],[1137,-1]]),
+        assert np.array_equal(np.array([[-1, overlap],[overlap,-1]]),
                               detector.token_overlap_matrix)
 
         html_out = detector.generate_html_report(output_mode="return")
@@ -46,16 +60,20 @@ def test_compare(self):
         assert test_str2 in html_out
         assert test_str3 in html_out
 
-    def test_compare_manual_config(self):
+    def test_compare_manual_config(self, sample_file_metrics):
         detector = CopyDetector(noise_t=25, guarantee_t=25, silent=True)
         detector.add_file(TESTS_DIR + "/sample_py/code/sample1.py")
         detector.add_file(TESTS_DIR + "/sample_py/code/sample2.py")
         detector.run()
 
-        assert np.array_equal(np.array([[[-1, -1], [1137/1829,1137/1257]],
-                                        [[1137/1257,1137/1829], [-1, -1]]]),
+        overlap = sample_file_metrics["token_overlap"]
+        sim1 = overlap / sample_file_metrics["file1_len"]
+        sim2 = overlap / sample_file_metrics["file2_len"]
+
+        assert np.array_equal(np.array([[[-1, -1], [sim1, sim2]],
+                                        [[sim2, sim1], [-1, -1]]]),
                               detector.similarity_matrix)
-        assert np.array_equal(np.array([[-1,1137],[1137,-1]]),
+        assert np.array_equal(np.array([[-1,overlap],[overlap,-1]]),
                               detector.token_overlap_matrix)
 
     def test_compare_saving(self, tmpdir):
@@ -125,22 +143,24 @@ class TestTwoFileAPIDetection():
     """Performs the same checks as the other two-file check, but uses
     the API instead of the command line code.
     """
-    def test_compare(self):
+    def test_compare(self, sample_file_metrics):
         fp1 = CodeFingerprint(TESTS_DIR+"/sample_py/code/sample1.py", 25, 1)
         fp2 = CodeFingerprint(TESTS_DIR+"/sample_py/code/sample2.py", 25, 1)
         token_overlap, similarities, slices = compare_files(fp1, fp2)
 
-        assert token_overlap == 1137
-        assert similarities[0] == 1137/1829
-        assert similarities[1] == 1137/1257
+        overlap = sample_file_metrics["token_overlap"]
+
+        assert token_overlap == overlap
+        assert similarities[0] == overlap / sample_file_metrics["file1_len"]
+        assert similarities[1] == overlap / sample_file_metrics["file2_len"]
 
     def test_compare_boilerplate(self):
         bp_fingerprint = CodeFingerprint(
             TESTS_DIR + "/sample_py/boilerplate/handout.py", 25, 1)
         fp1 = CodeFingerprint(TESTS_DIR+"/sample_py/code/sample1.py", 25, 1,
-                              bp_fingerprint.hashes)
+                              np.array(list(bp_fingerprint.hashes)))
         fp2 = CodeFingerprint(TESTS_DIR+"/sample_py/code/sample2.py", 25, 1,
-                              bp_fingerprint.hashes)
+                              np.array(list(bp_fingerprint.hashes)))
 
         token_overlap, similarities, slices = compare_files(fp1, fp2)
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -175,7 +175,7 @@ def test_java_tokenization(self):
 
     def test_get_token_coverage(self):
         sample = "0123456789"
-        idx1 = np.array([0, 5])
+        idx1 = {0: [0], 1: [5]}
 
         # two 5-grams starting at 0 and 5 cover all 10 tokens
         assert cd.get_token_coverage(idx1, 5, len(sample)) == len(sample)
@@ -187,5 +187,5 @@ def test_get_token_coverage(self):
         assert cd.get_token_coverage(idx1, 1, len(sample)) == 2
 
         # k-gram overlap shouldn't matter
-        idx = np.arange(8)
+        idx = {i: [i] for i in range(8)}
         assert cd.get_token_coverage(idx, 3, len(sample)) == len(sample)