Merge pull request #43 from blingenf/fix/other_encodings

Add encoding control
blingenf · Jul 9, 2023 · e92ab23 · e92ab23
2 parents 9414fdd + f803ab5
commit e92ab23
Show file tree

Hide file tree

Showing 5 changed files with 52 additions and 7 deletions.
diff --git a/copydetect/__main__.py b/copydetect/__main__.py
@@ -83,6 +83,11 @@ def main():
     parser.add_argument('--version', action='version',
                         version="copydetect v" + __version__,
                         help="print version number and exit")
+    parser.add_argument("--encoding", default="utf-8",
+                        help="encoding to use for reading files. If files use "
+                        "varying encodings, --encoding DETECT can be used to "
+                        "detect the encoding of all files (requires the "
+                        "chardet package)")
     args = parser.parse_args()
 
     if args.conf:
@@ -106,6 +111,7 @@ def main():
           "disable_autoopen" : args.autoopen,
           "truncate" : args.truncate,
           "out_file" : args.out_file,
+          "encoding": args.encoding,
         }
     else:
         parser.error("either a path to a configuration file (-c) or a "

diff --git a/copydetect/detector.py b/copydetect/detector.py
@@ -45,6 +45,10 @@ class CodeFingerprint:
         the "file" argument will not be used to load a file from disk
         but will still be used for language detection and displayed on
         the report.
+    encoding : str, default="utf-8"
+        Text encoding to use for reading the file. If "DETECT", the
+        chardet library will be used (if installed) to automatically
+        detect file encoding
 
     Attributes
     ----------
@@ -75,11 +79,28 @@ class CodeFingerprint:
         performing winnowing.
     """
     def __init__(self, file, k, win_size, boilerplate=[], filter=True,
-                 language=None, fp=None):
+                 language=None, fp=None, encoding: str = "utf-8"):
         if fp is not None:
             code = fp.read()
+        elif encoding == "DETECT":
+            try:
+                import chardet
+                with open(file, "rb") as code_fp:
+                    code = code_fp.read()
+                detected_encoding = chardet.detect(code)["encoding"]
+                if detected_encoding is not None:
+                    code = code.decode(detected_encoding)
+                else:
+                    # if encoding can't be detected, just use the default
+                    # encoding (the file may be empty)
+                    code = code.decode()
+            except ModuleNotFoundError as e:
+                logging.error(
+                    "encoding detection requires chardet to be installed"
+                )
+                raise e
         else:
-            with open(file, encoding="utf-8") as code_fp:
+            with open(file, encoding=encoding) as code_fp:
                 code = code_fp.read()
         if filter:
             filtered_code, offsets = filter_code(code, file, language)
@@ -223,6 +244,10 @@ class CopyDetector:
         Path to output report file.
     silent : bool
         If true, all logging output will be supressed.
+    encoding : str, default="utf-8"
+        Text encoding to use for reading the file. If "DETECT", the
+        chardet library will be used (if installed) to automatically
+        detect file encoding
     """
     def __init__(self, config=None, test_dirs=[], ref_dirs=[],
                  boilerplate_dirs=[], extensions=["*"],
@@ -231,7 +256,8 @@ def __init__(self, config=None, test_dirs=[], ref_dirs=[],
                  display_t=defaults.DISPLAY_THRESHOLD,
                  same_name_only=False, ignore_leaf=False, autoopen=True,
                  disable_filtering=False, force_language=None,
-                 truncate=False, out_file="./report.html", silent=False):
+                 truncate=False, out_file="./report.html", silent=False,
+                 encoding: str = "utf-8"):
         if config is not None:
             # temporary workaround to ensure existing code continues
             # to work
@@ -265,6 +291,7 @@ def __init__(self, config=None, test_dirs=[], ref_dirs=[],
         self.force_language = force_language
         self.truncate = truncate
         self.out_file = out_file
+        self.encoding = encoding
 
         self._check_arguments()
 
@@ -458,10 +485,11 @@ def _get_boilerplate_hashes(self):
             try:
                 fingerprint=CodeFingerprint(file, self.noise_t, 1,
                                             filter=not self.disable_filtering,
-                                            language=self.force_language)
+                                            language=self.force_language,
+                                            encoding=self.encoding)
                 boilerplate_hashes.extend(fingerprint.hashes)
             except UnicodeDecodeError:
-                logging.warning(f"Skipping {file}: file not ASCII text")
+                logging.warning(f"Skipping {file}: file not UTF-8 text")
                 continue
 
         return np.unique(np.array(boilerplate_hashes))
@@ -479,10 +507,10 @@ def _preprocess_code(self, file_list):
                     self.file_data[code_f] = CodeFingerprint(
                         code_f, self.noise_t, self.window_size,
                         boilerplate_hashes, not self.disable_filtering,
-                        self.force_language)
+                        self.force_language, encoding=self.encoding)
 
                 except UnicodeDecodeError:
-                    logging.warning(f"Skipping {code_f}: file not ASCII text")
+                    logging.warning(f"Skipping {code_f}: file not UTF-8 text")
                     continue
 
     def _comparison_loop(self):

diff --git a/docs/cmdline.rst b/docs/cmdline.rst
@@ -32,3 +32,4 @@ Configuration options can be provided either by using the command line arguments
 - ``disable_autoopen`` (``-a``, ``--disable-autoopen``):  if ``true``, the detector will not automatically open a browser window to display the report.
 - ``truncate`` (``-T``, ``--truncate``):  if ``true``, highlighted code will be truncated to remove non-highlighted regions from the displayed output (sections not within 10 lines of highlighted code will be replaced with "...").
 - ``out_file`` (``-O``, ``--out-file``): path to save output report to. A '.html' extension will be added to the path if not provided. If a directory is provided instead of a file, the report will be saved to that directory as report.html.
+- ``encoding`` (``--encoding``): encoding to use for reading files (the default is UTF-8). If files use varying encodings, --encoding DETECT can be used to detect the encoding of all files *(note: encoding detection requires the chardet package)*.
diff --git a/tests/sample_other/c_sample_utf16.c b/tests/sample_other/c_sample_utf16.c
diff --git a/tests/test_detector.py b/tests/test_detector.py
@@ -211,3 +211,13 @@ def test_out_file(self, tmpdir):
         detector.generate_html_report()
 
         assert Path(tmpdir + "/report.html").exists()
+
+    def test_encoding_specification(self):
+        detector = CopyDetector(test_dirs=[TESTS_DIR + "/sample_py"],
+                                extensions=["c"], encoding="utf-16",
+                                silent=True)
+        detector.add_file(TESTS_DIR + "/sample_other/c_sample_utf16.c")
+        detector.run()
+
+        # make sure utf-16 file was loaded correctly
+        assert len(list(detector.file_data.values())[0].raw_code) > 0