Skip to content

Commit

Permalink
Merge pull request #43 from blingenf/fix/other_encodings
Browse files Browse the repository at this point in the history
Add encoding control
  • Loading branch information
blingenf authored Jul 9, 2023
2 parents 9414fdd + f803ab5 commit e92ab23
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 7 deletions.
6 changes: 6 additions & 0 deletions copydetect/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ def main():
parser.add_argument('--version', action='version',
version="copydetect v" + __version__,
help="print version number and exit")
parser.add_argument("--encoding", default="utf-8",
help="encoding to use for reading files. If files use "
"varying encodings, --encoding DETECT can be used to "
"detect the encoding of all files (requires the "
"chardet package)")
args = parser.parse_args()

if args.conf:
Expand All @@ -106,6 +111,7 @@ def main():
"disable_autoopen" : args.autoopen,
"truncate" : args.truncate,
"out_file" : args.out_file,
"encoding": args.encoding,
}
else:
parser.error("either a path to a configuration file (-c) or a "
Expand Down
42 changes: 35 additions & 7 deletions copydetect/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ class CodeFingerprint:
the "file" argument will not be used to load a file from disk
but will still be used for language detection and displayed on
the report.
encoding : str, default="utf-8"
Text encoding to use for reading the file. If "DETECT", the
chardet library will be used (if installed) to automatically
detect file encoding
Attributes
----------
Expand Down Expand Up @@ -75,11 +79,28 @@ class CodeFingerprint:
performing winnowing.
"""
def __init__(self, file, k, win_size, boilerplate=[], filter=True,
language=None, fp=None):
language=None, fp=None, encoding: str = "utf-8"):
if fp is not None:
code = fp.read()
elif encoding == "DETECT":
try:
import chardet
with open(file, "rb") as code_fp:
code = code_fp.read()
detected_encoding = chardet.detect(code)["encoding"]
if detected_encoding is not None:
code = code.decode(detected_encoding)
else:
# if encoding can't be detected, just use the default
# encoding (the file may be empty)
code = code.decode()
except ModuleNotFoundError as e:
logging.error(
"encoding detection requires chardet to be installed"
)
raise e
else:
with open(file, encoding="utf-8") as code_fp:
with open(file, encoding=encoding) as code_fp:
code = code_fp.read()
if filter:
filtered_code, offsets = filter_code(code, file, language)
Expand Down Expand Up @@ -223,6 +244,10 @@ class CopyDetector:
Path to output report file.
silent : bool
If true, all logging output will be supressed.
encoding : str, default="utf-8"
Text encoding to use for reading the file. If "DETECT", the
chardet library will be used (if installed) to automatically
detect file encoding
"""
def __init__(self, config=None, test_dirs=[], ref_dirs=[],
boilerplate_dirs=[], extensions=["*"],
Expand All @@ -231,7 +256,8 @@ def __init__(self, config=None, test_dirs=[], ref_dirs=[],
display_t=defaults.DISPLAY_THRESHOLD,
same_name_only=False, ignore_leaf=False, autoopen=True,
disable_filtering=False, force_language=None,
truncate=False, out_file="./report.html", silent=False):
truncate=False, out_file="./report.html", silent=False,
encoding: str = "utf-8"):
if config is not None:
# temporary workaround to ensure existing code continues
# to work
Expand Down Expand Up @@ -265,6 +291,7 @@ def __init__(self, config=None, test_dirs=[], ref_dirs=[],
self.force_language = force_language
self.truncate = truncate
self.out_file = out_file
self.encoding = encoding

self._check_arguments()

Expand Down Expand Up @@ -458,10 +485,11 @@ def _get_boilerplate_hashes(self):
try:
fingerprint=CodeFingerprint(file, self.noise_t, 1,
filter=not self.disable_filtering,
language=self.force_language)
language=self.force_language,
encoding=self.encoding)
boilerplate_hashes.extend(fingerprint.hashes)
except UnicodeDecodeError:
logging.warning(f"Skipping {file}: file not ASCII text")
logging.warning(f"Skipping {file}: file not UTF-8 text")
continue

return np.unique(np.array(boilerplate_hashes))
Expand All @@ -479,10 +507,10 @@ def _preprocess_code(self, file_list):
self.file_data[code_f] = CodeFingerprint(
code_f, self.noise_t, self.window_size,
boilerplate_hashes, not self.disable_filtering,
self.force_language)
self.force_language, encoding=self.encoding)

except UnicodeDecodeError:
logging.warning(f"Skipping {code_f}: file not ASCII text")
logging.warning(f"Skipping {code_f}: file not UTF-8 text")
continue

def _comparison_loop(self):
Expand Down
1 change: 1 addition & 0 deletions docs/cmdline.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,4 @@ Configuration options can be provided either by using the command line arguments
- ``disable_autoopen`` (``-a``, ``--disable-autoopen``): if ``true``, the detector will not automatically open a browser window to display the report.
- ``truncate`` (``-T``, ``--truncate``): if ``true``, highlighted code will be truncated to remove non-highlighted regions from the displayed output (sections not within 10 lines of highlighted code will be replaced with "...").
- ``out_file`` (``-O``, ``--out-file``): path to save output report to. A '.html' extension will be added to the path if not provided. If a directory is provided instead of a file, the report will be saved to that directory as report.html.
- ``encoding`` (``--encoding``): encoding to use for reading files (the default is UTF-8). If files use varying encodings, --encoding DETECT can be used to detect the encoding of all files *(note: encoding detection requires the chardet package)*.
Binary file added tests/sample_other/c_sample_utf16.c
Binary file not shown.
10 changes: 10 additions & 0 deletions tests/test_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,3 +211,13 @@ def test_out_file(self, tmpdir):
detector.generate_html_report()

assert Path(tmpdir + "/report.html").exists()

def test_encoding_specification(self):
detector = CopyDetector(test_dirs=[TESTS_DIR + "/sample_py"],
extensions=["c"], encoding="utf-16",
silent=True)
detector.add_file(TESTS_DIR + "/sample_other/c_sample_utf16.c")
detector.run()

# make sure utf-16 file was loaded correctly
assert len(list(detector.file_data.values())[0].raw_code) > 0

0 comments on commit e92ab23

Please sign in to comment.