diff --git a/floss/language/go/extract.py b/floss/language/go/extract.py index 9aa0ffd27..c1e5bf95f 100644 --- a/floss/language/go/extract.py +++ b/floss/language/go/extract.py @@ -37,6 +37,9 @@ def find_stack_strings_with_regex( if not binary_string: continue + if binary_string.endswith(b"\x00"): + binary_string = binary_string[:-1] + addr = m.start() # need to subtract opcode bytes offset off_regex = len(m.group(0)) - len(binary_string) @@ -98,6 +101,9 @@ def find_i386_stackstrings(section_data, offset, min_length): def get_stackstrings(pe: pefile.PE, min_length: int) -> Iterable[StaticString]: """ Find stackstrings in the given PE file. + + TODO(mr-tz): algorithms need improvements / rethinking of approach + https://github.com/mandiant/flare-floss/issues/828 """ for section in pe.sections: @@ -269,7 +275,9 @@ def get_string_blob_strings(pe: pefile.PE, min_length) -> Iterable[StaticString] with floss.utils.timing("find struct string candidates"): struct_strings = list(sorted(set(get_struct_string_candidates(pe)), key=lambda s: s.address)) if not struct_strings: - logger.warning("Failed to find struct string candidates: Is this a Go binary?") + logger.warning( + "Failed to find struct string candidates: Is this a Go binary? If so, the Go version may be unsupported." + ) return with floss.utils.timing("find string blob"): @@ -354,12 +362,14 @@ def get_string_blob_strings(pe: pefile.PE, min_length) -> Iterable[StaticString] last_buf = string_blob_buf[last_pointer_offset:] for size in range(len(last_buf), 0, -1): try: - s = last_buf[:size].decode("utf-8") + _ = last_buf[:size].decode("utf-8") except UnicodeDecodeError: continue else: try: - string = StaticString.from_utf8(last_buf[:size], last_pointer, min_length) + string = StaticString.from_utf8( + last_buf[:size], pe.get_offset_from_rva(last_pointer - image_base), min_length + ) yield string except ValueError: pass @@ -382,6 +392,25 @@ def extract_go_strings(sample, min_length) -> List[StaticString]: return go_strings +def get_static_strings_from_blob_range(sample: pathlib.Path, static_strings: List[StaticString]) -> List[StaticString]: + pe = pefile.PE(data=pathlib.Path(sample).read_bytes(), fast_load=True) + + struct_strings = list(sorted(set(get_struct_string_candidates(pe)), key=lambda s: s.address)) + if not struct_strings: + return [] + + try: + string_blob_start, string_blob_end = find_string_blob_range(pe, struct_strings) + except ValueError: + return [] + + image_base = pe.OPTIONAL_HEADER.ImageBase + string_blob_start = pe.get_offset_from_rva(string_blob_start - image_base) + string_blob_end = pe.get_offset_from_rva(string_blob_end - image_base) + + return list(filter(lambda s: string_blob_start <= s.offset < string_blob_end, static_strings)) + + def main(argv=None): parser = argparse.ArgumentParser(description="Get Go strings") parser.add_argument("path", help="file or path to analyze") diff --git a/floss/language/rust/extract.py b/floss/language/rust/extract.py index 3d3dc3960..71c3495e0 100644 --- a/floss/language/rust/extract.py +++ b/floss/language/rust/extract.py @@ -123,6 +123,20 @@ def extract_rust_strings(sample: pathlib.Path, min_length: int) -> List[StaticSt return rust_strings +def get_static_strings_from_rdata(sample, static_strings) -> List[StaticString]: + pe = pefile.PE(data=pathlib.Path(sample).read_bytes(), fast_load=True) + + try: + rdata_section = get_rdata_section(pe) + except ValueError: + return [] + + start_rdata = rdata_section.PointerToRawData + end_rdata = start_rdata + rdata_section.SizeOfRawData + + return list(filter(lambda s: start_rdata <= s.offset < end_rdata, static_strings)) + + def get_string_blob_strings(pe: pefile.PE, min_length: int) -> Iterable[StaticString]: image_base = pe.OPTIONAL_HEADER.ImageBase @@ -145,6 +159,11 @@ def get_string_blob_strings(pe: pefile.PE, min_length: int) -> Iterable[StaticSt # select only UTF-8 strings and adjust offset static_strings = filter_and_transform_utf8_strings(fixed_strings, start_rdata) + # TODO(mr-tz) - handle miss in rust-hello64.exe + # .rdata:00000001400C1270 0A aPanickedAfterP db 0Ah ; DATA XREF: .rdata:00000001400C12B8↓o + # .rdata:00000001400C1271 70 61 6E 69 63 6B 65 64… db 'panicked after panic::always_abort(), aborting.',0Ah,0 + # .rdata:00000001400C12A2 00 00 00 00 00 00 align 8 + struct_string_addrs = map(lambda c: c.address, get_struct_string_candidates(pe)) if pe.FILE_HEADER.Machine == pefile.MACHINE_TYPE["IMAGE_FILE_MACHINE_I386"]: @@ -157,6 +176,11 @@ def get_string_blob_strings(pe: pefile.PE, min_length: int) -> Iterable[StaticSt xrefs_lea = find_lea_xrefs(pe) xrefs = itertools.chain(struct_string_addrs, xrefs_lea) + # TODO(mr-tz) - handle movdqa rust-hello64.exe + # .text:0000000140026046 66 0F 6F 05 02 71 09 00 movdqa xmm0, cs:xmmword_1400BD150 + # .text:000000014002604E 66 0F 6F 0D 0A 71 09 00 movdqa xmm1, cs:xmmword_1400BD160 + # .text:0000000140026056 66 0F 6F 15 12 71 09 00 movdqa xmm2, cs:xmmword_1400BD170 + else: logger.error("unsupported architecture: %s", pe.FILE_HEADER.Machine) return [] diff --git a/floss/main.py b/floss/main.py index acd288acf..b5026143d 100644 --- a/floss/main.py +++ b/floss/main.py @@ -559,6 +559,10 @@ def main(argv=None) -> int: else: lang_id = identify_language(sample, static_strings) + # TODO(mr-tz): verify user-selected language makes sense and at least warn user + # include language version in results, if available + # https://github.com/mandiant/flare-floss/issues/900 + if lang_id == Language.GO: if analysis.enable_tight_strings or analysis.enable_stack_strings or analysis.enable_decoded_strings: logger.warning( @@ -621,8 +625,11 @@ def main(argv=None) -> int: results.strings.language_strings = floss.language.go.extract.extract_go_strings(sample, args.min_length) results.metadata.runtime.language_strings = get_runtime_diff(interim) + # missed strings only includes non-identified strings in searched range + # here currently only focus on strings in string blob range + string_blob_strings = floss.language.go.extract.get_static_strings_from_blob_range(sample, static_strings) results.strings.language_strings_missed = floss.language.utils.get_missed_strings( - static_strings, results.strings.language_strings, args.min_length + string_blob_strings, results.strings.language_strings, args.min_length ) elif lang_id == Language.RUST: @@ -634,8 +641,10 @@ def main(argv=None) -> int: ) results.metadata.runtime.language_strings = get_runtime_diff(interim) + # currently Rust strings are only extracted from the .rdata section + rdata_strings = floss.language.rust.extract.get_static_strings_from_rdata(sample, static_strings) results.strings.language_strings_missed = floss.language.utils.get_missed_strings( - static_strings, results.strings.language_strings, args.min_length + rdata_strings, results.strings.language_strings, args.min_length ) if ( results.analysis.enable_decoded_strings diff --git a/floss/results.py b/floss/results.py index 21a88834c..b6f6c397a 100644 --- a/floss/results.py +++ b/floss/results.py @@ -1,5 +1,6 @@ # Copyright (C) 2021 Mandiant, Inc. All Rights Reserved. +import re import json import datetime from enum import Enum @@ -139,6 +140,9 @@ def from_utf8(cls, buf, addr, min_length): except UnicodeDecodeError: raise ValueError("not utf-8") + if not re.sub(r"[\r\n\t]", "", decoded_string).isprintable(): + raise ValueError("not printable") + if len(decoded_string) < min_length: raise ValueError("too short") return cls(string=decoded_string, offset=addr, encoding=StringEncoding.UTF8)