From 9d70e5fc2246a7b38fe74f5e62c3e44543b4800f Mon Sep 17 00:00:00 2001 From: Volker Simonis Date: Sun, 14 Jan 2024 20:55:34 +0100 Subject: [PATCH] Don't dump pages which only contain zero bytes Introduces a new command line option '--skip-zero-bytes' which detects pages which only contain zero bytes and prohibits that they get dumped in the processes image file. It is a potentially expensive operation because it checks for every single process page if it contains only zeros, but it can significantly decrease the image size and improve the startup-time if many such pages exist. It effectively replaces such pages which the kernel's zero-page on restore. Signed-off-by: Volker Simonis --- Documentation/criu.txt | 8 + criu/config.c | 1 + criu/cr-service.c | 3 + criu/crtools.c | 1 + criu/include/cr_options.h | 1 + criu/include/stats.h | 2 + criu/mem.c | 53 +++++- criu/stats.c | 7 + images/rpc.proto | 1 + images/stats.proto | 2 + lib/c/criu.c | 11 ++ test/javaTests/pom.xml | 1 + .../criu/java/tests/CheckpointRestore.java | 2 +- test/javaTests/test-zero.xml | 89 ++++++++++ test/zdtm.py | 9 +- test/zdtm/static/Makefile | 1 + test/zdtm/static/zero_pages.c | 160 ++++++++++++++++++ test/zdtm/static/zero_pages.desc | 1 + 18 files changed, 346 insertions(+), 7 deletions(-) create mode 100644 test/javaTests/test-zero.xml create mode 100644 test/zdtm/static/zero_pages.c create mode 100644 test/zdtm/static/zero_pages.desc diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 606935790b..d1d34045a0 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -369,6 +369,14 @@ mount -t cgroup -o devices,freezer none devices,freezer Deduplicate "old" data in pages images of previous *dump*. This option implies incremental *dump* mode (see the *pre-dump* command). +*--skip-zero-pages*:: + Don't dump pages containing only zero bytes. This is a + potentially expensive operation because it checks for + every single process page if it contains only zeros, but + it can significantly decrease the image size and improve the + startup-time if many such pages exist. It effectively + replaces such pages which the kernel's zero-page on restore. + *-l*, *--file-locks*:: Dump file locks. It is necessary to make sure that all file lock users are taken into dump, so it is only safe to use this for enclosed containers diff --git a/criu/config.c b/criu/config.c index 1322a490ab..961bc4367b 100644 --- a/criu/config.c +++ b/criu/config.c @@ -650,6 +650,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "ms", no_argument, 0, 1054 }, BOOL_OPT("track-mem", &opts.track_mem), BOOL_OPT("auto-dedup", &opts.auto_dedup), + BOOL_OPT("skip-zero-pages", &opts.skip_zero_pages), { "libdir", required_argument, 0, 'L' }, { "cpu-cap", optional_argument, 0, 1057 }, BOOL_OPT("force-irmap", &opts.force_irmap), diff --git a/criu/cr-service.c b/criu/cr-service.c index 61a04c5ffe..19a1448efa 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -541,6 +541,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_auto_dedup) opts.auto_dedup = req->auto_dedup; + if (req->has_skip_zero_pages) + opts.skip_zero_pages = req->skip_zero_pages; + if (req->has_force_irmap) opts.force_irmap = req->force_irmap; diff --git a/criu/crtools.c b/criu/crtools.c index 94657f4186..5de3ab724e 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -541,6 +541,7 @@ int main(int argc, char *argv[], char *envp[]) " pages images of previous dump\n" " when used on restore, as soon as page is restored, it\n" " will be punched from the image\n" + " --skip-zero-pages don't dump pages containing only zero bytes.\n" " --pre-dump-mode splice - parasite based pre-dumping (default)\n" " read - process_vm_readv syscall based pre-dumping\n" "\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 60cf9437e6..b3a477550d 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -157,6 +157,7 @@ struct cr_options { int track_mem; char *img_parent; int auto_dedup; + int skip_zero_pages; unsigned int cpu_cap; int force_irmap; char **exec_cmd; diff --git a/criu/include/stats.h b/criu/include/stats.h index d8dd159989..841ebdfe01 100644 --- a/criu/include/stats.h +++ b/criu/include/stats.h @@ -33,6 +33,8 @@ enum { CNT_SHPAGES_SKIPPED_PARENT, CNT_SHPAGES_WRITTEN, + CNT_SKIPPED_ZERO_PAGES, + DUMP_CNT_NR_STATS, }; diff --git a/criu/mem.c b/criu/mem.c index f56ed826b3..6ac7adb6b8 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -3,8 +3,10 @@ #include #include #include +#include #include #include +#include #include "types.h" #include "cr_options.h" @@ -31,6 +33,7 @@ #include "prctl.h" #include "compel/infect-util.h" #include "pidfd-store.h" +#include "xmalloc.h" #include "protobuf.h" #include "images/pagemap.pb-c.h" @@ -191,11 +194,33 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct bool has_parent) { unsigned long nr_scanned; - unsigned long pages[3] = {}; + /* Counters for PAGES_SKIPPED_PARENT, PAGES_LAZY, PAGES_WRITTEN and SKIPPED_ZERO_PAGES */ + unsigned long pages[4] = {}; unsigned long vaddr; bool dump_all_pages; int ret = 0; + static char *zero_page = NULL; + static char *remote_page = NULL; + int zero = 0; + struct iovec local[2]; + struct iovec remote[1]; + int nread = 0; + if (opts.skip_zero_pages && zero_page == NULL) { + zero_page = xmalloc(PAGE_SIZE); + remote_page = xmalloc(PAGE_SIZE); + if (zero_page == NULL || remote_page == NULL) { + pr_warn("Can't allocate memory - disabling --skip-zero-pages\n"); + opts.skip_zero_pages = 0; + } else { + memzero(zero_page, PAGE_SIZE); + local[0].iov_base = remote_page; + local[0].iov_len = PAGE_SIZE; + remote[0].iov_base = (void *)0x0; + remote[0].iov_len = PAGE_SIZE; + } + } + dump_all_pages = should_dump_entire_vma(vma->e); nr_scanned = 0; @@ -207,9 +232,25 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct /* If dump_all_pages is true, should_dump_page is called to get pme. */ next = should_dump_page(pmc, vma->e, vaddr, &softdirty); - if (!dump_all_pages && next != vaddr) { - vaddr = next - PAGE_SIZE; - continue; + if (!dump_all_pages) { + if (next != vaddr) { + vaddr = next - PAGE_SIZE; + continue; + } else if (opts.skip_zero_pages) { + remote[0].iov_base = (void *)vaddr; + nread = process_vm_readv(item->pid->real, local, 1, remote, 1, 0); + if (nread == PAGE_SIZE) { + zero = memcmp(zero_page, remote_page, PAGE_SIZE); + /* + * If the page contains just zeros we can treat it like the zero page and skip it. + * At restore it will be replaced by a reference to the zero page and COWed if accessed. + */ + if (zero == 0) { + pages[3]++; + continue; + } + } + } } if (vma_entry_can_be_lazy(vma->e) && !is_stack(item, vaddr)) @@ -247,8 +288,10 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]); cnt_add(CNT_PAGES_LAZY, pages[1]); cnt_add(CNT_PAGES_WRITTEN, pages[2]); + cnt_add(CNT_SKIPPED_ZERO_PAGES, pages[3]); - pr_info("Pagemap generated: %lu pages (%lu lazy) %lu holes\n", pages[2] + pages[1], pages[1], pages[0]); + pr_info("Pagemap generated: %lu pages (%lu lazy) %lu holes %lu skipped zero\n", + pages[2] + pages[1], pages[1], pages[0], pages[3]); return ret; } diff --git a/criu/stats.c b/criu/stats.c index 0a9b4f5d47..40b90aaf2f 100644 --- a/criu/stats.c +++ b/criu/stats.c @@ -134,6 +134,9 @@ static void display_stats(int what, StatsEntry *stats) stats->dump->pages_skipped_parent, stats->dump->pages_skipped_parent); pr_msg("Memory pages written: %" PRIu64 " (0x%" PRIx64 ")\n", stats->dump->pages_written, stats->dump->pages_written); + if (stats->dump->has_skipped_zero_pages) + pr_msg("Memory pages skipped because zero: %" PRIu64 " (0x%" PRIx64 ")\n", + stats->dump->skipped_zero_pages, stats->dump->skipped_zero_pages); pr_msg("Lazy memory pages: %" PRIu64 " (0x%" PRIx64 ")\n", stats->dump->pages_lazy, stats->dump->pages_lazy); } else if (what == RESTORE_STATS) { @@ -178,6 +181,10 @@ void write_stats(int what) ds_entry.has_page_pipes = true; ds_entry.page_pipe_bufs = dstats->counts[CNT_PAGE_PIPE_BUFS]; ds_entry.has_page_pipe_bufs = true; + if (opts.skip_zero_pages) { + ds_entry.has_skipped_zero_pages = true; + ds_entry.skipped_zero_pages = dstats->counts[CNT_SKIPPED_ZERO_PAGES]; + } ds_entry.shpages_scanned = dstats->counts[CNT_SHPAGES_SCANNED]; ds_entry.has_shpages_scanned = true; diff --git a/images/rpc.proto b/images/rpc.proto index 1a4722a9ce..b73799a8c2 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -145,6 +145,7 @@ message criu_opts { optional bool leave_stopped = 69; optional bool display_stats = 70; optional bool log_to_stderr = 71; + optional bool skip_zero_pages = 72; /* optional bool check_mounts = 128; */ } diff --git a/images/stats.proto b/images/stats.proto index 64e46181da..aafd89992d 100644 --- a/images/stats.proto +++ b/images/stats.proto @@ -22,6 +22,8 @@ message dump_stats_entry { optional uint64 shpages_scanned = 12; optional uint64 shpages_skipped_parent = 13; optional uint64 shpages_written = 14; + + optional uint64 skipped_zero_pages = 15; } message restore_stats_entry { diff --git a/lib/c/criu.c b/lib/c/criu.c index 7f766db857..a8402233a9 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -387,6 +387,17 @@ void criu_set_auto_dedup(bool auto_dedup) criu_local_set_auto_dedup(global_opts, auto_dedup); } +void criu_local_set_skip_zero_pages(criu_opts *opts, bool skip_zero_pages) +{ + opts->rpc->has_skip_zero_pages = true; + opts->rpc->skip_zero_pages = skip_zero_pages; +} + +void criu_set_skip_zero_pages(bool skip_zero_pages) +{ + criu_local_set_skip_zero_pages(global_opts, skip_zero_pages); +} + void criu_local_set_force_irmap(criu_opts *opts, bool force_irmap) { opts->rpc->has_force_irmap = true; diff --git a/test/javaTests/pom.xml b/test/javaTests/pom.xml index ddb6c89cf1..8a0b5bd4b3 100644 --- a/test/javaTests/pom.xml +++ b/test/javaTests/pom.xml @@ -18,6 +18,7 @@ test.xml + test-zero.xml diff --git a/test/javaTests/src/org/criu/java/tests/CheckpointRestore.java b/test/javaTests/src/org/criu/java/tests/CheckpointRestore.java index 860619c267..6c22c7260b 100644 --- a/test/javaTests/src/org/criu/java/tests/CheckpointRestore.java +++ b/test/javaTests/src/org/criu/java/tests/CheckpointRestore.java @@ -112,7 +112,7 @@ public void runtest(String testName, String checkpointOpt, String restoreOpt) th String pid; int exitCode; - System.out.println("======= Testing " + testName + " ========"); + System.out.println("======= Testing " + testName + " " + checkpointOpt + " ========"); testSetup(testName); diff --git a/test/javaTests/test-zero.xml b/test/javaTests/test-zero.xml new file mode 100644 index 0000000000..fe4d65471e --- /dev/null +++ b/test/javaTests/test-zero.xml @@ -0,0 +1,89 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/zdtm.py b/test/zdtm.py index 7a7cdfd3b6..d7f1191566 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -1052,6 +1052,7 @@ def __init__(self, opts): self.__sat = bool(opts['sat']) self.__dedup = bool(opts['dedup']) self.__mdedup = bool(opts['noauto_dedup']) + self.__skip_zero_pages = bool(opts['skip_zero_pages']) self.__user = bool(opts['user']) self.__rootless = bool(opts['rootless']) self.__leave_stopped = bool(opts['stop']) @@ -1381,6 +1382,9 @@ def dump(self, action, opts=[]): if self.__dedup: a_opts += ["--auto-dedup"] + if self.__skip_zero_pages: + a_opts += ["--skip-zero-pages"] + a_opts += ["--timeout", "10"] criu_dir = os.path.dirname(os.getcwd()) @@ -2083,7 +2087,7 @@ def run_test(self, name, desc, flavor): 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', - 'rootless') + 'rootless', 'skip_zero_pages') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2697,6 +2701,9 @@ def get_cli_args(): rp.add_argument("--noauto-dedup", help="Manual deduplicate images on iterations", action='store_true') + rp.add_argument("--skip-zero-pages", + help="Don't dump pages containing only zero bytes", + action='store_true') rp.add_argument("--nocr", help="Do not CR anything, just check test works", action='store_true') diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index fb856d55b4..9689a939a3 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -271,6 +271,7 @@ TST_NOFILE := \ sigtrap01 \ change_mnt_context \ fd_offset \ + zero_pages \ # jobctl00 \ PKG_CONFIG ?= pkg-config diff --git a/test/zdtm/static/zero_pages.c b/test/zdtm/static/zero_pages.c new file mode 100644 index 0000000000..f65667904d --- /dev/null +++ b/test/zdtm/static/zero_pages.c @@ -0,0 +1,160 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check the --skip-zero-pages flag"; +const char *test_author = "Volker Simonis "; + +#define PME_PFRAME_MASK ((1ULL << 55) - 1) +uint64_t zero_page_pfn; +int page_size; +int pagemap; + +static uint64_t vaddr_to_pfn(unsigned long vaddr) +{ + uint64_t pfn; + off_t off = (vaddr / page_size) * sizeof(uint64_t); + if (pread(pagemap, &pfn, sizeof(pfn), off) != sizeof(pfn)) { + pr_perror("Can't read pme"); + exit(1); + } else { + return (pfn & PME_PFRAME_MASK); + } +} + +static void init_zero_page_pfn(void) +{ + void *addr; + if ((pagemap = open("/proc/self/pagemap", O_RDONLY)) == -1) { + pr_perror("Can't open /proc/self/pagemap"); + exit(1); + } + if ((addr = mmap(NULL, page_size, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) == MAP_FAILED) { + pr_perror("Unable to map zero page"); + exit(1); + } + if (*((int *)addr) != 0) { + pr_perror("Newly mapped page must be zero"); + exit(1); + } + zero_page_pfn = vaddr_to_pfn((unsigned long)addr); + munmap(addr, page_size); + + if (zero_page_pfn == 0) { + pr_err("zero_page_pfn is invalid.\n"); + exit(1); + } + fprintf(stderr, "zero_page_pfn = %lu\n", zero_page_pfn); +} + +static int pages_in_mem(char *addr, int nr_of_pages) +{ + int counter = 0; + unsigned char pages[nr_of_pages]; + if (mincore(addr, page_size * nr_of_pages, pages) == -1) { + pr_perror("Can't call mincore"); + exit(1); + } + for (int i = 0; i < nr_of_pages; i++) { + if ((pages[i] & 0x1)) { + counter++; + } + } + return counter; +} + +static int zero_pages(char *addr, int nr_of_pages) +{ + int counter = 0; + for (int i = 0; i < nr_of_pages; i++, addr += page_size) { + if (vaddr_to_pfn((unsigned long)addr) == zero_page_pfn) { + counter++; + } + } + return counter; +} + +int main(int argc, char **argv) +{ + char *addr; + int nr_of_pages = 64; + + test_init(argc, argv); + + page_size = sysconf(_SC_PAGESIZE); + + init_zero_page_pfn(); + + addr = (char *)mmap(NULL, page_size * nr_of_pages, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + pr_perror("Can't mmap %d bytes", page_size * nr_of_pages); + exit(1); + } + /* Check that pages are not in memory yet */ + if (pages_in_mem(addr, nr_of_pages) != 0) { + pr_err("Pages shouldn't be in memory yet.\n"); + exit(1); + } + for (int i = 0; i < nr_of_pages; i++) { + /* Read pages to bring them into memory */ + if (addr[i * page_size] != 0) { + pr_err("All pages should have zero content.\n"); + exit(1); + } + } + /* Check that all pages reference the zero page */ + if (zero_pages(addr, nr_of_pages) != nr_of_pages) { + pr_err("All pages should reference the zero page.\n"); + exit(1); + } + for (int i = 0; i < nr_of_pages; i++) { + /* Write pages to COW them */ + addr[i * page_size] = 0; + } + /* Check that all pages are mapped to distinct physical pages */ + if (pages_in_mem(addr, nr_of_pages) != nr_of_pages) { + pr_err("All pages should be in memory.\n"); + exit(1); + } + + test_daemon(); + test_waitsig(); + + /* Check that pages are not in memory yet */ + if (pages_in_mem(addr, nr_of_pages) != 0) { + fail("Pages shouldn't be in memory yet.\n"); + goto out; + } + for (int i = 0; i < nr_of_pages; i++) { + /* Read pages to bring them into memory */ + if (addr[i * page_size] != 0) { + fail("All pages should have zero content.\n"); + } + } + /* Check that all pages reference the zero page */ + if (zero_pages(addr, nr_of_pages) != nr_of_pages) { + fail("All pages should reference the zero page.\n"); + goto out; + } + for (int i = 0; i < nr_of_pages; i++) { + /* Write pages to COW them */ + addr[i * page_size] = 0; + } + /* Check that all pages are mapped to distinct physical pages */ + if (pages_in_mem(addr, nr_of_pages) != nr_of_pages) { + fail("All pages should be in memory.\n"); + goto out; + } + + pass(); +out: + return 0; +} diff --git a/test/zdtm/static/zero_pages.desc b/test/zdtm/static/zero_pages.desc new file mode 100644 index 0000000000..b9804bf35b --- /dev/null +++ b/test/zdtm/static/zero_pages.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'opts': '--skip-zero-pages', 'flags': 'suid'}