diff --git a/criu/include/mem.h b/criu/include/mem.h index 03574ea3d7..3618c9cc3b 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -7,6 +7,7 @@ #include "pid.h" #include "proc_parse.h" #include "inventory.pb-c.h" +#include "pagemap-cache.h" struct parasite_ctl; struct vm_area_list; @@ -47,5 +48,6 @@ int open_vmas(struct pstree_item *t); int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); -bool should_dump_page(VmaEntry *vmae, u64 pme); + +u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty); #endif /* __CR_MEM_H__ */ diff --git a/criu/include/pagemap-cache.h b/criu/include/pagemap-cache.h index 1d8bbffaf6..875e69e560 100644 --- a/criu/include/pagemap-cache.h +++ b/criu/include/pagemap-cache.h @@ -1,10 +1,12 @@ #ifndef __CR_PAGEMAP_H__ #define __CR_PAGEMAP_H__ +#include #include #include "int.h" #include "common/list.h" +#include "pagemap_scan.h" struct vma_area; @@ -15,9 +17,15 @@ typedef struct { unsigned long start; /* start of area */ unsigned long end; /* end of area */ const struct list_head *vma_head; /* list head of VMAs we're serving */ + int fd; /* file to read PMs from */ + u64 *map; /* local buffer */ size_t map_len; /* length of a buffer */ - int fd; /* file to read PMs from */ + + struct page_region *regs; /* buffer for the PAGEMAP_SCAN ioctl */ + size_t regs_len; /* actual length of regs */ + size_t regs_max_len; /* maximum length of regs */ + size_t regs_idx; /* current index in the regs array */ } pmc_t; #define PMC_INIT \ @@ -26,7 +34,8 @@ typedef struct { } extern int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size); -extern u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma); +extern int pmc_get_map(pmc_t *pmc, const struct vma_area *vma); extern void pmc_fini(pmc_t *pmc); +extern int pmc_fill(pmc_t *pmc, u64 start, u64 end); #endif /* __CR_PAGEMAP_H__ */ diff --git a/criu/include/shmem.h b/criu/include/shmem.h index 813ef630ef..15cab11464 100644 --- a/criu/include/shmem.h +++ b/criu/include/shmem.h @@ -4,13 +4,14 @@ #include "int.h" #include "common/lock.h" #include "images/vma.pb-c.h" +#include "pagemap-cache.h" struct vma_area; extern int collect_shmem(int pid, struct vma_area *vma); extern int collect_sysv_shmem(unsigned long shmid, unsigned long size); extern int cr_dump_shmem(void); -extern int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map); +extern int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc); extern int fixup_sysv_shmems(void); extern int dump_one_memfd_shmem(int fd, unsigned long shmid, unsigned long size); extern int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid); diff --git a/criu/mem.c b/criu/mem.c index 417e0a21de..f56ed826b3 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -99,7 +99,7 @@ static inline bool __page_in_parent(bool dirty) return opts.track_mem && opts.img_parent && !dirty; } -bool should_dump_page(VmaEntry *vmae, u64 pme) +static bool should_dump_entire_vma(VmaEntry *vmae) { /* * vDSO area must be always dumped because on restore @@ -107,30 +107,53 @@ bool should_dump_page(VmaEntry *vmae, u64 pme) */ if (vma_entry_is(vmae, VMA_AREA_VDSO)) return true; - /* - * In turn VVAR area is special and referenced from - * vDSO area by IP addressing (at least on x86) thus - * never ever dump its content but always use one provided - * by the kernel on restore, ie runtime VVAR area must - * be remapped into proper place.. - */ - if (vma_entry_is(vmae, VMA_AREA_VVAR)) - return false; - - /* - * Optimisation for private mapping pages, that haven't - * yet being COW-ed - */ - if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) - return false; if (vma_entry_is(vmae, VMA_AREA_AIORING)) return true; - if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) - return true; return false; } +/* + * should_dump_page returns vaddr if an addressed page has to be dumped. + * Otherwise, it returns an address that has to be inspected next. + */ +u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty) +{ + if (vaddr >= pmc->end && pmc_fill(pmc, vaddr, vmae->end)) + return -1; + + if (pmc->regs) { + while (1) { + if (pmc->regs_idx == pmc->regs_len) + return pmc->end; + if (vaddr < pmc->regs[pmc->regs_idx].end) + break; + pmc->regs_idx++; + } + if (vaddr < pmc->regs[pmc->regs_idx].start) + return pmc->regs[pmc->regs_idx].start; + if (softdirty) + *softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; + return vaddr; + } else { + u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; + + /* + * Optimisation for private mapping pages, that haven't + * yet being COW-ed + */ + if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) + return vaddr + PAGE_SIZE; + if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) { + if (softdirty) + *softdirty = pme & PME_SOFT_DIRTY; + return vaddr; + } + + return vaddr + PAGE_SIZE; + } +} + bool page_is_zero(u64 pme) { return __page_is_zero(pme); @@ -164,25 +187,30 @@ static bool is_stack(struct pstree_item *item, unsigned long vaddr) * the memory contents is present in the parent image set. */ -static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, +static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, pmc_t *pmc, u64 *pvaddr, bool has_parent) { - u64 *at = &map[PAGE_PFN(*off)]; - unsigned long pfn, nr_to_scan; + unsigned long nr_scanned; unsigned long pages[3] = {}; + unsigned long vaddr; + bool dump_all_pages; int ret = 0; - nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE; + dump_all_pages = should_dump_entire_vma(vma->e); - for (pfn = 0; pfn < nr_to_scan; pfn++) { - unsigned long vaddr; + nr_scanned = 0; + for (vaddr = *pvaddr; vaddr < vma->e->end; vaddr += PAGE_SIZE, nr_scanned++) { unsigned int ppb_flags = 0; + bool softdirty = false; + u64 next; int st; - if (!should_dump_page(vma->e, at[pfn])) + /* If dump_all_pages is true, should_dump_page is called to get pme. */ + next = should_dump_page(pmc, vma->e, vaddr, &softdirty); + if (!dump_all_pages && next != vaddr) { + vaddr = next - PAGE_SIZE; continue; - - vaddr = vma->e->start + *off + pfn * PAGE_SIZE; + } if (vma_entry_can_be_lazy(vma->e) && !is_stack(item, vaddr)) ppb_flags |= PPB_LAZY; @@ -194,7 +222,7 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct * page. The latter would be checked in page-xfer. */ - if (has_parent && page_in_parent(at[pfn] & PME_SOFT_DIRTY)) { + if (has_parent && page_in_parent(softdirty)) { ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); st = 0; } else { @@ -214,9 +242,8 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct pages[st]++; } - *off += pfn * PAGE_SIZE; - - cnt_add(CNT_PAGES_SCANNED, nr_to_scan); + *pvaddr = vaddr; + cnt_add(CNT_PAGES_SCANNED, nr_scanned); cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]); cnt_add(CNT_PAGES_LAZY, pages[1]); cnt_add(CNT_PAGES_WRITTEN, pages[2]); @@ -356,12 +383,20 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str struct page_xfer *xfer, struct parasite_dump_pages_args *args, struct parasite_ctl *ctl, pmc_t *pmc, bool has_parent, bool pre_dump, int parent_predump_mode) { - u64 off = 0; - u64 *map; + u64 vaddr; int ret; if (!vma_area_is_private(vma, kdat.task_size) && !vma_area_is(vma, VMA_ANON_SHARED)) return 0; + /* + * In turn VVAR area is special and referenced from + * vDSO area by IP addressing (at least on x86) thus + * never ever dump its content but always use one provided + * by the kernel on restore, ie runtime VVAR area must + * be remapped into proper place.. + */ + if (vma_entry_is(vma->e, VMA_AREA_VVAR)) + return 0; /* * To facilitate any combination of pre-dump modes to run after @@ -421,15 +456,14 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str has_parent = false; } - map = pmc_get_map(pmc, vma); - if (!map) + if (pmc_get_map(pmc, vma)) return -1; if (vma_area_is(vma, VMA_ANON_SHARED)) - return add_shmem_area(item->pid->real, vma->e, map); - + return add_shmem_area(item->pid->real, vma->e, pmc); + vaddr = vma->e->start; again: - ret = generate_iovs(item, vma, pp, map, &off, has_parent); + ret = generate_iovs(item, vma, pp, pmc, &vaddr, has_parent); if (ret == -EAGAIN) { BUG_ON(!(pp->flags & PP_CHUNK_MODE)); diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index 09dbc6a363..d9bd1bc86a 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -1,5 +1,6 @@ #include #include +#include #include "page.h" #include "pagemap-cache.h" @@ -22,6 +23,8 @@ #define PAGEMAP_LEN(addr) (PAGE_PFN(addr) * sizeof(u64)) +#define PAGE_REGIONS_MAX_NR 32768 + /* * It's a workaround for a kernel bug. In the 3.19 kernel when pagemap are read * for a few vma-s for one read call, it returns incorrect data. @@ -50,10 +53,23 @@ int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t siz pmc->pid = pid; pmc->map_len = PAGEMAP_LEN(map_size); pmc->vma_head = vma_head; - - pmc->map = xmalloc(pmc->map_len); - if (!pmc->map) - goto err; + pmc->regs_max_len = PAGE_PFN(map_size); + if (pmc->regs_max_len > PAGE_REGIONS_MAX_NR) + pmc->regs_max_len = PAGE_REGIONS_MAX_NR; + pmc->regs_len = 0; + pmc->regs_idx = 0; + pmc->regs = NULL; + pmc->map = NULL; + + if (kdat.has_pagemap_scan) { + pmc->regs = xmalloc(pmc->regs_max_len * sizeof(struct page_region)); + if (!pmc->regs) + goto err; + } else { + pmc->map = xmalloc(pmc->map_len); + if (!pmc->map) + goto err; + } if (pagemap_cache_disabled) pr_warn_once("The pagemap cache is disabled\n"); @@ -87,17 +103,11 @@ int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t siz return -1; } -static inline u64 *__pmc_get_map(pmc_t *pmc, unsigned long addr) -{ - return &pmc->map[PAGE_PFN(addr - pmc->start)]; -} - static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) { unsigned long low = vma->e->start & PMC_MASK; unsigned long high = low + PMC_SIZE; size_t len = vma_area_len(vma); - size_t size_map; if (high > kdat.task_size) high = kdat.task_size; @@ -149,39 +159,79 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) pr_debug("\t%d: simple mode [l:%lx h:%lx]\n", pmc->pid, pmc->start, pmc->end); } + return pmc_fill(pmc, pmc->start, pmc->end); +} + +int pmc_fill(pmc_t *pmc, u64 start, u64 end) +{ + size_t size_map; + + pmc->start = start; + pmc->end = end; + size_map = PAGEMAP_LEN(pmc->end - pmc->start); BUG_ON(pmc->map_len < size_map); BUG_ON(pmc->fd < 0); - if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) { - pmc_zap(pmc); - pr_perror("Can't read %d's pagemap file", pmc->pid); - return -1; + if (pmc->regs) { + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .start = pmc->start, + .end = pmc->end, + .vec = (long)pmc->regs, + .vec_len = pmc->regs_max_len, + .max_pages = 0, + /* + * Request pages that are in RAM or swap, excluding + * zero-filled and file-backed pages. + */ + .category_inverted = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_mask = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_anyof_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED, + .return_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_SOFT_DIRTY, + }; + long ret; + + ret = ioctl(pmc->fd, PAGEMAP_SCAN, &args); + if (ret == -1) { + pr_perror("PAGEMAP_SCAN"); + pmc_zap(pmc); + return -1; + } + pmc->regs_len = ret; + pmc->regs_idx = 0; + pmc->end = args.walk_end; + } else { + if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) { + pmc_zap(pmc); + pr_perror("Can't read %d's pagemap file", pmc->pid); + return -1; + } } return 0; } -u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma) +int pmc_get_map(pmc_t *pmc, const struct vma_area *vma) { /* Hit */ if (likely(pmc->start <= vma->e->start && pmc->end >= vma->e->end)) - return __pmc_get_map(pmc, vma->e->start); + return 0; /* Miss, refill the cache */ if (pmc_fill_cache(pmc, vma)) { pr_err("Failed to fill cache for %d (%lx-%lx)\n", pmc->pid, (long)vma->e->start, (long)vma->e->end); - return NULL; + return -1; } - - /* Hit for sure */ - return __pmc_get_map(pmc, vma->e->start); + return 0; } void pmc_fini(pmc_t *pmc) { close_safe(&pmc->fd); xfree(pmc->map); + xfree(pmc->regs); pmc_reset(pmc); } diff --git a/criu/shmem.c b/criu/shmem.c index c13a39b660..9e3178352d 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -206,23 +206,28 @@ static int expand_shmem(struct shmem_info *si, unsigned long new_size) return 0; } -static void update_shmem_pmaps(struct shmem_info *si, u64 *map, VmaEntry *vma) +static void update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) { unsigned long shmem_pfn, vma_pfn, vma_pgcnt; + u64 vaddr; if (!is_shmem_tracking_en()) return; vma_pgcnt = DIV_ROUND_UP(si->size - vma->pgoff, PAGE_SIZE); - for (vma_pfn = 0; vma_pfn < vma_pgcnt; ++vma_pfn) { - if (!should_dump_page(vma, map[vma_pfn])) + for (vma_pfn = 0, vaddr = vma->start; vma_pfn < vma_pgcnt; ++vma_pfn, vaddr += PAGE_SIZE) { + bool softdirty = false; + u64 next; + + next = should_dump_page(pmc, vma, vaddr, &softdirty); + if (next != vaddr) { + vaddr = next - PAGE_SIZE; continue; + } shmem_pfn = vma_pfn + DIV_ROUND_UP(vma->pgoff, PAGE_SIZE); - if (map[vma_pfn] & PME_SOFT_DIRTY) + if (softdirty) set_pstate(si->pstate_map, shmem_pfn, PST_DIRTY); - else if (page_is_zero(map[vma_pfn])) - set_pstate(si->pstate_map, shmem_pfn, PST_ZERO); else set_pstate(si->pstate_map, shmem_pfn, PST_DUMP); } @@ -648,7 +653,7 @@ static int open_shmem(int pid, struct vma_area *vma) return -1; } -int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) +int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) { struct shmem_info *si; unsigned long size = vma->pgoff + (vma->end - vma->start); @@ -662,7 +667,7 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) if (expand_shmem(si, size)) return -1; } - update_shmem_pmaps(si, map, vma); + update_shmem_pmaps(si, pmc, vma); return 0; } @@ -679,7 +684,7 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) if (expand_shmem(si, size)) return -1; - update_shmem_pmaps(si, map, vma); + update_shmem_pmaps(si, pmc, vma); return 0; }