diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index c65b790433cb..f89568bfa397 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -1288,7 +1288,7 @@ static void kvm_unpoison_all(void *param) QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) { QLIST_REMOVE(page, list); - qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE); + qemu_ram_remap(page->ram_addr); g_free(page); } } diff --git a/hw/core/cpu-system.c b/hw/core/cpu-system.c index 6aae28a349a7..6e307c89597f 100644 --- a/hw/core/cpu-system.c +++ b/hw/core/cpu-system.c @@ -51,13 +51,18 @@ hwaddr cpu_get_phys_page_attrs_debug(CPUState *cpu, vaddr addr, MemTxAttrs *attrs) { CPUClass *cc = CPU_GET_CLASS(cpu); + hwaddr paddr; if (cc->sysemu_ops->get_phys_page_attrs_debug) { - return cc->sysemu_ops->get_phys_page_attrs_debug(cpu, addr, attrs); + paddr = cc->sysemu_ops->get_phys_page_attrs_debug(cpu, addr, attrs); + } else { + /* Fallback for CPUs which don't implement the _attrs_ hook */ + *attrs = MEMTXATTRS_UNSPECIFIED; + paddr = cc->sysemu_ops->get_phys_page_debug(cpu, addr); } - /* Fallback for CPUs which don't implement the _attrs_ hook */ - *attrs = MEMTXATTRS_UNSPECIFIED; - return cc->sysemu_ops->get_phys_page_debug(cpu, addr); + /* Indicate that this is a debug access. */ + attrs->debug = 1; + return paddr; } hwaddr cpu_get_phys_page_debug(CPUState *cpu, vaddr addr) diff --git a/hw/core/loader.c b/hw/core/loader.c index fd25c5e01bd9..332b879a0bf0 100644 --- a/hw/core/loader.c +++ b/hw/core/loader.c @@ -144,7 +144,7 @@ ssize_t load_image_mr(const char *filename, MemoryRegion *mr) { ssize_t size; - if (!memory_access_is_direct(mr, false)) { + if (!memory_access_is_direct(mr, false, MEMTXATTRS_UNSPECIFIED)) { /* Can only load an image into RAM or ROM */ return -1; } diff --git a/hw/display/apple-gfx.m b/hw/display/apple-gfx.m index aa1455b62955..1554f3b8016b 100644 --- a/hw/display/apple-gfx.m +++ b/hw/display/apple-gfx.m @@ -137,7 +137,8 @@ static void apple_gfx_destroy_task(AppleGFXState *s, PGTask_t *task) MEMTXATTRS_UNSPECIFIED); if (!ram_region || ram_region_length < length || - !memory_access_is_direct(ram_region, !read_only)) { + !memory_access_is_direct(ram_region, !read_only, + MEMTXATTRS_UNSPECIFIED)) { return NULL; } diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c index 9e5ff6d87a9d..6e51a92856fb 100644 --- a/hw/remote/vfio-user-obj.c +++ b/hw/remote/vfio-user-obj.c @@ -358,7 +358,7 @@ static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t *buf, hwaddr offset, int access_size; uint64_t val; - if (memory_access_is_direct(mr, is_write)) { + if (memory_access_is_direct(mr, is_write, MEMTXATTRS_UNSPECIFIED)) { /** * Some devices expose a PCI expansion ROM, which could be buffer * based as compared to other regions which are primarily based on diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c index b1a003736b0a..7b140add765c 100644 --- a/hw/virtio/virtio-mem.c +++ b/hw/virtio/virtio-mem.c @@ -991,7 +991,7 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) return; } - if (enable_mlock) { + if (should_mlock(mlock_state)) { error_setg(errp, "Incompatible with mlock"); return; } diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index b1d76d698508..3771b2130c26 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -67,7 +67,7 @@ typedef uintptr_t ram_addr_t; /* memory API */ -void qemu_ram_remap(ram_addr_t addr, ram_addr_t length); +void qemu_ram_remap(ram_addr_t addr); /* This should not be used by devices. */ ram_addr_t qemu_ram_addr_from_host(void *ptr); ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr); diff --git a/include/exec/memattrs.h b/include/exec/memattrs.h index 060b7e713149..8db1d3046479 100644 --- a/include/exec/memattrs.h +++ b/include/exec/memattrs.h @@ -44,6 +44,8 @@ typedef struct MemTxAttrs { * (see MEMTX_ACCESS_ERROR). */ unsigned int memory:1; + /* Debug access that can even write to ROM. */ + unsigned int debug:1; /* Requester ID (for MSI for example) */ unsigned int requester_id:16; @@ -56,7 +58,8 @@ typedef struct MemTxAttrs { * Bus masters which don't specify any attributes will get this * (via the MEMTXATTRS_UNSPECIFIED constant), so that we can * distinguish "all attributes deliberately clear" from - * "didn't specify" if necessary. + * "didn't specify" if necessary. "debug" can be set alongside + * "unspecified". */ bool unspecified; diff --git a/include/exec/memory.h b/include/exec/memory.h index 9f73b5986726..78c4e0aec8d1 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -2995,15 +2995,34 @@ MemTxResult address_space_write_cached_slow(MemoryRegionCache *cache, int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr); bool prepare_mmio_access(MemoryRegion *mr); -static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write) +static inline bool memory_region_supports_direct_access(MemoryRegion *mr) { - if (is_write) { - return memory_region_is_ram(mr) && !mr->readonly && - !mr->rom_device && !memory_region_is_ram_device(mr); - } else { - return (memory_region_is_ram(mr) && !memory_region_is_ram_device(mr)) || - memory_region_is_romd(mr); + /* ROM DEVICE regions only allow direct access if in ROMD mode. */ + if (memory_region_is_romd(mr)) { + return true; + } + if (!memory_region_is_ram(mr)) { + return false; + } + /* + * RAM DEVICE regions can be accessed directly using memcpy, but it might + * be MMIO and access using mempy can be wrong (e.g., using instructions not + * intended for MMIO access). So we treat this as IO. + */ + return !memory_region_is_ram_device(mr); +} + +static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write, + MemTxAttrs attrs) +{ + if (!memory_region_supports_direct_access(mr)) { + return false; + } + /* Debug access can write to ROM. */ + if (is_write && !attrs.debug) { + return !mr->readonly && !mr->rom_device; } + return true; } /** @@ -3036,7 +3055,7 @@ MemTxResult address_space_read(AddressSpace *as, hwaddr addr, fv = address_space_to_flatview(as); l = len; mr = flatview_translate(fv, addr, &addr1, &l, false, attrs); - if (len == l && memory_access_is_direct(mr, false)) { + if (len == l && memory_access_is_direct(mr, false, attrs)) { ptr = qemu_map_ram_ptr(mr->ram_block, addr1); memcpy(buf, ptr, len); } else { diff --git a/include/system/os-posix.h b/include/system/os-posix.h index b881ac6c6f74..ce5b3bccf8db 100644 --- a/include/system/os-posix.h +++ b/include/system/os-posix.h @@ -53,7 +53,7 @@ bool os_set_runas(const char *user_id); void os_set_chroot(const char *path); void os_setup_limits(void); void os_setup_post(void); -int os_mlock(void); +int os_mlock(bool on_fault); /** * qemu_alloc_stack: diff --git a/include/system/os-win32.h b/include/system/os-win32.h index b82a5d3ad93c..bc623061d821 100644 --- a/include/system/os-win32.h +++ b/include/system/os-win32.h @@ -123,7 +123,7 @@ static inline bool is_daemonized(void) return false; } -static inline int os_mlock(void) +static inline int os_mlock(bool on_fault G_GNUC_UNUSED) { return -ENOSYS; } diff --git a/include/system/system.h b/include/system/system.h index 0cbb43ec303b..a7effe7dfd8b 100644 --- a/include/system/system.h +++ b/include/system/system.h @@ -44,10 +44,20 @@ extern int display_opengl; extern const char *keyboard_layout; extern int old_param; extern uint8_t *boot_splash_filedata; -extern bool enable_mlock; extern bool enable_cpu_pm; extern QEMUClockType rtc_clock; +typedef enum { + MLOCK_OFF = 0, + MLOCK_ON, + MLOCK_ON_FAULT, +} MlockState; + +bool should_mlock(MlockState); +bool is_mlock_on_fault(MlockState); + +extern MlockState mlock_state; + #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { const char *name; diff --git a/meson.build b/meson.build index 8ed10b6624e7..0ee79c664d39 100644 --- a/meson.build +++ b/meson.build @@ -2885,6 +2885,12 @@ config_host_data.set('HAVE_MLOCKALL', cc.links(gnu_source_prefix + ''' return mlockall(MCL_FUTURE); }''')) +config_host_data.set('HAVE_MLOCK_ONFAULT', cc.links(gnu_source_prefix + ''' + #include + int main(void) { + return mlockall(MCL_FUTURE | MCL_ONFAULT); + }''')) + have_l2tpv3 = false if get_option('l2tpv3').allowed() and have_system have_l2tpv3 = cc.has_type('struct mmsghdr', diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 6a6da6ba7f3a..5d3edfcfec73 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -651,8 +651,8 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) mis->have_fault_thread = false; } - if (enable_mlock) { - if (os_mlock() < 0) { + if (should_mlock(mlock_state)) { + if (os_mlock(is_mlock_on_fault(mlock_state)) < 0) { error_report("mlock: %s", strerror(errno)); /* * It doesn't feel right to fail at this point, we have a valid diff --git a/monitor/hmp-cmds-target.c b/monitor/hmp-cmds-target.c index 27ffe61818db..239c2a61a451 100644 --- a/monitor/hmp-cmds-target.c +++ b/monitor/hmp-cmds-target.c @@ -301,7 +301,6 @@ void hmp_gpa2hva(Monitor *mon, const QDict *qdict) void hmp_gva2gpa(Monitor *mon, const QDict *qdict) { target_ulong addr = qdict_get_int(qdict, "addr"); - MemTxAttrs attrs; CPUState *cs = mon_get_cpu(mon); hwaddr gpa; @@ -310,7 +309,7 @@ void hmp_gva2gpa(Monitor *mon, const QDict *qdict) return; } - gpa = cpu_get_phys_page_attrs_debug(cs, addr & TARGET_PAGE_MASK, &attrs); + gpa = cpu_get_phys_page_debug(cs, addr & TARGET_PAGE_MASK); if (gpa == -1) { monitor_printf(mon, "Unmapped\n"); } else { diff --git a/os-posix.c b/os-posix.c index 9cce55ff2f7e..52925c23d3d9 100644 --- a/os-posix.c +++ b/os-posix.c @@ -327,18 +327,29 @@ void os_set_line_buffering(void) setvbuf(stdout, NULL, _IOLBF, 0); } -int os_mlock(void) +int os_mlock(bool on_fault) { #ifdef HAVE_MLOCKALL int ret = 0; + int flags = MCL_CURRENT | MCL_FUTURE; - ret = mlockall(MCL_CURRENT | MCL_FUTURE); + if (on_fault) { +#ifdef HAVE_MLOCK_ONFAULT + flags |= MCL_ONFAULT; +#else + error_report("mlockall: on_fault not supported"); + return -EINVAL; +#endif + } + + ret = mlockall(flags); if (ret < 0) { error_report("mlockall: %s", strerror(errno)); } return ret; #else + (void)on_fault; return -ENOSYS; #endif } diff --git a/qemu-options.hx b/qemu-options.hx index 1b26ad53bda7..61270e320670 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -4632,21 +4632,25 @@ SRST ERST DEF("overcommit", HAS_ARG, QEMU_OPTION_overcommit, - "-overcommit [mem-lock=on|off][cpu-pm=on|off]\n" + "-overcommit [mem-lock=on|off|on-fault][cpu-pm=on|off]\n" " run qemu with overcommit hints\n" - " mem-lock=on|off controls memory lock support (default: off)\n" + " mem-lock=on|off|on-fault controls memory lock support (default: off)\n" " cpu-pm=on|off controls cpu power management (default: off)\n", QEMU_ARCH_ALL) SRST -``-overcommit mem-lock=on|off`` +``-overcommit mem-lock=on|off|on-fault`` \ ``-overcommit cpu-pm=on|off`` Run qemu with hints about host resource overcommit. The default is to assume that host overcommits all resources. Locking qemu and guest memory can be enabled via ``mem-lock=on`` - (disabled by default). This works when host memory is not - overcommitted and reduces the worst-case latency for guest. + or ``mem-lock=on-fault`` (disabled by default). This works when + host memory is not overcommitted and reduces the worst-case latency for + guest. The on-fault option is better for reducing the memory footprint + since it makes allocations lazy, but the pages still get locked in place + once faulted by the guest or QEMU. Note that the two options are mutually + exclusive. Guest ability to manage power state of host cpus (increasing latency for other processes on the same host cpu, but decreasing latency for diff --git a/system/globals.c b/system/globals.c index 4867c93ca6b9..316623bd20af 100644 --- a/system/globals.c +++ b/system/globals.c @@ -31,10 +31,20 @@ #include "system/cpus.h" #include "system/system.h" +bool should_mlock(MlockState state) +{ + return state == MLOCK_ON || state == MLOCK_ON_FAULT; +} + +bool is_mlock_on_fault(MlockState state) +{ + return state == MLOCK_ON_FAULT; +} + enum vga_retrace_method vga_retrace_method = VGA_RETRACE_DUMB; int display_opengl; const char* keyboard_layout; -bool enable_mlock; +MlockState mlock_state; bool enable_cpu_pm; int autostart = 1; int vga_interface_type = VGA_NONE; diff --git a/system/memory_ldst.c.inc b/system/memory_ldst.c.inc index 0e6f3940a9a1..7f32d3d9ff39 100644 --- a/system/memory_ldst.c.inc +++ b/system/memory_ldst.c.inc @@ -34,7 +34,7 @@ static inline uint32_t glue(address_space_ldl_internal, SUFFIX)(ARG1_DECL, RCU_READ_LOCK(); mr = TRANSLATE(addr, &addr1, &l, false, attrs); - if (l < 4 || !memory_access_is_direct(mr, false)) { + if (l < 4 || !memory_access_is_direct(mr, false, attrs)) { release_lock |= prepare_mmio_access(mr); /* I/O case */ @@ -103,7 +103,7 @@ static inline uint64_t glue(address_space_ldq_internal, SUFFIX)(ARG1_DECL, RCU_READ_LOCK(); mr = TRANSLATE(addr, &addr1, &l, false, attrs); - if (l < 8 || !memory_access_is_direct(mr, false)) { + if (l < 8 || !memory_access_is_direct(mr, false, attrs)) { release_lock |= prepare_mmio_access(mr); /* I/O case */ @@ -170,7 +170,7 @@ uint8_t glue(address_space_ldub, SUFFIX)(ARG1_DECL, RCU_READ_LOCK(); mr = TRANSLATE(addr, &addr1, &l, false, attrs); - if (!memory_access_is_direct(mr, false)) { + if (!memory_access_is_direct(mr, false, attrs)) { release_lock |= prepare_mmio_access(mr); /* I/O case */ @@ -207,7 +207,7 @@ static inline uint16_t glue(address_space_lduw_internal, SUFFIX)(ARG1_DECL, RCU_READ_LOCK(); mr = TRANSLATE(addr, &addr1, &l, false, attrs); - if (l < 2 || !memory_access_is_direct(mr, false)) { + if (l < 2 || !memory_access_is_direct(mr, false, attrs)) { release_lock |= prepare_mmio_access(mr); /* I/O case */ @@ -277,7 +277,7 @@ void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL, RCU_READ_LOCK(); mr = TRANSLATE(addr, &addr1, &l, true, attrs); - if (l < 4 || !memory_access_is_direct(mr, true)) { + if (l < 4 || !memory_access_is_direct(mr, true, attrs)) { release_lock |= prepare_mmio_access(mr); r = memory_region_dispatch_write(mr, addr1, val, MO_32, attrs); @@ -314,7 +314,7 @@ static inline void glue(address_space_stl_internal, SUFFIX)(ARG1_DECL, RCU_READ_LOCK(); mr = TRANSLATE(addr, &addr1, &l, true, attrs); - if (l < 4 || !memory_access_is_direct(mr, true)) { + if (l < 4 || !memory_access_is_direct(mr, true, attrs)) { release_lock |= prepare_mmio_access(mr); r = memory_region_dispatch_write(mr, addr1, val, MO_32 | devend_memop(endian), attrs); @@ -377,7 +377,7 @@ void glue(address_space_stb, SUFFIX)(ARG1_DECL, RCU_READ_LOCK(); mr = TRANSLATE(addr, &addr1, &l, true, attrs); - if (!memory_access_is_direct(mr, true)) { + if (!memory_access_is_direct(mr, true, attrs)) { release_lock |= prepare_mmio_access(mr); r = memory_region_dispatch_write(mr, addr1, val, MO_8, attrs); } else { @@ -410,7 +410,7 @@ static inline void glue(address_space_stw_internal, SUFFIX)(ARG1_DECL, RCU_READ_LOCK(); mr = TRANSLATE(addr, &addr1, &l, true, attrs); - if (l < 2 || !memory_access_is_direct(mr, true)) { + if (l < 2 || !memory_access_is_direct(mr, true, attrs)) { release_lock |= prepare_mmio_access(mr); r = memory_region_dispatch_write(mr, addr1, val, MO_16 | devend_memop(endian), attrs); @@ -474,7 +474,7 @@ static void glue(address_space_stq_internal, SUFFIX)(ARG1_DECL, RCU_READ_LOCK(); mr = TRANSLATE(addr, &addr1, &l, true, attrs); - if (l < 8 || !memory_access_is_direct(mr, true)) { + if (l < 8 || !memory_access_is_direct(mr, true, attrs)) { release_lock |= prepare_mmio_access(mr); r = memory_region_dispatch_write(mr, addr1, val, MO_64 | devend_memop(endian), attrs); diff --git a/system/physmem.c b/system/physmem.c index 67c9db9daadb..67bdf631e60c 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -573,7 +573,7 @@ MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat, is_write, true, &as, attrs); mr = section.mr; - if (xen_enabled() && memory_access_is_direct(mr, is_write)) { + if (xen_enabled() && memory_access_is_direct(mr, is_write, attrs)) { hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr; *plen = MIN(page, *plen); } @@ -2275,45 +2275,80 @@ void qemu_ram_free(RAMBlock *block) } #ifndef _WIN32 -void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) +/* Simply remap the given VM memory location from start to start+length */ +static int qemu_ram_remap_mmap(RAMBlock *block, uint64_t start, size_t length) +{ + int flags, prot; + void *area; + void *host_startaddr = block->host + start; + + assert(block->fd < 0); + flags = MAP_FIXED | MAP_ANONYMOUS; + flags |= block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE; + flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0; + prot = PROT_READ; + prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE; + area = mmap(host_startaddr, length, prot, flags, -1, 0); + return area != host_startaddr ? -errno : 0; +} + +/* + * qemu_ram_remap - remap a single RAM page + * + * @addr: address in ram_addr_t address space. + * + * This function will try remapping a single page of guest RAM identified by + * @addr, essentially discarding memory to recover from previously poisoned + * memory (MCE). The page size depends on the RAMBlock (i.e., hugetlb). @addr + * does not have to point at the start of the page. + * + * This function is only to be used during system resets; it will kill the + * VM if remapping failed. + */ +void qemu_ram_remap(ram_addr_t addr) { RAMBlock *block; - ram_addr_t offset; - int flags; - void *area, *vaddr; - int prot; + uint64_t offset; + void *vaddr; + size_t page_size; RAMBLOCK_FOREACH(block) { offset = addr - block->offset; if (offset < block->max_length) { + /* Respect the pagesize of our RAMBlock */ + page_size = qemu_ram_pagesize(block); + offset = QEMU_ALIGN_DOWN(offset, page_size); + vaddr = ramblock_ptr(block, offset); if (block->flags & RAM_PREALLOC) { ; } else if (xen_enabled()) { abort(); } else { - flags = MAP_FIXED; - flags |= block->flags & RAM_SHARED ? - MAP_SHARED : MAP_PRIVATE; - flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0; - prot = PROT_READ; - prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE; - if (block->fd >= 0) { - area = mmap(vaddr, length, prot, flags, block->fd, - offset + block->fd_offset); - } else { - flags |= MAP_ANONYMOUS; - area = mmap(vaddr, length, prot, flags, -1, 0); - } - if (area != vaddr) { - error_report("Could not remap addr: " - RAM_ADDR_FMT "@" RAM_ADDR_FMT "", - length, addr); - exit(1); + if (ram_block_discard_range(block, offset, page_size) != 0) { + /* + * Fall back to using mmap() only for anonymous mapping, + * as if a backing file is associated we may not be able + * to recover the memory in all cases. + * So don't take the risk of using only mmap and fail now. + */ + if (block->fd >= 0) { + error_report("Could not remap RAM %s:%" PRIx64 "+%" + PRIx64 " +%zx", block->idstr, offset, + block->fd_offset, page_size); + exit(1); + } + if (qemu_ram_remap_mmap(block, offset, page_size) != 0) { + error_report("Could not remap RAM %s:%" PRIx64 " +%zx", + block->idstr, offset, page_size); + exit(1); + } } - memory_try_enable_merging(vaddr, length); - qemu_ram_setup_dump(vaddr, length); + memory_try_enable_merging(vaddr, page_size); + qemu_ram_setup_dump(vaddr, page_size); } + + break; } } } @@ -2869,7 +2904,7 @@ static MemTxResult flatview_write_continue_step(MemTxAttrs attrs, return MEMTX_ACCESS_ERROR; } - if (!memory_access_is_direct(mr, true)) { + if (!memory_access_is_direct(mr, true, attrs)) { uint64_t val; MemTxResult result; bool release_lock = prepare_mmio_access(mr); @@ -2965,7 +3000,7 @@ static MemTxResult flatview_read_continue_step(MemTxAttrs attrs, uint8_t *buf, return MEMTX_ACCESS_ERROR; } - if (!memory_access_is_direct(mr, false)) { + if (!memory_access_is_direct(mr, false, attrs)) { /* I/O case */ uint64_t val; MemTxResult result; @@ -3137,8 +3172,7 @@ static inline MemTxResult address_space_write_rom_internal(AddressSpace *as, l = len; mr = address_space_translate(as, addr, &addr1, &l, true, attrs); - if (!(memory_region_is_ram(mr) || - memory_region_is_romd(mr))) { + if (!memory_region_supports_direct_access(mr)) { l = memory_access_size(mr, l, addr1); } else { /* ROM/RAM case */ @@ -3275,7 +3309,7 @@ static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len, while (len > 0) { l = len; mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs); - if (!memory_access_is_direct(mr, is_write)) { + if (!memory_access_is_direct(mr, is_write, attrs)) { l = memory_access_size(mr, l, addr); if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) { return false; @@ -3355,7 +3389,7 @@ void *address_space_map(AddressSpace *as, fv = address_space_to_flatview(as); mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs); - if (!memory_access_is_direct(mr, is_write)) { + if (!memory_access_is_direct(mr, is_write, attrs)) { size_t used = qatomic_read(&as->bounce_buffer_size); for (;;) { hwaddr alloc = MIN(as->max_bounce_buffer_size - used, l); @@ -3488,7 +3522,7 @@ int64_t address_space_cache_init(MemoryRegionCache *cache, mr = cache->mrs.mr; memory_region_ref(mr); - if (memory_access_is_direct(mr, is_write)) { + if (memory_access_is_direct(mr, is_write, MEMTXATTRS_UNSPECIFIED)) { /* We don't care about the memory attributes here as we're only * doing this if we found actual RAM, which behaves the same * regardless of attributes; so UNSPECIFIED is fine. @@ -3681,13 +3715,8 @@ int cpu_memory_rw_debug(CPUState *cpu, vaddr addr, if (l > len) l = len; phys_addr += (addr & ~TARGET_PAGE_MASK); - if (is_write) { - res = address_space_write_rom(cpu->cpu_ases[asidx].as, phys_addr, - attrs, buf, l); - } else { - res = address_space_read(cpu->cpu_ases[asidx].as, phys_addr, - attrs, buf, l); - } + res = address_space_rw(cpu->cpu_ases[asidx].as, phys_addr, attrs, buf, + l, is_write); if (res != MEMTX_OK) { return -1; } @@ -3797,18 +3826,19 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length) } ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - start, length); + start + rb->fd_offset, length); if (ret) { ret = -errno; - error_report("%s: Failed to fallocate %s:%" PRIx64 " +%zx (%d)", - __func__, rb->idstr, start, length, ret); + error_report("%s: Failed to fallocate %s:%" PRIx64 "+%" PRIx64 + " +%zx (%d)", __func__, rb->idstr, start, + rb->fd_offset, length, ret); goto err; } #else ret = -ENOSYS; error_report("%s: fallocate not available/file" - "%s:%" PRIx64 " +%zx (%d)", - __func__, rb->idstr, start, length, ret); + "%s:%" PRIx64 "+%" PRIx64 " +%zx (%d)", __func__, + rb->idstr, start, rb->fd_offset, length, ret); goto err; #endif } @@ -3855,6 +3885,7 @@ int ram_block_discard_guest_memfd_range(RAMBlock *rb, uint64_t start, int ret = -1; #ifdef CONFIG_FALLOCATE_PUNCH_HOLE + /* ignore fd_offset with guest_memfd */ ret = fallocate(rb->guest_memfd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, start, length); diff --git a/system/vl.c b/system/vl.c index 2a570ed9ff4f..8f776684ec87 100644 --- a/system/vl.c +++ b/system/vl.c @@ -352,7 +352,7 @@ static QemuOptsList qemu_overcommit_opts = { .desc = { { .name = "mem-lock", - .type = QEMU_OPT_BOOL, + .type = QEMU_OPT_STRING, }, { .name = "cpu-pm", @@ -797,8 +797,8 @@ static QemuOptsList qemu_run_with_opts = { static void realtime_init(void) { - if (enable_mlock) { - if (os_mlock() < 0) { + if (should_mlock(mlock_state)) { + if (os_mlock(is_mlock_on_fault(mlock_state)) < 0) { error_report("locking memory failed"); exit(1); } @@ -1876,6 +1876,44 @@ static void object_option_parse(const char *str) visit_free(v); } +static void overcommit_parse(const char *str) +{ + QemuOpts *opts; + const char *mem_lock_opt; + + opts = qemu_opts_parse_noisily(qemu_find_opts("overcommit"), + str, false); + if (!opts) { + exit(1); + } + + enable_cpu_pm = qemu_opt_get_bool(opts, "cpu-pm", enable_cpu_pm); + + mem_lock_opt = qemu_opt_get(opts, "mem-lock"); + if (!mem_lock_opt) { + return; + } + + if (strcmp(mem_lock_opt, "on") == 0) { + mlock_state = MLOCK_ON; + return; + } + + if (strcmp(mem_lock_opt, "off") == 0) { + mlock_state = MLOCK_OFF; + return; + } + + if (strcmp(mem_lock_opt, "on-fault") == 0) { + mlock_state = MLOCK_ON_FAULT; + return; + } + + error_report("parameter 'mem-lock' expects one of " + "'on', 'off', 'on-fault'"); + exit(1); +} + /* * Very early object creation, before the sandbox options have been activated. */ @@ -3591,13 +3629,7 @@ void qemu_init(int argc, char **argv) object_option_parse(optarg); break; case QEMU_OPTION_overcommit: - opts = qemu_opts_parse_noisily(qemu_find_opts("overcommit"), - optarg, false); - if (!opts) { - exit(1); - } - enable_mlock = qemu_opt_get_bool(opts, "mem-lock", enable_mlock); - enable_cpu_pm = qemu_opt_get_bool(opts, "cpu-pm", enable_cpu_pm); + overcommit_parse(optarg); break; case QEMU_OPTION_compat: {