diff options
author | 2022-12-12 11:14:19 -0700 | |
---|---|---|
committer | 2024-07-29 18:15:05 +0200 | |
commit | 8f9e92b36972415b97abc598bbdf1908806b1882 (patch) | |
tree | efa8d96a10a6cbc587e1e4fc1befaf18402b8806 | |
parent | minmax: simplify min()/max()/clamp() implementation (diff) | |
download | linux-rng-jd/vdso-skip-insn.tar.xz linux-rng-jd/vdso-skip-insn.zip |
x86: mm: Skip faulting instruction for VM_DROPPABLE faultsjd/vdso-skip-insn
VM_DROPPABLE allocations can, by definition, be dropped and then filled
with zeros at any time. For reads, this is working as intended:
userspace reads memory, and at some point it comes back as zeros.
Writes, however, are another story.
If the kernel has dropped the memory and userspace writes to those
addresses, the page fault handler traps, the memory is allocated, and
control is returned to userspace to retry the write, which succeeds.
But if the memory fails to be allocated when trapping, control is still
returned to userspace to retry the write, and the whole thing happens
again. And it doesn't make sense to kill the process for a droppable
mapping. Eventually hopefully there's enough memory and this succeeds.
However, that retry loop really is unnecessary. Instead, the write could
just be skipped, because skipping a write is the same as the write
succeeding and then immediately after the kernel dropping the page so
that subsequent reads return zeros.
So, rather than merely returning to userspace at the same write
instruction to be retried, skip that write instruction.
Implement it on x86, where instructions are variable size, by copying
userspace's %rip to a stack buffer of size MAX_INSN_SIZE, decoding it,
and then adding the length of the decoded instruction to userspace's
%rip. In the event any of these fail, just fallback to not advancing
%rip and trying again.
Cc: linux-mm@kvack.org
Cc: x86@kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
-rw-r--r-- | arch/x86/mm/fault.c | 19 | ||||
-rw-r--r-- | include/linux/mm_types.h | 5 | ||||
-rw-r--r-- | mm/memory.c | 4 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 |
4 files changed, 27 insertions, 3 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e6c469b323cc..8887c6d63ef2 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -37,6 +37,8 @@ #include <asm/irq_stack.h> #include <asm/fred.h> #include <asm/sev.h> /* snp_dump_hva_rmpentry() */ +#include <asm/insn.h> /* struct insn */ +#include <asm/insn-eval.h> /* insn_fetch_from_user(), ... */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -1415,6 +1417,23 @@ retry: } mmap_read_unlock(mm); + + if (fault & VM_FAULT_SKIP_INSN) { + u8 buf[MAX_INSN_SIZE]; + struct insn insn; + int nr_copied; + + nr_copied = insn_fetch_from_user(regs, buf); + if (nr_copied <= 0) + return; + + if (!insn_decode_from_regs(&insn, regs, buf, nr_copied)) + return; + + regs->ip += insn.length; + return; + } + done: if (likely(!(fault & VM_FAULT_ERROR))) return; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 485424979254..a2dda933b09b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1249,6 +1249,7 @@ typedef __bitwise unsigned int vm_fault_t; * fsync() to complete (for synchronous page faults * in DAX) * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released + * @VM_FAULT_SKIP_INSN: ->handle the fault by skipping faulting instruction * @VM_FAULT_HINDEX_MASK: mask HINDEX value * */ @@ -1266,6 +1267,7 @@ enum vm_fault_reason { VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000, VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000, VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000, + VM_FAULT_SKIP_INSN = (__force vm_fault_t)0x008000, VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000, }; @@ -1290,7 +1292,8 @@ enum vm_fault_reason { { VM_FAULT_FALLBACK, "FALLBACK" }, \ { VM_FAULT_DONE_COW, "DONE_COW" }, \ { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \ - { VM_FAULT_COMPLETED, "COMPLETED" } + { VM_FAULT_COMPLETED, "COMPLETED" }, \ + { VM_FAULT_SKIP_INSN, "SKIP_INSN" } struct vm_special_mapping { const char *name; /* The name, e.g. "[vdso]". */ diff --git a/mm/memory.c b/mm/memory.c index 34f8402d2046..405d9c56fa35 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5841,8 +5841,10 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, lru_gen_exit_fault(); /* If the mapping is droppable, then errors due to OOM aren't fatal. */ - if (is_droppable) + if (is_droppable && (ret & VM_FAULT_OOM)) { ret &= ~VM_FAULT_OOM; + ret |= VM_FAULT_SKIP_INSN; + } if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b858e22b259d..d75e0ee19aa3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2306,7 +2306,7 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct struct folio *folio; if (vma->vm_flags & VM_DROPPABLE) - gfp |= __GFP_NOWARN; + gfp |= __GFP_NOWARN | __GFP_NORETRY; pol = get_vma_policy(vma, addr, order, &ilx); folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); |