aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason A. Donenfeld <Jason@zx2c4.com>2022-12-12 11:14:19 -0700
committerJason A. Donenfeld <Jason@zx2c4.com>2024-07-29 18:15:05 +0200
commit8f9e92b36972415b97abc598bbdf1908806b1882 (patch)
treeefa8d96a10a6cbc587e1e4fc1befaf18402b8806
parentminmax: simplify min()/max()/clamp() implementation (diff)
downloadlinux-rng-jd/vdso-skip-insn.tar.xz
linux-rng-jd/vdso-skip-insn.zip
x86: mm: Skip faulting instruction for VM_DROPPABLE faultsjd/vdso-skip-insn
VM_DROPPABLE allocations can, by definition, be dropped and then filled with zeros at any time. For reads, this is working as intended: userspace reads memory, and at some point it comes back as zeros. Writes, however, are another story. If the kernel has dropped the memory and userspace writes to those addresses, the page fault handler traps, the memory is allocated, and control is returned to userspace to retry the write, which succeeds. But if the memory fails to be allocated when trapping, control is still returned to userspace to retry the write, and the whole thing happens again. And it doesn't make sense to kill the process for a droppable mapping. Eventually hopefully there's enough memory and this succeeds. However, that retry loop really is unnecessary. Instead, the write could just be skipped, because skipping a write is the same as the write succeeding and then immediately after the kernel dropping the page so that subsequent reads return zeros. So, rather than merely returning to userspace at the same write instruction to be retried, skip that write instruction. Implement it on x86, where instructions are variable size, by copying userspace's %rip to a stack buffer of size MAX_INSN_SIZE, decoding it, and then adding the length of the decoded instruction to userspace's %rip. In the event any of these fail, just fallback to not advancing %rip and trying again. Cc: linux-mm@kvack.org Cc: x86@kernel.org Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
-rw-r--r--arch/x86/mm/fault.c19
-rw-r--r--include/linux/mm_types.h5
-rw-r--r--mm/memory.c4
-rw-r--r--mm/mempolicy.c2
4 files changed, 27 insertions, 3 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e6c469b323cc..8887c6d63ef2 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -37,6 +37,8 @@
#include <asm/irq_stack.h>
#include <asm/fred.h>
#include <asm/sev.h> /* snp_dump_hva_rmpentry() */
+#include <asm/insn.h> /* struct insn */
+#include <asm/insn-eval.h> /* insn_fetch_from_user(), ... */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -1415,6 +1417,23 @@ retry:
}
mmap_read_unlock(mm);
+
+ if (fault & VM_FAULT_SKIP_INSN) {
+ u8 buf[MAX_INSN_SIZE];
+ struct insn insn;
+ int nr_copied;
+
+ nr_copied = insn_fetch_from_user(regs, buf);
+ if (nr_copied <= 0)
+ return;
+
+ if (!insn_decode_from_regs(&insn, regs, buf, nr_copied))
+ return;
+
+ regs->ip += insn.length;
+ return;
+ }
+
done:
if (likely(!(fault & VM_FAULT_ERROR)))
return;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 485424979254..a2dda933b09b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1249,6 +1249,7 @@ typedef __bitwise unsigned int vm_fault_t;
* fsync() to complete (for synchronous page faults
* in DAX)
* @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released
+ * @VM_FAULT_SKIP_INSN: ->handle the fault by skipping faulting instruction
* @VM_FAULT_HINDEX_MASK: mask HINDEX value
*
*/
@@ -1266,6 +1267,7 @@ enum vm_fault_reason {
VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000,
+ VM_FAULT_SKIP_INSN = (__force vm_fault_t)0x008000,
VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
};
@@ -1290,7 +1292,8 @@ enum vm_fault_reason {
{ VM_FAULT_FALLBACK, "FALLBACK" }, \
{ VM_FAULT_DONE_COW, "DONE_COW" }, \
{ VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \
- { VM_FAULT_COMPLETED, "COMPLETED" }
+ { VM_FAULT_COMPLETED, "COMPLETED" }, \
+ { VM_FAULT_SKIP_INSN, "SKIP_INSN" }
struct vm_special_mapping {
const char *name; /* The name, e.g. "[vdso]". */
diff --git a/mm/memory.c b/mm/memory.c
index 34f8402d2046..405d9c56fa35 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5841,8 +5841,10 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
lru_gen_exit_fault();
/* If the mapping is droppable, then errors due to OOM aren't fatal. */
- if (is_droppable)
+ if (is_droppable && (ret & VM_FAULT_OOM)) {
ret &= ~VM_FAULT_OOM;
+ ret |= VM_FAULT_SKIP_INSN;
+ }
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b858e22b259d..d75e0ee19aa3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2306,7 +2306,7 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct
struct folio *folio;
if (vma->vm_flags & VM_DROPPABLE)
- gfp |= __GFP_NOWARN;
+ gfp |= __GFP_NOWARN | __GFP_NORETRY;
pol = get_vma_policy(vma, addr, order, &ilx);
folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());