diff --git a/Documentation/devicetree/bindings/interrupt-controller/ingenic,intc.yaml b/Documentation/devicetree/bindings/interrupt-controller/ingenic,intc.yaml index 0a046be8d1cd..0358a7739c8e 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/ingenic,intc.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/ingenic,intc.yaml @@ -23,6 +23,7 @@ properties: - enum: - ingenic,jz4775-intc - ingenic,jz4770-intc + - ingenic,jz4760b-intc - const: ingenic,jz4760-intc - items: - const: ingenic,x1000-intc diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 1a2b5210cdbf..38e327d4b479 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -182,6 +182,9 @@ is dependent on the CPU capability and the kernel configuration. The limit can be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION ioctl() at run-time. +Creation of the VM will fail if the requested IPA size (whether it is +implicit or explicit) is unsupported on the host. + Please note that configuring the IPA size does not affect the capability exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects size of the address translated by the stage2 level (guest physical to diff --git a/MAINTAINERS b/MAINTAINERS index 80476bee0d67..915e12d3f03c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -261,8 +261,8 @@ ABI/API L: linux-api@vger.kernel.org F: include/linux/syscalls.h F: kernel/sys_ni.c -F: include/uapi/ -F: arch/*/include/uapi/ +X: include/uapi/ +X: arch/*/include/uapi/ ABIT UGURU 1,2 HARDWARE MONITOR DRIVER M: Hans de Goede diff --git a/Makefile b/Makefile index 1bc96ba26886..fc8076bb42ee 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 12 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Frozen Wasteland # *DOCUMENTATION* diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 81b0d62874f7..d762a4c8dc74 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -347,6 +347,7 @@ config ARCH_EP93XX select ARM_AMBA imply ARM_PATCH_PHYS_VIRT select ARM_VIC + select GENERIC_IRQ_MULTI_HANDLER select AUTO_ZRELADDR select CLKDEV_LOOKUP select CLKSRC_MMIO diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 22d933e9b59e..a7ab84f781f7 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -47,10 +47,10 @@ #define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2 #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3 #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4 -#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5 +#define __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context 5 #define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6 #define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config 8 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11 @@ -183,16 +183,16 @@ DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs); #define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs) extern void __kvm_flush_vm_context(void); +extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu); extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, int level); extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); -extern void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu); extern void __kvm_timer_set_cntvoff(u64 cntvoff); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); -extern u64 __vgic_v3_get_ich_vtr_el2(void); +extern u64 __vgic_v3_get_gic_config(void); extern u64 __vgic_v3_read_vmcr(void); extern void __vgic_v3_write_vmcr(u32 vmcr); extern void __vgic_v3_init_lrs(void); diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index c0450828378b..32ae676236b6 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -83,6 +83,11 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt); void __debug_switch_to_guest(struct kvm_vcpu *vcpu); void __debug_switch_to_host(struct kvm_vcpu *vcpu); +#ifdef __KVM_NVHE_HYPERVISOR__ +void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu); +void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu); +#endif + void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); @@ -97,7 +102,8 @@ bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt); void __noreturn hyp_panic(void); #ifdef __KVM_NVHE_HYPERVISOR__ -void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); +void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr, + u64 elr, u64 par); #endif #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 23f1a557bd9f..5aa9ed1e9ec6 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -101,6 +101,9 @@ KVM_NVHE_ALIAS(__stop___kvm_ex_table); /* Array containing bases of nVHE per-CPU memory regions. */ KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base); +/* PMU available static key */ +KVM_NVHE_ALIAS(kvm_arm_pmu_available); + #endif /* CONFIG_KVM */ #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index fc4c95dd2d26..7f06ba76698d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -385,11 +385,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) last_ran = this_cpu_ptr(mmu->last_vcpu_ran); /* + * We guarantee that both TLBs and I-cache are private to each + * vcpu. If detecting that a vcpu from the same VM has + * previously run on the same physical CPU, call into the + * hypervisor code to nuke the relevant contexts. + * * We might get preempted before the vCPU actually runs, but * over-invalidation doesn't affect correctness. */ if (*last_ran != vcpu->vcpu_id) { - kvm_call_hyp(__kvm_tlb_flush_local_vmid, mmu); + kvm_call_hyp(__kvm_flush_cpu_context, mmu); *last_ran = vcpu->vcpu_id; } diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index b0afad7a99c6..e831d3dfd50d 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -85,8 +85,10 @@ SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL) // If the hyp context is loaded, go straight to hyp_panic get_loaded_vcpu x0, x1 - cbz x0, hyp_panic + cbnz x0, 1f + b hyp_panic +1: // The hyp context is saved so make sure it is restored to allow // hyp_panic to run at hyp and, subsequently, panic to run in the host. // This makes use of __guest_exit to avoid duplication but sets the @@ -94,7 +96,7 @@ SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL) // current state is saved to the guest context but it will only be // accurate if the guest had been completely restored. adr_this_cpu x0, kvm_hyp_ctxt, x1 - adr x1, hyp_panic + adr_l x1, hyp_panic str x1, [x0, #CPU_XREG_OFFSET(30)] get_vcpu_ptr x1, x0 @@ -146,7 +148,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // Now restore the hyp regs restore_callee_saved_regs x2 - set_loaded_vcpu xzr, x1, x2 + set_loaded_vcpu xzr, x2, x3 alternative_if ARM64_HAS_RAS_EXTN // If we have the RAS extensions we can consume a pending error diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 54f4860cd87c..6c1f51f25eb3 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -90,15 +90,18 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) * counter, which could make a PMXEVCNTR_EL0 access UNDEF at * EL1 instead of being trapped to EL2. */ - write_sysreg(0, pmselr_el0); - write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); + if (kvm_arm_support_pmu_v3()) { + write_sysreg(0, pmselr_el0); + write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); + } write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); } static inline void __deactivate_traps_common(void) { write_sysreg(0, hstr_el2); - write_sysreg(0, pmuserenr_el0); + if (kvm_arm_support_pmu_v3()) + write_sysreg(0, pmuserenr_el0); } static inline void ___activate_traps(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 91a711aa8382..f401724f12ef 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -58,16 +58,24 @@ static void __debug_restore_spe(u64 pmscr_el1) write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1); } -void __debug_switch_to_guest(struct kvm_vcpu *vcpu) +void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) { /* Disable and flush SPE data generation */ __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); +} + +void __debug_switch_to_guest(struct kvm_vcpu *vcpu) +{ __debug_switch_to_guest_common(vcpu); } +void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) +{ + __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); +} + void __debug_switch_to_host(struct kvm_vcpu *vcpu) { - __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); __debug_switch_to_host_common(vcpu); } diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 6585a7cbbc56..5d94584840cc 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -71,7 +71,8 @@ SYM_FUNC_START(__host_enter) SYM_FUNC_END(__host_enter) /* - * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); + * void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr, + * u64 elr, u64 par); */ SYM_FUNC_START(__hyp_do_panic) /* Prepare and exit to the host's panic funciton. */ @@ -82,9 +83,11 @@ SYM_FUNC_START(__hyp_do_panic) hyp_kimg_va lr, x6 msr elr_el2, lr - /* Set the panic format string. Use the, now free, LR as scratch. */ - ldr lr, =__hyp_panic_string - hyp_kimg_va lr, x6 + mov x29, x0 + + /* Load the format string into x0 and arguments into x1-7 */ + ldr x0, =__hyp_panic_string + hyp_kimg_va x0, x6 /* Load the format arguments into x1-7. */ mov x6, x3 @@ -94,9 +97,7 @@ SYM_FUNC_START(__hyp_do_panic) mrs x5, hpfar_el2 /* Enter the host, conditionally restoring the host context. */ - cmp x0, xzr - mov x0, lr - b.eq __host_enter_without_restoring + cbz x29, __host_enter_without_restoring b __host_enter_for_panic SYM_FUNC_END(__hyp_do_panic) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index f012f8665ecc..936328207bde 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -46,11 +46,11 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); } -static void handle___kvm_tlb_flush_local_vmid(struct kvm_cpu_context *host_ctxt) +static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); - __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu)); + __kvm_flush_cpu_context(kern_hyp_va(mmu)); } static void handle___kvm_timer_set_cntvoff(struct kvm_cpu_context *host_ctxt) @@ -67,9 +67,9 @@ static void handle___kvm_enable_ssbs(struct kvm_cpu_context *host_ctxt) write_sysreg_el2(tmp, SYS_SCTLR); } -static void handle___vgic_v3_get_ich_vtr_el2(struct kvm_cpu_context *host_ctxt) +static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt) { - cpu_reg(host_ctxt, 1) = __vgic_v3_get_ich_vtr_el2(); + cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config(); } static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt) @@ -115,10 +115,10 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), HANDLE_FUNC(__kvm_tlb_flush_vmid), - HANDLE_FUNC(__kvm_tlb_flush_local_vmid), + HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), HANDLE_FUNC(__kvm_enable_ssbs), - HANDLE_FUNC(__vgic_v3_get_ich_vtr_el2), + HANDLE_FUNC(__vgic_v3_get_gic_config), HANDLE_FUNC(__vgic_v3_read_vmcr), HANDLE_FUNC(__vgic_v3_write_vmcr), HANDLE_FUNC(__vgic_v3_init_lrs), diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index f3d0e9eca56c..68ab6b4d5141 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -192,6 +192,14 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) pmu_switch_needed = __pmu_switch_to_guest(host_ctxt); __sysreg_save_state_nvhe(host_ctxt); + /* + * We must flush and disable the SPE buffer for nVHE, as + * the translation regime(EL1&0) is going to be loaded with + * that of the guest. And we must do this before we change the + * translation regime to EL2 (via MDCR_EL2_E2PB == 0) and + * before we load guest Stage1. + */ + __debug_save_host_buffers_nvhe(vcpu); __adjust_pc(vcpu); @@ -234,11 +242,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) __fpsimd_save_fpexc32(vcpu); + __debug_switch_to_host(vcpu); /* * This must come after restoring the host sysregs, since a non-VHE * system may enable SPE here and make use of the TTBRs. */ - __debug_switch_to_host(vcpu); + __debug_restore_host_buffers_nvhe(vcpu); if (pmu_switch_needed) __pmu_switch_to_host(host_ctxt); @@ -257,7 +266,6 @@ void __noreturn hyp_panic(void) u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); u64 par = read_sysreg_par(); - bool restore_host = true; struct kvm_cpu_context *host_ctxt; struct kvm_vcpu *vcpu; @@ -271,7 +279,7 @@ void __noreturn hyp_panic(void) __sysreg_restore_state_nvhe(host_ctxt); } - __hyp_do_panic(restore_host, spsr, elr, par); + __hyp_do_panic(host_ctxt, spsr, elr, par); unreachable(); } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index fbde89a2c6e8..229b06748c20 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -123,7 +123,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_host(&cxt); } -void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) +void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; @@ -131,6 +131,7 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_guest(mmu, &cxt); __tlbi(vmalle1); + asm volatile("ic iallu"); dsb(nsh); isb(); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 4d177ce1d536..926fc07074f5 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -223,6 +223,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, goto out; if (!table) { + data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); data->addr += kvm_granule_size(level); goto out; } diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 80406f463c28..ee3682b9873c 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -405,9 +405,45 @@ void __vgic_v3_init_lrs(void) __gic_v3_set_lr(0, i); } -u64 __vgic_v3_get_ich_vtr_el2(void) +/* + * Return the GIC CPU configuration: + * - [31:0] ICH_VTR_EL2 + * - [62:32] RES0 + * - [63] MMIO (GICv2) capable + */ +u64 __vgic_v3_get_gic_config(void) { - return read_gicreg(ICH_VTR_EL2); + u64 val, sre = read_gicreg(ICC_SRE_EL1); + unsigned long flags = 0; + + /* + * To check whether we have a MMIO-based (GICv2 compatible) + * CPU interface, we need to disable the system register + * view. To do that safely, we have to prevent any interrupt + * from firing (which would be deadly). + * + * Note that this only makes sense on VHE, as interrupts are + * already masked for nVHE as part of the exception entry to + * EL2. + */ + if (has_vhe()) + flags = local_daif_save(); + + write_gicreg(0, ICC_SRE_EL1); + isb(); + + val = read_gicreg(ICC_SRE_EL1); + + write_gicreg(sre, ICC_SRE_EL1); + isb(); + + if (has_vhe()) + local_daif_restore(flags); + + val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63); + val |= read_gicreg(ICH_VTR_EL2); + + return val; } u64 __vgic_v3_read_vmcr(void) diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index fd7895945bbc..66f17349f0c3 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -127,7 +127,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_host(&cxt); } -void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) +void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; @@ -135,6 +135,7 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_guest(mmu, &cxt); __tlbi(vmalle1); + asm volatile("ic iallu"); dsb(nsh); isb(); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 77cb2d28f2a4..8711894db8c2 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1312,8 +1312,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * Prevent userspace from creating a memory region outside of the IPA * space addressable by the KVM guest IPA space. */ - if (memslot->base_gfn + memslot->npages >= - (kvm_phys_size(kvm) >> PAGE_SHIFT)) + if ((memslot->base_gfn + memslot->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT)) return -EFAULT; mmap_read_lock(current->mm); diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index d45b8b9a4415..739164324afe 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -11,6 +11,8 @@ #include +DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); + static int kvm_is_in_guest(void) { return kvm_get_running_vcpu() != NULL; @@ -48,6 +50,14 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { int kvm_perf_init(void) { + /* + * Check if HW_PERF_EVENTS are supported by checking the number of + * hardware performance counters. This could ensure the presence of + * a physical PMU and CONFIG_PERF_EVENT is selected. + */ + if (IS_ENABLED(CONFIG_ARM_PMU) && perf_num_counters() > 0) + static_branch_enable(&kvm_arm_pmu_available); + return perf_register_guest_info_callbacks(&kvm_guest_cbs); } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index e9ec08b0b070..e32c6e139a09 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -823,16 +823,6 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) return val & mask; } -bool kvm_arm_support_pmu_v3(void) -{ - /* - * Check if HW_PERF_EVENTS are supported by checking the number of - * hardware performance counters. This could ensure the presence of - * a physical PMU and CONFIG_PERF_EVENT is selected. - */ - return (perf_num_counters() > 0); -} - int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) { if (!kvm_vcpu_has_pmu(vcpu)) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index e81c7ec9e102..bd354cd45d28 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -326,10 +326,9 @@ int kvm_set_ipa_limit(void) } kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange); - WARN(kvm_ipa_limit < KVM_PHYS_SHIFT, - "KVM IPA Size Limit (%d bits) is smaller than default size\n", - kvm_ipa_limit); - kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit); + kvm_info("IPA Size Limit: %d bits%s\n", kvm_ipa_limit, + ((kvm_ipa_limit < KVM_PHYS_SHIFT) ? + " (Reduced IPA size, limited VM/VMM compatibility)" : "")); return 0; } @@ -358,6 +357,11 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) return -EINVAL; } else { phys_shift = KVM_PHYS_SHIFT; + if (phys_shift > kvm_ipa_limit) { + pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", + current->comm); + return -EINVAL; + } } mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 52915b342351..6f530925a231 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -574,9 +574,13 @@ early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); */ int vgic_v3_probe(const struct gic_kvm_info *info) { - u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2); + u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); + bool has_v2; int ret; + has_v2 = ich_vtr_el2 >> 63; + ich_vtr_el2 = (u32)ich_vtr_el2; + /* * The ListRegs field is 5 bits, but there is an architectural * maximum of 16 list registers. Just ignore bit 4... @@ -594,13 +598,15 @@ int vgic_v3_probe(const struct gic_kvm_info *info) gicv4_enable ? "en" : "dis"); } + kvm_vgic_global_state.vcpu_base = 0; + if (!info->vcpu.start) { kvm_info("GICv3: no GICV resource entry\n"); - kvm_vgic_global_state.vcpu_base = 0; + } else if (!has_v2) { + pr_warn(FW_BUG "CPU interface incapable of MMIO access\n"); } else if (!PAGE_ALIGNED(info->vcpu.start)) { pr_warn("GICV physical address 0x%llx not page aligned\n", (unsigned long long)info->vcpu.start); - kvm_vgic_global_state.vcpu_base = 0; } else { kvm_vgic_global_state.vcpu_base = info->vcpu.start; kvm_vgic_global_state.can_emulate_gicv2 = true; diff --git a/arch/ia64/include/asm/syscall.h b/arch/ia64/include/asm/syscall.h index 6c6f16e409a8..0d23c0049301 100644 --- a/arch/ia64/include/asm/syscall.h +++ b/arch/ia64/include/asm/syscall.h @@ -32,7 +32,7 @@ static inline void syscall_rollback(struct task_struct *task, static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { - return regs->r10 == -1 ? regs->r8:0; + return regs->r10 == -1 ? -regs->r8:0; } static inline long syscall_get_return_value(struct task_struct *task, diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index c3490ee2daa5..e14f5653393a 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -2013,27 +2013,39 @@ static void syscall_get_set_args_cb(struct unw_frame_info *info, void *data) { struct syscall_get_set_args *args = data; struct pt_regs *pt = args->regs; - unsigned long *krbs, cfm, ndirty; + unsigned long *krbs, cfm, ndirty, nlocals, nouts; int i, count; if (unw_unwind_to_user(info) < 0) return; + /* + * We get here via a few paths: + * - break instruction: cfm is shared with caller. + * syscall args are in out= regs, locals are non-empty. + * - epsinstruction: cfm is set by br.call + * locals don't exist. + * + * For both cases argguments are reachable in cfm.sof - cfm.sol. + * CFM: [ ... | sor: 17..14 | sol : 13..7 | sof : 6..0 ] + */ cfm = pt->cr_ifs; + nlocals = (cfm >> 7) & 0x7f; /* aka sol */ + nouts = (cfm & 0x7f) - nlocals; /* aka sof - sol */ krbs = (unsigned long *)info->task + IA64_RBS_OFFSET/8; ndirty = ia64_rse_num_regs(krbs, krbs + (pt->loadrs >> 19)); count = 0; if (in_syscall(pt)) - count = min_t(int, args->n, cfm & 0x7f); + count = min_t(int, args->n, nouts); + /* Iterate over outs. */ for (i = 0; i < count; i++) { + int j = ndirty + nlocals + i + args->i; if (args->rw) - *ia64_rse_skip_regs(krbs, ndirty + i + args->i) = - args->args[i]; + *ia64_rse_skip_regs(krbs, j) = args->args[i]; else - args->args[i] = *ia64_rse_skip_regs(krbs, - ndirty + i + args->i); + args->args[i] = *ia64_rse_skip_regs(krbs, j); } if (!args->rw) { diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index eacc9102c251..f1d029bf906e 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -73,9 +73,10 @@ void __patch_exception(int exc, unsigned long addr); #endif #define OP_RT_RA_MASK 0xffff0000UL -#define LIS_R2 0x3c020000UL -#define ADDIS_R2_R12 0x3c4c0000UL -#define ADDI_R2_R2 0x38420000UL +#define LIS_R2 (PPC_INST_ADDIS | __PPC_RT(R2)) +#define ADDIS_R2_R12 (PPC_INST_ADDIS | __PPC_RT(R2) | __PPC_RA(R12)) +#define ADDI_R2_R2 (PPC_INST_ADDI | __PPC_RT(R2) | __PPC_RA(R2)) + static inline unsigned long ppc_function_entry(void *func) { diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index aedfba29e43a..e8d09a841373 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -410,7 +410,6 @@ DECLARE_INTERRUPT_HANDLER(altivec_assist_exception); DECLARE_INTERRUPT_HANDLER(CacheLockingException); DECLARE_INTERRUPT_HANDLER(SPEFloatingPointException); DECLARE_INTERRUPT_HANDLER(SPEFloatingPointRoundException); -DECLARE_INTERRUPT_HANDLER(unrecoverable_exception); DECLARE_INTERRUPT_HANDLER(WatchdogException); DECLARE_INTERRUPT_HANDLER(kernel_bad_stack); @@ -437,6 +436,8 @@ DECLARE_INTERRUPT_HANDLER_NMI(hmi_exception_realmode); DECLARE_INTERRUPT_HANDLER_ASYNC(TAUException); +void unrecoverable_exception(struct pt_regs *regs); + void replay_system_reset(void); void replay_soft_interrupts(void); diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 975ba260006a..1499e928ea6a 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -195,7 +195,7 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) #define TRAP_FLAGS_MASK 0x11 #define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK) #define FULL_REGS(regs) (((regs)->trap & 1) == 0) -#define SET_FULL_REGS(regs) ((regs)->trap |= 1) +#define SET_FULL_REGS(regs) ((regs)->trap &= ~1) #endif #define CHECK_FULL_REGS(regs) BUG_ON(!FULL_REGS(regs)) #define NV_REG_POISON 0xdeadbeefdeadbeefUL @@ -210,7 +210,7 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) #define TRAP_FLAGS_MASK 0x1F #define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK) #define FULL_REGS(regs) (((regs)->trap & 1) == 0) -#define SET_FULL_REGS(regs) ((regs)->trap |= 1) +#define SET_FULL_REGS(regs) ((regs)->trap &= ~1) #define IS_CRITICAL_EXC(regs) (((regs)->trap & 2) != 0) #define IS_MCHECK_EXC(regs) (((regs)->trap & 4) != 0) #define IS_DEBUG_EXC(regs) (((regs)->trap & 8) != 0) diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index fdab93428372..9d1fbd8be1c7 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -71,6 +71,16 @@ static inline void disable_kernel_vsx(void) { msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX); } +#else +static inline void enable_kernel_vsx(void) +{ + BUILD_BUG(); +} + +static inline void disable_kernel_vsx(void) +{ + BUILD_BUG(); +} #endif #ifdef CONFIG_SPE diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 60d3051a8bc8..8082b690e874 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -466,7 +466,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real) ld r10,PACAKMSR(r13) /* get MSR value for kernel */ /* MSR[RI] is clear iff using SRR regs */ - .if IHSRR == EXC_HV_OR_STD + .if IHSRR_IF_HVMODE BEGIN_FTR_SECTION xori r10,r10,MSR_RI END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index 2ef3c4051bb9..c475a229a42a 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -436,7 +436,6 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned return ret; } -void unrecoverable_exception(struct pt_regs *regs); void preempt_schedule_irq(void); notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsigned long msr) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 1583fd1c6010..a44a30b0688c 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -2170,7 +2170,7 @@ DEFINE_INTERRUPT_HANDLER(SPEFloatingPointRoundException) * in the MSR is 0. This indicates that SRR0/1 are live, and that * we therefore lost state by taking this exception. */ -DEFINE_INTERRUPT_HANDLER(unrecoverable_exception) +void unrecoverable_exception(struct pt_regs *regs) { pr_emerg("Unrecoverable exception %lx at %lx (msr=%lx)\n", regs->trap, regs->nip, regs->msr); diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index a2433ae8a65e..4efd39aacb9f 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -128,7 +128,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) regs->ax = -EFAULT; instrumentation_end(); - syscall_exit_to_user_mode(regs); + local_irq_disable(); + irqentry_exit_to_user_mode(regs); return false; } diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 541fdaf64045..0051cf5c792d 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -210,6 +210,8 @@ SYM_CODE_START(entry_SYSCALL_compat) /* Switch to the kernel stack */ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) + /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq %r8 /* pt_regs->sp */ diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6ddeed3cd2ac..18df17129695 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -81,7 +81,11 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); -DEFINE_STATIC_CALL_NULL(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); +/* + * This one is magic, it will get called even when PMU init fails (because + * there is no PMU), in which case it should simply return NULL. + */ +DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] @@ -1944,13 +1948,6 @@ static void _x86_pmu_read(struct perf_event *event) x86_perf_event_update(event); } -static inline struct perf_guest_switch_msr * -perf_guest_get_msrs_nop(int *nr) -{ - *nr = 0; - return NULL; -} - static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -2025,7 +2022,7 @@ static int __init init_hw_perf_events(void) x86_pmu.read = _x86_pmu_read; if (!x86_pmu.guest_get_msrs) - x86_pmu.guest_get_msrs = perf_guest_get_msrs_nop; + x86_pmu.guest_get_msrs = (void *)&__static_call_return0; x86_pmu_static_call_update(); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 5bac48d5c18e..7bbb5bb98d8c 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3662,8 +3662,10 @@ static int intel_pmu_hw_config(struct perf_event *event) if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & - ~intel_pmu_large_pebs_flags(event))) + ~intel_pmu_large_pebs_flags(event))) { event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS; + event->attach_state |= PERF_ATTACH_SCHED_CB; + } } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); @@ -3676,6 +3678,7 @@ static int intel_pmu_hw_config(struct perf_event *event) ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; + event->attach_state |= PERF_ATTACH_SCHED_CB; /* * BTS is set up earlier in this path, so don't account twice diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index a0f839aa144d..98b4dae5e8bc 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -23,6 +23,8 @@ unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); int insn_get_code_seg_params(struct pt_regs *regs); int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]); +int insn_fetch_from_user_inatomic(struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE]); bool insn_decode(struct insn *insn, struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE], int buf_size); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 877a4025d8da..9bc091ecaaeb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -963,7 +963,7 @@ struct kvm_arch { struct kvm_pit *vpit; atomic_t vapics_in_nmi_mode; struct mutex apic_map_lock; - struct kvm_apic_map *apic_map; + struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; bool apic_access_page_done; @@ -1036,7 +1036,7 @@ struct kvm_arch { bool bus_lock_detection_enabled; - struct kvm_pmu_event_filter *pmu_event_filter; + struct kvm_pmu_event_filter __rcu *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 2c35f1c01a2d..b6a9d51d1d79 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -25,6 +25,7 @@ void __end_SYSENTER_singlestep_region(void); void entry_SYSENTER_compat(void); void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); +void entry_SYSCALL_compat_safe_stack(void); void entry_INT80_compat(void); #ifdef CONFIG_XEN_PV void xen_entry_INT80_compat(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index d8324a236696..409f661481e1 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -94,6 +94,8 @@ struct pt_regs { #include #endif +#include + struct cpuinfo_x86; struct task_struct; @@ -175,6 +177,19 @@ static inline bool any_64bit_mode(struct pt_regs *regs) #ifdef CONFIG_X86_64 #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp + +static inline bool ip_within_syscall_gap(struct pt_regs *regs) +{ + bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && + regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); + +#ifdef CONFIG_IA32_EMULATION + ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat && + regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack); +#endif + + return ret; +} #endif static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 8b58d6975d5d..0bc9b0895f33 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -58,9 +58,8 @@ static __always_inline unsigned long smap_save(void) unsigned long flags; asm volatile ("# smap_save\n\t" - ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) - "pushf; pop %0; " __ASM_CLAC "\n\t" - "1:" + ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC "\n\t", + X86_FEATURE_SMAP) : "=rm" (flags) : : "memory", "cc"); return flags; @@ -69,9 +68,8 @@ static __always_inline unsigned long smap_save(void) static __always_inline void smap_restore(unsigned long flags) { asm volatile ("# smap_restore\n\t" - ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) - "push %0; popf\n\t" - "1:" + ALTERNATIVE("", "push %0; popf\n\t", + X86_FEATURE_SMAP) : : "g" (flags) : "memory", "cc"); } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index aa593743acf6..1fc0962c89c0 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -268,21 +268,20 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { -#ifdef CONFIG_X86_64 - u8 flags; - - if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) - return 0; - - flags = pvclock_read_flags(&hv_clock_boot[0].pvti); - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 0; - - kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; -#endif - kvmclock_init_mem(); +#ifdef CONFIG_X86_64 + if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { + u8 flags; + + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; + + kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + } +#endif + return 0; } early_initcall(kvm_setup_vsyscall_timeinfo); diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 84c1821819af..04a780abb512 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -121,8 +121,18 @@ static void __init setup_vc_stacks(int cpu) cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); } -static __always_inline bool on_vc_stack(unsigned long sp) +static __always_inline bool on_vc_stack(struct pt_regs *regs) { + unsigned long sp = regs->sp; + + /* User-mode RSP is not trusted */ + if (user_mode(regs)) + return false; + + /* SYSCALL gap still has user-mode RSP */ + if (ip_within_syscall_gap(regs)) + return false; + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); } @@ -144,7 +154,7 @@ void noinstr __sev_es_ist_enter(struct pt_regs *regs) old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); /* Make room on the IST stack */ - if (on_vc_stack(regs->sp)) + if (on_vc_stack(regs)) new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); else new_ist = old_ist - sizeof(old_ist); @@ -248,7 +258,7 @@ static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) int res; if (user_mode(ctxt->regs)) { - res = insn_fetch_from_user(ctxt->regs, buffer); + res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); if (!res) { ctxt->fi.vector = X86_TRAP_PF; ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; @@ -1248,13 +1258,12 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) { struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + irqentry_state_t irq_state; struct ghcb_state state; struct es_em_ctxt ctxt; enum es_result result; struct ghcb *ghcb; - lockdep_assert_irqs_disabled(); - /* * Handle #DB before calling into !noinstr code to avoid recursive #DB. */ @@ -1263,6 +1272,8 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) return; } + irq_state = irqentry_nmi_enter(regs); + lockdep_assert_irqs_disabled(); instrumentation_begin(); /* @@ -1325,6 +1336,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) out: instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); return; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7f5aec758f0e..ac1874a2a70e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -694,8 +694,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r * In the SYSCALL entry path the RSP value comes from user-space - don't * trust it and switch to the current kernel stack */ - if (regs->ip >= (unsigned long)entry_SYSCALL_64 && - regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack) { + if (ip_within_syscall_gap(regs)) { sp = this_cpu_read(cpu_current_top_of_stack); goto sync; } diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 2a1d47f47eee..a1202536fc57 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -13,7 +13,7 @@ #define orc_warn_current(args...) \ ({ \ - if (state->task == current) \ + if (state->task == current && !state->error) \ orc_warn(args); \ }) @@ -367,8 +367,8 @@ static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) return false; - *ip = regs->ip; - *sp = regs->sp; + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); return true; } @@ -380,8 +380,8 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) return false; - *ip = regs->ip; - *sp = regs->sp; + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); return true; } @@ -402,12 +402,12 @@ static bool get_reg(struct unwind_state *state, unsigned int reg_off, return false; if (state->full_regs) { - *val = ((unsigned long *)state->regs)[reg]; + *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]); return true; } if (state->prev_regs) { - *val = ((unsigned long *)state->prev_regs)[reg]; + *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]); return true; } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 45d40bfacb7c..cc369b9ad8f1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1642,7 +1642,16 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } if (kvm_use_posted_timer_interrupt(apic->vcpu)) { - kvm_wait_lapic_expire(vcpu); + /* + * Ensure the guest's timer has truly expired before posting an + * interrupt. Open code the relevant checks to avoid querying + * lapic_timer_int_injected(), which will be false since the + * interrupt isn't yet injected. Waiting until after injecting + * is not an option since that won't help a posted interrupt. + */ + if (vcpu->arch.apic->lapic_timer.expired_tscdeadline && + vcpu->arch.apic->lapic_timer.timer_advance_ns) + __kvm_wait_lapic_expire(vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } @@ -2595,6 +2604,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); + apic->lapic_timer.expired_tscdeadline = 0; apic_update_lvtt(apic); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c926c6b899a1..d78915019b08 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -337,7 +337,18 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, cpu_relax(); } } else { + /* + * If the SPTE is not MMU-present, there is no backing + * page associated with the SPTE and so no side effects + * that need to be recorded, and exclusive ownership of + * mmu_lock ensures the SPTE can't be made present. + * Note, zapping MMIO SPTEs is also unnecessary as they + * are guarded by the memslots generation, not by being + * unreachable. + */ old_child_spte = READ_ONCE(*sptep); + if (!is_shadow_present_pte(old_child_spte)) + continue; /* * Marking the SPTE as a removed SPTE is not diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index baee91c1e936..58a45bb139f8 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -115,13 +115,6 @@ static const struct svm_direct_access_msrs { { .index = MSR_INVALID, .always = false }, }; -/* enable NPT for AMD64 and X86 with PAE */ -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) -bool npt_enabled = true; -#else -bool npt_enabled; -#endif - /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * pause_filter_count: On processors that support Pause filtering(indicated @@ -170,9 +163,12 @@ module_param(pause_filter_count_shrink, ushort, 0444); static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; module_param(pause_filter_count_max, ushort, 0444); -/* allow nested paging (virtualized MMU) for all guests */ -static int npt = true; -module_param(npt, int, S_IRUGO); +/* + * Use nested page tables by default. Note, NPT may get forced off by + * svm_hardware_setup() if it's unsupported by hardware or the host kernel. + */ +bool npt_enabled = true; +module_param_named(npt, npt_enabled, bool, 0444); /* allow nested virtualization in KVM/SVM */ static int nested = true; @@ -988,10 +984,15 @@ static __init int svm_hardware_setup(void) goto err; } - if (!boot_cpu_has(X86_FEATURE_NPT)) + /* + * KVM's MMU doesn't support using 2-level paging for itself, and thus + * NPT isn't supported if the host is using 2-level paging since host + * CR4 is unchanged on VMRUN. + */ + if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) npt_enabled = false; - if (npt_enabled && !npt) + if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 50810d471462..32cf8287d4a7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6580,8 +6580,8 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) int i, nr_msrs; struct perf_guest_switch_msr *msrs; + /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ msrs = perf_guest_get_msrs(&nr_msrs); - if (!msrs) return; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a20ce60152e..47e021bdcc94 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10601,7 +10601,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, return (void __user *)hva; } else { if (!slot || !slot->npages) - return 0; + return NULL; old_npages = slot->npages; hva = slot->userspace_addr; diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 4229950a5d78..bb0b3fe1e0a0 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -1415,6 +1415,25 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) } } +static unsigned long insn_get_effective_ip(struct pt_regs *regs) +{ + unsigned long seg_base = 0; + + /* + * If not in user-space long mode, a custom code segment could be in + * use. This is true in protected mode (if the process defined a local + * descriptor table), or virtual-8086 mode. In most of the cases + * seg_base will be zero as in USER_CS. + */ + if (!user_64bit_mode(regs)) { + seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); + if (seg_base == -1L) + return 0; + } + + return seg_base + regs->ip; +} + /** * insn_fetch_from_user() - Copy instruction bytes from user-space memory * @regs: Structure with register values as seen when entering kernel mode @@ -1431,24 +1450,43 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) */ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) { - unsigned long seg_base = 0; + unsigned long ip; int not_copied; - /* - * If not in user-space long mode, a custom code segment could be in - * use. This is true in protected mode (if the process defined a local - * descriptor table), or virtual-8086 mode. In most of the cases - * seg_base will be zero as in USER_CS. - */ - if (!user_64bit_mode(regs)) { - seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); - if (seg_base == -1L) - return 0; - } + ip = insn_get_effective_ip(regs); + if (!ip) + return 0; + not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE); - not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), - MAX_INSN_SIZE); + return MAX_INSN_SIZE - not_copied; +} + +/** + * insn_fetch_from_user_inatomic() - Copy instruction bytes from user-space memory + * while in atomic code + * @regs: Structure with register values as seen when entering kernel mode + * @buf: Array to store the fetched instruction + * + * Gets the linear address of the instruction and copies the instruction bytes + * to the buf. This function must be used in atomic context. + * + * Returns: + * + * Number of instruction bytes copied. + * + * 0 if nothing was copied. + */ +int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) +{ + unsigned long ip; + int not_copied; + + ip = insn_get_effective_ip(regs); + if (!ip) + return 0; + + not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE); return MAX_INSN_SIZE - not_copied; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a711a2e2a794..cf8deecc39ef 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -627,7 +627,7 @@ static ssize_t writeback_store(struct device *dev, struct bio_vec bio_vec; struct page *page; ssize_t ret = len; - int mode; + int mode, err; unsigned long blk_idx = 0; if (sysfs_streq(buf, "idle")) @@ -638,8 +638,8 @@ static ssize_t writeback_store(struct device *dev, if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1)) return -EINVAL; - ret = kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index); - if (ret || index >= nr_pages) + if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) || + index >= nr_pages) return -EINVAL; nr_pages = 1; @@ -663,7 +663,7 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - while (nr_pages--) { + for (; nr_pages != 0; index++, nr_pages--) { struct bio_vec bvec; bvec.bv_page = page; @@ -728,12 +728,17 @@ static ssize_t writeback_store(struct device *dev, * XXX: A single page IO would be inefficient for write * but it would be not bad as starter. */ - ret = submit_bio_wait(&bio); - if (ret) { + err = submit_bio_wait(&bio); + if (err) { zram_slot_lock(zram, index); zram_clear_flag(zram, index, ZRAM_UNDER_WB); zram_clear_flag(zram, index, ZRAM_IDLE); zram_slot_unlock(zram, index); + /* + * Return last IO error unless every IO were + * not suceeded. + */ + ret = err; continue; } diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index ec2f3985bef3..26e69788f27a 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -96,6 +96,18 @@ static void install_memreserve_table(void) efi_err("Failed to install memreserve config table!\n"); } +static u32 get_supported_rt_services(void) +{ + const efi_rt_properties_table_t *rt_prop_table; + u32 supported = EFI_RT_SUPPORTED_ALL; + + rt_prop_table = get_efi_config_table(EFI_RT_PROPERTIES_TABLE_GUID); + if (rt_prop_table) + supported &= rt_prop_table->runtime_services_supported; + + return supported; +} + /* * EFI entry point for the arm/arm64 EFI stubs. This is the entrypoint * that is described in the PE/COFF header. Most of the code is the same @@ -250,6 +262,10 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, (prop_tbl->memory_protection_attribute & EFI_PROPERTIES_RUNTIME_MEMORY_PROTECTION_NON_EXECUTABLE_PE_DATA); + /* force efi_novamap if SetVirtualAddressMap() is unsupported */ + efi_novamap |= !(get_supported_rt_services() & + EFI_RT_SUPPORTED_SET_VIRTUAL_ADDRESS_MAP); + /* hibernation expects the runtime regions to stay in the same place */ if (!IS_ENABLED(CONFIG_HIBERNATION) && !efi_nokaslr && !flat_va_mapping) { /* diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c index 0a900afc66ff..45c9c6a7f1d6 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c @@ -500,8 +500,6 @@ vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf, vm_fault_t ret; pgoff_t fault_page_size; bool write = vmf->flags & FAULT_FLAG_WRITE; - bool is_cow_mapping = - (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; switch (pe_size) { case PE_SIZE_PMD: @@ -518,7 +516,7 @@ vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf, } /* Always do write dirty-tracking and COW on PTE level. */ - if (write && (READ_ONCE(vbo->dirty) || is_cow_mapping)) + if (write && (READ_ONCE(vbo->dirty) || is_cow_mapping(vma->vm_flags))) return VM_FAULT_FALLBACK; ret = ttm_bo_vm_reserve(bo, vmf); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c index 3c03b1746661..cb9975889e2f 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c @@ -49,7 +49,7 @@ int vmw_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_ops = &vmw_vm_ops; /* Use VM_PFNMAP rather than VM_MIXEDMAP if not a COW mapping */ - if ((vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) != VM_MAYWRITE) + if (!is_cow_mapping(vma->vm_flags)) vma->vm_flags = (vma->vm_flags & ~VM_MIXEDMAP) | VM_PFNMAP; return 0; diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 58aee4651648..18494b03a6f3 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -8,7 +8,6 @@ config IRQCHIP config ARM_GIC bool select IRQ_DOMAIN_HIERARCHY - select GENERIC_IRQ_MULTI_HANDLER select GENERIC_IRQ_EFFECTIVE_AFF_MASK config ARM_GIC_PM @@ -33,7 +32,6 @@ config GIC_NON_BANKED config ARM_GIC_V3 bool - select GENERIC_IRQ_MULTI_HANDLER select IRQ_DOMAIN_HIERARCHY select PARTITION_PERCPU select GENERIC_IRQ_EFFECTIVE_AFF_MASK @@ -64,7 +62,6 @@ config ARM_NVIC config ARM_VIC bool select IRQ_DOMAIN - select GENERIC_IRQ_MULTI_HANDLER config ARM_VIC_NR int @@ -99,14 +96,12 @@ config ATMEL_AIC_IRQ bool select GENERIC_IRQ_CHIP select IRQ_DOMAIN - select GENERIC_IRQ_MULTI_HANDLER select SPARSE_IRQ config ATMEL_AIC5_IRQ bool select GENERIC_IRQ_CHIP select IRQ_DOMAIN - select GENERIC_IRQ_MULTI_HANDLER select SPARSE_IRQ config I8259 @@ -153,7 +148,6 @@ config DW_APB_ICTL config FARADAY_FTINTC010 bool select IRQ_DOMAIN - select GENERIC_IRQ_MULTI_HANDLER select SPARSE_IRQ config HISILICON_IRQ_MBIGEN @@ -169,7 +163,6 @@ config IMGPDC_IRQ config IXP4XX_IRQ bool select IRQ_DOMAIN - select GENERIC_IRQ_MULTI_HANDLER select SPARSE_IRQ config MADERA_IRQ @@ -186,7 +179,6 @@ config CLPS711X_IRQCHIP bool depends on ARCH_CLPS711X select IRQ_DOMAIN - select GENERIC_IRQ_MULTI_HANDLER select SPARSE_IRQ default y @@ -205,7 +197,6 @@ config OMAP_IRQCHIP config ORION_IRQCHIP bool select IRQ_DOMAIN - select GENERIC_IRQ_MULTI_HANDLER config PIC32_EVIC bool diff --git a/drivers/irqchip/irq-ingenic-tcu.c b/drivers/irqchip/irq-ingenic-tcu.c index 7a7222d4c19c..b938d1d04d96 100644 --- a/drivers/irqchip/irq-ingenic-tcu.c +++ b/drivers/irqchip/irq-ingenic-tcu.c @@ -179,5 +179,6 @@ static int __init ingenic_tcu_irq_init(struct device_node *np, } IRQCHIP_DECLARE(jz4740_tcu_irq, "ingenic,jz4740-tcu", ingenic_tcu_irq_init); IRQCHIP_DECLARE(jz4725b_tcu_irq, "ingenic,jz4725b-tcu", ingenic_tcu_irq_init); +IRQCHIP_DECLARE(jz4760_tcu_irq, "ingenic,jz4760-tcu", ingenic_tcu_irq_init); IRQCHIP_DECLARE(jz4770_tcu_irq, "ingenic,jz4770-tcu", ingenic_tcu_irq_init); IRQCHIP_DECLARE(x1000_tcu_irq, "ingenic,x1000-tcu", ingenic_tcu_irq_init); diff --git a/drivers/irqchip/irq-ingenic.c b/drivers/irqchip/irq-ingenic.c index b61a8901ef72..ea36bb00be80 100644 --- a/drivers/irqchip/irq-ingenic.c +++ b/drivers/irqchip/irq-ingenic.c @@ -155,6 +155,7 @@ static int __init intc_2chip_of_init(struct device_node *node, { return ingenic_intc_of_init(node, 2); } +IRQCHIP_DECLARE(jz4760_intc, "ingenic,jz4760-intc", intc_2chip_of_init); IRQCHIP_DECLARE(jz4770_intc, "ingenic,jz4770-intc", intc_2chip_of_init); IRQCHIP_DECLARE(jz4775_intc, "ingenic,jz4775-intc", intc_2chip_of_init); IRQCHIP_DECLARE(jz4780_intc, "ingenic,jz4780-intc", intc_2chip_of_init); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index c457334de43f..e1eae7ea823a 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -649,12 +649,24 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, struct super_block *sb = file_inode(file)->i_sb; struct dentry *root = sb->s_root, *dentry; int err = 0; + struct file *f = NULL; e = create_entry(buffer, count); if (IS_ERR(e)) return PTR_ERR(e); + if (e->flags & MISC_FMT_OPEN_FILE) { + f = open_exec(e->interpreter); + if (IS_ERR(f)) { + pr_notice("register: failed to install interpreter file %s\n", + e->interpreter); + kfree(e); + return PTR_ERR(f); + } + e->interp_file = f; + } + inode_lock(d_inode(root)); dentry = lookup_one_len(e->name, root, strlen(e->name)); err = PTR_ERR(dentry); @@ -678,21 +690,6 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, goto out2; } - if (e->flags & MISC_FMT_OPEN_FILE) { - struct file *f; - - f = open_exec(e->interpreter); - if (IS_ERR(f)) { - err = PTR_ERR(f); - pr_notice("register: failed to install interpreter file %s\n", e->interpreter); - simple_release_fs(&bm_mnt, &entry_count); - iput(inode); - inode = NULL; - goto out2; - } - e->interp_file = f; - } - e->dentry = dget(dentry); inode->i_private = e; inode->i_fop = &bm_entry_operations; @@ -709,6 +706,8 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, inode_unlock(d_inode(root)); if (err) { + if (f) + filp_close(f, NULL); kfree(e); return err; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e440315eb637..480cc95fcb25 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1098,8 +1098,6 @@ struct clear_refs_private { #ifdef CONFIG_MEM_SOFT_DIRTY -#define is_cow_mapping(flags) (((flags) & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) - static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { struct page *page; diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 8dcb3e1477bc..6fd3cda608e4 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -13,6 +13,13 @@ #define ARMV8_PMU_CYCLE_IDX (ARMV8_PMU_MAX_COUNTERS - 1) #define ARMV8_PMU_MAX_COUNTER_PAIRS ((ARMV8_PMU_MAX_COUNTERS + 1) >> 1) +DECLARE_STATIC_KEY_FALSE(kvm_arm_pmu_available); + +static __always_inline bool kvm_arm_support_pmu_v3(void) +{ + return static_branch_likely(&kvm_arm_pmu_available); +} + #ifdef CONFIG_HW_PERF_EVENTS struct kvm_pmc { @@ -47,7 +54,6 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx); -bool kvm_arm_support_pmu_v3(void); int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, @@ -87,7 +93,6 @@ static inline void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx) {} -static inline bool kvm_arm_support_pmu_v3(void) { return false; } static inline int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 04c0a5a717f7..d217c382b02d 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -31,6 +31,12 @@ #define __no_sanitize_thread #endif +#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) +#define __HAVE_BUILTIN_BSWAP32__ +#define __HAVE_BUILTIN_BSWAP64__ +#define __HAVE_BUILTIN_BSWAP16__ +#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ + #if __has_feature(undefined_behavior_sanitizer) /* GCC does not have __SANITIZE_UNDEFINED__ */ #define __no_sanitize_undefined \ diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 42d196805f58..33cacc8af26d 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -150,7 +150,6 @@ struct irq_domain_chip_generic; * setting up one or more generic chips for interrupt controllers * drivers using the generic chip library which uses this pointer. * @parent: Pointer to parent irq_domain to support hierarchy irq_domains - * @debugfs_file: dentry for the domain debugfs file * * Revmap data, used internally by irq_domain * @revmap_direct_max_irq: The largest hwirq that can be set for controllers that @@ -174,9 +173,6 @@ struct irq_domain { #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_domain *parent; #endif -#ifdef CONFIG_GENERIC_IRQ_DEBUGFS - struct dentry *debugfs_file; -#endif /* reverse map data. The linear map gets appended to the irq_domain */ irq_hw_number_t hwirq_max; diff --git a/include/linux/memblock.h b/include/linux/memblock.h index c88bc24e31aa..d13e3cd938b4 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -460,7 +460,7 @@ static inline void memblock_free_late(phys_addr_t base, phys_addr_t size) /* * Set the allocation direction to bottom-up or top-down. */ -static inline void memblock_set_bottom_up(bool enable) +static inline __init void memblock_set_bottom_up(bool enable) { memblock.bottom_up = enable; } @@ -470,7 +470,7 @@ static inline void memblock_set_bottom_up(bool enable) * if this is true, that said, memblock will allocate memory * in bottom-up direction. */ -static inline bool memblock_bottom_up(void) +static inline __init bool memblock_bottom_up(void) { return memblock.bottom_up; } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e6dc793d587d..0c04d39a7967 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1061,9 +1061,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, rcu_read_unlock(); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -void mem_cgroup_split_huge_fixup(struct page *head); -#endif +void split_page_memcg(struct page *head, unsigned int nr); #else /* CONFIG_MEMCG */ @@ -1400,7 +1398,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, return 0; } -static inline void mem_cgroup_split_huge_fixup(struct page *head) +static inline void split_page_memcg(struct page *head, unsigned int nr) { } diff --git a/include/linux/mm.h b/include/linux/mm.h index f82e5cd6bb8a..b2367f148470 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1300,6 +1300,27 @@ static inline bool page_maybe_dma_pinned(struct page *page) GUP_PIN_COUNTING_BIAS; } +static inline bool is_cow_mapping(vm_flags_t flags) +{ + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} + +/* + * This should most likely only be called during fork() to see whether we + * should break the cow immediately for a page on the src mm. + */ +static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, + struct page *page) +{ + if (!is_cow_mapping(vma->vm_flags)) + return false; + + if (!atomic_read(&vma->vm_mm->has_pinned)) + return false; + + return page_maybe_dma_pinned(page); +} + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7c31dcb852b9..19e924771b50 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -23,6 +23,7 @@ #endif #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) +#define INIT_PASID 0 struct address_space; struct mem_cgroup; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fab42cfbd350..3f7f89ea5e51 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -606,6 +606,7 @@ struct swevent_hlist { #define PERF_ATTACH_TASK 0x04 #define PERF_ATTACH_TASK_DATA 0x08 #define PERF_ATTACH_ITRACE 0x10 +#define PERF_ATTACH_SCHED_CB 0x20 struct perf_cgroup; struct perf_buffer; @@ -872,6 +873,7 @@ struct perf_cpu_context { struct list_head cgrp_cpuctx_entry; #endif + struct list_head sched_cb_entry; int sched_cb_usage; int online; diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 1ae08b8462a4..90b2a0bce11c 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -140,7 +140,8 @@ static inline bool in_vfork(struct task_struct *tsk) * another oom-unkillable task does this it should blame itself. */ rcu_read_lock(); - ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm; + ret = tsk->vfork_done && + rcu_dereference(tsk->real_parent)->mm == tsk->mm; rcu_read_unlock(); return ret; diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 2f7bb92b4c9e..f61e34fbaaea 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -664,10 +664,7 @@ typedef struct { * seqcount_latch_init() - runtime initializer for seqcount_latch_t * @s: Pointer to the seqcount_latch_t instance */ -static inline void seqcount_latch_init(seqcount_latch_t *s) -{ - seqcount_init(&s->seqcount); -} +#define seqcount_latch_init(s) seqcount_init(&(s)->seqcount) /** * raw_read_seqcount_latch() - pick even/odd latch data copy diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 30577c3aecf8..46fb3ebdd16e 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -128,7 +128,7 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); #else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ -static inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, +static __always_inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { unsigned long flags; @@ -139,14 +139,15 @@ static inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, return ret; } -static inline int stop_machine(cpu_stop_fn_t fn, void *data, - const struct cpumask *cpus) +static __always_inline int +stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { return stop_machine_cpuslocked(fn, data, cpus); } -static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, - const struct cpumask *cpus) +static __always_inline int +stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, + const struct cpumask *cpus) { return stop_machine(fn, data, cpus); } diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index c6abb79501b3..e81856c0ba13 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -115,12 +115,13 @@ static inline void u64_stats_inc(u64_stats_t *p) } #endif +#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) +#define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) +#else static inline void u64_stats_init(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) - seqcount_init(&syncp->seq); -#endif } +#endif static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) { diff --git a/init/Kconfig b/init/Kconfig index 1cf9d8beb82d..00b1919b68e3 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -119,8 +119,7 @@ config INIT_ENV_ARG_LIMIT config COMPILE_TEST bool "Compile also drivers which will not load" - depends on !UML && !S390 - default n + depends on HAS_IOMEM help Some drivers can be compiled on a different platform than they are intended to be run on. Despite they cannot be loaded there (or even diff --git a/kernel/events/core.c b/kernel/events/core.c index ec99baf58c2d..8986c60be29b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -386,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); +static DEFINE_PER_CPU(int, perf_sched_cb_usages); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -3461,11 +3462,16 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, } } +static DEFINE_PER_CPU(struct list_head, sched_cb_list); + void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - --cpuctx->sched_cb_usage; + this_cpu_dec(perf_sched_cb_usages); + + if (!--cpuctx->sched_cb_usage) + list_del(&cpuctx->sched_cb_entry); } @@ -3473,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - cpuctx->sched_cb_usage++; + if (!cpuctx->sched_cb_usage++) + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + + this_cpu_inc(perf_sched_cb_usages); } /* @@ -3502,6 +3511,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } +static void perf_pmu_sched_task(struct task_struct *prev, + struct task_struct *next, + bool sched_in) +{ + struct perf_cpu_context *cpuctx; + + if (prev == next) + return; + + list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { + /* will be handled in perf_event_context_sched_in/out */ + if (cpuctx->task_ctx) + continue; + + __perf_pmu_sched_task(cpuctx, sched_in); + } +} + static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); @@ -3524,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(task, next, false); + if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); @@ -3832,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); + + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -4657,7 +4690,7 @@ static void unaccount_event(struct perf_event *event) if (event->parent) return; - if (event->attach_state & PERF_ATTACH_TASK) + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); @@ -11176,7 +11209,7 @@ static void account_event(struct perf_event *event) if (event->parent) return; - if (event->attach_state & PERF_ATTACH_TASK) + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); @@ -12973,6 +13006,7 @@ static void __init perf_event_init_all_cpus(void) #ifdef CONFIG_CGROUP_PERF INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); #endif + INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } diff --git a/kernel/fork.c b/kernel/fork.c index 6cd024f16630..66554441f7ca 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -997,6 +997,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif } +static void mm_init_pasid(struct mm_struct *mm) +{ +#ifdef CONFIG_IOMMU_SUPPORT + mm->pasid = INIT_PASID; +#endif +} + static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES @@ -1027,6 +1034,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + mm_init_pasid(mm); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 288151393a06..d10ab1d689d5 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1898,16 +1898,15 @@ DEFINE_SHOW_ATTRIBUTE(irq_domain_debug); static void debugfs_add_domain_dir(struct irq_domain *d) { - if (!d->name || !domain_dir || d->debugfs_file) + if (!d->name || !domain_dir) return; - d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d, - &irq_domain_debug_fops); + debugfs_create_file(d->name, 0444, domain_dir, d, + &irq_domain_debug_fops); } static void debugfs_remove_domain_dir(struct irq_domain *d) { - debugfs_remove(d->debugfs_file); - d->debugfs_file = NULL; + debugfs_remove(debugfs_lookup(d->name, domain_dir)); } void __init irq_domain_debugfs_init(struct dentry *root) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e57809458da0..64353de5c0cd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1893,8 +1893,13 @@ struct migration_arg { struct set_affinity_pending *pending; }; +/* + * @refs: number of wait_for_completion() + * @stop_pending: is @stop_work in use + */ struct set_affinity_pending { refcount_t refs; + unsigned int stop_pending; struct completion done; struct cpu_stop_work stop_work; struct migration_arg arg; @@ -1929,8 +1934,8 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, */ static int migration_cpu_stop(void *data) { - struct set_affinity_pending *pending; struct migration_arg *arg = data; + struct set_affinity_pending *pending = arg->pending; struct task_struct *p = arg->task; int dest_cpu = arg->dest_cpu; struct rq *rq = this_rq(); @@ -1952,7 +1957,6 @@ static int migration_cpu_stop(void *data) raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); - pending = p->migration_pending; /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because @@ -1963,21 +1967,14 @@ static int migration_cpu_stop(void *data) goto out; if (pending) { - p->migration_pending = NULL; + if (p->migration_pending == pending) + p->migration_pending = NULL; complete = true; } - /* migrate_enable() -- we must not race against SCA */ if (dest_cpu < 0) { - /* - * When this was migrate_enable() but we no longer - * have a @pending, a concurrent SCA 'fixed' things - * and we should be valid again. Nothing to do. - */ - if (!pending) { - WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) goto out; - } dest_cpu = cpumask_any_distribute(&p->cpus_mask); } @@ -1987,7 +1984,14 @@ static int migration_cpu_stop(void *data) else p->wake_cpu = dest_cpu; - } else if (dest_cpu < 0 || pending) { + /* + * XXX __migrate_task() can fail, at which point we might end + * up running on a dodgy CPU, AFAICT this can only happen + * during CPU hotplug, at which point we'll get pushed out + * anyway, so it's probably not a big deal. + */ + + } else if (pending) { /* * This happens when we get migrated between migrate_enable()'s * preempt_enable() and scheduling the stopper task. At that @@ -2002,43 +2006,32 @@ static int migration_cpu_stop(void *data) * ->pi_lock, so the allowed mask is stable - if it got * somewhere allowed, we're done. */ - if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { - p->migration_pending = NULL; + if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { + if (p->migration_pending == pending) + p->migration_pending = NULL; complete = true; goto out; } - /* - * When this was migrate_enable() but we no longer have an - * @pending, a concurrent SCA 'fixed' things and we should be - * valid again. Nothing to do. - */ - if (!pending) { - WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); - goto out; - } - /* * When migrate_enable() hits a rq mis-match we can't reliably * determine is_migration_disabled() and so have to chase after * it. */ + WARN_ON_ONCE(!pending->stop_pending); task_rq_unlock(rq, p, &rf); stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, &pending->arg, &pending->stop_work); return 0; } out: + if (pending) + pending->stop_pending = false; task_rq_unlock(rq, p, &rf); if (complete) complete_all(&pending->done); - /* For pending->{arg,stop_work} */ - pending = arg->pending; - if (pending && refcount_dec_and_test(&pending->refs)) - wake_up_var(&pending->refs); - return 0; } @@ -2225,11 +2218,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag int dest_cpu, unsigned int flags) { struct set_affinity_pending my_pending = { }, *pending = NULL; - struct migration_arg arg = { - .task = p, - .dest_cpu = dest_cpu, - }; - bool complete = false; + bool stop_pending, complete = false; /* Can the task run on the task's current CPU? If so, we're done */ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { @@ -2241,12 +2230,16 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag push_task = get_task_struct(p); } + /* + * If there are pending waiters, but no pending stop_work, + * then complete now. + */ pending = p->migration_pending; - if (pending) { - refcount_inc(&pending->refs); + if (pending && !pending->stop_pending) { p->migration_pending = NULL; complete = true; } + task_rq_unlock(rq, p, rf); if (push_task) { @@ -2255,7 +2248,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag } if (complete) - goto do_complete; + complete_all(&pending->done); return 0; } @@ -2266,6 +2259,12 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag /* Install the request */ refcount_set(&my_pending.refs, 1); init_completion(&my_pending.done); + my_pending.arg = (struct migration_arg) { + .task = p, + .dest_cpu = -1, /* any */ + .pending = &my_pending, + }; + p->migration_pending = &my_pending; } else { pending = p->migration_pending; @@ -2290,45 +2289,41 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag return -EINVAL; } - if (flags & SCA_MIGRATE_ENABLE) { - - refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ - p->migration_flags &= ~MDF_PUSH; - task_rq_unlock(rq, p, rf); - - pending->arg = (struct migration_arg) { - .task = p, - .dest_cpu = -1, - .pending = pending, - }; - - stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, - &pending->arg, &pending->stop_work); - - return 0; - } - if (task_running(rq, p) || p->state == TASK_WAKING) { /* - * Lessen races (and headaches) by delegating - * is_migration_disabled(p) checks to the stopper, which will - * run on the same CPU as said p. + * MIGRATE_ENABLE gets here because 'p == current', but for + * anything else we cannot do is_migration_disabled(), punt + * and have the stopper function handle it all race-free. */ - task_rq_unlock(rq, p, rf); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + stop_pending = pending->stop_pending; + if (!stop_pending) + pending->stop_pending = true; + if (flags & SCA_MIGRATE_ENABLE) + p->migration_flags &= ~MDF_PUSH; + + task_rq_unlock(rq, p, rf); + + if (!stop_pending) { + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + } + + if (flags & SCA_MIGRATE_ENABLE) + return 0; } else { if (!is_migration_disabled(p)) { if (task_on_rq_queued(p)) rq = move_queued_task(rq, rf, p, dest_cpu); - p->migration_pending = NULL; - complete = true; + if (!pending->stop_pending) { + p->migration_pending = NULL; + complete = true; + } } task_rq_unlock(rq, p, rf); -do_complete: if (complete) complete_all(&pending->done); } @@ -2336,7 +2331,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag wait_for_completion(&pending->done); if (refcount_dec_and_test(&pending->refs)) - wake_up_var(&pending->refs); + wake_up_var(&pending->refs); /* No UaF, just an address */ /* * Block the original owner of &pending until all subsequent callers @@ -2344,6 +2339,9 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag */ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); + /* ARGH */ + WARN_ON_ONCE(my_pending.stop_pending); + return 0; } diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index acdae625c636..b5add64d9698 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -471,9 +471,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm) } rcu_read_unlock(); - preempt_disable(); - smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); - preempt_enable(); + on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true); free_cpumask_var(tmpmask); cpus_read_unlock(); diff --git a/kernel/static_call.c b/kernel/static_call.c index 6906c6ec4c97..ae825295cf68 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -349,7 +349,8 @@ static int static_call_add_module(struct module *mod) struct static_call_site *site; for (site = start; site != stop; site++) { - unsigned long addr = (unsigned long)static_call_key(site); + unsigned long s_key = (long)site->key + (long)&site->key; + unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS; unsigned long key; /* @@ -373,8 +374,8 @@ static int static_call_add_module(struct module *mod) return -EINVAL; } - site->key = (key - (long)&site->key) | - (site->key & STATIC_CALL_SITE_FLAGS); + key |= s_key & STATIC_CALL_SITE_FLAGS; + site->key = key - (long)&site->key; } return __static_call_init(mod, start, stop); diff --git a/kernel/sys.c b/kernel/sys.c index 3f87d589f465..cd4c014bc9ab 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2083,7 +2083,7 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr, * up to the caller to provide sane values here, otherwise userspace * tools which use this vector might be unhappy. */ - unsigned long user_auxv[AT_VECTOR_SIZE]; + unsigned long user_auxv[AT_VECTOR_SIZE] = {}; if (len > sizeof(user_auxv)) return -EINVAL; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 743c852e10f2..788b9d137de4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -546,8 +546,11 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, } /* - * Recomputes cpu_base::*next_timer and returns the earliest expires_next but - * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram. + * Recomputes cpu_base::*next_timer and returns the earliest expires_next + * but does not set cpu_base::*expires_next, that is done by + * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating + * cpu_base::*expires_next right away, reprogramming logic would no longer + * work. * * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, * those timers will get run whenever the softirq gets handled, at the end of @@ -588,6 +591,37 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ return expires_next; } +static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) +{ + ktime_t expires_next, soft = KTIME_MAX; + + /* + * If the soft interrupt has already been activated, ignore the + * soft bases. They will be handled in the already raised soft + * interrupt. + */ + if (!cpu_base->softirq_activated) { + soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); + /* + * Update the soft expiry time. clock_settime() might have + * affected it. + */ + cpu_base->softirq_expires_next = soft; + } + + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); + /* + * If a softirq timer is expiring first, update cpu_base->next_timer + * and program the hardware with the soft expiry time. + */ + if (expires_next > soft) { + cpu_base->next_timer = cpu_base->softirq_next_timer; + expires_next = soft; + } + + return expires_next; +} + static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; @@ -628,23 +662,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; - /* - * Find the current next expiration time. - */ - expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); - - if (cpu_base->next_timer && cpu_base->next_timer->is_soft) { - /* - * When the softirq is activated, hrtimer has to be - * programmed with the first hard hrtimer because soft - * timer interrupt could occur too late. - */ - if (cpu_base->softirq_activated) - expires_next = __hrtimer_get_next_event(cpu_base, - HRTIMER_ACTIVE_HARD); - else - cpu_base->softirq_expires_next = expires_next; - } + expires_next = hrtimer_update_next_event(cpu_base); if (skip_equal && expires_next == cpu_base->expires_next) return; @@ -1644,8 +1662,8 @@ void hrtimer_interrupt(struct clock_event_device *dev) __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); - /* Reevaluate the clock bases for the next expiry */ - expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); + /* Reevaluate the clock bases for the [soft] next expiry */ + expires_next = hrtimer_update_next_event(cpu_base); /* * Store the new expiry value so the migration code can verify * against it. diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 624ae1df7984..fba9909e31b7 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -156,6 +156,7 @@ config KASAN_STACK_ENABLE config KASAN_STACK int + depends on KASAN_GENERIC || KASAN_SW_TAGS default 1 if KASAN_STACK_ENABLE || CC_IS_GCC default 0 diff --git a/mm/highmem.c b/mm/highmem.c index 874b732b120c..86f2b9495f9c 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -368,20 +368,24 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1, BUG_ON(end1 > page_size(page) || end2 > page_size(page)); + if (start1 >= end1) + start1 = end1 = 0; + if (start2 >= end2) + start2 = end2 = 0; + for (i = 0; i < compound_nr(page); i++) { void *kaddr = NULL; - if (start1 < PAGE_SIZE || start2 < PAGE_SIZE) - kaddr = kmap_atomic(page + i); - if (start1 >= PAGE_SIZE) { start1 -= PAGE_SIZE; end1 -= PAGE_SIZE; } else { unsigned this_end = min_t(unsigned, end1, PAGE_SIZE); - if (end1 > start1) + if (end1 > start1) { + kaddr = kmap_atomic(page + i); memset(kaddr + start1, 0, this_end - start1); + } end1 -= this_end; start1 = 0; } @@ -392,8 +396,11 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1, } else { unsigned this_end = min_t(unsigned, end2, PAGE_SIZE); - if (end2 > start2) + if (end2 > start2) { + if (!kaddr) + kaddr = kmap_atomic(page + i); memset(kaddr + start2, 0, this_end - start2); + } end2 -= this_end; start2 = 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 395c75111d33..ae907a9c2050 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1100,9 +1100,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, * best effort that the pinned pages won't be replaced by another * random page during the coming copy-on-write. */ - if (unlikely(is_cow_mapping(vma->vm_flags) && - atomic_read(&src_mm->has_pinned) && - page_maybe_dma_pinned(src_page))) { + if (unlikely(page_needs_cow_for_dma(vma, src_page))) { pte_free(dst_mm, pgtable); spin_unlock(src_ptl); spin_unlock(dst_ptl); @@ -1214,9 +1212,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, } /* Please refer to comments in copy_huge_pmd() */ - if (unlikely(is_cow_mapping(vma->vm_flags) && - atomic_read(&src_mm->has_pinned) && - page_maybe_dma_pinned(pud_page(pud)))) { + if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) { spin_unlock(src_ptl); spin_unlock(dst_ptl); __split_huge_pud(vma, src_pud, addr); @@ -2471,7 +2467,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, int i; /* complete memcg works before add pages to LRU */ - mem_cgroup_split_huge_fixup(head); + split_page_memcg(head, nr); if (PageAnon(head) && PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8fb42c6dd74b..5b1ab1f427c5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -331,6 +331,24 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) } } +static inline long +hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from, + long to, struct hstate *h, struct hugetlb_cgroup *cg, + long *regions_needed) +{ + struct file_region *nrg; + + if (!regions_needed) { + nrg = get_file_region_entry_from_cache(map, from, to); + record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); + list_add(&nrg->link, rg->link.prev); + coalesce_file_region(map, nrg); + } else + *regions_needed += 1; + + return to - from; +} + /* * Must be called with resv->lock held. * @@ -346,7 +364,7 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, long add = 0; struct list_head *head = &resv->regions; long last_accounted_offset = f; - struct file_region *rg = NULL, *trg = NULL, *nrg = NULL; + struct file_region *rg = NULL, *trg = NULL; if (regions_needed) *regions_needed = 0; @@ -369,24 +387,17 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, /* When we find a region that starts beyond our range, we've * finished. */ - if (rg->from > t) + if (rg->from >= t) break; /* Add an entry for last_accounted_offset -> rg->from, and * update last_accounted_offset. */ - if (rg->from > last_accounted_offset) { - add += rg->from - last_accounted_offset; - if (!regions_needed) { - nrg = get_file_region_entry_from_cache( - resv, last_accounted_offset, rg->from); - record_hugetlb_cgroup_uncharge_info(h_cg, h, - resv, nrg); - list_add(&nrg->link, rg->link.prev); - coalesce_file_region(resv, nrg); - } else - *regions_needed += 1; - } + if (rg->from > last_accounted_offset) + add += hugetlb_resv_map_add(resv, rg, + last_accounted_offset, + rg->from, h, h_cg, + regions_needed); last_accounted_offset = rg->to; } @@ -394,17 +405,9 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, /* Handle the case where our range extends beyond * last_accounted_offset. */ - if (last_accounted_offset < t) { - add += t - last_accounted_offset; - if (!regions_needed) { - nrg = get_file_region_entry_from_cache( - resv, last_accounted_offset, t); - record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg); - list_add(&nrg->link, rg->link.prev); - coalesce_file_region(resv, nrg); - } else - *regions_needed += 1; - } + if (last_accounted_offset < t) + add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, + t, h, h_cg, regions_needed); VM_BUG_ON(add < 0); return add; @@ -3725,21 +3728,32 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte) return false; } +static void +hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, + struct page *new_page) +{ + __SetPageUptodate(new_page); + set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); + hugepage_add_new_anon_rmap(new_page, vma, addr); + hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); + ClearHPageRestoreReserve(new_page); + SetHPageMigratable(new_page); +} + int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { pte_t *src_pte, *dst_pte, entry, dst_entry; struct page *ptepage; unsigned long addr; - int cow; + bool cow = is_cow_mapping(vma->vm_flags); struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + unsigned long npages = pages_per_huge_page(h); struct address_space *mapping = vma->vm_file->f_mapping; struct mmu_notifier_range range; int ret = 0; - cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; - if (cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, vma->vm_start, @@ -3784,6 +3798,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); dst_entry = huge_ptep_get(dst_pte); +again: if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { /* * Skip if src entry none. Also, skip in the @@ -3807,6 +3822,52 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); } else { + entry = huge_ptep_get(src_pte); + ptepage = pte_page(entry); + get_page(ptepage); + + /* + * This is a rare case where we see pinned hugetlb + * pages while they're prone to COW. We need to do the + * COW earlier during fork. + * + * When pre-allocating the page or copying data, we + * need to be without the pgtable locks since we could + * sleep during the process. + */ + if (unlikely(page_needs_cow_for_dma(vma, ptepage))) { + pte_t src_pte_old = entry; + struct page *new; + + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + /* Do not use reserve as it's private owned */ + new = alloc_huge_page(vma, addr, 1); + if (IS_ERR(new)) { + put_page(ptepage); + ret = PTR_ERR(new); + break; + } + copy_user_huge_page(new, ptepage, addr, vma, + npages); + put_page(ptepage); + + /* Install the new huge page if src pte stable */ + dst_ptl = huge_pte_lock(h, dst, dst_pte); + src_ptl = huge_pte_lockptr(h, src, src_pte); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + entry = huge_ptep_get(src_pte); + if (!pte_same(src_pte_old, entry)) { + put_page(new); + /* dst_entry won't change as in child */ + goto again; + } + hugetlb_install_page(vma, dst_pte, addr, new); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + continue; + } + if (cow) { /* * No need to notify as we are downgrading page @@ -3817,12 +3878,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, */ huge_ptep_set_wrprotect(src, addr, src_pte); } - entry = huge_ptep_get(src_pte); - ptepage = pte_page(entry); - get_page(ptepage); + page_dup_rmap(ptepage, true); set_huge_pte_at(dst, addr, dst_pte, entry); - hugetlb_count_add(pages_per_huge_page(h), dst); + hugetlb_count_add(npages, dst); } spin_unlock(src_ptl); spin_unlock(dst_ptl); diff --git a/mm/internal.h b/mm/internal.h index 9902648f2206..1432feec62df 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -296,11 +296,6 @@ static inline unsigned int buddy_order(struct page *page) */ #define buddy_order_unsafe(page) READ_ONCE(page_private(page)) -static inline bool is_cow_mapping(vm_flags_t flags) -{ - return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; -} - /* * These three helpers classifies VMAs for virtual memory accounting. */ diff --git a/mm/kfence/report.c b/mm/kfence/report.c index ab83d5a59bb1..e3f71451ad9e 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -20,6 +20,11 @@ #include "kfence.h" +/* May be overridden by . */ +#ifndef ARCH_FUNC_PREFIX +#define ARCH_FUNC_PREFIX "" +#endif + extern bool no_hash_pointers; /* Helper function to either print to a seq_file or to console. */ @@ -67,8 +72,9 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries for (skipnr = 0; skipnr < num_entries; skipnr++) { int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]); - if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") || - !strncmp(buf, "__slab_free", len)) { + if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") || + !strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) { /* * In case of tail calls from any of the below * to any of the above. @@ -77,10 +83,10 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries } /* Also the *_bulk() variants by only checking prefixes. */ - if (str_has_prefix(buf, "kfree") || - str_has_prefix(buf, "kmem_cache_free") || - str_has_prefix(buf, "__kmalloc") || - str_has_prefix(buf, "kmem_cache_alloc")) + if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc")) goto found; } if (fallback < num_entries) @@ -116,12 +122,12 @@ void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *met lockdep_assert_held(&meta->lock); if (meta->state == KFENCE_OBJECT_UNUSED) { - seq_con_printf(seq, "kfence-#%zd unused\n", meta - kfence_metadata); + seq_con_printf(seq, "kfence-#%td unused\n", meta - kfence_metadata); return; } seq_con_printf(seq, - "kfence-#%zd [0x%p-0x%p" + "kfence-#%td [0x%p-0x%p" ", size=%d, cache=%s] allocated by task %d:\n", meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size, (cache && cache->name) ? cache->name : "", meta->alloc_track.pid); @@ -204,7 +210,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write), (void *)stack_entries[skipnr]); - pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%zd):\n", + pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%td):\n", get_access_type(is_write), (void *)address, left_of_object ? meta->addr - address : address - meta->addr, left_of_object ? "left" : "right", object_index); @@ -213,14 +219,14 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r case KFENCE_ERROR_UAF: pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write), (void *)stack_entries[skipnr]); - pr_err("Use-after-free %s at 0x%p (in kfence-#%zd):\n", + pr_err("Use-after-free %s at 0x%p (in kfence-#%td):\n", get_access_type(is_write), (void *)address, object_index); break; case KFENCE_ERROR_CORRUPTION: pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]); pr_err("Corrupted memory at 0x%p ", (void *)address); print_diff_canary(address, 16, meta); - pr_cont(" (in kfence-#%zd):\n", object_index); + pr_cont(" (in kfence-#%td):\n", object_index); break; case KFENCE_ERROR_INVALID: pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write), @@ -230,7 +236,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r break; case KFENCE_ERROR_INVALID_FREE: pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]); - pr_err("Invalid free of 0x%p (in kfence-#%zd):\n", (void *)address, + pr_err("Invalid free of 0x%p (in kfence-#%td):\n", (void *)address, object_index); break; } diff --git a/mm/madvise.c b/mm/madvise.c index 36ddd4adc37e..cca578c8a302 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1198,12 +1198,22 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto release_task; } - mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); + /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR_OR_NULL(mm)) { ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; goto release_task; } + /* + * Require CAP_SYS_NICE for influencing process performance. Note that + * only non-destructive hints are currently supported. + */ + if (!capable(CAP_SYS_NICE)) { + ret = -EPERM; + goto release_mm; + } + total_len = iov_iter_count(&iter); while (iov_iter_count(&iter)) { @@ -1218,6 +1228,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, if (ret == 0) ret = total_len - iov_iter_count(&iter); +release_mm: mmput(mm); release_task: put_task_struct(task); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 845eec01ef9d..e064ac0d850a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3287,24 +3287,21 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) #endif /* CONFIG_MEMCG_KMEM */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* - * Because page_memcg(head) is not set on compound tails, set it now. + * Because page_memcg(head) is not set on tails, set it now. */ -void mem_cgroup_split_huge_fixup(struct page *head) +void split_page_memcg(struct page *head, unsigned int nr) { struct mem_cgroup *memcg = page_memcg(head); int i; - if (mem_cgroup_disabled()) + if (mem_cgroup_disabled() || !memcg) return; - for (i = 1; i < HPAGE_PMD_NR; i++) { - css_get(&memcg->css); - head[i].memcg_data = (unsigned long)memcg; - } + for (i = 1; i < nr; i++) + head[i].memcg_data = head->memcg_data; + css_get_many(&memcg->css, nr - 1); } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_MEMCG_SWAP /** diff --git a/mm/memory.c b/mm/memory.c index 350186150ab8..7b43ca36a5fd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -823,12 +823,8 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, struct page **prealloc, pte_t pte, struct page *page) { - struct mm_struct *src_mm = src_vma->vm_mm; struct page *new_page; - if (!is_cow_mapping(src_vma->vm_flags)) - return 1; - /* * What we want to do is to check whether this page may * have been pinned by the parent process. If so, @@ -842,9 +838,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * the page count. That might give false positives for * for pinning, but it will work correctly. */ - if (likely(!atomic_read(&src_mm->has_pinned))) - return 1; - if (likely(!page_maybe_dma_pinned(page))) + if (likely(!page_needs_cow_for_dma(src_vma, page))) return 1; new_page = *prealloc; @@ -3117,6 +3111,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_WP); } + /* + * Userfaultfd write-protect can defer flushes. Ensure the TLB + * is flushed in this case before copying. + */ + if (unlikely(userfaultfd_wp(vmf->vma) && + mm_tlb_flush_pending(vmf->vma->vm_mm))) + flush_tlb_page(vmf->vma, vmf->address); + vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); if (!vmf->page) { /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c9d7d350923b..aded7714034a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1293,6 +1293,12 @@ static __always_inline bool free_pages_prepare(struct page *page, kernel_poison_pages(page, 1 << order); + /* + * With hardware tag-based KASAN, memory tags must be set before the + * page becomes unavailable via debug_pagealloc or arch_free_page. + */ + kasan_free_nondeferred_pages(page, order); + /* * arch_free_page() can make the page's contents inaccessible. s390 * does this. So nothing which can access the page's contents should @@ -1302,8 +1308,6 @@ static __always_inline bool free_pages_prepare(struct page *page, debug_pagealloc_unmap_pages(page, 1 << order); - kasan_free_nondeferred_pages(page, order); - return true; } @@ -3322,6 +3326,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, 1 << order); + split_page_memcg(page, 1 << order); } EXPORT_SYMBOL_GPL(split_page); @@ -6271,12 +6276,65 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } +#if !defined(CONFIG_FLAT_NODE_MEM_MAP) +/* + * Only struct pages that correspond to ranges defined by memblock.memory + * are zeroed and initialized by going through __init_single_page() during + * memmap_init_zone(). + * + * But, there could be struct pages that correspond to holes in + * memblock.memory. This can happen because of the following reasons: + * - physical memory bank size is not necessarily the exact multiple of the + * arbitrary section size + * - early reserved memory may not be listed in memblock.memory + * - memory layouts defined with memmap= kernel parameter may not align + * nicely with memmap sections + * + * Explicitly initialize those struct pages so that: + * - PG_Reserved is set + * - zone and node links point to zone and node that span the page if the + * hole is in the middle of a zone + * - zone and node links point to adjacent zone/node if the hole falls on + * the zone boundary; the pages in such holes will be prepended to the + * zone/node above the hole except for the trailing pages in the last + * section that will be appended to the zone/node below. + */ +static u64 __meminit init_unavailable_range(unsigned long spfn, + unsigned long epfn, + int zone, int node) +{ + unsigned long pfn; + u64 pgcnt = 0; + + for (pfn = spfn; pfn < epfn; pfn++) { + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { + pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) + + pageblock_nr_pages - 1; + continue; + } + __init_single_page(pfn_to_page(pfn), pfn, zone, node); + __SetPageReserved(pfn_to_page(pfn)); + pgcnt++; + } + + return pgcnt; +} +#else +static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn, + int zone, int node) +{ + return 0; +} +#endif + void __meminit __weak memmap_init_zone(struct zone *zone) { unsigned long zone_start_pfn = zone->zone_start_pfn; unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone); + static unsigned long hole_pfn; unsigned long start_pfn, end_pfn; + u64 pgcnt = 0; for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); @@ -6286,7 +6344,29 @@ void __meminit __weak memmap_init_zone(struct zone *zone) memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn, zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + + if (hole_pfn < start_pfn) + pgcnt += init_unavailable_range(hole_pfn, start_pfn, + zone_id, nid); + hole_pfn = end_pfn; } + +#ifdef CONFIG_SPARSEMEM + /* + * Initialize the hole in the range [zone_end_pfn, section_end]. + * If zone boundary falls in the middle of a section, this hole + * will be re-initialized during the call to this function for the + * higher zone. + */ + end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION); + if (hole_pfn < end_pfn) + pgcnt += init_unavailable_range(hole_pfn, end_pfn, + zone_id, nid); +#endif + + if (pgcnt) + pr_info(" %s zone: %llu pages in unavailable ranges\n", + zone->name, pgcnt); } static int zone_batchsize(struct zone *zone) @@ -7083,88 +7163,6 @@ void __init free_area_init_memoryless_node(int nid) free_area_init_node(nid); } -#if !defined(CONFIG_FLAT_NODE_MEM_MAP) -/* - * Initialize all valid struct pages in the range [spfn, epfn) and mark them - * PageReserved(). Return the number of struct pages that were initialized. - */ -static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn) -{ - unsigned long pfn; - u64 pgcnt = 0; - - for (pfn = spfn; pfn < epfn; pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) - + pageblock_nr_pages - 1; - continue; - } - /* - * Use a fake node/zone (0) for now. Some of these pages - * (in memblock.reserved but not in memblock.memory) will - * get re-initialized via reserve_bootmem_region() later. - */ - __init_single_page(pfn_to_page(pfn), pfn, 0, 0); - __SetPageReserved(pfn_to_page(pfn)); - pgcnt++; - } - - return pgcnt; -} - -/* - * Only struct pages that are backed by physical memory are zeroed and - * initialized by going through __init_single_page(). But, there are some - * struct pages which are reserved in memblock allocator and their fields - * may be accessed (for example page_to_pfn() on some configuration accesses - * flags). We must explicitly initialize those struct pages. - * - * This function also addresses a similar issue where struct pages are left - * uninitialized because the physical address range is not covered by - * memblock.memory or memblock.reserved. That could happen when memblock - * layout is manually configured via memmap=, or when the highest physical - * address (max_pfn) does not end on a section boundary. - */ -static void __init init_unavailable_mem(void) -{ - phys_addr_t start, end; - u64 i, pgcnt; - phys_addr_t next = 0; - - /* - * Loop through unavailable ranges not covered by memblock.memory. - */ - pgcnt = 0; - for_each_mem_range(i, &start, &end) { - if (next < start) - pgcnt += init_unavailable_range(PFN_DOWN(next), - PFN_UP(start)); - next = end; - } - - /* - * Early sections always have a fully populated memmap for the whole - * section - see pfn_valid(). If the last section has holes at the - * end and that section is marked "online", the memmap will be - * considered initialized. Make sure that memmap has a well defined - * state. - */ - pgcnt += init_unavailable_range(PFN_DOWN(next), - round_up(max_pfn, PAGES_PER_SECTION)); - - /* - * Struct pages that do not have backing memory. This could be because - * firmware is using some of this memory, or for some other reasons. - */ - if (pgcnt) - pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); -} -#else -static inline void __init init_unavailable_mem(void) -{ -} -#endif /* !CONFIG_FLAT_NODE_MEM_MAP */ - #if MAX_NUMNODES > 1 /* * Figure out the number of possible node ids. @@ -7588,7 +7586,6 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); - init_unavailable_mem(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); free_area_init_node(nid); diff --git a/mm/slab.c b/mm/slab.c index 51fd424e0d6d..ae651bf540b7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2992,7 +2992,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); - if (!objp) + if (!objp || is_kfence_address(objp)) return objp; if (cachep->flags & SLAB_POISON) { check_poison_obj(cachep, objp); diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 068cdb41f76f..5e5388a38e2a 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2442,6 +2442,9 @@ static int handle_insn_ops(struct instruction *insn, struct insn_state *state) if (update_cfi_state(insn, &state->cfi, op)) return 1; + if (!insn->alt_group) + continue; + if (op->dest.type == OP_DEST_PUSHF) { if (!state->uaccess_stack) { state->uaccess_stack = 1;