Linux 5.12-rc3

-----BEGIN PGP SIGNATURE-----
 
 iQFSBAABCAA8FiEEq68RxlopcLEwq+PEeb4+QwBBGIYFAmBOgu4eHHRvcnZhbGRz
 QGxpbnV4LWZvdW5kYXRpb24ub3JnAAoJEHm+PkMAQRiGUd0H/3Ey8aWjVAig9Pe+
 VQVZKwG+LXWH6UmUx5qyaTxophhmGnWLvkigJMn63qIg4eQtfp2gNFHK+T4OJNIP
 ybnkjFZ337x4J9zD6m8mt4Wmelq9iW2wNOS+3YZAyYiGlXfMGM7SlYRCQRQznTED
 2O/JCMsOoP+Z8tr5ah/bzs0dANsXmTZ3QqRP2uzb6irKTgFR3/weOhj+Ht1oJ4Aq
 V+bgdcwhtk20hJhlvVeqws+o74LR789tTDCknlz/YNMv9e6VPfyIQ5vJAcFmZATE
 Ezj9yzkZ4IU+Ux6ikAyaFyBU8d1a4Wqye3eHCZBsEo6tcSAhbTZ90eoU86vh6ajS
 LZjwkNw=
 =6y1u
 -----END PGP SIGNATURE-----

Merge 5.12-rc3 into android-mainline

Linux 5.12-rc3

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: Ibc683be6a28885f39d477e3f86ec2ecd1a9e7e21
This commit is contained in:
Greg Kroah-Hartman 2021-03-15 08:48:29 +01:00
commit 6bb5c8b522
94 changed files with 844 additions and 476 deletions

View File

@ -23,6 +23,7 @@ properties:
- enum:
- ingenic,jz4775-intc
- ingenic,jz4770-intc
- ingenic,jz4760b-intc
- const: ingenic,jz4760-intc
- items:
- const: ingenic,x1000-intc

View File

@ -182,6 +182,9 @@ is dependent on the CPU capability and the kernel configuration. The limit can
be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION
ioctl() at run-time.
Creation of the VM will fail if the requested IPA size (whether it is
implicit or explicit) is unsupported on the host.
Please note that configuring the IPA size does not affect the capability
exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects
size of the address translated by the stage2 level (guest physical to

View File

@ -261,8 +261,8 @@ ABI/API
L: linux-api@vger.kernel.org
F: include/linux/syscalls.h
F: kernel/sys_ni.c
F: include/uapi/
F: arch/*/include/uapi/
X: include/uapi/
X: arch/*/include/uapi/
ABIT UGURU 1,2 HARDWARE MONITOR DRIVER
M: Hans de Goede <hdegoede@redhat.com>

View File

@ -2,7 +2,7 @@
VERSION = 5
PATCHLEVEL = 12
SUBLEVEL = 0
EXTRAVERSION = -rc2
EXTRAVERSION = -rc3
NAME = Frozen Wasteland
# *DOCUMENTATION*

View File

@ -347,6 +347,7 @@ config ARCH_EP93XX
select ARM_AMBA
imply ARM_PATCH_PHYS_VIRT
select ARM_VIC
select GENERIC_IRQ_MULTI_HANDLER
select AUTO_ZRELADDR
select CLKDEV_LOOKUP
select CLKSRC_MMIO

View File

@ -47,10 +47,10 @@
#define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2
#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3
#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4
#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5
#define __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context 5
#define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6
#define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config 8
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11
@ -183,16 +183,16 @@ DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs);
#define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs)
extern void __kvm_flush_vm_context(void);
extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu);
extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
int level);
extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);
extern void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu);
extern void __kvm_timer_set_cntvoff(u64 cntvoff);
extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
extern u64 __vgic_v3_get_ich_vtr_el2(void);
extern u64 __vgic_v3_get_gic_config(void);
extern u64 __vgic_v3_read_vmcr(void);
extern void __vgic_v3_write_vmcr(u32 vmcr);
extern void __vgic_v3_init_lrs(void);

View File

@ -83,6 +83,11 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt);
void __debug_switch_to_guest(struct kvm_vcpu *vcpu);
void __debug_switch_to_host(struct kvm_vcpu *vcpu);
#ifdef __KVM_NVHE_HYPERVISOR__
void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu);
void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu);
#endif
void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
@ -97,7 +102,8 @@ bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt);
void __noreturn hyp_panic(void);
#ifdef __KVM_NVHE_HYPERVISOR__
void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr,
u64 elr, u64 par);
#endif
#endif /* __ARM64_KVM_HYP_H__ */

View File

@ -101,6 +101,9 @@ KVM_NVHE_ALIAS(__stop___kvm_ex_table);
/* Array containing bases of nVHE per-CPU memory regions. */
KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base);
/* PMU available static key */
KVM_NVHE_ALIAS(kvm_arm_pmu_available);
#endif /* CONFIG_KVM */
#endif /* __ARM64_KERNEL_IMAGE_VARS_H */

View File

@ -385,11 +385,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
/*
* We guarantee that both TLBs and I-cache are private to each
* vcpu. If detecting that a vcpu from the same VM has
* previously run on the same physical CPU, call into the
* hypervisor code to nuke the relevant contexts.
*
* We might get preempted before the vCPU actually runs, but
* over-invalidation doesn't affect correctness.
*/
if (*last_ran != vcpu->vcpu_id) {
kvm_call_hyp(__kvm_tlb_flush_local_vmid, mmu);
kvm_call_hyp(__kvm_flush_cpu_context, mmu);
*last_ran = vcpu->vcpu_id;
}

View File

@ -85,8 +85,10 @@ SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL)
// If the hyp context is loaded, go straight to hyp_panic
get_loaded_vcpu x0, x1
cbz x0, hyp_panic
cbnz x0, 1f
b hyp_panic
1:
// The hyp context is saved so make sure it is restored to allow
// hyp_panic to run at hyp and, subsequently, panic to run in the host.
// This makes use of __guest_exit to avoid duplication but sets the
@ -94,7 +96,7 @@ SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL)
// current state is saved to the guest context but it will only be
// accurate if the guest had been completely restored.
adr_this_cpu x0, kvm_hyp_ctxt, x1
adr x1, hyp_panic
adr_l x1, hyp_panic
str x1, [x0, #CPU_XREG_OFFSET(30)]
get_vcpu_ptr x1, x0
@ -146,7 +148,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// Now restore the hyp regs
restore_callee_saved_regs x2
set_loaded_vcpu xzr, x1, x2
set_loaded_vcpu xzr, x2, x3
alternative_if ARM64_HAS_RAS_EXTN
// If we have the RAS extensions we can consume a pending error

View File

@ -90,15 +90,18 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
* counter, which could make a PMXEVCNTR_EL0 access UNDEF at
* EL1 instead of being trapped to EL2.
*/
write_sysreg(0, pmselr_el0);
write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
if (kvm_arm_support_pmu_v3()) {
write_sysreg(0, pmselr_el0);
write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
}
write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
}
static inline void __deactivate_traps_common(void)
{
write_sysreg(0, hstr_el2);
write_sysreg(0, pmuserenr_el0);
if (kvm_arm_support_pmu_v3())
write_sysreg(0, pmuserenr_el0);
}
static inline void ___activate_traps(struct kvm_vcpu *vcpu)

View File

@ -58,16 +58,24 @@ static void __debug_restore_spe(u64 pmscr_el1)
write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1);
}
void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
/* Disable and flush SPE data generation */
__debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1);
}
void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
{
__debug_switch_to_guest_common(vcpu);
}
void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
__debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
}
void __debug_switch_to_host(struct kvm_vcpu *vcpu)
{
__debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
__debug_switch_to_host_common(vcpu);
}

View File

@ -71,7 +71,8 @@ SYM_FUNC_START(__host_enter)
SYM_FUNC_END(__host_enter)
/*
* void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
* void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr,
* u64 elr, u64 par);
*/
SYM_FUNC_START(__hyp_do_panic)
/* Prepare and exit to the host's panic funciton. */
@ -82,9 +83,11 @@ SYM_FUNC_START(__hyp_do_panic)
hyp_kimg_va lr, x6
msr elr_el2, lr
/* Set the panic format string. Use the, now free, LR as scratch. */
ldr lr, =__hyp_panic_string
hyp_kimg_va lr, x6
mov x29, x0
/* Load the format string into x0 and arguments into x1-7 */
ldr x0, =__hyp_panic_string
hyp_kimg_va x0, x6
/* Load the format arguments into x1-7. */
mov x6, x3
@ -94,9 +97,7 @@ SYM_FUNC_START(__hyp_do_panic)
mrs x5, hpfar_el2
/* Enter the host, conditionally restoring the host context. */
cmp x0, xzr
mov x0, lr
b.eq __host_enter_without_restoring
cbz x29, __host_enter_without_restoring
b __host_enter_for_panic
SYM_FUNC_END(__hyp_do_panic)

View File

@ -46,11 +46,11 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
__kvm_tlb_flush_vmid(kern_hyp_va(mmu));
}
static void handle___kvm_tlb_flush_local_vmid(struct kvm_cpu_context *host_ctxt)
static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
__kvm_tlb_flush_local_vmid(kern_hyp_va(mmu));
__kvm_flush_cpu_context(kern_hyp_va(mmu));
}
static void handle___kvm_timer_set_cntvoff(struct kvm_cpu_context *host_ctxt)
@ -67,9 +67,9 @@ static void handle___kvm_enable_ssbs(struct kvm_cpu_context *host_ctxt)
write_sysreg_el2(tmp, SYS_SCTLR);
}
static void handle___vgic_v3_get_ich_vtr_el2(struct kvm_cpu_context *host_ctxt)
static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt)
{
cpu_reg(host_ctxt, 1) = __vgic_v3_get_ich_vtr_el2();
cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config();
}
static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt)
@ -115,10 +115,10 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_flush_vm_context),
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
HANDLE_FUNC(__kvm_tlb_flush_vmid),
HANDLE_FUNC(__kvm_tlb_flush_local_vmid),
HANDLE_FUNC(__kvm_flush_cpu_context),
HANDLE_FUNC(__kvm_timer_set_cntvoff),
HANDLE_FUNC(__kvm_enable_ssbs),
HANDLE_FUNC(__vgic_v3_get_ich_vtr_el2),
HANDLE_FUNC(__vgic_v3_get_gic_config),
HANDLE_FUNC(__vgic_v3_read_vmcr),
HANDLE_FUNC(__vgic_v3_write_vmcr),
HANDLE_FUNC(__vgic_v3_init_lrs),

View File

@ -192,6 +192,14 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
pmu_switch_needed = __pmu_switch_to_guest(host_ctxt);
__sysreg_save_state_nvhe(host_ctxt);
/*
* We must flush and disable the SPE buffer for nVHE, as
* the translation regime(EL1&0) is going to be loaded with
* that of the guest. And we must do this before we change the
* translation regime to EL2 (via MDCR_EL2_E2PB == 0) and
* before we load guest Stage1.
*/
__debug_save_host_buffers_nvhe(vcpu);
__adjust_pc(vcpu);
@ -234,11 +242,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED)
__fpsimd_save_fpexc32(vcpu);
__debug_switch_to_host(vcpu);
/*
* This must come after restoring the host sysregs, since a non-VHE
* system may enable SPE here and make use of the TTBRs.
*/
__debug_switch_to_host(vcpu);
__debug_restore_host_buffers_nvhe(vcpu);
if (pmu_switch_needed)
__pmu_switch_to_host(host_ctxt);
@ -257,7 +266,6 @@ void __noreturn hyp_panic(void)
u64 spsr = read_sysreg_el2(SYS_SPSR);
u64 elr = read_sysreg_el2(SYS_ELR);
u64 par = read_sysreg_par();
bool restore_host = true;
struct kvm_cpu_context *host_ctxt;
struct kvm_vcpu *vcpu;
@ -271,7 +279,7 @@ void __noreturn hyp_panic(void)
__sysreg_restore_state_nvhe(host_ctxt);
}
__hyp_do_panic(restore_host, spsr, elr, par);
__hyp_do_panic(host_ctxt, spsr, elr, par);
unreachable();
}

View File

@ -123,7 +123,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
__tlb_switch_to_host(&cxt);
}
void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu)
void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
{
struct tlb_inv_context cxt;
@ -131,6 +131,7 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu)
__tlb_switch_to_guest(mmu, &cxt);
__tlbi(vmalle1);
asm volatile("ic iallu");
dsb(nsh);
isb();

View File

@ -223,6 +223,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
goto out;
if (!table) {
data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level));
data->addr += kvm_granule_size(level);
goto out;
}

View File

@ -405,9 +405,45 @@ void __vgic_v3_init_lrs(void)
__gic_v3_set_lr(0, i);
}
u64 __vgic_v3_get_ich_vtr_el2(void)
/*
* Return the GIC CPU configuration:
* - [31:0] ICH_VTR_EL2
* - [62:32] RES0
* - [63] MMIO (GICv2) capable
*/
u64 __vgic_v3_get_gic_config(void)
{
return read_gicreg(ICH_VTR_EL2);
u64 val, sre = read_gicreg(ICC_SRE_EL1);
unsigned long flags = 0;
/*
* To check whether we have a MMIO-based (GICv2 compatible)
* CPU interface, we need to disable the system register
* view. To do that safely, we have to prevent any interrupt
* from firing (which would be deadly).
*
* Note that this only makes sense on VHE, as interrupts are
* already masked for nVHE as part of the exception entry to
* EL2.
*/
if (has_vhe())
flags = local_daif_save();
write_gicreg(0, ICC_SRE_EL1);
isb();
val = read_gicreg(ICC_SRE_EL1);
write_gicreg(sre, ICC_SRE_EL1);
isb();
if (has_vhe())
local_daif_restore(flags);
val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63);
val |= read_gicreg(ICH_VTR_EL2);
return val;
}
u64 __vgic_v3_read_vmcr(void)

View File

@ -127,7 +127,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
__tlb_switch_to_host(&cxt);
}
void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu)
void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
{
struct tlb_inv_context cxt;
@ -135,6 +135,7 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu)
__tlb_switch_to_guest(mmu, &cxt);
__tlbi(vmalle1);
asm volatile("ic iallu");
dsb(nsh);
isb();

View File

@ -1312,8 +1312,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
* Prevent userspace from creating a memory region outside of the IPA
* space addressable by the KVM guest IPA space.
*/
if (memslot->base_gfn + memslot->npages >=
(kvm_phys_size(kvm) >> PAGE_SHIFT))
if ((memslot->base_gfn + memslot->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
return -EFAULT;
mmap_read_lock(current->mm);

View File

@ -11,6 +11,8 @@
#include <asm/kvm_emulate.h>
DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available);
static int kvm_is_in_guest(void)
{
return kvm_get_running_vcpu() != NULL;
@ -48,6 +50,14 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
int kvm_perf_init(void)
{
/*
* Check if HW_PERF_EVENTS are supported by checking the number of
* hardware performance counters. This could ensure the presence of
* a physical PMU and CONFIG_PERF_EVENT is selected.
*/
if (IS_ENABLED(CONFIG_ARM_PMU) && perf_num_counters() > 0)
static_branch_enable(&kvm_arm_pmu_available);
return perf_register_guest_info_callbacks(&kvm_guest_cbs);
}

View File

@ -823,16 +823,6 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
return val & mask;
}
bool kvm_arm_support_pmu_v3(void)
{
/*
* Check if HW_PERF_EVENTS are supported by checking the number of
* hardware performance counters. This could ensure the presence of
* a physical PMU and CONFIG_PERF_EVENT is selected.
*/
return (perf_num_counters() > 0);
}
int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
{
if (!kvm_vcpu_has_pmu(vcpu))

View File

@ -326,10 +326,9 @@ int kvm_set_ipa_limit(void)
}
kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange);
WARN(kvm_ipa_limit < KVM_PHYS_SHIFT,
"KVM IPA Size Limit (%d bits) is smaller than default size\n",
kvm_ipa_limit);
kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit);
kvm_info("IPA Size Limit: %d bits%s\n", kvm_ipa_limit,
((kvm_ipa_limit < KVM_PHYS_SHIFT) ?
" (Reduced IPA size, limited VM/VMM compatibility)" : ""));
return 0;
}
@ -358,6 +357,11 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
return -EINVAL;
} else {
phys_shift = KVM_PHYS_SHIFT;
if (phys_shift > kvm_ipa_limit) {
pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
current->comm);
return -EINVAL;
}
}
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);

View File

@ -574,9 +574,13 @@ early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
*/
int vgic_v3_probe(const struct gic_kvm_info *info)
{
u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2);
u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config);
bool has_v2;
int ret;
has_v2 = ich_vtr_el2 >> 63;
ich_vtr_el2 = (u32)ich_vtr_el2;
/*
* The ListRegs field is 5 bits, but there is an architectural
* maximum of 16 list registers. Just ignore bit 4...
@ -594,13 +598,15 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
gicv4_enable ? "en" : "dis");
}
kvm_vgic_global_state.vcpu_base = 0;
if (!info->vcpu.start) {
kvm_info("GICv3: no GICV resource entry\n");
kvm_vgic_global_state.vcpu_base = 0;
} else if (!has_v2) {
pr_warn(FW_BUG "CPU interface incapable of MMIO access\n");
} else if (!PAGE_ALIGNED(info->vcpu.start)) {
pr_warn("GICV physical address 0x%llx not page aligned\n",
(unsigned long long)info->vcpu.start);
kvm_vgic_global_state.vcpu_base = 0;
} else {
kvm_vgic_global_state.vcpu_base = info->vcpu.start;
kvm_vgic_global_state.can_emulate_gicv2 = true;

View File

@ -32,7 +32,7 @@ static inline void syscall_rollback(struct task_struct *task,
static inline long syscall_get_error(struct task_struct *task,
struct pt_regs *regs)
{
return regs->r10 == -1 ? regs->r8:0;
return regs->r10 == -1 ? -regs->r8:0;
}
static inline long syscall_get_return_value(struct task_struct *task,

View File

@ -2013,27 +2013,39 @@ static void syscall_get_set_args_cb(struct unw_frame_info *info, void *data)
{
struct syscall_get_set_args *args = data;
struct pt_regs *pt = args->regs;
unsigned long *krbs, cfm, ndirty;
unsigned long *krbs, cfm, ndirty, nlocals, nouts;
int i, count;
if (unw_unwind_to_user(info) < 0)
return;
/*
* We get here via a few paths:
* - break instruction: cfm is shared with caller.
* syscall args are in out= regs, locals are non-empty.
* - epsinstruction: cfm is set by br.call
* locals don't exist.
*
* For both cases argguments are reachable in cfm.sof - cfm.sol.
* CFM: [ ... | sor: 17..14 | sol : 13..7 | sof : 6..0 ]
*/
cfm = pt->cr_ifs;
nlocals = (cfm >> 7) & 0x7f; /* aka sol */
nouts = (cfm & 0x7f) - nlocals; /* aka sof - sol */
krbs = (unsigned long *)info->task + IA64_RBS_OFFSET/8;
ndirty = ia64_rse_num_regs(krbs, krbs + (pt->loadrs >> 19));
count = 0;
if (in_syscall(pt))
count = min_t(int, args->n, cfm & 0x7f);
count = min_t(int, args->n, nouts);
/* Iterate over outs. */
for (i = 0; i < count; i++) {
int j = ndirty + nlocals + i + args->i;
if (args->rw)
*ia64_rse_skip_regs(krbs, ndirty + i + args->i) =
args->args[i];
*ia64_rse_skip_regs(krbs, j) = args->args[i];
else
args->args[i] = *ia64_rse_skip_regs(krbs,
ndirty + i + args->i);
args->args[i] = *ia64_rse_skip_regs(krbs, j);
}
if (!args->rw) {

View File

@ -73,9 +73,10 @@ void __patch_exception(int exc, unsigned long addr);
#endif
#define OP_RT_RA_MASK 0xffff0000UL
#define LIS_R2 0x3c020000UL
#define ADDIS_R2_R12 0x3c4c0000UL
#define ADDI_R2_R2 0x38420000UL
#define LIS_R2 (PPC_INST_ADDIS | __PPC_RT(R2))
#define ADDIS_R2_R12 (PPC_INST_ADDIS | __PPC_RT(R2) | __PPC_RA(R12))
#define ADDI_R2_R2 (PPC_INST_ADDI | __PPC_RT(R2) | __PPC_RA(R2))
static inline unsigned long ppc_function_entry(void *func)
{

View File

@ -410,7 +410,6 @@ DECLARE_INTERRUPT_HANDLER(altivec_assist_exception);
DECLARE_INTERRUPT_HANDLER(CacheLockingException);
DECLARE_INTERRUPT_HANDLER(SPEFloatingPointException);
DECLARE_INTERRUPT_HANDLER(SPEFloatingPointRoundException);
DECLARE_INTERRUPT_HANDLER(unrecoverable_exception);
DECLARE_INTERRUPT_HANDLER(WatchdogException);
DECLARE_INTERRUPT_HANDLER(kernel_bad_stack);
@ -437,6 +436,8 @@ DECLARE_INTERRUPT_HANDLER_NMI(hmi_exception_realmode);
DECLARE_INTERRUPT_HANDLER_ASYNC(TAUException);
void unrecoverable_exception(struct pt_regs *regs);
void replay_system_reset(void);
void replay_soft_interrupts(void);

View File

@ -195,7 +195,7 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
#define TRAP_FLAGS_MASK 0x11
#define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK)
#define FULL_REGS(regs) (((regs)->trap & 1) == 0)
#define SET_FULL_REGS(regs) ((regs)->trap |= 1)
#define SET_FULL_REGS(regs) ((regs)->trap &= ~1)
#endif
#define CHECK_FULL_REGS(regs) BUG_ON(!FULL_REGS(regs))
#define NV_REG_POISON 0xdeadbeefdeadbeefUL
@ -210,7 +210,7 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
#define TRAP_FLAGS_MASK 0x1F
#define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK)
#define FULL_REGS(regs) (((regs)->trap & 1) == 0)
#define SET_FULL_REGS(regs) ((regs)->trap |= 1)
#define SET_FULL_REGS(regs) ((regs)->trap &= ~1)
#define IS_CRITICAL_EXC(regs) (((regs)->trap & 2) != 0)
#define IS_MCHECK_EXC(regs) (((regs)->trap & 4) != 0)
#define IS_DEBUG_EXC(regs) (((regs)->trap & 8) != 0)

View File

@ -71,6 +71,16 @@ static inline void disable_kernel_vsx(void)
{
msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
}
#else
static inline void enable_kernel_vsx(void)
{
BUILD_BUG();
}
static inline void disable_kernel_vsx(void)
{
BUILD_BUG();
}
#endif
#ifdef CONFIG_SPE

View File

@ -466,7 +466,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real)
ld r10,PACAKMSR(r13) /* get MSR value for kernel */
/* MSR[RI] is clear iff using SRR regs */
.if IHSRR == EXC_HV_OR_STD
.if IHSRR_IF_HVMODE
BEGIN_FTR_SECTION
xori r10,r10,MSR_RI
END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)

View File

@ -436,7 +436,6 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned
return ret;
}
void unrecoverable_exception(struct pt_regs *regs);
void preempt_schedule_irq(void);
notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsigned long msr)

View File

@ -2170,7 +2170,7 @@ DEFINE_INTERRUPT_HANDLER(SPEFloatingPointRoundException)
* in the MSR is 0. This indicates that SRR0/1 are live, and that
* we therefore lost state by taking this exception.
*/
DEFINE_INTERRUPT_HANDLER(unrecoverable_exception)
void unrecoverable_exception(struct pt_regs *regs)
{
pr_emerg("Unrecoverable exception %lx at %lx (msr=%lx)\n",
regs->trap, regs->nip, regs->msr);

View File

@ -128,7 +128,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
regs->ax = -EFAULT;
instrumentation_end();
syscall_exit_to_user_mode(regs);
local_irq_disable();
irqentry_exit_to_user_mode(regs);
return false;
}

View File

@ -210,6 +210,8 @@ SYM_CODE_START(entry_SYSCALL_compat)
/* Switch to the kernel stack */
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL)
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
pushq %r8 /* pt_regs->sp */

View File

@ -81,7 +81,11 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
DEFINE_STATIC_CALL_NULL(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs);
/*
* This one is magic, it will get called even when PMU init fails (because
* there is no PMU), in which case it should simply return NULL.
*/
DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs);
u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
@ -1944,13 +1948,6 @@ static void _x86_pmu_read(struct perf_event *event)
x86_perf_event_update(event);
}
static inline struct perf_guest_switch_msr *
perf_guest_get_msrs_nop(int *nr)
{
*nr = 0;
return NULL;
}
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@ -2025,7 +2022,7 @@ static int __init init_hw_perf_events(void)
x86_pmu.read = _x86_pmu_read;
if (!x86_pmu.guest_get_msrs)
x86_pmu.guest_get_msrs = perf_guest_get_msrs_nop;
x86_pmu.guest_get_msrs = (void *)&__static_call_return0;
x86_pmu_static_call_update();

View File

@ -3662,8 +3662,10 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
if (!(event->attr.sample_type &
~intel_pmu_large_pebs_flags(event)))
~intel_pmu_large_pebs_flags(event))) {
event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
event->attach_state |= PERF_ATTACH_SCHED_CB;
}
}
if (x86_pmu.pebs_aliases)
x86_pmu.pebs_aliases(event);
@ -3676,6 +3678,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
ret = intel_pmu_setup_lbr_filter(event);
if (ret)
return ret;
event->attach_state |= PERF_ATTACH_SCHED_CB;
/*
* BTS is set up earlier in this path, so don't account twice

View File

@ -23,6 +23,8 @@ unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);
int insn_get_code_seg_params(struct pt_regs *regs);
int insn_fetch_from_user(struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE]);
int insn_fetch_from_user_inatomic(struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE]);
bool insn_decode(struct insn *insn, struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE], int buf_size);

View File

@ -963,7 +963,7 @@ struct kvm_arch {
struct kvm_pit *vpit;
atomic_t vapics_in_nmi_mode;
struct mutex apic_map_lock;
struct kvm_apic_map *apic_map;
struct kvm_apic_map __rcu *apic_map;
atomic_t apic_map_dirty;
bool apic_access_page_done;
@ -1036,7 +1036,7 @@ struct kvm_arch {
bool bus_lock_detection_enabled;
struct kvm_pmu_event_filter *pmu_event_filter;
struct kvm_pmu_event_filter __rcu *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
#ifdef CONFIG_X86_64

View File

@ -25,6 +25,7 @@ void __end_SYSENTER_singlestep_region(void);
void entry_SYSENTER_compat(void);
void __end_entry_SYSENTER_compat(void);
void entry_SYSCALL_compat(void);
void entry_SYSCALL_compat_safe_stack(void);
void entry_INT80_compat(void);
#ifdef CONFIG_XEN_PV
void xen_entry_INT80_compat(void);

View File

@ -94,6 +94,8 @@ struct pt_regs {
#include <asm/paravirt_types.h>
#endif
#include <asm/proto.h>
struct cpuinfo_x86;
struct task_struct;
@ -175,6 +177,19 @@ static inline bool any_64bit_mode(struct pt_regs *regs)
#ifdef CONFIG_X86_64
#define current_user_stack_pointer() current_pt_regs()->sp
#define compat_user_stack_pointer() current_pt_regs()->sp
static inline bool ip_within_syscall_gap(struct pt_regs *regs)
{
bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack);
#ifdef CONFIG_IA32_EMULATION
ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat &&
regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack);
#endif
return ret;
}
#endif
static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)

View File

@ -58,9 +58,8 @@ static __always_inline unsigned long smap_save(void)
unsigned long flags;
asm volatile ("# smap_save\n\t"
ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP)
"pushf; pop %0; " __ASM_CLAC "\n\t"
"1:"
ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC "\n\t",
X86_FEATURE_SMAP)
: "=rm" (flags) : : "memory", "cc");
return flags;
@ -69,9 +68,8 @@ static __always_inline unsigned long smap_save(void)
static __always_inline void smap_restore(unsigned long flags)
{
asm volatile ("# smap_restore\n\t"
ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP)
"push %0; popf\n\t"
"1:"
ALTERNATIVE("", "push %0; popf\n\t",
X86_FEATURE_SMAP)
: : "g" (flags) : "memory", "cc");
}

View File

@ -268,21 +268,20 @@ static void __init kvmclock_init_mem(void)
static int __init kvm_setup_vsyscall_timeinfo(void)
{
#ifdef CONFIG_X86_64
u8 flags;
if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall)
return 0;
flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
if (!(flags & PVCLOCK_TSC_STABLE_BIT))
return 0;
kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
#endif
kvmclock_init_mem();
#ifdef CONFIG_X86_64
if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) {
u8 flags;
flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
if (!(flags & PVCLOCK_TSC_STABLE_BIT))
return 0;
kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
}
#endif
return 0;
}
early_initcall(kvm_setup_vsyscall_timeinfo);

View File

@ -121,8 +121,18 @@ static void __init setup_vc_stacks(int cpu)
cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
}
static __always_inline bool on_vc_stack(unsigned long sp)
static __always_inline bool on_vc_stack(struct pt_regs *regs)
{
unsigned long sp = regs->sp;
/* User-mode RSP is not trusted */
if (user_mode(regs))
return false;
/* SYSCALL gap still has user-mode RSP */
if (ip_within_syscall_gap(regs))
return false;
return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
}
@ -144,7 +154,7 @@ void noinstr __sev_es_ist_enter(struct pt_regs *regs)
old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
/* Make room on the IST stack */
if (on_vc_stack(regs->sp))
if (on_vc_stack(regs))
new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
else
new_ist = old_ist - sizeof(old_ist);
@ -248,7 +258,7 @@ static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
int res;
if (user_mode(ctxt->regs)) {
res = insn_fetch_from_user(ctxt->regs, buffer);
res = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
if (!res) {
ctxt->fi.vector = X86_TRAP_PF;
ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
@ -1248,13 +1258,12 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
{
struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
irqentry_state_t irq_state;
struct ghcb_state state;
struct es_em_ctxt ctxt;
enum es_result result;
struct ghcb *ghcb;
lockdep_assert_irqs_disabled();
/*
* Handle #DB before calling into !noinstr code to avoid recursive #DB.
*/
@ -1263,6 +1272,8 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
return;
}
irq_state = irqentry_nmi_enter(regs);
lockdep_assert_irqs_disabled();
instrumentation_begin();
/*
@ -1325,6 +1336,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
out:
instrumentation_end();
irqentry_nmi_exit(regs, irq_state);
return;

View File

@ -694,8 +694,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
* In the SYSCALL entry path the RSP value comes from user-space - don't
* trust it and switch to the current kernel stack
*/
if (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack) {
if (ip_within_syscall_gap(regs)) {
sp = this_cpu_read(cpu_current_top_of_stack);
goto sync;
}

View File

@ -13,7 +13,7 @@
#define orc_warn_current(args...) \
({ \
if (state->task == current) \
if (state->task == current && !state->error) \
orc_warn(args); \
})
@ -367,8 +367,8 @@ static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
return false;
*ip = regs->ip;
*sp = regs->sp;
*ip = READ_ONCE_NOCHECK(regs->ip);
*sp = READ_ONCE_NOCHECK(regs->sp);
return true;
}
@ -380,8 +380,8 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr
if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
return false;
*ip = regs->ip;
*sp = regs->sp;
*ip = READ_ONCE_NOCHECK(regs->ip);
*sp = READ_ONCE_NOCHECK(regs->sp);
return true;
}
@ -402,12 +402,12 @@ static bool get_reg(struct unwind_state *state, unsigned int reg_off,
return false;
if (state->full_regs) {
*val = ((unsigned long *)state->regs)[reg];
*val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]);
return true;
}
if (state->prev_regs) {
*val = ((unsigned long *)state->prev_regs)[reg];
*val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]);
return true;
}

View File

@ -1642,7 +1642,16 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
}
if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
kvm_wait_lapic_expire(vcpu);
/*
* Ensure the guest's timer has truly expired before posting an
* interrupt. Open code the relevant checks to avoid querying
* lapic_timer_int_injected(), which will be false since the
* interrupt isn't yet injected. Waiting until after injecting
* is not an option since that won't help a posted interrupt.
*/
if (vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
vcpu->arch.apic->lapic_timer.timer_advance_ns)
__kvm_wait_lapic_expire(vcpu);
kvm_apic_inject_pending_timer_irqs(apic);
return;
}
@ -2595,6 +2604,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
apic_update_ppr(apic);
hrtimer_cancel(&apic->lapic_timer.timer);
apic->lapic_timer.expired_tscdeadline = 0;
apic_update_lvtt(apic);
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);

View File

@ -337,7 +337,18 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
cpu_relax();
}
} else {
/*
* If the SPTE is not MMU-present, there is no backing
* page associated with the SPTE and so no side effects
* that need to be recorded, and exclusive ownership of
* mmu_lock ensures the SPTE can't be made present.
* Note, zapping MMIO SPTEs is also unnecessary as they
* are guarded by the memslots generation, not by being
* unreachable.
*/
old_child_spte = READ_ONCE(*sptep);
if (!is_shadow_present_pte(old_child_spte))
continue;
/*
* Marking the SPTE as a removed SPTE is not

View File

@ -115,13 +115,6 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_INVALID, .always = false },
};
/* enable NPT for AMD64 and X86 with PAE */
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
bool npt_enabled = true;
#else
bool npt_enabled;
#endif
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* pause_filter_count: On processors that support Pause filtering(indicated
@ -170,9 +163,12 @@ module_param(pause_filter_count_shrink, ushort, 0444);
static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
module_param(pause_filter_count_max, ushort, 0444);
/* allow nested paging (virtualized MMU) for all guests */
static int npt = true;
module_param(npt, int, S_IRUGO);
/*
* Use nested page tables by default. Note, NPT may get forced off by
* svm_hardware_setup() if it's unsupported by hardware or the host kernel.
*/
bool npt_enabled = true;
module_param_named(npt, npt_enabled, bool, 0444);
/* allow nested virtualization in KVM/SVM */
static int nested = true;
@ -988,10 +984,15 @@ static __init int svm_hardware_setup(void)
goto err;
}
if (!boot_cpu_has(X86_FEATURE_NPT))
/*
* KVM's MMU doesn't support using 2-level paging for itself, and thus
* NPT isn't supported if the host is using 2-level paging since host
* CR4 is unchanged on VMRUN.
*/
if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
npt_enabled = false;
if (npt_enabled && !npt)
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);

View File

@ -6580,8 +6580,8 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
int i, nr_msrs;
struct perf_guest_switch_msr *msrs;
/* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
msrs = perf_guest_get_msrs(&nr_msrs);
if (!msrs)
return;

View File

@ -10601,7 +10601,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
return (void __user *)hva;
} else {
if (!slot || !slot->npages)
return 0;
return NULL;
old_npages = slot->npages;
hva = slot->userspace_addr;

View File

@ -1415,6 +1415,25 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
}
}
static unsigned long insn_get_effective_ip(struct pt_regs *regs)
{
unsigned long seg_base = 0;
/*
* If not in user-space long mode, a custom code segment could be in
* use. This is true in protected mode (if the process defined a local
* descriptor table), or virtual-8086 mode. In most of the cases
* seg_base will be zero as in USER_CS.
*/
if (!user_64bit_mode(regs)) {
seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
if (seg_base == -1L)
return 0;
}
return seg_base + regs->ip;
}
/**
* insn_fetch_from_user() - Copy instruction bytes from user-space memory
* @regs: Structure with register values as seen when entering kernel mode
@ -1431,24 +1450,43 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
*/
int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
{
unsigned long seg_base = 0;
unsigned long ip;
int not_copied;
/*
* If not in user-space long mode, a custom code segment could be in
* use. This is true in protected mode (if the process defined a local
* descriptor table), or virtual-8086 mode. In most of the cases
* seg_base will be zero as in USER_CS.
*/
if (!user_64bit_mode(regs)) {
seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
if (seg_base == -1L)
return 0;
}
ip = insn_get_effective_ip(regs);
if (!ip)
return 0;
not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE);
not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip),
MAX_INSN_SIZE);
return MAX_INSN_SIZE - not_copied;
}
/**
* insn_fetch_from_user_inatomic() - Copy instruction bytes from user-space memory
* while in atomic code
* @regs: Structure with register values as seen when entering kernel mode
* @buf: Array to store the fetched instruction
*
* Gets the linear address of the instruction and copies the instruction bytes
* to the buf. This function must be used in atomic context.
*
* Returns:
*
* Number of instruction bytes copied.
*
* 0 if nothing was copied.
*/
int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
{
unsigned long ip;
int not_copied;
ip = insn_get_effective_ip(regs);
if (!ip)
return 0;
not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE);
return MAX_INSN_SIZE - not_copied;
}

View File

@ -627,7 +627,7 @@ static ssize_t writeback_store(struct device *dev,
struct bio_vec bio_vec;
struct page *page;
ssize_t ret = len;
int mode;
int mode, err;
unsigned long blk_idx = 0;
if (sysfs_streq(buf, "idle"))
@ -638,8 +638,8 @@ static ssize_t writeback_store(struct device *dev,
if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1))
return -EINVAL;
ret = kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index);
if (ret || index >= nr_pages)
if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) ||
index >= nr_pages)
return -EINVAL;
nr_pages = 1;
@ -663,7 +663,7 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
while (nr_pages--) {
for (; nr_pages != 0; index++, nr_pages--) {
struct bio_vec bvec;
bvec.bv_page = page;
@ -728,12 +728,17 @@ static ssize_t writeback_store(struct device *dev,
* XXX: A single page IO would be inefficient for write
* but it would be not bad as starter.
*/
ret = submit_bio_wait(&bio);
if (ret) {
err = submit_bio_wait(&bio);
if (err) {
zram_slot_lock(zram, index);
zram_clear_flag(zram, index, ZRAM_UNDER_WB);
zram_clear_flag(zram, index, ZRAM_IDLE);
zram_slot_unlock(zram, index);
/*
* Return last IO error unless every IO were
* not suceeded.
*/
ret = err;
continue;
}

View File

@ -96,6 +96,18 @@ static void install_memreserve_table(void)
efi_err("Failed to install memreserve config table!\n");
}
static u32 get_supported_rt_services(void)
{
const efi_rt_properties_table_t *rt_prop_table;
u32 supported = EFI_RT_SUPPORTED_ALL;
rt_prop_table = get_efi_config_table(EFI_RT_PROPERTIES_TABLE_GUID);
if (rt_prop_table)
supported &= rt_prop_table->runtime_services_supported;
return supported;
}
/*
* EFI entry point for the arm/arm64 EFI stubs. This is the entrypoint
* that is described in the PE/COFF header. Most of the code is the same
@ -250,6 +262,10 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
(prop_tbl->memory_protection_attribute &
EFI_PROPERTIES_RUNTIME_MEMORY_PROTECTION_NON_EXECUTABLE_PE_DATA);
/* force efi_novamap if SetVirtualAddressMap() is unsupported */
efi_novamap |= !(get_supported_rt_services() &
EFI_RT_SUPPORTED_SET_VIRTUAL_ADDRESS_MAP);
/* hibernation expects the runtime regions to stay in the same place */
if (!IS_ENABLED(CONFIG_HIBERNATION) && !efi_nokaslr && !flat_va_mapping) {
/*

View File

@ -500,8 +500,6 @@ vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf,
vm_fault_t ret;
pgoff_t fault_page_size;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool is_cow_mapping =
(vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
switch (pe_size) {
case PE_SIZE_PMD:
@ -518,7 +516,7 @@ vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf,
}
/* Always do write dirty-tracking and COW on PTE level. */
if (write && (READ_ONCE(vbo->dirty) || is_cow_mapping))
if (write && (READ_ONCE(vbo->dirty) || is_cow_mapping(vma->vm_flags)))
return VM_FAULT_FALLBACK;
ret = ttm_bo_vm_reserve(bo, vmf);

View File

@ -49,7 +49,7 @@ int vmw_mmap(struct file *filp, struct vm_area_struct *vma)
vma->vm_ops = &vmw_vm_ops;
/* Use VM_PFNMAP rather than VM_MIXEDMAP if not a COW mapping */
if ((vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) != VM_MAYWRITE)
if (!is_cow_mapping(vma->vm_flags))
vma->vm_flags = (vma->vm_flags & ~VM_MIXEDMAP) | VM_PFNMAP;
return 0;

View File

@ -8,7 +8,6 @@ config IRQCHIP
config ARM_GIC
bool
select IRQ_DOMAIN_HIERARCHY
select GENERIC_IRQ_MULTI_HANDLER
select GENERIC_IRQ_EFFECTIVE_AFF_MASK
config ARM_GIC_PM
@ -33,7 +32,6 @@ config GIC_NON_BANKED
config ARM_GIC_V3
bool
select GENERIC_IRQ_MULTI_HANDLER
select IRQ_DOMAIN_HIERARCHY
select PARTITION_PERCPU
select GENERIC_IRQ_EFFECTIVE_AFF_MASK
@ -64,7 +62,6 @@ config ARM_NVIC
config ARM_VIC
bool
select IRQ_DOMAIN
select GENERIC_IRQ_MULTI_HANDLER
config ARM_VIC_NR
int
@ -99,14 +96,12 @@ config ATMEL_AIC_IRQ
bool
select GENERIC_IRQ_CHIP
select IRQ_DOMAIN
select GENERIC_IRQ_MULTI_HANDLER
select SPARSE_IRQ
config ATMEL_AIC5_IRQ
bool
select GENERIC_IRQ_CHIP
select IRQ_DOMAIN
select GENERIC_IRQ_MULTI_HANDLER
select SPARSE_IRQ
config I8259
@ -153,7 +148,6 @@ config DW_APB_ICTL
config FARADAY_FTINTC010
bool
select IRQ_DOMAIN
select GENERIC_IRQ_MULTI_HANDLER
select SPARSE_IRQ
config HISILICON_IRQ_MBIGEN
@ -169,7 +163,6 @@ config IMGPDC_IRQ
config IXP4XX_IRQ
bool
select IRQ_DOMAIN
select GENERIC_IRQ_MULTI_HANDLER
select SPARSE_IRQ
config MADERA_IRQ
@ -186,7 +179,6 @@ config CLPS711X_IRQCHIP
bool
depends on ARCH_CLPS711X
select IRQ_DOMAIN
select GENERIC_IRQ_MULTI_HANDLER
select SPARSE_IRQ
default y
@ -205,7 +197,6 @@ config OMAP_IRQCHIP
config ORION_IRQCHIP
bool
select IRQ_DOMAIN
select GENERIC_IRQ_MULTI_HANDLER
config PIC32_EVIC
bool

View File

@ -179,5 +179,6 @@ static int __init ingenic_tcu_irq_init(struct device_node *np,
}
IRQCHIP_DECLARE(jz4740_tcu_irq, "ingenic,jz4740-tcu", ingenic_tcu_irq_init);
IRQCHIP_DECLARE(jz4725b_tcu_irq, "ingenic,jz4725b-tcu", ingenic_tcu_irq_init);
IRQCHIP_DECLARE(jz4760_tcu_irq, "ingenic,jz4760-tcu", ingenic_tcu_irq_init);
IRQCHIP_DECLARE(jz4770_tcu_irq, "ingenic,jz4770-tcu", ingenic_tcu_irq_init);
IRQCHIP_DECLARE(x1000_tcu_irq, "ingenic,x1000-tcu", ingenic_tcu_irq_init);

View File

@ -155,6 +155,7 @@ static int __init intc_2chip_of_init(struct device_node *node,
{
return ingenic_intc_of_init(node, 2);
}
IRQCHIP_DECLARE(jz4760_intc, "ingenic,jz4760-intc", intc_2chip_of_init);
IRQCHIP_DECLARE(jz4770_intc, "ingenic,jz4770-intc", intc_2chip_of_init);
IRQCHIP_DECLARE(jz4775_intc, "ingenic,jz4775-intc", intc_2chip_of_init);
IRQCHIP_DECLARE(jz4780_intc, "ingenic,jz4780-intc", intc_2chip_of_init);

View File

@ -649,12 +649,24 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
struct super_block *sb = file_inode(file)->i_sb;
struct dentry *root = sb->s_root, *dentry;
int err = 0;
struct file *f = NULL;
e = create_entry(buffer, count);
if (IS_ERR(e))
return PTR_ERR(e);
if (e->flags & MISC_FMT_OPEN_FILE) {
f = open_exec(e->interpreter);
if (IS_ERR(f)) {
pr_notice("register: failed to install interpreter file %s\n",
e->interpreter);
kfree(e);
return PTR_ERR(f);
}
e->interp_file = f;
}
inode_lock(d_inode(root));
dentry = lookup_one_len(e->name, root, strlen(e->name));
err = PTR_ERR(dentry);
@ -678,21 +690,6 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
goto out2;
}
if (e->flags & MISC_FMT_OPEN_FILE) {
struct file *f;
f = open_exec(e->interpreter);
if (IS_ERR(f)) {
err = PTR_ERR(f);
pr_notice("register: failed to install interpreter file %s\n", e->interpreter);
simple_release_fs(&bm_mnt, &entry_count);
iput(inode);
inode = NULL;
goto out2;
}
e->interp_file = f;
}
e->dentry = dget(dentry);
inode->i_private = e;
inode->i_fop = &bm_entry_operations;
@ -709,6 +706,8 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
inode_unlock(d_inode(root));
if (err) {
if (f)
filp_close(f, NULL);
kfree(e);
return err;
}

View File

@ -1098,8 +1098,6 @@ struct clear_refs_private {
#ifdef CONFIG_MEM_SOFT_DIRTY
#define is_cow_mapping(flags) (((flags) & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE)
static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
struct page *page;

View File

@ -13,6 +13,13 @@
#define ARMV8_PMU_CYCLE_IDX (ARMV8_PMU_MAX_COUNTERS - 1)
#define ARMV8_PMU_MAX_COUNTER_PAIRS ((ARMV8_PMU_MAX_COUNTERS + 1) >> 1)
DECLARE_STATIC_KEY_FALSE(kvm_arm_pmu_available);
static __always_inline bool kvm_arm_support_pmu_v3(void)
{
return static_branch_likely(&kvm_arm_pmu_available);
}
#ifdef CONFIG_HW_PERF_EVENTS
struct kvm_pmc {
@ -47,7 +54,6 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val);
void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val);
void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
u64 select_idx);
bool kvm_arm_support_pmu_v3(void);
int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr);
int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu,
@ -87,7 +93,6 @@ static inline void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) {}
static inline void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) {}
static inline void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu,
u64 data, u64 select_idx) {}
static inline bool kvm_arm_support_pmu_v3(void) { return false; }
static inline int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{

View File

@ -31,6 +31,12 @@
#define __no_sanitize_thread
#endif
#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP)
#define __HAVE_BUILTIN_BSWAP32__
#define __HAVE_BUILTIN_BSWAP64__
#define __HAVE_BUILTIN_BSWAP16__
#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
#if __has_feature(undefined_behavior_sanitizer)
/* GCC does not have __SANITIZE_UNDEFINED__ */
#define __no_sanitize_undefined \

View File

@ -150,7 +150,6 @@ struct irq_domain_chip_generic;
* setting up one or more generic chips for interrupt controllers
* drivers using the generic chip library which uses this pointer.
* @parent: Pointer to parent irq_domain to support hierarchy irq_domains
* @debugfs_file: dentry for the domain debugfs file
*
* Revmap data, used internally by irq_domain
* @revmap_direct_max_irq: The largest hwirq that can be set for controllers that
@ -174,9 +173,6 @@ struct irq_domain {
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
struct irq_domain *parent;
#endif
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
struct dentry *debugfs_file;
#endif
/* reverse map data. The linear map gets appended to the irq_domain */
irq_hw_number_t hwirq_max;

View File

@ -460,7 +460,7 @@ static inline void memblock_free_late(phys_addr_t base, phys_addr_t size)
/*
* Set the allocation direction to bottom-up or top-down.
*/
static inline void memblock_set_bottom_up(bool enable)
static inline __init void memblock_set_bottom_up(bool enable)
{
memblock.bottom_up = enable;
}
@ -470,7 +470,7 @@ static inline void memblock_set_bottom_up(bool enable)
* if this is true, that said, memblock will allocate memory
* in bottom-up direction.
*/
static inline bool memblock_bottom_up(void)
static inline __init bool memblock_bottom_up(void)
{
return memblock.bottom_up;
}

View File

@ -1061,9 +1061,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
rcu_read_unlock();
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void mem_cgroup_split_huge_fixup(struct page *head);
#endif
void split_page_memcg(struct page *head, unsigned int nr);
#else /* CONFIG_MEMCG */
@ -1400,7 +1398,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
return 0;
}
static inline void mem_cgroup_split_huge_fixup(struct page *head)
static inline void split_page_memcg(struct page *head, unsigned int nr)
{
}

View File

@ -1300,6 +1300,27 @@ static inline bool page_maybe_dma_pinned(struct page *page)
GUP_PIN_COUNTING_BIAS;
}
static inline bool is_cow_mapping(vm_flags_t flags)
{
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
/*
* This should most likely only be called during fork() to see whether we
* should break the cow immediately for a page on the src mm.
*/
static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
struct page *page)
{
if (!is_cow_mapping(vma->vm_flags))
return false;
if (!atomic_read(&vma->vm_mm->has_pinned))
return false;
return page_maybe_dma_pinned(page);
}
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif

View File

@ -23,6 +23,7 @@
#endif
#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
#define INIT_PASID 0
struct address_space;
struct mem_cgroup;

View File

@ -606,6 +606,7 @@ struct swevent_hlist {
#define PERF_ATTACH_TASK 0x04
#define PERF_ATTACH_TASK_DATA 0x08
#define PERF_ATTACH_ITRACE 0x10
#define PERF_ATTACH_SCHED_CB 0x20
struct perf_cgroup;
struct perf_buffer;
@ -872,6 +873,7 @@ struct perf_cpu_context {
struct list_head cgrp_cpuctx_entry;
#endif
struct list_head sched_cb_entry;
int sched_cb_usage;
int online;

View File

@ -140,7 +140,8 @@ static inline bool in_vfork(struct task_struct *tsk)
* another oom-unkillable task does this it should blame itself.
*/
rcu_read_lock();
ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm;
ret = tsk->vfork_done &&
rcu_dereference(tsk->real_parent)->mm == tsk->mm;
rcu_read_unlock();
return ret;

View File

@ -664,10 +664,7 @@ typedef struct {
* seqcount_latch_init() - runtime initializer for seqcount_latch_t
* @s: Pointer to the seqcount_latch_t instance
*/
static inline void seqcount_latch_init(seqcount_latch_t *s)
{
seqcount_init(&s->seqcount);
}
#define seqcount_latch_init(s) seqcount_init(&(s)->seqcount)
/**
* raw_read_seqcount_latch() - pick even/odd latch data copy

View File

@ -128,7 +128,7 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus);
#else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
static inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
static __always_inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus)
{
unsigned long flags;
@ -139,14 +139,15 @@ static inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
return ret;
}
static inline int stop_machine(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus)
static __always_inline int
stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
{
return stop_machine_cpuslocked(fn, data, cpus);
}
static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus)
static __always_inline int
stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus)
{
return stop_machine(fn, data, cpus);
}

View File

@ -115,12 +115,13 @@ static inline void u64_stats_inc(u64_stats_t *p)
}
#endif
#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
#define u64_stats_init(syncp) seqcount_init(&(syncp)->seq)
#else
static inline void u64_stats_init(struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
seqcount_init(&syncp->seq);
#endif
}
#endif
static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
{

View File

@ -119,8 +119,7 @@ config INIT_ENV_ARG_LIMIT
config COMPILE_TEST
bool "Compile also drivers which will not load"
depends on !UML && !S390
default n
depends on HAS_IOMEM
help
Some drivers can be compiled on a different platform than they are
intended to be run on. Despite they cannot be loaded there (or even

View File

@ -386,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@ -3461,11 +3462,16 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
}
}
static DEFINE_PER_CPU(struct list_head, sched_cb_list);
void perf_sched_cb_dec(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
--cpuctx->sched_cb_usage;
this_cpu_dec(perf_sched_cb_usages);
if (!--cpuctx->sched_cb_usage)
list_del(&cpuctx->sched_cb_entry);
}
@ -3473,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
cpuctx->sched_cb_usage++;
if (!cpuctx->sched_cb_usage++)
list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
this_cpu_inc(perf_sched_cb_usages);
}
/*
@ -3502,6 +3511,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
static void perf_pmu_sched_task(struct task_struct *prev,
struct task_struct *next,
bool sched_in)
{
struct perf_cpu_context *cpuctx;
if (prev == next)
return;
list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
/* will be handled in perf_event_context_sched_in/out */
if (cpuctx->task_ctx)
continue;
__perf_pmu_sched_task(cpuctx, sched_in);
}
}
static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in);
@ -3524,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
{
int ctxn;
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(task, next, false);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
@ -3832,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(prev, task, true);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@ -4657,7 +4690,7 @@ static void unaccount_event(struct perf_event *event)
if (event->parent)
return;
if (event->attach_state & PERF_ATTACH_TASK)
if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
dec = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
@ -11176,7 +11209,7 @@ static void account_event(struct perf_event *event)
if (event->parent)
return;
if (event->attach_state & PERF_ATTACH_TASK)
if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
inc = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
@ -12973,6 +13006,7 @@ static void __init perf_event_init_all_cpus(void)
#ifdef CONFIG_CGROUP_PERF
INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
#endif
INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
}
}

View File

@ -997,6 +997,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
static void mm_init_pasid(struct mm_struct *mm)
{
#ifdef CONFIG_IOMMU_SUPPORT
mm->pasid = INIT_PASID;
#endif
}
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
@ -1027,6 +1034,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
mm_init_pasid(mm);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);

View File

@ -1898,16 +1898,15 @@ DEFINE_SHOW_ATTRIBUTE(irq_domain_debug);
static void debugfs_add_domain_dir(struct irq_domain *d)
{
if (!d->name || !domain_dir || d->debugfs_file)
if (!d->name || !domain_dir)
return;
d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d,
&irq_domain_debug_fops);
debugfs_create_file(d->name, 0444, domain_dir, d,
&irq_domain_debug_fops);
}
static void debugfs_remove_domain_dir(struct irq_domain *d)
{
debugfs_remove(d->debugfs_file);
d->debugfs_file = NULL;
debugfs_remove(debugfs_lookup(d->name, domain_dir));
}
void __init irq_domain_debugfs_init(struct dentry *root)

View File

@ -1893,8 +1893,13 @@ struct migration_arg {
struct set_affinity_pending *pending;
};
/*
* @refs: number of wait_for_completion()
* @stop_pending: is @stop_work in use
*/
struct set_affinity_pending {
refcount_t refs;
unsigned int stop_pending;
struct completion done;
struct cpu_stop_work stop_work;
struct migration_arg arg;
@ -1929,8 +1934,8 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
*/
static int migration_cpu_stop(void *data)
{
struct set_affinity_pending *pending;
struct migration_arg *arg = data;
struct set_affinity_pending *pending = arg->pending;
struct task_struct *p = arg->task;
int dest_cpu = arg->dest_cpu;
struct rq *rq = this_rq();
@ -1952,7 +1957,6 @@ static int migration_cpu_stop(void *data)
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
pending = p->migration_pending;
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
@ -1963,21 +1967,14 @@ static int migration_cpu_stop(void *data)
goto out;
if (pending) {
p->migration_pending = NULL;
if (p->migration_pending == pending)
p->migration_pending = NULL;
complete = true;
}
/* migrate_enable() -- we must not race against SCA */
if (dest_cpu < 0) {
/*
* When this was migrate_enable() but we no longer
* have a @pending, a concurrent SCA 'fixed' things
* and we should be valid again. Nothing to do.
*/
if (!pending) {
WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
goto out;
}
dest_cpu = cpumask_any_distribute(&p->cpus_mask);
}
@ -1987,7 +1984,14 @@ static int migration_cpu_stop(void *data)
else
p->wake_cpu = dest_cpu;
} else if (dest_cpu < 0 || pending) {
/*
* XXX __migrate_task() can fail, at which point we might end
* up running on a dodgy CPU, AFAICT this can only happen
* during CPU hotplug, at which point we'll get pushed out
* anyway, so it's probably not a big deal.
*/
} else if (pending) {
/*
* This happens when we get migrated between migrate_enable()'s
* preempt_enable() and scheduling the stopper task. At that
@ -2002,43 +2006,32 @@ static int migration_cpu_stop(void *data)
* ->pi_lock, so the allowed mask is stable - if it got
* somewhere allowed, we're done.
*/
if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
p->migration_pending = NULL;
if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
if (p->migration_pending == pending)
p->migration_pending = NULL;
complete = true;
goto out;
}
/*
* When this was migrate_enable() but we no longer have an
* @pending, a concurrent SCA 'fixed' things and we should be
* valid again. Nothing to do.
*/
if (!pending) {
WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
goto out;
}
/*
* When migrate_enable() hits a rq mis-match we can't reliably
* determine is_migration_disabled() and so have to chase after
* it.
*/
WARN_ON_ONCE(!pending->stop_pending);
task_rq_unlock(rq, p, &rf);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
return 0;
}
out:
if (pending)
pending->stop_pending = false;
task_rq_unlock(rq, p, &rf);
if (complete)
complete_all(&pending->done);
/* For pending->{arg,stop_work} */
pending = arg->pending;
if (pending && refcount_dec_and_test(&pending->refs))
wake_up_var(&pending->refs);
return 0;
}
@ -2225,11 +2218,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
int dest_cpu, unsigned int flags)
{
struct set_affinity_pending my_pending = { }, *pending = NULL;
struct migration_arg arg = {
.task = p,
.dest_cpu = dest_cpu,
};
bool complete = false;
bool stop_pending, complete = false;
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
@ -2241,12 +2230,16 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
push_task = get_task_struct(p);
}
/*
* If there are pending waiters, but no pending stop_work,
* then complete now.
*/
pending = p->migration_pending;
if (pending) {
refcount_inc(&pending->refs);
if (pending && !pending->stop_pending) {
p->migration_pending = NULL;
complete = true;
}
task_rq_unlock(rq, p, rf);
if (push_task) {
@ -2255,7 +2248,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
}
if (complete)
goto do_complete;
complete_all(&pending->done);
return 0;
}
@ -2266,6 +2259,12 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
/* Install the request */
refcount_set(&my_pending.refs, 1);
init_completion(&my_pending.done);
my_pending.arg = (struct migration_arg) {
.task = p,
.dest_cpu = -1, /* any */
.pending = &my_pending,
};
p->migration_pending = &my_pending;
} else {
pending = p->migration_pending;
@ -2290,45 +2289,41 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
return -EINVAL;
}
if (flags & SCA_MIGRATE_ENABLE) {
refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
p->migration_flags &= ~MDF_PUSH;
task_rq_unlock(rq, p, rf);
pending->arg = (struct migration_arg) {
.task = p,
.dest_cpu = -1,
.pending = pending,
};
stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
&pending->arg, &pending->stop_work);
return 0;
}
if (task_running(rq, p) || p->state == TASK_WAKING) {
/*
* Lessen races (and headaches) by delegating
* is_migration_disabled(p) checks to the stopper, which will
* run on the same CPU as said p.
* MIGRATE_ENABLE gets here because 'p == current', but for
* anything else we cannot do is_migration_disabled(), punt
* and have the stopper function handle it all race-free.
*/
task_rq_unlock(rq, p, rf);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
stop_pending = pending->stop_pending;
if (!stop_pending)
pending->stop_pending = true;
if (flags & SCA_MIGRATE_ENABLE)
p->migration_flags &= ~MDF_PUSH;
task_rq_unlock(rq, p, rf);
if (!stop_pending) {
stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
&pending->arg, &pending->stop_work);
}
if (flags & SCA_MIGRATE_ENABLE)
return 0;
} else {
if (!is_migration_disabled(p)) {
if (task_on_rq_queued(p))
rq = move_queued_task(rq, rf, p, dest_cpu);
p->migration_pending = NULL;
complete = true;
if (!pending->stop_pending) {
p->migration_pending = NULL;
complete = true;
}
}
task_rq_unlock(rq, p, rf);
do_complete:
if (complete)
complete_all(&pending->done);
}
@ -2336,7 +2331,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
wait_for_completion(&pending->done);
if (refcount_dec_and_test(&pending->refs))
wake_up_var(&pending->refs);
wake_up_var(&pending->refs); /* No UaF, just an address */
/*
* Block the original owner of &pending until all subsequent callers
@ -2344,6 +2339,9 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
*/
wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
/* ARGH */
WARN_ON_ONCE(my_pending.stop_pending);
return 0;
}

View File

@ -471,9 +471,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm)
}
rcu_read_unlock();
preempt_disable();
smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
preempt_enable();
on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
free_cpumask_var(tmpmask);
cpus_read_unlock();

View File

@ -349,7 +349,8 @@ static int static_call_add_module(struct module *mod)
struct static_call_site *site;
for (site = start; site != stop; site++) {
unsigned long addr = (unsigned long)static_call_key(site);
unsigned long s_key = (long)site->key + (long)&site->key;
unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS;
unsigned long key;
/*
@ -373,8 +374,8 @@ static int static_call_add_module(struct module *mod)
return -EINVAL;
}
site->key = (key - (long)&site->key) |
(site->key & STATIC_CALL_SITE_FLAGS);
key |= s_key & STATIC_CALL_SITE_FLAGS;
site->key = key - (long)&site->key;
}
return __static_call_init(mod, start, stop);

View File

@ -2083,7 +2083,7 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
* up to the caller to provide sane values here, otherwise userspace
* tools which use this vector might be unhappy.
*/
unsigned long user_auxv[AT_VECTOR_SIZE];
unsigned long user_auxv[AT_VECTOR_SIZE] = {};
if (len > sizeof(user_auxv))
return -EINVAL;

View File

@ -546,8 +546,11 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
}
/*
* Recomputes cpu_base::*next_timer and returns the earliest expires_next but
* does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
* Recomputes cpu_base::*next_timer and returns the earliest expires_next
* but does not set cpu_base::*expires_next, that is done by
* hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
* cpu_base::*expires_next right away, reprogramming logic would no longer
* work.
*
* When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
* those timers will get run whenever the softirq gets handled, at the end of
@ -588,6 +591,37 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
return expires_next;
}
static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
{
ktime_t expires_next, soft = KTIME_MAX;
/*
* If the soft interrupt has already been activated, ignore the
* soft bases. They will be handled in the already raised soft
* interrupt.
*/
if (!cpu_base->softirq_activated) {
soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
/*
* Update the soft expiry time. clock_settime() might have
* affected it.
*/
cpu_base->softirq_expires_next = soft;
}
expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
/*
* If a softirq timer is expiring first, update cpu_base->next_timer
* and program the hardware with the soft expiry time.
*/
if (expires_next > soft) {
cpu_base->next_timer = cpu_base->softirq_next_timer;
expires_next = soft;
}
return expires_next;
}
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
@ -628,23 +662,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
ktime_t expires_next;
/*
* Find the current next expiration time.
*/
expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
/*
* When the softirq is activated, hrtimer has to be
* programmed with the first hard hrtimer because soft
* timer interrupt could occur too late.
*/
if (cpu_base->softirq_activated)
expires_next = __hrtimer_get_next_event(cpu_base,
HRTIMER_ACTIVE_HARD);
else
cpu_base->softirq_expires_next = expires_next;
}
expires_next = hrtimer_update_next_event(cpu_base);
if (skip_equal && expires_next == cpu_base->expires_next)
return;
@ -1644,8 +1662,8 @@ void hrtimer_interrupt(struct clock_event_device *dev)
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
/* Reevaluate the clock bases for the next expiry */
expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
/* Reevaluate the clock bases for the [soft] next expiry */
expires_next = hrtimer_update_next_event(cpu_base);
/*
* Store the new expiry value so the migration code can verify
* against it.

View File

@ -156,6 +156,7 @@ config KASAN_STACK_ENABLE
config KASAN_STACK
int
depends on KASAN_GENERIC || KASAN_SW_TAGS
default 1 if KASAN_STACK_ENABLE || CC_IS_GCC
default 0

View File

@ -368,20 +368,24 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
BUG_ON(end1 > page_size(page) || end2 > page_size(page));
if (start1 >= end1)
start1 = end1 = 0;
if (start2 >= end2)
start2 = end2 = 0;
for (i = 0; i < compound_nr(page); i++) {
void *kaddr = NULL;
if (start1 < PAGE_SIZE || start2 < PAGE_SIZE)
kaddr = kmap_atomic(page + i);
if (start1 >= PAGE_SIZE) {
start1 -= PAGE_SIZE;
end1 -= PAGE_SIZE;
} else {
unsigned this_end = min_t(unsigned, end1, PAGE_SIZE);
if (end1 > start1)
if (end1 > start1) {
kaddr = kmap_atomic(page + i);
memset(kaddr + start1, 0, this_end - start1);
}
end1 -= this_end;
start1 = 0;
}
@ -392,8 +396,11 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
} else {
unsigned this_end = min_t(unsigned, end2, PAGE_SIZE);
if (end2 > start2)
if (end2 > start2) {
if (!kaddr)
kaddr = kmap_atomic(page + i);
memset(kaddr + start2, 0, this_end - start2);
}
end2 -= this_end;
start2 = 0;
}

View File

@ -1100,9 +1100,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* best effort that the pinned pages won't be replaced by another
* random page during the coming copy-on-write.
*/
if (unlikely(is_cow_mapping(vma->vm_flags) &&
atomic_read(&src_mm->has_pinned) &&
page_maybe_dma_pinned(src_page))) {
if (unlikely(page_needs_cow_for_dma(vma, src_page))) {
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
@ -1214,9 +1212,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
/* Please refer to comments in copy_huge_pmd() */
if (unlikely(is_cow_mapping(vma->vm_flags) &&
atomic_read(&src_mm->has_pinned) &&
page_maybe_dma_pinned(pud_page(pud)))) {
if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) {
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
__split_huge_pud(vma, src_pud, addr);
@ -2471,7 +2467,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
int i;
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
split_page_memcg(head, nr);
if (PageAnon(head) && PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };

View File

@ -331,6 +331,24 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
}
}
static inline long
hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
long to, struct hstate *h, struct hugetlb_cgroup *cg,
long *regions_needed)
{
struct file_region *nrg;
if (!regions_needed) {
nrg = get_file_region_entry_from_cache(map, from, to);
record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
list_add(&nrg->link, rg->link.prev);
coalesce_file_region(map, nrg);
} else
*regions_needed += 1;
return to - from;
}
/*
* Must be called with resv->lock held.
*
@ -346,7 +364,7 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
long add = 0;
struct list_head *head = &resv->regions;
long last_accounted_offset = f;
struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
struct file_region *rg = NULL, *trg = NULL;
if (regions_needed)
*regions_needed = 0;
@ -369,24 +387,17 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
/* When we find a region that starts beyond our range, we've
* finished.
*/
if (rg->from > t)
if (rg->from >= t)
break;
/* Add an entry for last_accounted_offset -> rg->from, and
* update last_accounted_offset.
*/
if (rg->from > last_accounted_offset) {
add += rg->from - last_accounted_offset;
if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, rg->from);
record_hugetlb_cgroup_uncharge_info(h_cg, h,
resv, nrg);
list_add(&nrg->link, rg->link.prev);
coalesce_file_region(resv, nrg);
} else
*regions_needed += 1;
}
if (rg->from > last_accounted_offset)
add += hugetlb_resv_map_add(resv, rg,
last_accounted_offset,
rg->from, h, h_cg,
regions_needed);
last_accounted_offset = rg->to;
}
@ -394,17 +405,9 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
/* Handle the case where our range extends beyond
* last_accounted_offset.
*/
if (last_accounted_offset < t) {
add += t - last_accounted_offset;
if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, t);
record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
list_add(&nrg->link, rg->link.prev);
coalesce_file_region(resv, nrg);
} else
*regions_needed += 1;
}
if (last_accounted_offset < t)
add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
t, h, h_cg, regions_needed);
VM_BUG_ON(add < 0);
return add;
@ -3725,21 +3728,32 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
return false;
}
static void
hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
struct page *new_page)
{
__SetPageUptodate(new_page);
set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
hugepage_add_new_anon_rmap(new_page, vma, addr);
hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
ClearHPageRestoreReserve(new_page);
SetHPageMigratable(new_page);
}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
{
pte_t *src_pte, *dst_pte, entry, dst_entry;
struct page *ptepage;
unsigned long addr;
int cow;
bool cow = is_cow_mapping(vma->vm_flags);
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
unsigned long npages = pages_per_huge_page(h);
struct address_space *mapping = vma->vm_file->f_mapping;
struct mmu_notifier_range range;
int ret = 0;
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
if (cow) {
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
vma->vm_start,
@ -3784,6 +3798,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
dst_entry = huge_ptep_get(dst_pte);
again:
if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
/*
* Skip if src entry none. Also, skip in the
@ -3807,6 +3822,52 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}
set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
} else {
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
/*
* This is a rare case where we see pinned hugetlb
* pages while they're prone to COW. We need to do the
* COW earlier during fork.
*
* When pre-allocating the page or copying data, we
* need to be without the pgtable locks since we could
* sleep during the process.
*/
if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
pte_t src_pte_old = entry;
struct page *new;
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
/* Do not use reserve as it's private owned */
new = alloc_huge_page(vma, addr, 1);
if (IS_ERR(new)) {
put_page(ptepage);
ret = PTR_ERR(new);
break;
}
copy_user_huge_page(new, ptepage, addr, vma,
npages);
put_page(ptepage);
/* Install the new huge page if src pte stable */
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
if (!pte_same(src_pte_old, entry)) {
put_page(new);
/* dst_entry won't change as in child */
goto again;
}
hugetlb_install_page(vma, dst_pte, addr, new);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
continue;
}
if (cow) {
/*
* No need to notify as we are downgrading page
@ -3817,12 +3878,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
*/
huge_ptep_set_wrprotect(src, addr, src_pte);
}
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
page_dup_rmap(ptepage, true);
set_huge_pte_at(dst, addr, dst_pte, entry);
hugetlb_count_add(pages_per_huge_page(h), dst);
hugetlb_count_add(npages, dst);
}
spin_unlock(src_ptl);
spin_unlock(dst_ptl);

View File

@ -296,11 +296,6 @@ static inline unsigned int buddy_order(struct page *page)
*/
#define buddy_order_unsafe(page) READ_ONCE(page_private(page))
static inline bool is_cow_mapping(vm_flags_t flags)
{
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
/*
* These three helpers classifies VMAs for virtual memory accounting.
*/

View File

@ -20,6 +20,11 @@
#include "kfence.h"
/* May be overridden by <asm/kfence.h>. */
#ifndef ARCH_FUNC_PREFIX
#define ARCH_FUNC_PREFIX ""
#endif
extern bool no_hash_pointers;
/* Helper function to either print to a seq_file or to console. */
@ -67,8 +72,9 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
for (skipnr = 0; skipnr < num_entries; skipnr++) {
int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]);
if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") ||
!strncmp(buf, "__slab_free", len)) {
if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") ||
!strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) {
/*
* In case of tail calls from any of the below
* to any of the above.
@ -77,10 +83,10 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
}
/* Also the *_bulk() variants by only checking prefixes. */
if (str_has_prefix(buf, "kfree") ||
str_has_prefix(buf, "kmem_cache_free") ||
str_has_prefix(buf, "__kmalloc") ||
str_has_prefix(buf, "kmem_cache_alloc"))
if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc"))
goto found;
}
if (fallback < num_entries)
@ -116,12 +122,12 @@ void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *met
lockdep_assert_held(&meta->lock);
if (meta->state == KFENCE_OBJECT_UNUSED) {
seq_con_printf(seq, "kfence-#%zd unused\n", meta - kfence_metadata);
seq_con_printf(seq, "kfence-#%td unused\n", meta - kfence_metadata);
return;
}
seq_con_printf(seq,
"kfence-#%zd [0x%p-0x%p"
"kfence-#%td [0x%p-0x%p"
", size=%d, cache=%s] allocated by task %d:\n",
meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size,
(cache && cache->name) ? cache->name : "<destroyed>", meta->alloc_track.pid);
@ -204,7 +210,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write),
(void *)stack_entries[skipnr]);
pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%zd):\n",
pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%td):\n",
get_access_type(is_write), (void *)address,
left_of_object ? meta->addr - address : address - meta->addr,
left_of_object ? "left" : "right", object_index);
@ -213,14 +219,14 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
case KFENCE_ERROR_UAF:
pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write),
(void *)stack_entries[skipnr]);
pr_err("Use-after-free %s at 0x%p (in kfence-#%zd):\n",
pr_err("Use-after-free %s at 0x%p (in kfence-#%td):\n",
get_access_type(is_write), (void *)address, object_index);
break;
case KFENCE_ERROR_CORRUPTION:
pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]);
pr_err("Corrupted memory at 0x%p ", (void *)address);
print_diff_canary(address, 16, meta);
pr_cont(" (in kfence-#%zd):\n", object_index);
pr_cont(" (in kfence-#%td):\n", object_index);
break;
case KFENCE_ERROR_INVALID:
pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write),
@ -230,7 +236,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
break;
case KFENCE_ERROR_INVALID_FREE:
pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]);
pr_err("Invalid free of 0x%p (in kfence-#%zd):\n", (void *)address,
pr_err("Invalid free of 0x%p (in kfence-#%td):\n", (void *)address,
object_index);
break;
}

View File

@ -1198,12 +1198,22 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto release_task;
}
mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
if (IS_ERR_OR_NULL(mm)) {
ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
goto release_task;
}
/*
* Require CAP_SYS_NICE for influencing process performance. Note that
* only non-destructive hints are currently supported.
*/
if (!capable(CAP_SYS_NICE)) {
ret = -EPERM;
goto release_mm;
}
total_len = iov_iter_count(&iter);
while (iov_iter_count(&iter)) {
@ -1218,6 +1228,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
if (ret == 0)
ret = total_len - iov_iter_count(&iter);
release_mm:
mmput(mm);
release_task:
put_task_struct(task);

View File

@ -3287,24 +3287,21 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* Because page_memcg(head) is not set on compound tails, set it now.
* Because page_memcg(head) is not set on tails, set it now.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
void split_page_memcg(struct page *head, unsigned int nr)
{
struct mem_cgroup *memcg = page_memcg(head);
int i;
if (mem_cgroup_disabled())
if (mem_cgroup_disabled() || !memcg)
return;
for (i = 1; i < HPAGE_PMD_NR; i++) {
css_get(&memcg->css);
head[i].memcg_data = (unsigned long)memcg;
}
for (i = 1; i < nr; i++)
head[i].memcg_data = head->memcg_data;
css_get_many(&memcg->css, nr - 1);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_MEMCG_SWAP
/**

View File

@ -823,12 +823,8 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
struct page **prealloc, pte_t pte, struct page *page)
{
struct mm_struct *src_mm = src_vma->vm_mm;
struct page *new_page;
if (!is_cow_mapping(src_vma->vm_flags))
return 1;
/*
* What we want to do is to check whether this page may
* have been pinned by the parent process. If so,
@ -842,9 +838,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* the page count. That might give false positives for
* for pinning, but it will work correctly.
*/
if (likely(!atomic_read(&src_mm->has_pinned)))
return 1;
if (likely(!page_maybe_dma_pinned(page)))
if (likely(!page_needs_cow_for_dma(src_vma, page)))
return 1;
new_page = *prealloc;
@ -3117,6 +3111,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
return handle_userfault(vmf, VM_UFFD_WP);
}
/*
* Userfaultfd write-protect can defer flushes. Ensure the TLB
* is flushed in this case before copying.
*/
if (unlikely(userfaultfd_wp(vmf->vma) &&
mm_tlb_flush_pending(vmf->vma->vm_mm)))
flush_tlb_page(vmf->vma, vmf->address);
vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
if (!vmf->page) {
/*

View File

@ -1293,6 +1293,12 @@ static __always_inline bool free_pages_prepare(struct page *page,
kernel_poison_pages(page, 1 << order);
/*
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
kasan_free_nondeferred_pages(page, order);
/*
* arch_free_page() can make the page's contents inaccessible. s390
* does this. So nothing which can access the page's contents should
@ -1302,8 +1308,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
debug_pagealloc_unmap_pages(page, 1 << order);
kasan_free_nondeferred_pages(page, order);
return true;
}
@ -3322,6 +3326,7 @@ void split_page(struct page *page, unsigned int order)
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
split_page_owner(page, 1 << order);
split_page_memcg(page, 1 << order);
}
EXPORT_SYMBOL_GPL(split_page);
@ -6271,12 +6276,65 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
}
#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
/*
* Only struct pages that correspond to ranges defined by memblock.memory
* are zeroed and initialized by going through __init_single_page() during
* memmap_init_zone().
*
* But, there could be struct pages that correspond to holes in
* memblock.memory. This can happen because of the following reasons:
* - physical memory bank size is not necessarily the exact multiple of the
* arbitrary section size
* - early reserved memory may not be listed in memblock.memory
* - memory layouts defined with memmap= kernel parameter may not align
* nicely with memmap sections
*
* Explicitly initialize those struct pages so that:
* - PG_Reserved is set
* - zone and node links point to zone and node that span the page if the
* hole is in the middle of a zone
* - zone and node links point to adjacent zone/node if the hole falls on
* the zone boundary; the pages in such holes will be prepended to the
* zone/node above the hole except for the trailing pages in the last
* section that will be appended to the zone/node below.
*/
static u64 __meminit init_unavailable_range(unsigned long spfn,
unsigned long epfn,
int zone, int node)
{
unsigned long pfn;
u64 pgcnt = 0;
for (pfn = spfn; pfn < epfn; pfn++) {
if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+ pageblock_nr_pages - 1;
continue;
}
__init_single_page(pfn_to_page(pfn), pfn, zone, node);
__SetPageReserved(pfn_to_page(pfn));
pgcnt++;
}
return pgcnt;
}
#else
static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn,
int zone, int node)
{
return 0;
}
#endif
void __meminit __weak memmap_init_zone(struct zone *zone)
{
unsigned long zone_start_pfn = zone->zone_start_pfn;
unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone);
static unsigned long hole_pfn;
unsigned long start_pfn, end_pfn;
u64 pgcnt = 0;
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
@ -6286,7 +6344,29 @@ void __meminit __weak memmap_init_zone(struct zone *zone)
memmap_init_range(end_pfn - start_pfn, nid,
zone_id, start_pfn, zone_end_pfn,
MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
if (hole_pfn < start_pfn)
pgcnt += init_unavailable_range(hole_pfn, start_pfn,
zone_id, nid);
hole_pfn = end_pfn;
}
#ifdef CONFIG_SPARSEMEM
/*
* Initialize the hole in the range [zone_end_pfn, section_end].
* If zone boundary falls in the middle of a section, this hole
* will be re-initialized during the call to this function for the
* higher zone.
*/
end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION);
if (hole_pfn < end_pfn)
pgcnt += init_unavailable_range(hole_pfn, end_pfn,
zone_id, nid);
#endif
if (pgcnt)
pr_info(" %s zone: %llu pages in unavailable ranges\n",
zone->name, pgcnt);
}
static int zone_batchsize(struct zone *zone)
@ -7083,88 +7163,6 @@ void __init free_area_init_memoryless_node(int nid)
free_area_init_node(nid);
}
#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
/*
* Initialize all valid struct pages in the range [spfn, epfn) and mark them
* PageReserved(). Return the number of struct pages that were initialized.
*/
static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn)
{
unsigned long pfn;
u64 pgcnt = 0;
for (pfn = spfn; pfn < epfn; pfn++) {
if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+ pageblock_nr_pages - 1;
continue;
}
/*
* Use a fake node/zone (0) for now. Some of these pages
* (in memblock.reserved but not in memblock.memory) will
* get re-initialized via reserve_bootmem_region() later.
*/
__init_single_page(pfn_to_page(pfn), pfn, 0, 0);
__SetPageReserved(pfn_to_page(pfn));
pgcnt++;
}
return pgcnt;
}
/*
* Only struct pages that are backed by physical memory are zeroed and
* initialized by going through __init_single_page(). But, there are some
* struct pages which are reserved in memblock allocator and their fields
* may be accessed (for example page_to_pfn() on some configuration accesses
* flags). We must explicitly initialize those struct pages.
*
* This function also addresses a similar issue where struct pages are left
* uninitialized because the physical address range is not covered by
* memblock.memory or memblock.reserved. That could happen when memblock
* layout is manually configured via memmap=, or when the highest physical
* address (max_pfn) does not end on a section boundary.
*/
static void __init init_unavailable_mem(void)
{
phys_addr_t start, end;
u64 i, pgcnt;
phys_addr_t next = 0;
/*
* Loop through unavailable ranges not covered by memblock.memory.
*/
pgcnt = 0;
for_each_mem_range(i, &start, &end) {
if (next < start)
pgcnt += init_unavailable_range(PFN_DOWN(next),
PFN_UP(start));
next = end;
}
/*
* Early sections always have a fully populated memmap for the whole
* section - see pfn_valid(). If the last section has holes at the
* end and that section is marked "online", the memmap will be
* considered initialized. Make sure that memmap has a well defined
* state.
*/
pgcnt += init_unavailable_range(PFN_DOWN(next),
round_up(max_pfn, PAGES_PER_SECTION));
/*
* Struct pages that do not have backing memory. This could be because
* firmware is using some of this memory, or for some other reasons.
*/
if (pgcnt)
pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
}
#else
static inline void __init init_unavailable_mem(void)
{
}
#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
#if MAX_NUMNODES > 1
/*
* Figure out the number of possible node ids.
@ -7588,7 +7586,6 @@ void __init free_area_init(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
init_unavailable_mem();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid);

View File

@ -2992,7 +2992,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
{
WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
if (!objp)
if (!objp || is_kfence_address(objp))
return objp;
if (cachep->flags & SLAB_POISON) {
check_poison_obj(cachep, objp);

View File

@ -2442,6 +2442,9 @@ static int handle_insn_ops(struct instruction *insn, struct insn_state *state)
if (update_cfi_state(insn, &state->cfi, op))
return 1;
if (!insn->alt_group)
continue;
if (op->dest.type == OP_DEST_PUSHF) {
if (!state->uaccess_stack) {
state->uaccess_stack = 1;