From abd82e533d88df1521e3da6799b83ce88852ab88 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 18 Dec 2020 23:12:05 +0900 Subject: [PATCH 001/108] x86/kprobes: Do not decode opcode in resume_execution() Currently, kprobes decodes the opcode right after single-stepping in resume_execution(). But the opcode was already decoded while preparing arch_specific_insn in arch_copy_kprobe(). Decode the opcode in arch_copy_kprobe() instead of in resume_execution() and set some flags which classify the opcode for the resuming process. [ bp: Massage commit message. ] Signed-off-by: Masami Hiramatsu Signed-off-by: Borislav Petkov Acked-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/160830072561.349576.3014979564448023213.stgit@devnote2 --- arch/x86/include/asm/kprobes.h | 11 ++- arch/x86/kernel/kprobes/core.c | 168 +++++++++++++++------------------ 2 files changed, 81 insertions(+), 98 deletions(-) diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 991a7ad540c7..d20a3d6be36e 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -58,14 +58,17 @@ struct arch_specific_insn { /* copy of the original instruction */ kprobe_opcode_t *insn; /* - * boostable = false: This instruction type is not boostable. - * boostable = true: This instruction has been boosted: we have + * boostable = 0: This instruction type is not boostable. + * boostable = 1: This instruction has been boosted: we have * added a relative jump after the instruction copy in insn, * so no single-step and fixup are needed (unless there's * a post_handler). */ - bool boostable; - bool if_modifier; + unsigned boostable:1; + unsigned if_modifier:1; + unsigned is_call:1; + unsigned is_pushf:1; + unsigned is_abs_ip:1; /* Number of bytes of text poked */ int tp_len; }; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index a65e9e97857f..df776cdca327 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -132,26 +132,6 @@ void synthesize_relcall(void *dest, void *from, void *to) } NOKPROBE_SYMBOL(synthesize_relcall); -/* - * Skip the prefixes of the instruction. - */ -static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn) -{ - insn_attr_t attr; - - attr = inat_get_opcode_attribute((insn_byte_t)*insn); - while (inat_is_legacy_prefix(attr)) { - insn++; - attr = inat_get_opcode_attribute((insn_byte_t)*insn); - } -#ifdef CONFIG_X86_64 - if (inat_is_rex_prefix(attr)) - insn++; -#endif - return insn; -} -NOKPROBE_SYMBOL(skip_prefixes); - /* * Returns non-zero if INSN is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode @@ -311,25 +291,6 @@ static int can_probe(unsigned long paddr) return (addr == paddr); } -/* - * Returns non-zero if opcode modifies the interrupt flag. - */ -static int is_IF_modifier(kprobe_opcode_t *insn) -{ - /* Skip prefixes */ - insn = skip_prefixes(insn); - - switch (*insn) { - case 0xfa: /* cli */ - case 0xfb: /* sti */ - case 0xcf: /* iret/iretd */ - case 0x9d: /* popf/popfd */ - return 1; - } - - return 0; -} - /* * Copy an instruction with recovering modified instruction by kprobes * and adjust the displacement if the instruction uses the %rip-relative @@ -411,9 +372,9 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p, synthesize_reljump(buf + len, p->ainsn.insn + len, p->addr + insn->length); len += JMP32_INSN_SIZE; - p->ainsn.boostable = true; + p->ainsn.boostable = 1; } else { - p->ainsn.boostable = false; + p->ainsn.boostable = 0; } return len; @@ -450,6 +411,67 @@ void free_insn_page(void *page) module_memfree(page); } +static void set_resume_flags(struct kprobe *p, struct insn *insn) +{ + insn_byte_t opcode = insn->opcode.bytes[0]; + + switch (opcode) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0x9d: /* popf/popfd */ + /* Check whether the instruction modifies Interrupt Flag or not */ + p->ainsn.if_modifier = 1; + break; + case 0x9c: /* pushfl */ + p->ainsn.is_pushf = 1; + break; + case 0xcf: /* iret */ + p->ainsn.if_modifier = 1; + fallthrough; + case 0xc2: /* ret/lret */ + case 0xc3: + case 0xca: + case 0xcb: + case 0xea: /* jmp absolute -- ip is correct */ + /* ip is already adjusted, no more changes required */ + p->ainsn.is_abs_ip = 1; + /* Without resume jump, this is boostable */ + p->ainsn.boostable = 1; + break; + case 0xe8: /* call relative - Fix return addr */ + p->ainsn.is_call = 1; + break; +#ifdef CONFIG_X86_32 + case 0x9a: /* call absolute -- same as call absolute, indirect */ + p->ainsn.is_call = 1; + p->ainsn.is_abs_ip = 1; + break; +#endif + case 0xff: + opcode = insn->opcode.bytes[1]; + if ((opcode & 0x30) == 0x10) { + /* + * call absolute, indirect + * Fix return addr; ip is correct. + * But this is not boostable + */ + p->ainsn.is_call = 1; + p->ainsn.is_abs_ip = 1; + break; + } else if (((opcode & 0x31) == 0x20) || + ((opcode & 0x31) == 0x21)) { + /* + * jmp near and far, absolute indirect + * ip is correct. + */ + p->ainsn.is_abs_ip = 1; + /* Without resume jump, this is boostable */ + p->ainsn.boostable = 1; + } + break; + } +} + static int arch_copy_kprobe(struct kprobe *p) { struct insn insn; @@ -467,8 +489,8 @@ static int arch_copy_kprobe(struct kprobe *p) */ len = prepare_boost(buf, p, &insn); - /* Check whether the instruction modifies Interrupt Flag or not */ - p->ainsn.if_modifier = is_IF_modifier(buf); + /* Analyze the opcode and set resume flags */ + set_resume_flags(p, &insn); /* Also, displacement change doesn't affect the first byte */ p->opcode = buf[0]; @@ -491,6 +513,9 @@ int arch_prepare_kprobe(struct kprobe *p) if (!can_probe((unsigned long)p->addr)) return -EILSEQ; + + memset(&p->ainsn, 0, sizeof(p->ainsn)); + /* insn: must be on special executable page on x86. */ p->ainsn.insn = get_insn_slot(); if (!p->ainsn.insn) @@ -806,11 +831,6 @@ NOKPROBE_SYMBOL(trampoline_handler); * 2) If the single-stepped instruction was a call, the return address * that is atop the stack is the address following the copied instruction. * We need to make it the address following the original instruction. - * - * If this is the first time we've single-stepped the instruction at - * this probepoint, and the instruction is boostable, boost it: add a - * jump instruction after the copied instruction, that jumps to the next - * instruction after the probepoint. */ static void resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) @@ -818,60 +838,20 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs, unsigned long *tos = stack_addr(regs); unsigned long copy_ip = (unsigned long)p->ainsn.insn; unsigned long orig_ip = (unsigned long)p->addr; - kprobe_opcode_t *insn = p->ainsn.insn; - - /* Skip prefixes */ - insn = skip_prefixes(insn); regs->flags &= ~X86_EFLAGS_TF; - switch (*insn) { - case 0x9c: /* pushfl */ + + /* Fixup the contents of top of stack */ + if (p->ainsn.is_pushf) { *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF); *tos |= kcb->kprobe_old_flags; - break; - case 0xc2: /* iret/ret/lret */ - case 0xc3: - case 0xca: - case 0xcb: - case 0xcf: - case 0xea: /* jmp absolute -- ip is correct */ - /* ip is already adjusted, no more changes required */ - p->ainsn.boostable = true; - goto no_change; - case 0xe8: /* call relative - Fix return addr */ + } else if (p->ainsn.is_call) { *tos = orig_ip + (*tos - copy_ip); - break; -#ifdef CONFIG_X86_32 - case 0x9a: /* call absolute -- same as call absolute, indirect */ - *tos = orig_ip + (*tos - copy_ip); - goto no_change; -#endif - case 0xff: - if ((insn[1] & 0x30) == 0x10) { - /* - * call absolute, indirect - * Fix return addr; ip is correct. - * But this is not boostable - */ - *tos = orig_ip + (*tos - copy_ip); - goto no_change; - } else if (((insn[1] & 0x31) == 0x20) || - ((insn[1] & 0x31) == 0x21)) { - /* - * jmp near and far, absolute indirect - * ip is correct. And this is boostable - */ - p->ainsn.boostable = true; - goto no_change; - } - break; - default: - break; } - regs->ip += orig_ip - copy_ip; + if (!p->ainsn.is_abs_ip) + regs->ip += orig_ip - copy_ip; -no_change: restore_btf(); } NOKPROBE_SYMBOL(resume_execution); From ba9506be4e402ee597b8f41204008b97989b5eef Mon Sep 17 00:00:00 2001 From: Steve Wahl Date: Fri, 8 Jan 2021 09:35:48 -0600 Subject: [PATCH 002/108] perf/x86/intel/uncore: Store the logical die id instead of the physical die id. The phys_id isn't really used other than to map to a logical die id. Calculate the logical die id earlier, and store that instead of the phys_id. Signed-off-by: Steve Wahl Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lkml.kernel.org/r/20210108153549.108989-2-steve.wahl@hpe.com --- arch/x86/events/intel/uncore.c | 58 ++++++++++------------------ arch/x86/events/intel/uncore.h | 5 +-- arch/x86/events/intel/uncore_snb.c | 2 +- arch/x86/events/intel/uncore_snbep.c | 31 +++++++-------- 4 files changed, 39 insertions(+), 57 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 357258f82dc8..33c8180d5a87 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -31,21 +31,21 @@ struct event_constraint uncore_constraint_empty = MODULE_LICENSE("GPL"); -int uncore_pcibus_to_physid(struct pci_bus *bus) +int uncore_pcibus_to_dieid(struct pci_bus *bus) { struct pci2phy_map *map; - int phys_id = -1; + int die_id = -1; raw_spin_lock(&pci2phy_map_lock); list_for_each_entry(map, &pci2phy_map_head, list) { if (map->segment == pci_domain_nr(bus)) { - phys_id = map->pbus_to_physid[bus->number]; + die_id = map->pbus_to_dieid[bus->number]; break; } } raw_spin_unlock(&pci2phy_map_lock); - return phys_id; + return die_id; } static void uncore_free_pcibus_map(void) @@ -86,7 +86,7 @@ struct pci2phy_map *__find_pci2phy_map(int segment) alloc = NULL; map->segment = segment; for (i = 0; i < 256; i++) - map->pbus_to_physid[i] = -1; + map->pbus_to_dieid[i] = -1; list_add_tail(&map->list, &pci2phy_map_head); end: @@ -332,7 +332,6 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, uncore_pmu_init_hrtimer(box); box->cpu = -1; - box->pci_phys_id = -1; box->dieid = -1; /* set default hrtimer timeout */ @@ -993,18 +992,11 @@ uncore_types_init(struct intel_uncore_type **types, bool setid) /* * Get the die information of a PCI device. * @pdev: The PCI device. - * @phys_id: The physical socket id which the device maps to. * @die: The die id which the device maps to. */ -static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, - int *phys_id, int *die) +static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die) { - *phys_id = uncore_pcibus_to_physid(pdev->bus); - if (*phys_id < 0) - return -ENODEV; - - *die = (topology_max_die_per_package() > 1) ? *phys_id : - topology_phys_to_logical_pkg(*phys_id); + *die = uncore_pcibus_to_dieid(pdev->bus); if (*die < 0) return -EINVAL; @@ -1046,13 +1038,12 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids) * @pdev: The PCI device. * @type: The corresponding PMU type of the device. * @pmu: The corresponding PMU of the device. - * @phys_id: The physical socket id which the device maps to. * @die: The die id which the device maps to. */ static int uncore_pci_pmu_register(struct pci_dev *pdev, struct intel_uncore_type *type, struct intel_uncore_pmu *pmu, - int phys_id, int die) + int die) { struct intel_uncore_box *box; int ret; @@ -1070,7 +1061,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev, WARN_ON_ONCE(pmu->func_id != pdev->devfn); atomic_inc(&box->refcnt); - box->pci_phys_id = phys_id; box->dieid = die; box->pci_dev = pdev; box->pmu = pmu; @@ -1097,9 +1087,9 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id { struct intel_uncore_type *type; struct intel_uncore_pmu *pmu = NULL; - int phys_id, die, ret; + int die, ret; - ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die); + ret = uncore_pci_get_dev_die_info(pdev, &die); if (ret) return ret; @@ -1132,7 +1122,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; } - ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die); + ret = uncore_pci_pmu_register(pdev, type, pmu, die); pci_set_drvdata(pdev, pmu->boxes[die]); @@ -1142,17 +1132,12 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id /* * Unregister the PMU of a PCI device * @pmu: The corresponding PMU is unregistered. - * @phys_id: The physical socket id which the device maps to. * @die: The die id which the device maps to. */ -static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, - int phys_id, int die) +static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, int die) { struct intel_uncore_box *box = pmu->boxes[die]; - if (WARN_ON_ONCE(phys_id != box->pci_phys_id)) - return; - pmu->boxes[die] = NULL; if (atomic_dec_return(&pmu->activeboxes) == 0) uncore_pmu_unregister(pmu); @@ -1164,9 +1149,9 @@ static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box; struct intel_uncore_pmu *pmu; - int i, phys_id, die; + int i, die; - if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die)) + if (uncore_pci_get_dev_die_info(pdev, &die)) return; box = pci_get_drvdata(pdev); @@ -1185,7 +1170,7 @@ static void uncore_pci_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); - uncore_pci_pmu_unregister(pmu, phys_id, die); + uncore_pci_pmu_unregister(pmu, die); } static int uncore_bus_notify(struct notifier_block *nb, @@ -1194,7 +1179,7 @@ static int uncore_bus_notify(struct notifier_block *nb, struct device *dev = data; struct pci_dev *pdev = to_pci_dev(dev); struct intel_uncore_pmu *pmu; - int phys_id, die; + int die; /* Unregister the PMU when the device is going to be deleted. */ if (action != BUS_NOTIFY_DEL_DEVICE) @@ -1204,10 +1189,10 @@ static int uncore_bus_notify(struct notifier_block *nb, if (!pmu) return NOTIFY_DONE; - if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die)) + if (uncore_pci_get_dev_die_info(pdev, &die)) return NOTIFY_DONE; - uncore_pci_pmu_unregister(pmu, phys_id, die); + uncore_pci_pmu_unregister(pmu, die); return NOTIFY_OK; } @@ -1224,7 +1209,7 @@ static void uncore_pci_sub_driver_init(void) struct pci_dev *pci_sub_dev; bool notify = false; unsigned int devfn; - int phys_id, die; + int die; while (ids && ids->vendor) { pci_sub_dev = NULL; @@ -1244,12 +1229,11 @@ static void uncore_pci_sub_driver_init(void) if (!pmu) continue; - if (uncore_pci_get_dev_die_info(pci_sub_dev, - &phys_id, &die)) + if (uncore_pci_get_dev_die_info(pci_sub_dev, &die)) continue; if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu, - phys_id, die)) + die)) notify = true; } ids++; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 9efea154349d..a3c6e1643ad2 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -124,7 +124,6 @@ struct intel_uncore_extra_reg { }; struct intel_uncore_box { - int pci_phys_id; int dieid; /* Logical die ID */ int n_active; /* number of active events */ int n_events; @@ -173,11 +172,11 @@ struct freerunning_counters { struct pci2phy_map { struct list_head list; int segment; - int pbus_to_physid[256]; + int pbus_to_dieid[256]; }; struct pci2phy_map *__find_pci2phy_map(int segment); -int uncore_pcibus_to_physid(struct pci_bus *bus); +int uncore_pcibus_to_dieid(struct pci_bus *bus); ssize_t uncore_event_show(struct device *dev, struct device_attribute *attr, char *buf); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 098f893e2e22..51271288499e 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -657,7 +657,7 @@ int snb_pci2phy_map_init(int devid) pci_dev_put(dev); return -ENOMEM; } - map->pbus_to_physid[bus] = 0; + map->pbus_to_dieid[bus] = 0; raw_spin_unlock(&pci2phy_map_lock); pci_dev_put(dev); diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 7bdb1821215d..2d7014dc46f6 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1359,7 +1359,7 @@ static struct pci_driver snbep_uncore_pci_driver = { static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse) { struct pci_dev *ubox_dev = NULL; - int i, bus, nodeid, segment; + int i, bus, nodeid, segment, die_id; struct pci2phy_map *map; int err = 0; u32 config = 0; @@ -1395,7 +1395,11 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool */ for (i = 0; i < 8; i++) { if (nodeid == ((config >> (3 * i)) & 0x7)) { - map->pbus_to_physid[bus] = i; + if (topology_max_die_per_package() > 1) + die_id = i; + else + die_id = topology_phys_to_logical_pkg(i); + map->pbus_to_dieid[bus] = die_id; break; } } @@ -1412,17 +1416,17 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool i = -1; if (reverse) { for (bus = 255; bus >= 0; bus--) { - if (map->pbus_to_physid[bus] >= 0) - i = map->pbus_to_physid[bus]; + if (map->pbus_to_dieid[bus] >= 0) + i = map->pbus_to_dieid[bus]; else - map->pbus_to_physid[bus] = i; + map->pbus_to_dieid[bus] = i; } } else { for (bus = 0; bus <= 255; bus++) { - if (map->pbus_to_physid[bus] >= 0) - i = map->pbus_to_physid[bus]; + if (map->pbus_to_dieid[bus] >= 0) + i = map->pbus_to_dieid[bus]; else - map->pbus_to_physid[bus] = i; + map->pbus_to_dieid[bus] = i; } } } @@ -4646,19 +4650,14 @@ int snr_uncore_pci_init(void) static struct pci_dev *snr_uncore_get_mc_dev(int id) { struct pci_dev *mc_dev = NULL; - int phys_id, pkg; + int pkg; while (1) { mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev); if (!mc_dev) break; - phys_id = uncore_pcibus_to_physid(mc_dev->bus); - if (phys_id < 0) - continue; - pkg = topology_phys_to_logical_pkg(phys_id); - if (pkg < 0) - continue; - else if (pkg == id) + pkg = uncore_pcibus_to_dieid(mc_dev->bus); + if (pkg == id) break; } return mc_dev; From 9a7832ce3d920426a36cdd78eda4b3568d4d09e3 Mon Sep 17 00:00:00 2001 From: Steve Wahl Date: Fri, 8 Jan 2021 09:35:49 -0600 Subject: [PATCH 003/108] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info The registers used to determine which die a pci bus belongs to don't contain enough information to uniquely specify more than 8 dies, so when more than 8 dies are present, use NUMA information instead. Continue to use the previous method for 8 or fewer because it works there, and covers cases of NUMA being disabled. Signed-off-by: Steve Wahl Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lkml.kernel.org/r/20210108153549.108989-3-steve.wahl@hpe.com --- arch/x86/events/intel/uncore_snbep.c | 95 +++++++++++++++++++--------- 1 file changed, 66 insertions(+), 29 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 2d7014dc46f6..b79951d0707c 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1370,40 +1370,77 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool if (!ubox_dev) break; bus = ubox_dev->bus->number; - /* get the Node ID of the local register */ - err = pci_read_config_dword(ubox_dev, nodeid_loc, &config); - if (err) - break; - nodeid = config & NODE_ID_MASK; - /* get the Node ID mapping */ - err = pci_read_config_dword(ubox_dev, idmap_loc, &config); - if (err) - break; - - segment = pci_domain_nr(ubox_dev->bus); - raw_spin_lock(&pci2phy_map_lock); - map = __find_pci2phy_map(segment); - if (!map) { - raw_spin_unlock(&pci2phy_map_lock); - err = -ENOMEM; - break; - } - /* - * every three bits in the Node ID mapping register maps - * to a particular node. + * The nodeid and idmap registers only contain enough + * information to handle 8 nodes. On systems with more + * than 8 nodes, we need to rely on NUMA information, + * filled in from BIOS supplied information, to determine + * the topology. */ - for (i = 0; i < 8; i++) { - if (nodeid == ((config >> (3 * i)) & 0x7)) { - if (topology_max_die_per_package() > 1) - die_id = i; - else - die_id = topology_phys_to_logical_pkg(i); - map->pbus_to_dieid[bus] = die_id; + if (nr_node_ids <= 8) { + /* get the Node ID of the local register */ + err = pci_read_config_dword(ubox_dev, nodeid_loc, &config); + if (err) + break; + nodeid = config & NODE_ID_MASK; + /* get the Node ID mapping */ + err = pci_read_config_dword(ubox_dev, idmap_loc, &config); + if (err) + break; + + segment = pci_domain_nr(ubox_dev->bus); + raw_spin_lock(&pci2phy_map_lock); + map = __find_pci2phy_map(segment); + if (!map) { + raw_spin_unlock(&pci2phy_map_lock); + err = -ENOMEM; + break; + } + + /* + * every three bits in the Node ID mapping register maps + * to a particular node. + */ + for (i = 0; i < 8; i++) { + if (nodeid == ((config >> (3 * i)) & 0x7)) { + if (topology_max_die_per_package() > 1) + die_id = i; + else + die_id = topology_phys_to_logical_pkg(i); + map->pbus_to_dieid[bus] = die_id; + break; + } + } + raw_spin_unlock(&pci2phy_map_lock); + } else { + int node = pcibus_to_node(ubox_dev->bus); + int cpu; + + segment = pci_domain_nr(ubox_dev->bus); + raw_spin_lock(&pci2phy_map_lock); + map = __find_pci2phy_map(segment); + if (!map) { + raw_spin_unlock(&pci2phy_map_lock); + err = -ENOMEM; + break; + } + + die_id = -1; + for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->initialized && cpu_to_node(cpu) == node) { + map->pbus_to_dieid[bus] = die_id = c->logical_die_id; + break; + } + } + raw_spin_unlock(&pci2phy_map_lock); + + if (WARN_ON_ONCE(die_id == -1)) { + err = -EINVAL; break; } } - raw_spin_unlock(&pci2phy_map_lock); } if (!err) { From 7001d4af926b26469a0604eca80dc73b80c5faa8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 18 Jan 2021 13:01:29 +0000 Subject: [PATCH 004/108] arm64: Drop workaround for broken 'S' constraint with GCC 4.9 Since GCC < 5.1 has been shown to be unsuitable for the arm64 kernel, let's drop the workaround for the 'S' asm constraint that GCC 4.9 doesn't always grok. This is effectively a revert of 9fd339a45be5 ("arm64: Work around broken GCC 4.9 handling of "S" constraint"). Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210118130129.2875949-1-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/kvm_asm.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 8a33d83ea843..7ccf770c53d9 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -199,12 +199,6 @@ extern void __vgic_v3_init_lrs(void); extern u32 __kvm_get_mdcr_el2(void); -#if defined(GCC_VERSION) && GCC_VERSION < 50000 -#define SYM_CONSTRAINT "i" -#else -#define SYM_CONSTRAINT "S" -#endif - /* * Obtain the PC-relative address of a kernel symbol * s: symbol @@ -221,7 +215,7 @@ extern u32 __kvm_get_mdcr_el2(void); typeof(s) *addr; \ asm("adrp %0, %1\n" \ "add %0, %0, :lo12:%1\n" \ - : "=r" (addr) : SYM_CONSTRAINT (&s)); \ + : "=r" (addr) : "S" (&s)); \ addr; \ }) From a5b8ca97fbf8300a5e21c393df25ce6f521e7939 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 18 Dec 2020 11:45:40 +0900 Subject: [PATCH 005/108] arm64: do not descend to vdso directories twice arm64 descends into each vdso directory twice; first in vdso_prepare, second during the ordinary build process. PPC mimicked it and uncovered a problem [1]. In the first descend, Kbuild directly visits the vdso directories, therefore it does not inherit subdir-ccflags-y from upper directories. This means the command line parameters may differ between the two. If it happens, the offset values in the generated headers might be different from real offsets of vdso.so in the kernel. This potential danger should be avoided. The vdso directories are built in the vdso_prepare stage, so the second descend is unneeded. [1]: https://lore.kernel.org/linux-kbuild/CAK7LNARAkJ3_-4gX0VA2UkapbOftuzfSTVMBbgbw=HD8n7N+7w@mail.gmail.com/T/#ma10dcb961fda13f36d42d58fa6cb2da988b7e73a Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20201218024540.1102650-1-masahiroy@kernel.org Signed-off-by: Will Deacon --- arch/arm64/Makefile | 10 ++++++---- arch/arm64/kernel/Makefile | 5 +++-- arch/arm64/kernel/{vdso/vdso.S => vdso-wrap.S} | 0 arch/arm64/kernel/vdso/Makefile | 1 - arch/arm64/kernel/{vdso32/vdso.S => vdso32-wrap.S} | 0 arch/arm64/kernel/vdso32/Makefile | 1 - 6 files changed, 9 insertions(+), 8 deletions(-) rename arch/arm64/kernel/{vdso/vdso.S => vdso-wrap.S} (100%) rename arch/arm64/kernel/{vdso32/vdso.S => vdso32-wrap.S} (100%) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 90309208bb28..5b84aec31ed3 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -188,10 +188,12 @@ ifeq ($(KBUILD_EXTMOD),) # this hack. prepare: vdso_prepare vdso_prepare: prepare0 - $(Q)$(MAKE) $(build)=arch/arm64/kernel/vdso include/generated/vdso-offsets.h - $(if $(CONFIG_COMPAT_VDSO),$(Q)$(MAKE) \ - $(build)=arch/arm64/kernel/vdso32 \ - include/generated/vdso32-offsets.h) + $(Q)$(MAKE) $(build)=arch/arm64/kernel/vdso \ + include/generated/vdso-offsets.h arch/arm64/kernel/vdso/vdso.so +ifdef CONFIG_COMPAT_VDSO + $(Q)$(MAKE) $(build)=arch/arm64/kernel/vdso32 \ + include/generated/vdso32-offsets.h arch/arm64/kernel/vdso32/vdso.so +endif endif define archhelp diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 86364ab6f13f..42f6ad2c7eac 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -59,9 +59,10 @@ obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o obj-$(CONFIG_ARM64_MTE) += mte.o +obj-y += vdso-wrap.o +obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o -obj-y += vdso/ probes/ -obj-$(CONFIG_COMPAT_VDSO) += vdso32/ +obj-y += probes/ head-y := head.o extra-y += $(head-y) vmlinux.lds diff --git a/arch/arm64/kernel/vdso/vdso.S b/arch/arm64/kernel/vdso-wrap.S similarity index 100% rename from arch/arm64/kernel/vdso/vdso.S rename to arch/arm64/kernel/vdso-wrap.S diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index cd9c3fa25902..76c0255ecc91 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -44,7 +44,6 @@ endif # Disable gcov profiling for VDSO code GCOV_PROFILE := n -obj-y += vdso.o targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) diff --git a/arch/arm64/kernel/vdso32/vdso.S b/arch/arm64/kernel/vdso32-wrap.S similarity index 100% rename from arch/arm64/kernel/vdso32/vdso.S rename to arch/arm64/kernel/vdso32-wrap.S diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index a1e0f91e6cea..789ad420f16b 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -155,7 +155,6 @@ c-obj-vdso-gettimeofday := $(addprefix $(obj)/, $(c-obj-vdso-gettimeofday)) asm-obj-vdso := $(addprefix $(obj)/, $(asm-obj-vdso)) obj-vdso := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) -obj-y += vdso.o targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) From f3cb097ad8888019e6d8777cb17bcee18ac6c2f6 Mon Sep 17 00:00:00 2001 From: John Millikin Date: Wed, 23 Dec 2020 14:10:56 +0900 Subject: [PATCH 006/108] arm64: Support running gen_vdso_offsets.sh with BSD userland. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BSD sed ignores whitespace character escape sequences such as '\t' in the replacement string, causing this script to produce the following incorrect output:   #define vdso_offset_sigtrampt0x089c Changing the hard tab to ' ' causes both BSD and GNU dialects of sed to produce equivalent output. Signed-off-by: John Millikin Link: https://lore.kernel.org/r/15147ffb-7e67-b607-266d-f56599ecafd1@john-millikin.com Signed-off-by: Will Deacon --- arch/arm64/kernel/vdso/gen_vdso_offsets.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/vdso/gen_vdso_offsets.sh b/arch/arm64/kernel/vdso/gen_vdso_offsets.sh index 8b806eacd0a6..0387209d65b1 100755 --- a/arch/arm64/kernel/vdso/gen_vdso_offsets.sh +++ b/arch/arm64/kernel/vdso/gen_vdso_offsets.sh @@ -13,4 +13,4 @@ LC_ALL=C sed -n -e 's/^00*/0/' -e \ -'s/^\([0-9a-fA-F]*\) . VDSO_\([a-zA-Z0-9_]*\)$/\#define vdso_offset_\2\t0x\1/p' +'s/^\([0-9a-fA-F]*\) . VDSO_\([a-zA-Z0-9_]*\)$/\#define vdso_offset_\2 0x\1/p' From edb739eed8f37e2846641313ff1308e872a30d98 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 5 Jan 2021 16:54:11 +0530 Subject: [PATCH 007/108] arm64/mm: Add warning for outside range requests in vmemmap_populate() vmemmap_populate() does not validate the requested vmemmap address range to be inside the platform assigned space i.e [VMEMMAP_START..VMEMMAP_END] for vmemmap. Instead it would just go ahead and create the mapping which might then overlap with other sections in the kernel virtual address space. Just adding an warning here for range overrun which would help detect the problem earlier on, before a potential struct page corruption. This also makes vmemmap_populate() symmetrical with vmemmap_free() which already has a similar warning. Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/1609845851-25064-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Will Deacon --- arch/arm64/mm/mmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ae0c3d023824..f938e8020523 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1094,6 +1094,7 @@ static void free_empty_tables(unsigned long addr, unsigned long end, int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { + WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END)); return vmemmap_populate_basepages(start, end, node, altmap); } #else /* !ARM64_SWAPPER_USES_SECTION_MAPS */ @@ -1107,6 +1108,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, pud_t *pudp; pmd_t *pmdp; + WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END)); do { next = pmd_addr_end(addr, end); From 67c6bb56b649590a3f59c2a92331aa4e83d4534c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 6 Jan 2021 10:34:49 +0000 Subject: [PATCH 008/108] firmware: smccc: Add SMCCC TRNG function call IDs The ARM architected TRNG firmware interface, described in ARM spec DEN0098, define an ARM SMCCC based interface to a true random number generator, provided by firmware. Add the definitions of the SMCCC functions as defined by the spec. Signed-off-by: Ard Biesheuvel Signed-off-by: Andre Przywara Reviewed-by: Linus Walleij Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20210106103453.152275-2-andre.przywara@arm.com Signed-off-by: Will Deacon --- include/linux/arm-smccc.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index f860645f6512..62c54234576c 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -102,6 +102,37 @@ ARM_SMCCC_OWNER_STANDARD_HYP, \ 0x21) +/* TRNG entropy source calls (defined by ARM DEN0098) */ +#define ARM_SMCCC_TRNG_VERSION \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_STANDARD, \ + 0x50) + +#define ARM_SMCCC_TRNG_FEATURES \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_STANDARD, \ + 0x51) + +#define ARM_SMCCC_TRNG_GET_UUID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_STANDARD, \ + 0x52) + +#define ARM_SMCCC_TRNG_RND32 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_STANDARD, \ + 0x53) + +#define ARM_SMCCC_TRNG_RND64 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD, \ + 0x53) + /* * Return codes defined in ARM DEN 0070A * ARM DEN 0070A is now merged/consolidated into ARM DEN 0028 C From dd313a2653d442f015e82706f6b185a881772101 Mon Sep 17 00:00:00 2001 From: YANG LI Date: Mon, 11 Jan 2021 17:35:37 +0800 Subject: [PATCH 009/108] arm64: mte: style: Simplify bool comparison Fix the following coccicheck warning: ./tools/testing/selftests/arm64/mte/check_buffer_fill.c:84:12-35: WARNING: Comparison to bool Signed-off-by: YANG LI Reported-by: Abaci Robot Link: https://lore.kernel.org/r/1610357737-68678-1-git-send-email-abaci-bugfix@linux.alibaba.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/mte/check_buffer_fill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/mte/check_buffer_fill.c b/tools/testing/selftests/arm64/mte/check_buffer_fill.c index c9fa141ebdcc..75fc482d63b6 100644 --- a/tools/testing/selftests/arm64/mte/check_buffer_fill.c +++ b/tools/testing/selftests/arm64/mte/check_buffer_fill.c @@ -81,7 +81,7 @@ static int check_buffer_underflow_by_byte(int mem_type, int mode, last_index = 0; /* Set some value in tagged memory and make the buffer underflow */ for (j = sizes[i] - 1; (j >= -underflow_range) && - (cur_mte_cxt.fault_valid == false); j--) { + (!cur_mte_cxt.fault_valid); j--) { ptr[j] = '1'; last_index = j; } From 6106e1112cc69a367f495da2e66f13e2bca369fb Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 13 Jan 2021 17:31:55 +0000 Subject: [PATCH 010/108] arm64: remove EL0 exception frame record When entering an exception from EL0, the entry code creates a synthetic frame record with a NULL PC. This was used by the code introduced in commit: 7326749801396105 ("arm64: unwind: reference pt_regs via embedded stack frame") ... to discover exception entries on the stack and dump the associated pt_regs. Since the NULL PC was undesirable for the stacktrace, we added a special case to unwind_frame() to prevent the NULL PC from being logged. Since commit: a25ffd3a6302a678 ("arm64: traps: Don't print stack or raw PC/LR values in backtraces") ... we no longer try to dump the pt_regs as part of a stacktrace, and hence no longer need the synthetic exception record. This patch removes the synthetic exception record and the associated special case in unwind_frame(). Instead, EL0 exceptions set the FP to NULL, as is the case for other terminal records (e.g. when a kernel thread starts). The synthetic record for exceptions from EL1 is retrained as this has useful unwind information for the interrupted context. To make the terminal case a bit clearer, an explicit check is added to the start of unwind_frame(). This would otherwise be caught implicitly by the on_accessible_stack() checks. Reported-by: Mark Brown Signed-off-by: Mark Rutland Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20210113173155.43063-1-broonie@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/entry.S | 10 +++++----- arch/arm64/kernel/stacktrace.c | 13 ++++--------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index c9bae73f2621..89fd3907816e 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -261,16 +261,16 @@ alternative_else_nop_endif stp lr, x21, [sp, #S_LR] /* - * In order to be able to dump the contents of struct pt_regs at the - * time the exception was taken (in case we attempt to walk the call - * stack later), chain it together with the stack frames. + * For exceptions from EL0, terminate the callchain here. + * For exceptions from EL1, create a synthetic frame record so the + * interrupted code shows up in the backtrace. */ .if \el == 0 - stp xzr, xzr, [sp, #S_STACKFRAME] + mov x29, xzr .else stp x29, x22, [sp, #S_STACKFRAME] - .endif add x29, sp, #S_STACKFRAME + .endif #ifdef CONFIG_ARM64_SW_TTBR0_PAN alternative_if_not ARM64_HAS_PAN diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index fa56af1a59c3..0fb42129b469 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -44,6 +44,10 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) unsigned long fp = frame->fp; struct stack_info info; + /* Terminal record; nothing to unwind */ + if (!fp) + return -EINVAL; + if (fp & 0xf) return -EINVAL; @@ -104,15 +108,6 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) frame->pc = ptrauth_strip_insn_pac(frame->pc); - /* - * Frames created upon entry from EL0 have NULL FP and PC values, so - * don't bother reporting these. Frames created by __noreturn functions - * might have a valid FP even if PC is bogus, so only terminate where - * both are NULL. - */ - if (!frame->fp && !frame->pc) - return -EINVAL; - return 0; } NOKPROBE_SYMBOL(unwind_frame); From 384e5699e101b1ee184590a39ee60f10ec0d36b6 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 8 Jan 2021 16:46:51 +0530 Subject: [PATCH 011/108] arm64: topology: Avoid the have_policy check Every time I have stumbled upon this routine, I get confused with the way 'have_policy' is used and I have to dig in to understand why is it so. Here is an attempt to make it easier to understand, and hopefully it is an improvement. The 'have_policy' check was just an optimization to avoid writing to amu_fie_cpus in case we don't have to, but that optimization itself is creating more confusion than the real work. Lets just do that if all the CPUs support AMUs. It is much cleaner that way. Reviewed-by: Ionela Voinescu Signed-off-by: Viresh Kumar Tested-by: Ionela Voinescu Link: https://lore.kernel.org/r/c125766c4be93461772015ac7c9a6ae45d5756f6.1610104461.git.viresh.kumar@linaro.org Signed-off-by: Will Deacon --- arch/arm64/kernel/topology.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index f6faa697e83e..ebadc73449f9 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -199,14 +199,14 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) return 0; } -static inline bool +static inline void enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus) { struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); if (!policy) { pr_debug("CPU%d: No cpufreq policy found.\n", cpu); - return false; + return; } if (cpumask_subset(policy->related_cpus, valid_cpus)) @@ -214,8 +214,6 @@ enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus) amu_fie_cpus); cpufreq_cpu_put(policy); - - return true; } static DEFINE_STATIC_KEY_FALSE(amu_fie_key); @@ -225,7 +223,6 @@ static int __init init_amu_fie(void) { bool invariance_status = topology_scale_freq_invariant(); cpumask_var_t valid_cpus; - bool have_policy = false; int ret = 0; int cpu; @@ -245,17 +242,12 @@ static int __init init_amu_fie(void) continue; cpumask_set_cpu(cpu, valid_cpus); - have_policy |= enable_policy_freq_counters(cpu, valid_cpus); + enable_policy_freq_counters(cpu, valid_cpus); } - /* - * If we are not restricted by cpufreq policies, we only enable - * the use of the AMU feature for FIE if all CPUs support AMU. - * Otherwise, enable_policy_freq_counters has already enabled - * policy cpus. - */ - if (!have_policy && cpumask_equal(valid_cpus, cpu_present_mask)) - cpumask_or(amu_fie_cpus, amu_fie_cpus, valid_cpus); + /* Overwrite amu_fie_cpus if all CPUs support AMU */ + if (cpumask_equal(valid_cpus, cpu_present_mask)) + cpumask_copy(amu_fie_cpus, cpu_present_mask); if (!cpumask_empty(amu_fie_cpus)) { pr_info("CPUs[%*pbl]: counters will be used for FIE.", From 47b10b737c0794c2fa584d7c8103b485e274ed51 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 8 Jan 2021 16:46:52 +0530 Subject: [PATCH 012/108] arm64: topology: Reorder init_amu_fie() a bit This patch does a couple of optimizations in init_amu_fie(), like early exits from paths where we don't need to continue any further, avoid the enable/disable dance, moving the calls to topology_scale_freq_invariant() just when we need them, instead of at the top of the routine, and avoiding calling it for the third time. Reviewed-by: Ionela Voinescu Signed-off-by: Viresh Kumar Tested-by: Ionela Voinescu Link: https://lore.kernel.org/r/a732e71ab9ec28c354eb28dd898c9b47d490863f.1610104461.git.viresh.kumar@linaro.org Signed-off-by: Will Deacon --- arch/arm64/kernel/topology.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index ebadc73449f9..57267d694495 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -221,8 +221,8 @@ static DEFINE_STATIC_KEY_FALSE(amu_fie_key); static int __init init_amu_fie(void) { - bool invariance_status = topology_scale_freq_invariant(); cpumask_var_t valid_cpus; + bool invariant; int ret = 0; int cpu; @@ -249,18 +249,19 @@ static int __init init_amu_fie(void) if (cpumask_equal(valid_cpus, cpu_present_mask)) cpumask_copy(amu_fie_cpus, cpu_present_mask); - if (!cpumask_empty(amu_fie_cpus)) { - pr_info("CPUs[%*pbl]: counters will be used for FIE.", - cpumask_pr_args(amu_fie_cpus)); - static_branch_enable(&amu_fie_key); - } + if (cpumask_empty(amu_fie_cpus)) + goto free_valid_mask; - /* - * If the system is not fully invariant after AMU init, disable - * partial use of counters for frequency invariance. - */ - if (!topology_scale_freq_invariant()) - static_branch_disable(&amu_fie_key); + invariant = topology_scale_freq_invariant(); + + /* We aren't fully invariant yet */ + if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask)) + goto free_valid_mask; + + static_branch_enable(&amu_fie_key); + + pr_info("CPUs[%*pbl]: counters will be used for FIE.", + cpumask_pr_args(amu_fie_cpus)); /* * Task scheduler behavior depends on frequency invariance support, @@ -268,7 +269,7 @@ static int __init init_amu_fie(void) * a result of counter initialisation and use, retrigger the build of * scheduling domains to ensure the information is propagated properly. */ - if (invariance_status != topology_scale_freq_invariant()) + if (!invariant) rebuild_sched_domains_energy(); free_valid_mask: From a5f1b187cd24f7da35eb7a48fc50839c554d52a2 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 8 Jan 2021 16:46:53 +0530 Subject: [PATCH 013/108] arm64: topology: Make AMUs work with modular cpufreq drivers The AMU counters won't get used today if the cpufreq driver is built as a module as the amu core requires everything to be ready by late init. Fix that properly by registering for cpufreq policy notifier. Note that the amu core don't have any cpufreq dependency after the first time CPUFREQ_CREATE_POLICY notifier is called for all the CPUs. And so we don't need to do anything on the CPUFREQ_REMOVE_POLICY notifier. And for the same reason we check if the CPUs are already parsed in the beginning of amu_fie_setup() and skip if that is true. Alternatively we can shoot a work from there to unregister the notifier instead, but that seemed too much instead of this simple check. While at it, convert the print message to pr_debug instead of pr_info. Signed-off-by: Viresh Kumar Reviewed-by: Ionela Voinescu Tested-by: Ionela Voinescu Link: https://lore.kernel.org/r/89c1921334443e133c9c8791b4693607d65ed9f5.1610104461.git.viresh.kumar@linaro.org Signed-off-by: Will Deacon --- arch/arm64/kernel/topology.c | 92 +++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 57267d694495..e08a4126453a 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -199,69 +199,38 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) return 0; } -static inline void -enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus) -{ - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - - if (!policy) { - pr_debug("CPU%d: No cpufreq policy found.\n", cpu); - return; - } - - if (cpumask_subset(policy->related_cpus, valid_cpus)) - cpumask_or(amu_fie_cpus, policy->related_cpus, - amu_fie_cpus); - - cpufreq_cpu_put(policy); -} - static DEFINE_STATIC_KEY_FALSE(amu_fie_key); #define amu_freq_invariant() static_branch_unlikely(&amu_fie_key) -static int __init init_amu_fie(void) +static void amu_fie_setup(const struct cpumask *cpus) { - cpumask_var_t valid_cpus; bool invariant; - int ret = 0; int cpu; - if (!zalloc_cpumask_var(&valid_cpus, GFP_KERNEL)) - return -ENOMEM; + /* We are already set since the last insmod of cpufreq driver */ + if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + return; - if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) { - ret = -ENOMEM; - goto free_valid_mask; - } - - for_each_present_cpu(cpu) { + for_each_cpu(cpu, cpus) { if (!freq_counters_valid(cpu) || freq_inv_set_max_ratio(cpu, cpufreq_get_hw_max_freq(cpu) * 1000, arch_timer_get_rate())) - continue; - - cpumask_set_cpu(cpu, valid_cpus); - enable_policy_freq_counters(cpu, valid_cpus); + return; } - /* Overwrite amu_fie_cpus if all CPUs support AMU */ - if (cpumask_equal(valid_cpus, cpu_present_mask)) - cpumask_copy(amu_fie_cpus, cpu_present_mask); - - if (cpumask_empty(amu_fie_cpus)) - goto free_valid_mask; + cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); invariant = topology_scale_freq_invariant(); /* We aren't fully invariant yet */ if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask)) - goto free_valid_mask; + return; static_branch_enable(&amu_fie_key); - pr_info("CPUs[%*pbl]: counters will be used for FIE.", - cpumask_pr_args(amu_fie_cpus)); + pr_debug("CPUs[%*pbl]: counters will be used for FIE.", + cpumask_pr_args(cpus)); /* * Task scheduler behavior depends on frequency invariance support, @@ -271,13 +240,48 @@ static int __init init_amu_fie(void) */ if (!invariant) rebuild_sched_domains_energy(); +} -free_valid_mask: - free_cpumask_var(valid_cpus); +static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + + if (val == CPUFREQ_CREATE_POLICY) + amu_fie_setup(policy->related_cpus); + + /* + * We don't need to handle CPUFREQ_REMOVE_POLICY event as the AMU + * counters don't have any dependency on cpufreq driver once we have + * initialized AMU support and enabled invariance. The AMU counters will + * keep on working just fine in the absence of the cpufreq driver, and + * for the CPUs for which there are no counters available, the last set + * value of freq_scale will remain valid as that is the frequency those + * CPUs are running at. + */ + + return 0; +} + +static struct notifier_block init_amu_fie_notifier = { + .notifier_call = init_amu_fie_callback, +}; + +static int __init init_amu_fie(void) +{ + int ret; + + if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) + return -ENOMEM; + + ret = cpufreq_register_notifier(&init_amu_fie_notifier, + CPUFREQ_POLICY_NOTIFIER); + if (ret) + free_cpumask_var(amu_fie_cpus); return ret; } -late_initcall_sync(init_amu_fie); +core_initcall(init_amu_fie); bool arch_freq_counters_available(const struct cpumask *cpus) { From f9ce0be71d1fbb038ada15ced83474b0e63f264d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Sat, 19 Dec 2020 15:19:23 +0300 Subject: [PATCH 014/108] mm: Cleanup faultaround and finish_fault() codepaths alloc_set_pte() has two users with different requirements: in the faultaround code, it called from an atomic context and PTE page table has to be preallocated. finish_fault() can sleep and allocate page table as needed. PTL locking rules are also strange, hard to follow and overkill for finish_fault(). Let's untangle the mess. alloc_set_pte() has gone now. All locking is explicit. The price is some code duplication to handle huge pages in faultaround path, but it should be fine, having overall improvement in readability. Link: https://lore.kernel.org/r/20201229132819.najtavneutnf7ajp@box Signed-off-by: Kirill A. Shutemov [will: s/from from/from/ in comment; spotted by willy] Signed-off-by: Will Deacon --- fs/xfs/xfs_file.c | 6 +- include/linux/mm.h | 12 ++- include/linux/pgtable.h | 11 +++ mm/filemap.c | 177 ++++++++++++++++++++++++++--------- mm/memory.c | 199 ++++++++++++---------------------------- 5 files changed, 213 insertions(+), 192 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5b0f93f73837..111fe73bb8a7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1319,17 +1319,19 @@ xfs_filemap_pfn_mkwrite( return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } -static void +static vm_fault_t xfs_filemap_map_pages( struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct inode *inode = file_inode(vmf->vma->vm_file); + vm_fault_t ret; xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - filemap_map_pages(vmf, start_pgoff, end_pgoff); + ret = filemap_map_pages(vmf, start_pgoff, end_pgoff); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + return ret; } static const struct vm_operations_struct xfs_file_vm_ops = { diff --git a/include/linux/mm.h b/include/linux/mm.h index ecdf8a8cd6ae..4572a9bc5862 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -542,8 +542,8 @@ struct vm_fault { * is not NULL, otherwise pmd. */ pgtable_t prealloc_pte; /* Pre-allocated pte page table. - * vm_ops->map_pages() calls - * alloc_set_pte() from atomic context. + * vm_ops->map_pages() sets up a page + * table from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context. @@ -578,7 +578,7 @@ struct vm_operations_struct { vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); - void (*map_pages)(struct vm_fault *vmf, + vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); @@ -988,7 +988,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page); +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); +void do_set_pte(struct vm_fault *vmf, struct page *page); + vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #endif @@ -2622,7 +2624,7 @@ extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern vm_fault_t filemap_fault(struct vm_fault *vmf); -extern void filemap_map_pages(struct vm_fault *vmf, +extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 8fcdfa52eb4b..36eb748f3c97 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1314,6 +1314,17 @@ static inline int pmd_trans_unstable(pmd_t *pmd) #endif } +/* + * the ordering of these checks is important for pmds with _page_devmap set. + * if we check pmd_trans_unstable() first we will trip the bad_pmd() check + * inside of pmd_none_or_trans_huge_or_clear_bad(). this will end up correctly + * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. + */ +static inline int pmd_devmap_trans_unstable(pmd_t *pmd) +{ + return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); +} + #ifndef CONFIG_NUMA_BALANCING /* * Technically a PTE can be PROTNONE even when not doing NUMA balancing but diff --git a/mm/filemap.c b/mm/filemap.c index 5c9d564317a5..c1f2dc89b8a7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "internal.h" #define CREATE_TRACE_POINTS @@ -2911,74 +2912,164 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct vm_fault *vmf, - pgoff_t start_pgoff, pgoff_t end_pgoff) +static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) { - struct file *file = vmf->vma->vm_file; + struct mm_struct *mm = vmf->vma->vm_mm; + + /* Huge page is mapped? No need to proceed. */ + if (pmd_trans_huge(*vmf->pmd)) { + unlock_page(page); + put_page(page); + return true; + } + + if (pmd_none(*vmf->pmd) && PageTransHuge(page)) { + vm_fault_t ret = do_set_pmd(vmf, page); + if (!ret) { + /* The page is mapped successfully, reference consumed. */ + unlock_page(page); + return true; + } + } + + if (pmd_none(*vmf->pmd)) { + vmf->ptl = pmd_lock(mm, vmf->pmd); + if (likely(pmd_none(*vmf->pmd))) { + mm_inc_nr_ptes(mm); + pmd_populate(mm, vmf->pmd, vmf->prealloc_pte); + vmf->prealloc_pte = NULL; + } + spin_unlock(vmf->ptl); + } + + /* See comment in handle_pte_fault() */ + if (pmd_devmap_trans_unstable(vmf->pmd)) { + unlock_page(page); + put_page(page); + return true; + } + + return false; +} + +static struct page *next_uptodate_page(struct page *page, + struct address_space *mapping, + struct xa_state *xas, pgoff_t end_pgoff) +{ + unsigned long max_idx; + + do { + if (!page) + return NULL; + if (xas_retry(xas, page)) + continue; + if (xa_is_value(page)) + continue; + if (PageLocked(page)) + continue; + if (!page_cache_get_speculative(page)) + continue; + /* Has the page moved or been split? */ + if (unlikely(page != xas_reload(xas))) + goto skip; + if (!PageUptodate(page) || PageReadahead(page)) + goto skip; + if (PageHWPoison(page)) + goto skip; + if (!trylock_page(page)) + goto skip; + if (page->mapping != mapping) + goto unlock; + if (!PageUptodate(page)) + goto unlock; + max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + if (xas->xa_index >= max_idx) + goto unlock; + return page; +unlock: + unlock_page(page); +skip: + put_page(page); + } while ((page = xas_next_entry(xas, end_pgoff)) != NULL); + + return NULL; +} + +static inline struct page *first_map_page(struct address_space *mapping, + struct xa_state *xas, + pgoff_t end_pgoff) +{ + return next_uptodate_page(xas_find(xas, end_pgoff), + mapping, xas, end_pgoff); +} + +static inline struct page *next_map_page(struct address_space *mapping, + struct xa_state *xas, + pgoff_t end_pgoff) +{ + return next_uptodate_page(xas_next_entry(xas, end_pgoff), + mapping, xas, end_pgoff); +} + +vm_fault_t filemap_map_pages(struct vm_fault *vmf, + pgoff_t start_pgoff, pgoff_t end_pgoff) +{ + struct vm_area_struct *vma = vmf->vma; + struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; - unsigned long max_idx; + unsigned long address = vmf->address; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *head, *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); + vm_fault_t ret = 0; rcu_read_lock(); - xas_for_each(&xas, head, end_pgoff) { - if (xas_retry(&xas, head)) - continue; - if (xa_is_value(head)) - goto next; + head = first_map_page(mapping, &xas, end_pgoff); + if (!head) + goto out; - /* - * Check for a locked page first, as a speculative - * reference may adversely influence page migration. - */ - if (PageLocked(head)) - goto next; - if (!page_cache_get_speculative(head)) - goto next; + if (filemap_map_pmd(vmf, head)) { + ret = VM_FAULT_NOPAGE; + goto out; + } - /* Has the page moved or been split? */ - if (unlikely(head != xas_reload(&xas))) - goto skip; + vmf->address = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); + do { page = find_subpage(head, xas.xa_index); - - if (!PageUptodate(head) || - PageReadahead(page) || - PageHWPoison(page)) - goto skip; - if (!trylock_page(head)) - goto skip; - - if (head->mapping != mapping || !PageUptodate(head)) - goto unlock; - - max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); - if (xas.xa_index >= max_idx) + if (PageHWPoison(page)) goto unlock; if (mmap_miss > 0) mmap_miss--; vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT; - if (vmf->pte) - vmf->pte += xas.xa_index - last_pgoff; + vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; - if (alloc_set_pte(vmf, page)) + + if (!pte_none(*vmf->pte)) goto unlock; + + do_set_pte(vmf, page); + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, vmf->address, vmf->pte); unlock_page(head); - goto next; + + /* The fault is handled */ + if (vmf->address == address) + ret = VM_FAULT_NOPAGE; + continue; unlock: unlock_page(head); -skip: put_page(head); -next: - /* Huge page is mapped? No need to proceed. */ - if (pmd_trans_huge(*vmf->pmd)) - break; - } + } while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL); + pte_unmap_unlock(vmf->pte, vmf->ptl); +out: rcu_read_unlock(); + vmf->address = address; WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); + return ret; } EXPORT_SYMBOL(filemap_map_pages); diff --git a/mm/memory.c b/mm/memory.c index feff48e1465a..3e2fc2950ad7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3503,7 +3503,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; - /* See the comment in pte_alloc_one_map() */ + /* See comment in handle_pte_fault() */ if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; @@ -3643,66 +3643,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) return ret; } -/* - * The ordering of these checks is important for pmds with _PAGE_DEVMAP set. - * If we check pmd_trans_unstable() first we will trip the bad_pmd() check - * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly - * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. - */ -static int pmd_devmap_trans_unstable(pmd_t *pmd) -{ - return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); -} - -static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) -{ - struct vm_area_struct *vma = vmf->vma; - - if (!pmd_none(*vmf->pmd)) - goto map_pte; - if (vmf->prealloc_pte) { - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_none(*vmf->pmd))) { - spin_unlock(vmf->ptl); - goto map_pte; - } - - mm_inc_nr_ptes(vma->vm_mm); - pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); - spin_unlock(vmf->ptl); - vmf->prealloc_pte = NULL; - } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { - return VM_FAULT_OOM; - } -map_pte: - /* - * If a huge pmd materialized under us just retry later. Use - * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of - * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge - * under us and then back to pmd_none, as a result of MADV_DONTNEED - * running immediately after a huge pmd fault in a different thread of - * this mm, in turn leading to a misleading pmd_trans_huge() retval. - * All we have to ensure is that it is a regular pmd that we can walk - * with pte_offset_map() and we can do that through an atomic read in - * C, which is what pmd_trans_unstable() provides. - */ - if (pmd_devmap_trans_unstable(vmf->pmd)) - return VM_FAULT_NOPAGE; - - /* - * At this point we know that our vmf->pmd points to a page of ptes - * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge() - * for the duration of the fault. If a racing MADV_DONTNEED runs and - * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still - * be valid and we will re-check to make sure the vmf->pte isn't - * pte_none() under vmf->ptl protection when we return to - * alloc_set_pte(). - */ - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, - &vmf->ptl); - return 0; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void deposit_prealloc_pte(struct vm_fault *vmf) { @@ -3717,7 +3657,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) vmf->prealloc_pte = NULL; } -static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -3775,52 +3715,17 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) return ret; } #else -static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { - BUILD_BUG(); - return 0; + return VM_FAULT_FALLBACK; } #endif -/** - * alloc_set_pte - setup new PTE entry for given page and add reverse page - * mapping. If needed, the function allocates page table or use pre-allocated. - * - * @vmf: fault environment - * @page: page to map - * - * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on - * return. - * - * Target users are page handler itself and implementations of - * vm_ops->map_pages. - * - * Return: %0 on success, %VM_FAULT_ code in case of error. - */ -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) +void do_set_pte(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; pte_t entry; - vm_fault_t ret; - - if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { - ret = do_set_pmd(vmf, page); - if (ret != VM_FAULT_FALLBACK) - return ret; - } - - if (!vmf->pte) { - ret = pte_alloc_one_map(vmf); - if (ret) - return ret; - } - - /* Re-check under ptl */ - if (unlikely(!pte_none(*vmf->pte))) { - update_mmu_tlb(vma, vmf->address, vmf->pte); - return VM_FAULT_NOPAGE; - } flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); @@ -3837,14 +3742,8 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) page_add_file_rmap(page, false); } set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); - - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, vmf->address, vmf->pte); - - return 0; } - /** * finish_fault - finish page fault once we have prepared the page to fault * @@ -3862,12 +3761,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) */ vm_fault_t finish_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct page *page; - vm_fault_t ret = 0; + vm_fault_t ret; /* Did we COW the page? */ - if ((vmf->flags & FAULT_FLAG_WRITE) && - !(vmf->vma->vm_flags & VM_SHARED)) + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) page = vmf->cow_page; else page = vmf->page; @@ -3876,12 +3775,38 @@ vm_fault_t finish_fault(struct vm_fault *vmf) * check even for read faults because we might have lost our CoWed * page */ - if (!(vmf->vma->vm_flags & VM_SHARED)) - ret = check_stable_address_space(vmf->vma->vm_mm); - if (!ret) - ret = alloc_set_pte(vmf, page); - if (vmf->pte) - pte_unmap_unlock(vmf->pte, vmf->ptl); + if (!(vma->vm_flags & VM_SHARED)) { + ret = check_stable_address_space(vma->vm_mm); + if (ret) + return ret; + } + + if (pmd_none(*vmf->pmd)) { + if (PageTransCompound(page)) { + ret = do_set_pmd(vmf, page); + if (ret != VM_FAULT_FALLBACK) + return ret; + } + + if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) + return VM_FAULT_OOM; + } + + /* See comment in handle_pte_fault() */ + if (pmd_devmap_trans_unstable(vmf->pmd)) + return 0; + + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + ret = 0; + /* Re-check under ptl */ + if (likely(pte_none(*vmf->pte))) + do_set_pte(vmf, page); + else + ret = VM_FAULT_NOPAGE; + + update_mmu_tlb(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; } @@ -3951,13 +3876,12 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; int off; - vm_fault_t ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; - vmf->address = max(address & mask, vmf->vma->vm_start); - off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + address = max(address & mask, vmf->vma->vm_start); + off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); start_pgoff -= off; /* @@ -3965,7 +3889,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) * the vma or nr_pages from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - - ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, start_pgoff + nr_pages - 1); @@ -3973,31 +3897,11 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) if (pmd_none(*vmf->pmd)) { vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) - goto out; + return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ } - vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); - - /* Huge page is mapped? Page fault is solved */ - if (pmd_trans_huge(*vmf->pmd)) { - ret = VM_FAULT_NOPAGE; - goto out; - } - - /* ->map_pages() haven't done anything useful. Cold page cache? */ - if (!vmf->pte) - goto out; - - /* check if the page fault is solved */ - vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); - if (!pte_none(*vmf->pte)) - ret = VM_FAULT_NOPAGE; - pte_unmap_unlock(vmf->pte, vmf->ptl); -out: - vmf->address = address; - vmf->pte = NULL; - return ret; + return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); } static vm_fault_t do_read_fault(struct vm_fault *vmf) @@ -4353,7 +4257,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) */ vmf->pte = NULL; } else { - /* See comment in pte_alloc_one_map() */ + /* + * If a huge pmd materialized under us just retry later. Use + * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead + * of pmd_trans_huge() to ensure the pmd didn't become + * pmd_trans_huge under us and then back to pmd_none, as a + * result of MADV_DONTNEED running immediately after a huge pmd + * fault in a different thread of this mm, in turn leading to a + * misleading pmd_trans_huge() retval. All we have to ensure is + * that it is a regular pmd that we can walk with + * pte_offset_map() and we can do that through an atomic read + * in C, which is what pmd_trans_unstable() provides. + */ if (pmd_devmap_trans_unstable(vmf->pmd)) return 0; /* From 46bdb4277f98e70d0c91f4289897ade533fe9e80 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Nov 2020 18:48:26 +0000 Subject: [PATCH 015/108] mm: Allow architectures to request 'old' entries when prefaulting Commit 5c0a85fad949 ("mm: make faultaround produce old ptes") changed the "faultaround" behaviour to initialise prefaulted PTEs as 'old', since this avoids vmscan wrongly assuming that they are hot, despite having never been explicitly accessed by userspace. The change has been shown to benefit numerous arm64 micro-architectures (with hardware access flag) running Android, where both application launch latency and direct reclaim time are significantly reduced (by 10%+ and ~80% respectively). Unfortunately, commit 315d09bf30c2 ("Revert "mm: make faultaround produce old ptes"") reverted the change due to it being identified as the cause of a ~6% regression in unixbench on x86. Experiments on a variety of recent arm64 micro-architectures indicate that unixbench is not affected by the original commit, which appears to yield a 0-1% performance improvement. Since one size does not fit all for the initial state of prefaulted PTEs, introduce arch_wants_old_prefaulted_pte(), which allows an architecture to opt-in to 'old' prefaulted PTEs at runtime based on whatever criteria it may have. Cc: Jan Kara Cc: Minchan Kim Cc: Andrew Morton Cc: Kirill A. Shutemov Cc: Linus Torvalds Reported-by: Vinayak Menon Signed-off-by: Will Deacon --- include/linux/mm.h | 5 ++++- mm/filemap.c | 14 ++++++++++---- mm/memory.c | 20 +++++++++++++++++++- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 4572a9bc5862..251a2339befb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -434,6 +434,7 @@ extern pgprot_t protection_map[16]; * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. + * @FAULT_FLAG_PREFAULT: Fault was a prefault. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two @@ -464,6 +465,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_REMOTE 0x80 #define FAULT_FLAG_INSTRUCTION 0x100 #define FAULT_FLAG_INTERRUPTIBLE 0x200 +#define FAULT_FLAG_PREFAULT 0x400 /* * The default fault flags that should be used by most of the @@ -501,7 +503,8 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags) { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ - { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" } + { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ + { FAULT_FLAG_PREFAULT, "PREFAULT" } /* * vm_fault is filled by the pagefault handler and passed to the vma's diff --git a/mm/filemap.c b/mm/filemap.c index c1f2dc89b8a7..a6dc97906c8e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3019,6 +3019,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; unsigned long address = vmf->address; + unsigned long flags = vmf->flags; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *head, *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); @@ -3051,14 +3052,18 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (!pte_none(*vmf->pte)) goto unlock; + /* We're about to handle the fault */ + if (vmf->address == address) { + vmf->flags &= ~FAULT_FLAG_PREFAULT; + ret = VM_FAULT_NOPAGE; + } else { + vmf->flags |= FAULT_FLAG_PREFAULT; + } + do_set_pte(vmf, page); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, vmf->address, vmf->pte); unlock_page(head); - - /* The fault is handled */ - if (vmf->address == address) - ret = VM_FAULT_NOPAGE; continue; unlock: unlock_page(head); @@ -3067,6 +3072,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); + vmf->flags = flags; vmf->address = address; WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); return ret; diff --git a/mm/memory.c b/mm/memory.c index 3e2fc2950ad7..f0e7c589ca9d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -134,6 +134,18 @@ static inline bool arch_faults_on_old_pte(void) } #endif +#ifndef arch_wants_old_prefaulted_pte +static inline bool arch_wants_old_prefaulted_pte(void) +{ + /* + * Transitioning a PTE from 'old' to 'young' can be expensive on + * some architectures, even if it's performed in hardware. By + * default, "false" means prefaulted entries will be 'young'. + */ + return false; +} +#endif + static int __init disable_randmaps(char *s) { randomize_va_space = 0; @@ -3725,11 +3737,17 @@ void do_set_pte(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; + bool prefault = vmf->flags & FAULT_FLAG_PREFAULT; pte_t entry; flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); + + if (prefault && arch_wants_old_prefaulted_pte()) + entry = pte_mkold(entry); + else + entry = pte_sw_mkyoung(entry); + if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* copy-on-write page */ From 0388f9c7433024ccf3909b3404a745100f006a17 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Nov 2020 18:49:26 +0000 Subject: [PATCH 016/108] arm64: mm: Implement arch_wants_old_prefaulted_pte() On CPUs with hardware AF/DBM, initialising prefaulted PTEs as 'old' improves vmscan behaviour and does not appear to introduce any overhead elsewhere. Implement arch_wants_old_prefaulted_pte() to return 'true' if we detect hardware access flag support at runtime. This can be extended in future based on MIDR matching if necessary. Cc: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 501562793ce2..e17b96d0e4b5 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -980,7 +980,17 @@ static inline bool arch_faults_on_old_pte(void) return !cpu_has_hw_af(); } -#define arch_faults_on_old_pte arch_faults_on_old_pte +#define arch_faults_on_old_pte arch_faults_on_old_pte + +/* + * Experimentally, it's cheap to set the access flag in hardware and we + * benefit from prefaulting mappings as 'old' to start with. + */ +static inline bool arch_wants_old_prefaulted_pte(void) +{ + return !arch_faults_on_old_pte(); +} +#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte #endif /* !__ASSEMBLY__ */ From 4a669e2432fce9c01522a8453460e89f877dccd4 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Thu, 3 Dec 2020 22:16:09 +0800 Subject: [PATCH 017/108] drivers/perf: Add support for ARMv8.3-SPE Armv8.3 extends the SPE by adding: - Alignment field in the Events packet, and filtering on this event using PMSEVFR_EL1. - Support for the Scalable Vector Extension (SVE). The main additions for SVE are: - Recording the vector length for SVE operations in the Operation Type packet. It is not possible to filter on vector length. - Incomplete predicate and empty predicate fields in the Events packet, and filtering on these events using PMSEVFR_EL1. Update the check of pmsevfr for empty/partial predicated SVE and alignment event in SPE driver. Signed-off-by: Wei Li Link: https://lore.kernel.org/r/20201203141609.14148-1-liwei391@huawei.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/sysreg.h | 9 ++++++++- drivers/perf/arm_spe_pmu.c | 17 +++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 8b5e7e5c3cc8..767bb2d47be9 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -291,7 +291,11 @@ #define SYS_PMSFCR_EL1_ST_SHIFT 18 #define SYS_PMSEVFR_EL1 sys_reg(3, 0, 9, 9, 5) -#define SYS_PMSEVFR_EL1_RES0 0x0000ffff00ff0f55UL +#define SYS_PMSEVFR_EL1_RES0_8_2 \ + (GENMASK_ULL(47, 32) | GENMASK_ULL(23, 16) | GENMASK_ULL(11, 8) |\ + BIT_ULL(6) | BIT_ULL(4) | BIT_ULL(2) | BIT_ULL(0)) +#define SYS_PMSEVFR_EL1_RES0_8_3 \ + (SYS_PMSEVFR_EL1_RES0_8_2 & ~(BIT_ULL(18) | BIT_ULL(17) | BIT_ULL(11))) #define SYS_PMSLATFR_EL1 sys_reg(3, 0, 9, 9, 6) #define SYS_PMSLATFR_EL1_MINLAT_SHIFT 0 @@ -844,6 +848,9 @@ #define ID_AA64DFR0_PMUVER_8_5 0x6 #define ID_AA64DFR0_PMUVER_IMP_DEF 0xf +#define ID_AA64DFR0_PMSVER_8_2 0x1 +#define ID_AA64DFR0_PMSVER_8_3 0x2 + #define ID_DFR0_PERFMON_SHIFT 24 #define ID_DFR0_PERFMON_8_1 0x4 diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index cc00915ad6d1..bce9aff9f546 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -54,7 +54,7 @@ struct arm_spe_pmu { struct hlist_node hotplug_node; int irq; /* PPI */ - + u16 pmsver; u16 min_period; u16 counter_sz; @@ -655,6 +655,18 @@ static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev) return IRQ_HANDLED; } +static u64 arm_spe_pmsevfr_res0(u16 pmsver) +{ + switch (pmsver) { + case ID_AA64DFR0_PMSVER_8_2: + return SYS_PMSEVFR_EL1_RES0_8_2; + case ID_AA64DFR0_PMSVER_8_3: + /* Return the highest version we support in default */ + default: + return SYS_PMSEVFR_EL1_RES0_8_3; + } +} + /* Perf callbacks */ static int arm_spe_pmu_event_init(struct perf_event *event) { @@ -670,7 +682,7 @@ static int arm_spe_pmu_event_init(struct perf_event *event) !cpumask_test_cpu(event->cpu, &spe_pmu->supported_cpus)) return -ENOENT; - if (arm_spe_event_to_pmsevfr(event) & SYS_PMSEVFR_EL1_RES0) + if (arm_spe_event_to_pmsevfr(event) & arm_spe_pmsevfr_res0(spe_pmu->pmsver)) return -EOPNOTSUPP; if (attr->exclude_idle) @@ -937,6 +949,7 @@ static void __arm_spe_pmu_dev_probe(void *info) fld, smp_processor_id()); return; } + spe_pmu->pmsver = (u16)fld; /* Read PMBIDR first to determine whether or not we have access */ reg = read_sysreg_s(SYS_PMBIDR_EL1); From 30b34c4833ea7a1a48132d957052d79b6dcb1ebb Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Sun, 17 Jan 2021 22:28:44 +0100 Subject: [PATCH 018/108] perf: qcom: Constify static struct attribute_group The only usage is to put their addresses in an array of pointers to const struct attribute group. Make them const to allow the compiler to put them in read-only memory. Signed-off-by: Rikard Falkeborn Link: https://lore.kernel.org/r/20210117212847.21319-2-rikard.falkeborn@gmail.com Signed-off-by: Will Deacon --- drivers/perf/qcom_l2_pmu.c | 6 +++--- drivers/perf/qcom_l3_pmu.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c index 23a0e008dafa..8883af955a2a 100644 --- a/drivers/perf/qcom_l2_pmu.c +++ b/drivers/perf/qcom_l2_pmu.c @@ -649,7 +649,7 @@ static struct attribute *l2_cache_pmu_cpumask_attrs[] = { NULL, }; -static struct attribute_group l2_cache_pmu_cpumask_group = { +static const struct attribute_group l2_cache_pmu_cpumask_group = { .attrs = l2_cache_pmu_cpumask_attrs, }; @@ -665,7 +665,7 @@ static struct attribute *l2_cache_pmu_formats[] = { NULL, }; -static struct attribute_group l2_cache_pmu_format_group = { +static const struct attribute_group l2_cache_pmu_format_group = { .name = "format", .attrs = l2_cache_pmu_formats, }; @@ -700,7 +700,7 @@ static struct attribute *l2_cache_pmu_events[] = { NULL }; -static struct attribute_group l2_cache_pmu_events_group = { +static const struct attribute_group l2_cache_pmu_events_group = { .name = "events", .attrs = l2_cache_pmu_events, }; diff --git a/drivers/perf/qcom_l3_pmu.c b/drivers/perf/qcom_l3_pmu.c index 9ddb577c542b..fb34b87b9471 100644 --- a/drivers/perf/qcom_l3_pmu.c +++ b/drivers/perf/qcom_l3_pmu.c @@ -630,7 +630,7 @@ static struct attribute *qcom_l3_cache_pmu_formats[] = { NULL, }; -static struct attribute_group qcom_l3_cache_pmu_format_group = { +static const struct attribute_group qcom_l3_cache_pmu_format_group = { .name = "format", .attrs = qcom_l3_cache_pmu_formats, }; @@ -663,7 +663,7 @@ static struct attribute *qcom_l3_cache_pmu_events[] = { NULL }; -static struct attribute_group qcom_l3_cache_pmu_events_group = { +static const struct attribute_group qcom_l3_cache_pmu_events_group = { .name = "events", .attrs = qcom_l3_cache_pmu_events, }; @@ -685,7 +685,7 @@ static struct attribute *qcom_l3_cache_pmu_cpumask_attrs[] = { NULL, }; -static struct attribute_group qcom_l3_cache_pmu_cpumask_attr_group = { +static const struct attribute_group qcom_l3_cache_pmu_cpumask_attr_group = { .attrs = qcom_l3_cache_pmu_cpumask_attrs, }; From 3cb7d2da183fec42974fa3fa795cc33d1e81322d Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Sun, 17 Jan 2021 22:28:45 +0100 Subject: [PATCH 019/108] perf/imx_ddr: Constify static struct attribute_group The only usage is to put their addresses in an array of pointers to const struct attribute group. Make them const to allow the compiler to put them in read-only memory. Signed-off-by: Rikard Falkeborn Link: https://lore.kernel.org/r/20210117212847.21319-3-rikard.falkeborn@gmail.com Signed-off-by: Will Deacon --- drivers/perf/fsl_imx8_ddr_perf.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c index a11bfd8a0823..be1f26b62ddb 100644 --- a/drivers/perf/fsl_imx8_ddr_perf.c +++ b/drivers/perf/fsl_imx8_ddr_perf.c @@ -133,7 +133,7 @@ static struct attribute *ddr_perf_identifier_attrs[] = { NULL, }; -static struct attribute_group ddr_perf_identifier_attr_group = { +static const struct attribute_group ddr_perf_identifier_attr_group = { .attrs = ddr_perf_identifier_attrs, .is_visible = ddr_perf_identifier_attr_visible, }; @@ -188,7 +188,7 @@ static struct attribute *ddr_perf_filter_cap_attr[] = { NULL, }; -static struct attribute_group ddr_perf_filter_cap_attr_group = { +static const struct attribute_group ddr_perf_filter_cap_attr_group = { .name = "caps", .attrs = ddr_perf_filter_cap_attr, }; @@ -209,7 +209,7 @@ static struct attribute *ddr_perf_cpumask_attrs[] = { NULL, }; -static struct attribute_group ddr_perf_cpumask_attr_group = { +static const struct attribute_group ddr_perf_cpumask_attr_group = { .attrs = ddr_perf_cpumask_attrs, }; @@ -265,7 +265,7 @@ static struct attribute *ddr_perf_events_attrs[] = { NULL, }; -static struct attribute_group ddr_perf_events_attr_group = { +static const struct attribute_group ddr_perf_events_attr_group = { .name = "events", .attrs = ddr_perf_events_attrs, }; @@ -281,7 +281,7 @@ static struct attribute *ddr_perf_format_attrs[] = { NULL, }; -static struct attribute_group ddr_perf_format_attr_group = { +static const struct attribute_group ddr_perf_format_attr_group = { .name = "format", .attrs = ddr_perf_format_attrs, }; From c2c4d5c051b23dd79ff82d6232347245e11d7c9c Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Sun, 17 Jan 2021 22:28:46 +0100 Subject: [PATCH 020/108] perf: hisi: Constify static struct attribute_group The only usage is to put their addresses in an array of pointers to const struct attribute group. Make them const to allow the compiler to put them in read-only memory. Signed-off-by: Rikard Falkeborn Link: https://lore.kernel.org/r/20210117212847.21319-4-rikard.falkeborn@gmail.com Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c | 2 +- drivers/perf/hisilicon/hisi_uncore_hha_pmu.c | 2 +- drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c index 5ac6c9113767..ac1a8c120a00 100644 --- a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c @@ -319,7 +319,7 @@ static struct attribute *hisi_ddrc_pmu_identifier_attrs[] = { NULL }; -static struct attribute_group hisi_ddrc_pmu_identifier_group = { +static const struct attribute_group hisi_ddrc_pmu_identifier_group = { .attrs = hisi_ddrc_pmu_identifier_attrs, }; diff --git a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c index 41b2dceb5f26..3402f1a395a8 100644 --- a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c @@ -331,7 +331,7 @@ static struct attribute *hisi_hha_pmu_identifier_attrs[] = { NULL }; -static struct attribute_group hisi_hha_pmu_identifier_group = { +static const struct attribute_group hisi_hha_pmu_identifier_group = { .attrs = hisi_hha_pmu_identifier_attrs, }; diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c index 705501d18d03..7d792435c2aa 100644 --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c @@ -321,7 +321,7 @@ static struct attribute *hisi_l3c_pmu_identifier_attrs[] = { NULL }; -static struct attribute_group hisi_l3c_pmu_identifier_group = { +static const struct attribute_group hisi_l3c_pmu_identifier_group = { .attrs = hisi_l3c_pmu_identifier_attrs, }; From f0c140481d1b807217cacdcf11d24cfa407a7a53 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Sun, 17 Jan 2021 22:28:47 +0100 Subject: [PATCH 021/108] perf: Constify static struct attribute_group The only usage is to put their addresses in an array of pointers to const struct attribute group. Make them const to allow the compiler to put them in read-only memory. Signed-off-by: Rikard Falkeborn Link: https://lore.kernel.org/r/20210117212847.21319-5-rikard.falkeborn@gmail.com Signed-off-by: Will Deacon --- drivers/perf/arm-cci.c | 2 +- drivers/perf/arm-cmn.c | 2 +- drivers/perf/arm_dmc620_pmu.c | 4 ++-- drivers/perf/arm_pmu.c | 2 +- drivers/perf/arm_smmuv3_pmu.c | 8 ++++---- drivers/perf/arm_spe_pmu.c | 6 +++--- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c index 87c4be9dd412..a75cf77c4de4 100644 --- a/drivers/perf/arm-cci.c +++ b/drivers/perf/arm-cci.c @@ -1376,7 +1376,7 @@ static struct attribute *pmu_attrs[] = { NULL, }; -static struct attribute_group pmu_attr_group = { +static const struct attribute_group pmu_attr_group = { .attrs = pmu_attrs, }; diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index a76ff594f3ca..f30fcd330899 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -616,7 +616,7 @@ static struct attribute *arm_cmn_cpumask_attrs[] = { NULL, }; -static struct attribute_group arm_cmn_cpumask_attr_group = { +static const struct attribute_group arm_cmn_cpumask_attr_group = { .attrs = arm_cmn_cpumask_attrs, }; diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c index 004930eb4bbb..27f54c0afc3b 100644 --- a/drivers/perf/arm_dmc620_pmu.c +++ b/drivers/perf/arm_dmc620_pmu.c @@ -159,7 +159,7 @@ static struct attribute *dmc620_pmu_events_attrs[] = { NULL, }; -static struct attribute_group dmc620_pmu_events_attr_group = { +static const struct attribute_group dmc620_pmu_events_attr_group = { .name = "events", .attrs = dmc620_pmu_events_attrs, }; @@ -222,7 +222,7 @@ static struct attribute *dmc620_pmu_formats_attrs[] = { NULL, }; -static struct attribute_group dmc620_pmu_format_attr_group = { +static const struct attribute_group dmc620_pmu_format_attr_group = { .name = "format", .attrs = dmc620_pmu_formats_attrs, }; diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index cb2f55f450e4..2d10d84fb79c 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -577,7 +577,7 @@ static struct attribute *armpmu_common_attrs[] = { NULL, }; -static struct attribute_group armpmu_common_attr_group = { +static const struct attribute_group armpmu_common_attr_group = { .attrs = armpmu_common_attrs, }; diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c index 74474bb322c3..8ff7a67f691c 100644 --- a/drivers/perf/arm_smmuv3_pmu.c +++ b/drivers/perf/arm_smmuv3_pmu.c @@ -493,7 +493,7 @@ static struct attribute *smmu_pmu_cpumask_attrs[] = { NULL }; -static struct attribute_group smmu_pmu_cpumask_group = { +static const struct attribute_group smmu_pmu_cpumask_group = { .attrs = smmu_pmu_cpumask_attrs, }; @@ -548,7 +548,7 @@ static umode_t smmu_pmu_event_is_visible(struct kobject *kobj, return 0; } -static struct attribute_group smmu_pmu_events_group = { +static const struct attribute_group smmu_pmu_events_group = { .name = "events", .attrs = smmu_pmu_events, .is_visible = smmu_pmu_event_is_visible, @@ -583,7 +583,7 @@ static struct attribute *smmu_pmu_identifier_attrs[] = { NULL }; -static struct attribute_group smmu_pmu_identifier_group = { +static const struct attribute_group smmu_pmu_identifier_group = { .attrs = smmu_pmu_identifier_attrs, .is_visible = smmu_pmu_identifier_attr_visible, }; @@ -602,7 +602,7 @@ static struct attribute *smmu_pmu_formats[] = { NULL }; -static struct attribute_group smmu_pmu_format_group = { +static const struct attribute_group smmu_pmu_format_group = { .name = "format", .attrs = smmu_pmu_formats, }; diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index bce9aff9f546..d3929ccebfd2 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -146,7 +146,7 @@ static struct attribute *arm_spe_pmu_cap_attr[] = { NULL, }; -static struct attribute_group arm_spe_pmu_cap_group = { +static const struct attribute_group arm_spe_pmu_cap_group = { .name = "caps", .attrs = arm_spe_pmu_cap_attr, }; @@ -227,7 +227,7 @@ static struct attribute *arm_spe_pmu_formats_attr[] = { NULL, }; -static struct attribute_group arm_spe_pmu_format_group = { +static const struct attribute_group arm_spe_pmu_format_group = { .name = "format", .attrs = arm_spe_pmu_formats_attr, }; @@ -247,7 +247,7 @@ static struct attribute *arm_spe_pmu_attrs[] = { NULL, }; -static struct attribute_group arm_spe_pmu_group = { +static const struct attribute_group arm_spe_pmu_group = { .attrs = arm_spe_pmu_attrs, }; From 742d33729a0df11c9d8d4625dbf21dd20cdefd44 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 20 Jan 2021 14:34:23 +0000 Subject: [PATCH 022/108] mm: Move immutable fields of 'struct vm_fault' into anonymous struct 'struct vm_fault' contains both information about the fault being serviced alongside mutable fields contributing to the state of the fault-handling logic. Unfortunately, the distinction between the two is not clear-cut, and a number of callers end up manipulating the structure temporarily before restoring it when returning. Try to clean this up by moving the immutable fault information into an anonymous struct, which will later be marked as 'const'. Ideally, the 'flags' field would be part of the new structure too, but it seems as though the ->page_mkwrite() path is not ready for this yet. Cc: Kirill A. Shutemov Suggested-by: Linus Torvalds Link: https://lore.kernel.org/r/CAHk-=whYs9XsO88iqJzN6NC=D-dp2m0oYXuOoZ=eWnvv=5OA+w@mail.gmail.com Signed-off-by: Will Deacon --- include/linux/mm.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 251a2339befb..b4a5cb9bff7d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -517,11 +517,14 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags) * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { - struct vm_area_struct *vma; /* Target VMA */ - unsigned int flags; /* FAULT_FLAG_xxx flags */ - gfp_t gfp_mask; /* gfp mask to be used for allocations */ - pgoff_t pgoff; /* Logical page offset based on vma */ - unsigned long address; /* Faulting virtual address */ + struct { + struct vm_area_struct *vma; /* Target VMA */ + gfp_t gfp_mask; /* gfp mask to be used for allocations */ + pgoff_t pgoff; /* Logical page offset based on vma */ + unsigned long address; /* Faulting virtual address */ + }; + unsigned int flags; /* FAULT_FLAG_xxx flags + * XXX: should really be 'const' */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching From 9d3af4b448a119ac81378d3bc775f1c4a2a7ff36 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 14 Jan 2021 15:24:19 +0000 Subject: [PATCH 023/108] mm: Pass 'address' to map to do_set_pte() and drop FAULT_FLAG_PREFAULT Rather than modifying the 'address' field of the 'struct vm_fault' passed to do_set_pte(), leave that to identify the real faulting address and pass in the virtual address to be mapped by the new pte as a separate argument. This makes FAULT_FLAG_PREFAULT redundant, as a prefault entry can be identified simply by comparing the new address parameter with the faulting address, so remove the redundant flag at the same time. Cc: Kirill A. Shutemov Cc: Linus Torvalds Signed-off-by: Will Deacon --- include/linux/mm.h | 7 ++----- mm/filemap.c | 21 +++++++-------------- mm/memory.c | 10 +++++----- 3 files changed, 14 insertions(+), 24 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index b4a5cb9bff7d..e0f056753bef 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -434,7 +434,6 @@ extern pgprot_t protection_map[16]; * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. - * @FAULT_FLAG_PREFAULT: Fault was a prefault. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two @@ -465,7 +464,6 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_REMOTE 0x80 #define FAULT_FLAG_INSTRUCTION 0x100 #define FAULT_FLAG_INTERRUPTIBLE 0x200 -#define FAULT_FLAG_PREFAULT 0x400 /* * The default fault flags that should be used by most of the @@ -503,8 +501,7 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags) { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ - { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ - { FAULT_FLAG_PREFAULT, "PREFAULT" } + { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" } /* * vm_fault is filled by the pagefault handler and passed to the vma's @@ -995,7 +992,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) } vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); -void do_set_pte(struct vm_fault *vmf, struct page *page); +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); diff --git a/mm/filemap.c b/mm/filemap.c index a6dc97906c8e..fb7a8d9b5603 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3018,8 +3018,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; - unsigned long address = vmf->address; - unsigned long flags = vmf->flags; + unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *head, *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); @@ -3035,8 +3034,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, goto out; } - vmf->address = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); + addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); do { page = find_subpage(head, xas.xa_index); if (PageHWPoison(page)) @@ -3045,7 +3044,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (mmap_miss > 0) mmap_miss--; - vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT; + addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; @@ -3053,16 +3052,12 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, goto unlock; /* We're about to handle the fault */ - if (vmf->address == address) { - vmf->flags &= ~FAULT_FLAG_PREFAULT; + if (vmf->address == addr) ret = VM_FAULT_NOPAGE; - } else { - vmf->flags |= FAULT_FLAG_PREFAULT; - } - do_set_pte(vmf, page); + do_set_pte(vmf, page, addr); /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_cache(vma, addr, vmf->pte); unlock_page(head); continue; unlock: @@ -3072,8 +3067,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); - vmf->flags = flags; - vmf->address = address; WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); return ret; } diff --git a/mm/memory.c b/mm/memory.c index f0e7c589ca9d..7b1307873325 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3733,11 +3733,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) } #endif -void do_set_pte(struct vm_fault *vmf, struct page *page) +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; - bool prefault = vmf->flags & FAULT_FLAG_PREFAULT; + bool prefault = vmf->address != addr; pte_t entry; flush_icache_page(vma, page); @@ -3753,13 +3753,13 @@ void do_set_pte(struct vm_fault *vmf, struct page *page) /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address, false); + page_add_new_anon_rmap(page, vma, addr, false); lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); + set_pte_at(vma->vm_mm, addr, vmf->pte, entry); } /** @@ -3819,7 +3819,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) ret = 0; /* Re-check under ptl */ if (likely(pte_none(*vmf->pte))) - do_set_pte(vmf, page); + do_set_pte(vmf, page, vmf->address); else ret = VM_FAULT_NOPAGE; From 2b635dd372f6c8f27644c662bb48d10376ce561a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 14 Jan 2021 15:33:49 +0000 Subject: [PATCH 024/108] mm: Avoid modifying vmf.address in __collapse_huge_page_swapin() In preparation for const-ifying the anonymous struct field of 'struct vm_fault', rework __collapse_huge_page_swapin() to avoid continuously updating vmf.address and instead populate a new 'struct vm_fault' on the stack for each page being processed. Cc: Kirill A. Shutemov Cc: Linus Torvalds Signed-off-by: Will Deacon --- mm/khugepaged.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 67ab391a5373..fb0fdaec34d5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -991,38 +991,41 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, static bool __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, + unsigned long haddr, pmd_t *pmd, int referenced) { int swapped_in = 0; vm_fault_t ret = 0; - struct vm_fault vmf = { - .vma = vma, - .address = address, - .flags = FAULT_FLAG_ALLOW_RETRY, - .pmd = pmd, - .pgoff = linear_page_index(vma, address), - }; + unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); - vmf.pte = pte_offset_map(pmd, address); - for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; - vmf.pte++, vmf.address += PAGE_SIZE) { + for (address = haddr; address < end; address += PAGE_SIZE) { + struct vm_fault vmf = { + .vma = vma, + .address = address, + .pgoff = linear_page_index(vma, haddr), + .flags = FAULT_FLAG_ALLOW_RETRY, + .pmd = pmd, + }; + + vmf.pte = pte_offset_map(pmd, address); vmf.orig_pte = *vmf.pte; - if (!is_swap_pte(vmf.orig_pte)) + if (!is_swap_pte(vmf.orig_pte)) { + pte_unmap(vmf.pte); continue; + } swapped_in++; ret = do_swap_page(&vmf); /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */ if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); - if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { + if (hugepage_vma_revalidate(mm, haddr, &vma)) { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } /* check if the pmd is still valid */ - if (mm_find_pmd(mm, address) != pmd) { + if (mm_find_pmd(mm, haddr) != pmd) { trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } @@ -1031,11 +1034,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } - /* pte is unmapped now, we need to map it */ - vmf.pte = pte_offset_map(pmd, vmf.address); } - vmf.pte--; - pte_unmap(vmf.pte); /* Drain LRU add pagevec to remove extra pin on the swapped in pages */ if (swapped_in) From 8c63ca5bc3e19f11128e8e285dcf20aac6768f97 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 14 Jan 2021 15:42:14 +0000 Subject: [PATCH 025/108] mm: Use static initialisers for immutable fields of 'struct vm_fault' In preparation for const-ifying the anonymous struct field of 'struct vm_fault', ensure that it is initialised using designated initialisers. Cc: Kirill A. Shutemov Cc: Linus Torvalds Reviewed-by: Nick Desaulniers Signed-off-by: Will Deacon --- mm/shmem.c | 6 +++--- mm/swapfile.c | 11 ++++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 7c6b6d8f6c39..1b254fbfdf52 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1520,11 +1520,11 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, { struct vm_area_struct pvma; struct page *page; - struct vm_fault vmf; + struct vm_fault vmf = { + .vma = &pvma, + }; shmem_pseudo_vma_init(&pvma, info, index); - vmf.vma = &pvma; - vmf.address = 0; page = swap_cluster_readahead(swap, gfp, &vmf); shmem_pseudo_vma_destroy(&pvma); diff --git a/mm/swapfile.c b/mm/swapfile.c index 9fffc5af29d1..2b570a566276 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1951,8 +1951,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, si = swap_info[type]; pte = pte_offset_map(pmd, addr); do { - struct vm_fault vmf; - if (!is_swap_pte(*pte)) continue; @@ -1968,9 +1966,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, swap_map = &si->swap_map[offset]; page = lookup_swap_cache(entry, vma, addr); if (!page) { - vmf.vma = vma; - vmf.address = addr; - vmf.pmd = pmd; + struct vm_fault vmf = { + .vma = vma, + .address = addr, + .pmd = pmd, + }; + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); } From 5857c9209ce58f8e262889539ccdf63e73ad7a93 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 14 Jan 2021 15:44:09 +0000 Subject: [PATCH 026/108] mm: Mark anonymous struct field of 'struct vm_fault' as 'const' The fields of this struct are only ever read after being initialised, so mark it 'const' before somebody tries to modify it again. GCC will then complain (with an error) about modification of these fields after they have been initialised, although LLVM currently allows them without even a warning: https://bugs.llvm.org/show_bug.cgi?id=48755 Hopefully, future versions of LLVM will emit a warning. Cc: Kirill A. Shutemov Cc: Linus Torvalds Signed-off-by: Will Deacon --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e0f056753bef..7ff3d9817d38 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -514,7 +514,7 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags) * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { - struct { + const struct { struct vm_area_struct *vma; /* Target VMA */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ From 390596c9959c2a4f5b456df339f0604df3d55fe0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 5 Nov 2020 16:29:44 +0100 Subject: [PATCH 027/108] random: avoid arch_get_random_seed_long() when collecting IRQ randomness When reseeding the CRNG periodically, arch_get_random_seed_long() is called to obtain entropy from an architecture specific source if one is implemented. In most cases, these are special instructions, but in some cases, such as on ARM, we may want to back this using firmware calls, which are considerably more expensive. Another call to arch_get_random_seed_long() exists in the CRNG driver, in add_interrupt_randomness(), which collects entropy by capturing inter-interrupt timing and relying on interrupt jitter to provide random bits. This is done by keeping a per-CPU state, and mixing in the IRQ number, the cycle counter and the return address every time an interrupt is taken, and mixing this per-CPU state into the entropy pool every 64 invocations, or at least once per second. The entropy that is gathered this way is credited as 1 bit of entropy. Every time this happens, arch_get_random_seed_long() is invoked, and the result is mixed in as well, and also credited with 1 bit of entropy. This means that arch_get_random_seed_long() is called at least once per second on every CPU, which seems excessive, and doesn't really scale, especially in a virtualization scenario where CPUs may be oversubscribed: in cases where arch_get_random_seed_long() is backed by an instruction that actually goes back to a shared hardware entropy source (such as RNDRRS on ARM), we will end up hitting it hundreds of times per second. So let's drop the call to arch_get_random_seed_long() from add_interrupt_randomness(), and instead, rely on crng_reseed() to call the arch hook to get random seed material from the platform. Signed-off-by: Ard Biesheuvel Reviewed-by: Andre Przywara Tested-by: Andre Przywara Reviewed-by: Eric Biggers Acked-by: Marc Zyngier Reviewed-by: Jason A. Donenfeld Link: https://lore.kernel.org/r/20201105152944.16953-1-ardb@kernel.org Signed-off-by: Will Deacon --- drivers/char/random.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 5f3b8ac9d97b..84e24986a97a 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1261,8 +1261,6 @@ void add_interrupt_randomness(int irq, int irq_flags) cycles_t cycles = random_get_entropy(); __u32 c_high, j_high; __u64 ip; - unsigned long seed; - int credit = 0; if (cycles == 0) cycles = get_reg(fast_pool, regs); @@ -1298,23 +1296,12 @@ void add_interrupt_randomness(int irq, int irq_flags) fast_pool->last = now; __mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool)); - - /* - * If we have architectural seed generator, produce a seed and - * add it to the pool. For the sake of paranoia don't let the - * architectural seed generator dominate the input from the - * interrupt noise. - */ - if (arch_get_random_seed_long(&seed)) { - __mix_pool_bytes(r, &seed, sizeof(seed)); - credit = 1; - } spin_unlock(&r->lock); fast_pool->count = 0; /* award one bit for the contents of the fast pool */ - credit_entropy_bits(r, credit + 1); + credit_entropy_bits(r, 1); } EXPORT_SYMBOL_GPL(add_interrupt_randomness); From a37e31fc97efe7f7c68cb381cf4390e472c09061 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 6 Jan 2021 10:34:50 +0000 Subject: [PATCH 028/108] firmware: smccc: Introduce SMCCC TRNG framework The ARM DEN0098 document describe an SMCCC based firmware service to deliver hardware generated random numbers. Its existence is advertised according to the SMCCC v1.1 specification. Add a (dummy) call to probe functions implemented in each architecture (ARM and arm64), to determine the existence of this interface. For now this return false, but this will be overwritten by each architecture's support patch. Signed-off-by: Andre Przywara Reviewed-by: Linus Walleij Reviewed-by: Sudeep Holla Signed-off-by: Will Deacon --- arch/arm/include/asm/archrandom.h | 10 ++++++++++ arch/arm64/include/asm/archrandom.h | 12 ++++++++++++ drivers/firmware/smccc/smccc.c | 6 ++++++ 3 files changed, 28 insertions(+) create mode 100644 arch/arm/include/asm/archrandom.h diff --git a/arch/arm/include/asm/archrandom.h b/arch/arm/include/asm/archrandom.h new file mode 100644 index 000000000000..a8e84ca5c2ee --- /dev/null +++ b/arch/arm/include/asm/archrandom.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_ARCHRANDOM_H +#define _ASM_ARCHRANDOM_H + +static inline bool __init smccc_probe_trng(void) +{ + return false; +} + +#endif /* _ASM_ARCHRANDOM_H */ diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h index ffb1a40d5475..abe07c21da8e 100644 --- a/arch/arm64/include/asm/archrandom.h +++ b/arch/arm64/include/asm/archrandom.h @@ -8,6 +8,11 @@ #include #include +static inline bool __init smccc_probe_trng(void) +{ + return false; +} + static inline bool __arm64_rndr(unsigned long *v) { bool ok; @@ -79,5 +84,12 @@ arch_get_random_seed_long_early(unsigned long *v) } #define arch_get_random_seed_long_early arch_get_random_seed_long_early +#else /* !CONFIG_ARCH_RANDOM */ + +static inline bool __init smccc_probe_trng(void) +{ + return false; +} + #endif /* CONFIG_ARCH_RANDOM */ #endif /* _ASM_ARCHRANDOM_H */ diff --git a/drivers/firmware/smccc/smccc.c b/drivers/firmware/smccc/smccc.c index 00c88b809c0c..d52bfc5ed5e4 100644 --- a/drivers/firmware/smccc/smccc.c +++ b/drivers/firmware/smccc/smccc.c @@ -5,16 +5,22 @@ #define pr_fmt(fmt) "smccc: " fmt +#include #include #include +#include static u32 smccc_version = ARM_SMCCC_VERSION_1_0; static enum arm_smccc_conduit smccc_conduit = SMCCC_CONDUIT_NONE; +bool __ro_after_init smccc_trng_available = false; + void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit) { smccc_version = version; smccc_conduit = conduit; + + smccc_trng_available = smccc_probe_trng(); } enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void) From 38db987316a38a3fe55ff7f5f4653fcb520a9d26 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 6 Jan 2021 10:34:52 +0000 Subject: [PATCH 029/108] arm64: Add support for SMCCC TRNG entropy source The ARM architected TRNG firmware interface, described in ARM spec DEN0098, defines an ARM SMCCC based interface to a true random number generator, provided by firmware. This can be discovered via the SMCCC >=v1.1 interface, and provides up to 192 bits of entropy per call. Hook this SMC call into arm64's arch_get_random_*() implementation, coming to the rescue when the CPU does not implement the ARM v8.5 RNG system registers. For the detection, we piggy back on the PSCI/SMCCC discovery (which gives us the conduit to use (hvc/smc)), then try to call the ARM_SMCCC_TRNG_VERSION function, which returns -1 if this interface is not implemented. Reviewed-by: Mark Brown Signed-off-by: Andre Przywara Signed-off-by: Will Deacon --- arch/arm64/include/asm/archrandom.h | 72 ++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h index abe07c21da8e..09e43272ccb0 100644 --- a/arch/arm64/include/asm/archrandom.h +++ b/arch/arm64/include/asm/archrandom.h @@ -4,13 +4,24 @@ #ifdef CONFIG_ARCH_RANDOM +#include #include #include #include +#define ARM_SMCCC_TRNG_MIN_VERSION 0x10000UL + +extern bool smccc_trng_available; + static inline bool __init smccc_probe_trng(void) { - return false; + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_VERSION, &res); + if ((s32)res.a0 < 0) + return false; + + return res.a0 >= ARM_SMCCC_TRNG_MIN_VERSION; } static inline bool __arm64_rndr(unsigned long *v) @@ -43,26 +54,55 @@ static inline bool __must_check arch_get_random_int(unsigned int *v) static inline bool __must_check arch_get_random_seed_long(unsigned long *v) { + struct arm_smccc_res res; + + /* + * We prefer the SMCCC call, since its semantics (return actual + * hardware backed entropy) is closer to the idea behind this + * function here than what even the RNDRSS register provides + * (the output of a pseudo RNG freshly seeded by a TRNG). + */ + if (smccc_trng_available) { + arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, 64, &res); + if ((int)res.a0 >= 0) { + *v = res.a3; + return true; + } + } + /* * Only support the generic interface after we have detected * the system wide capability, avoiding complexity with the * cpufeature code and with potential scheduling between CPUs * with and without the feature. */ - if (!cpus_have_const_cap(ARM64_HAS_RNG)) - return false; + if (cpus_have_const_cap(ARM64_HAS_RNG) && __arm64_rndr(v)) + return true; - return __arm64_rndr(v); + return false; } - static inline bool __must_check arch_get_random_seed_int(unsigned int *v) { + struct arm_smccc_res res; unsigned long val; - bool ok = arch_get_random_seed_long(&val); - *v = val; - return ok; + if (smccc_trng_available) { + arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, 32, &res); + if ((int)res.a0 >= 0) { + *v = res.a3 & GENMASK(31, 0); + return true; + } + } + + if (cpus_have_const_cap(ARM64_HAS_RNG)) { + if (__arm64_rndr(&val)) { + *v = val; + return true; + } + } + + return false; } static inline bool __init __early_cpu_has_rndr(void) @@ -77,10 +117,20 @@ arch_get_random_seed_long_early(unsigned long *v) { WARN_ON(system_state != SYSTEM_BOOTING); - if (!__early_cpu_has_rndr()) - return false; + if (smccc_trng_available) { + struct arm_smccc_res res; - return __arm64_rndr(v); + arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, 64, &res); + if ((int)res.a0 >= 0) { + *v = res.a3; + return true; + } + } + + if (__early_cpu_has_rndr() && __arm64_rndr(v)) + return true; + + return false; } #define arch_get_random_seed_long_early arch_get_random_seed_long_early From f0b13ee23241846f6f6cd0d119a8ac8059416ec4 Mon Sep 17 00:00:00 2001 From: Sudarshan Rajagopalan Date: Wed, 20 Jan 2021 21:29:13 -0800 Subject: [PATCH 030/108] arm64/sparsemem: reduce SECTION_SIZE_BITS memory_block_size_bytes() determines the memory hotplug granularity i.e the amount of memory which can be hot added or hot removed from the kernel. The generic value here being MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) for memory_block_size_bytes() on platforms like arm64 that does not override. Current SECTION_SIZE_BITS is 30 i.e 1GB which is large and a reduction here increases memory hotplug granularity, thus improving its agility. A reduced section size also reduces memory wastage in vmemmmap mapping for sections with large memory holes. So we try to set the least section size as possible. A section size bits selection must follow: (MAX_ORDER - 1 + PAGE_SHIFT) <= SECTION_SIZE_BITS CONFIG_FORCE_MAX_ZONEORDER is always defined on arm64 and so just following it would help achieve the smallest section size. SECTION_SIZE_BITS = (CONFIG_FORCE_MAX_ZONEORDER - 1 + PAGE_SHIFT) SECTION_SIZE_BITS = 22 (11 - 1 + 12) i.e 4MB for 4K pages SECTION_SIZE_BITS = 24 (11 - 1 + 14) i.e 16MB for 16K pages without THP SECTION_SIZE_BITS = 25 (12 - 1 + 14) i.e 32MB for 16K pages with THP SECTION_SIZE_BITS = 26 (11 - 1 + 16) i.e 64MB for 64K pages without THP SECTION_SIZE_BITS = 29 (14 - 1 + 16) i.e 512MB for 64K pages with THP But there are other problems in reducing SECTION_SIZE_BIT. Reducing it by too much would over populate /sys/devices/system/memory/ and also consume too many page->flags bits in the !vmemmap case. Also section size needs to be multiple of 128MB to have PMD based vmemmap mapping with CONFIG_ARM64_4K_PAGES. Given these constraints, lets just reduce the section size to 128MB for 4K and 16K base page size configs, and to 512MB for 64K base page size config. Signed-off-by: Sudarshan Rajagopalan Suggested-by: Anshuman Khandual Suggested-by: David Hildenbrand Cc: Catalin Marinas Cc: Will Deacon Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Mike Rapoport Cc: Mark Rutland Cc: Logan Gunthorpe Cc: Andrew Morton Cc: Steven Price Cc: Suren Baghdasaryan Reviewed-by: David Hildenbrand Acked-by: Mike Rapoport Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/43843c5e092bfe3ec4c41e3c8c78a7ee35b69bb0.1611206601.git.sudaraja@codeaurora.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/sparsemem.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/sparsemem.h b/arch/arm64/include/asm/sparsemem.h index 1f43fcc79738..eb4a75d720ed 100644 --- a/arch/arm64/include/asm/sparsemem.h +++ b/arch/arm64/include/asm/sparsemem.h @@ -7,7 +7,26 @@ #ifdef CONFIG_SPARSEMEM #define MAX_PHYSMEM_BITS CONFIG_ARM64_PA_BITS -#define SECTION_SIZE_BITS 30 -#endif + +/* + * Section size must be at least 512MB for 64K base + * page size config. Otherwise it will be less than + * (MAX_ORDER - 1) and the build process will fail. + */ +#ifdef CONFIG_ARM64_64K_PAGES +#define SECTION_SIZE_BITS 29 + +#else + +/* + * Section size must be at least 128MB for 4K base + * page size config. Otherwise PMD based huge page + * entries could not be created for vmemmap mappings. + * 16K follows 4K for simplicity. + */ +#define SECTION_SIZE_BITS 27 +#endif /* CONFIG_ARM64_64K_PAGES */ + +#endif /* CONFIG_SPARSEMEM*/ #endif From 507d664450f84ab2d5601aed55c6a4b3af9ec4c2 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Tue, 26 Jan 2021 20:24:44 +0800 Subject: [PATCH 031/108] arm64: mm: Remove unused header file Many header files are never used, let's remove them directly. Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Shaokun Zhang Link: https://lore.kernel.org/r/1611663884-43329-1-git-send-email-zhangshaokun@hisilicon.com Signed-off-by: Will Deacon --- arch/arm64/mm/mmap.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 07937b49cb88..c66b3878712c 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -5,20 +5,10 @@ * Copyright (C) 2012 ARM Ltd. */ -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include -#include +#include /* * You really shouldn't be using read() or write() on /dev/mem. This might go From 1e193c70f5bba8fc66b9da19be64e9a9229a37d6 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Mon, 25 Jan 2021 19:55:53 +0800 Subject: [PATCH 032/108] arm64: cacheflush: Remove stale comment Remove stale comment since commit a7ba121215fa ("arm64: use asm-generic/cacheflush.h") Cc: Christoph Hellwig Cc: Andrew Morton Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Shaokun Zhang Link: https://lore.kernel.org/r/1611575753-36435-1-git-send-email-zhangshaokun@hisilicon.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/cacheflush.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 45217f21f1fe..52e5c1623224 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -30,11 +30,6 @@ * the implementation assumes non-aliasing VIPT D-cache and (aliasing) * VIPT I-cache. * - * flush_cache_mm(mm) - * - * Clean and invalidate all user space cache entries - * before a change of page tables. - * * flush_icache_range(start, end) * * Ensure coherency between the I-cache and the D-cache in the From c7b9095e87bf81f029fbae2e5d6a8354b6f11987 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 27 Jan 2021 12:52:16 +0000 Subject: [PATCH 033/108] arm64: Include linux/io.h in mm/mmap.c Commit 507d664450f8 ("arm64: mm: Remove unused header file") removed a bunch of apparently "unused" header inclusions from our mm/mmap.c implementation, but in doing so introduced the following warning when building with W=1: >> arch/arm64/mm/mmap.c:17:5: warning: no previous prototype for 'valid_phys_addr_range' [-Wmissing-prototypes] 17 | int valid_phys_addr_range(phys_addr_t addr, size_t size) | ^~~~~~~~~~~~~~~~~~~~~ >> arch/arm64/mm/mmap.c:36:5: warning: no previous prototype for 'valid_mmap_phys_addr_range' [-Wmissing-prototypes] 36 | int valid_mmap_phys_addr_range(unsigned long pfn, size_t size) | ^~~~~~~~~~~~~~~~~~~~~~~~~~ Add back the linux/io.h header inclusion to pull in the missing prototypes. Reported-by: kernel test robot Link: https://lore.kernel.org/r/202101271438.V9TmBC31-lkp@intel.com Signed-off-by: Will Deacon --- arch/arm64/mm/mmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index c66b3878712c..a38f54cd638c 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -5,6 +5,7 @@ * Copyright (C) 2012 ARM Ltd. */ +#include #include #include From 117cda9a7847286484d2353af64728a9875effd4 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:06 -0500 Subject: [PATCH 034/108] arm64: kexec: make dtb_mem always enabled Currently, dtb_mem is enabled only when CONFIG_KEXEC_FILE is enabled. This adds ugly ifdefs to c files. Always enabled dtb_mem, when it is not used, it is NULL. Change the dtb_mem to phys_addr_t, as it is a physical address. Signed-off-by: Pavel Tatashin Reviewed-by: James Morse Link: https://lore.kernel.org/r/20210125191923.1060122-2-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/kexec.h | 4 ++-- arch/arm64/kernel/machine_kexec.c | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index d24b527e8c00..61530ec3a9b1 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -90,18 +90,18 @@ static inline void crash_prepare_suspend(void) {} static inline void crash_post_resume(void) {} #endif -#ifdef CONFIG_KEXEC_FILE #define ARCH_HAS_KIMAGE_ARCH struct kimage_arch { void *dtb; - unsigned long dtb_mem; + phys_addr_t dtb_mem; /* Core ELF header buffer */ void *elf_headers; unsigned long elf_headers_mem; unsigned long elf_headers_sz; }; +#ifdef CONFIG_KEXEC_FILE extern const struct kexec_file_ops kexec_image_ops; struct kimage; diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index a0b144cfaea7..8096a6aa1d49 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -204,11 +204,7 @@ void machine_kexec(struct kimage *kimage) * In kexec_file case, the kernel starts directly without purgatory. */ cpu_soft_restart(reboot_code_buffer_phys, kimage->head, kimage->start, -#ifdef CONFIG_KEXEC_FILE - kimage->arch.dtb_mem); -#else - 0); -#endif + kimage->arch.dtb_mem); BUG(); /* Should never get here. */ } From 41f67d40a31d6b007d372461f171875fd940f17d Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:07 -0500 Subject: [PATCH 035/108] arm64: hibernate: variable pudp is used instead of pd4dp There should be p4dp used when p4d page is allocated. This is not a functional issue, but for the logical correctness this should be fixed. Fixes: e9f6376858b9 ("arm64: add support for folded p4d page tables") Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/r/20210125191923.1060122-3-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/kernel/hibernate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 9c9f47e9f7f4..0a54d81c90f9 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -190,10 +190,10 @@ static int trans_pgd_map_page(pgd_t *trans_pgd, void *page, pgdp = pgd_offset_pgd(trans_pgd, dst_addr); if (pgd_none(READ_ONCE(*pgdp))) { - pudp = (void *)get_safe_page(GFP_ATOMIC); - if (!pudp) + p4dp = (void *)get_safe_page(GFP_ATOMIC); + if (!pgdp) return -ENOMEM; - pgd_populate(&init_mm, pgdp, pudp); + pgd_populate(&init_mm, pgdp, p4dp); } p4dp = p4d_offset(pgdp, dst_addr); From 072e3d96a79a5876691c6532f91f930157144a2a Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:08 -0500 Subject: [PATCH 036/108] arm64: hibernate: move page handling function to new trans_pgd.c Now, that we abstracted the required functions move them to a new home. Later, we will generalize these function in order to be useful outside of hibernation. Signed-off-by: Pavel Tatashin Reviewed-by: James Morse Link: https://lore.kernel.org/r/20210125191923.1060122-4-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 4 + arch/arm64/include/asm/trans_pgd.h | 21 +++ arch/arm64/kernel/hibernate.c | 228 +------------------------- arch/arm64/mm/Makefile | 1 + arch/arm64/mm/trans_pgd.c | 250 +++++++++++++++++++++++++++++ 5 files changed, 277 insertions(+), 227 deletions(-) create mode 100644 arch/arm64/include/asm/trans_pgd.h create mode 100644 arch/arm64/mm/trans_pgd.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f39568b28ec1..fc0ed9d6e011 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1132,6 +1132,10 @@ config CRASH_DUMP For more details see Documentation/admin-guide/kdump/kdump.rst +config TRANS_TABLE + def_bool y + depends on HIBERNATION + config XEN_DOM0 def_bool y depends on XEN diff --git a/arch/arm64/include/asm/trans_pgd.h b/arch/arm64/include/asm/trans_pgd.h new file mode 100644 index 000000000000..23153c13d1ce --- /dev/null +++ b/arch/arm64/include/asm/trans_pgd.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2020, Microsoft Corporation. + * Pavel Tatashin + */ + +#ifndef _ASM_TRANS_TABLE_H +#define _ASM_TRANS_TABLE_H + +#include +#include +#include + +int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start, + unsigned long end); + +int trans_pgd_map_page(pgd_t *trans_pgd, void *page, unsigned long dst_addr, + pgprot_t pgprot); + +#endif /* _ASM_TRANS_TABLE_H */ diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 0a54d81c90f9..4a38662f0d90 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -16,7 +16,6 @@ #define pr_fmt(x) "hibernate: " x #include #include -#include #include #include #include @@ -31,13 +30,12 @@ #include #include #include -#include -#include #include #include #include #include #include +#include #include /* @@ -178,54 +176,6 @@ int arch_hibernation_header_restore(void *addr) } EXPORT_SYMBOL(arch_hibernation_header_restore); -static int trans_pgd_map_page(pgd_t *trans_pgd, void *page, - unsigned long dst_addr, - pgprot_t pgprot) -{ - pgd_t *pgdp; - p4d_t *p4dp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - pgdp = pgd_offset_pgd(trans_pgd, dst_addr); - if (pgd_none(READ_ONCE(*pgdp))) { - p4dp = (void *)get_safe_page(GFP_ATOMIC); - if (!pgdp) - return -ENOMEM; - pgd_populate(&init_mm, pgdp, p4dp); - } - - p4dp = p4d_offset(pgdp, dst_addr); - if (p4d_none(READ_ONCE(*p4dp))) { - pudp = (void *)get_safe_page(GFP_ATOMIC); - if (!pudp) - return -ENOMEM; - p4d_populate(&init_mm, p4dp, pudp); - } - - pudp = pud_offset(p4dp, dst_addr); - if (pud_none(READ_ONCE(*pudp))) { - pmdp = (void *)get_safe_page(GFP_ATOMIC); - if (!pmdp) - return -ENOMEM; - pud_populate(&init_mm, pudp, pmdp); - } - - pmdp = pmd_offset(pudp, dst_addr); - if (pmd_none(READ_ONCE(*pmdp))) { - ptep = (void *)get_safe_page(GFP_ATOMIC); - if (!ptep) - return -ENOMEM; - pmd_populate_kernel(&init_mm, pmdp, ptep); - } - - ptep = pte_offset_kernel(pmdp, dst_addr); - set_pte(ptep, pfn_pte(virt_to_pfn(page), PAGE_KERNEL_EXEC)); - - return 0; -} - /* * Copies length bytes, starting at src_start into an new page, * perform cache maintenance, then maps it at the specified address low @@ -462,182 +412,6 @@ int swsusp_arch_suspend(void) return ret; } -static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) -{ - pte_t pte = READ_ONCE(*src_ptep); - - if (pte_valid(pte)) { - /* - * Resume will overwrite areas that may be marked - * read only (code, rodata). Clear the RDONLY bit from - * the temporary mappings we use during restore. - */ - set_pte(dst_ptep, pte_mkwrite(pte)); - } else if (debug_pagealloc_enabled() && !pte_none(pte)) { - /* - * debug_pagealloc will removed the PTE_VALID bit if - * the page isn't in use by the resume kernel. It may have - * been in use by the original kernel, in which case we need - * to put it back in our copy to do the restore. - * - * Before marking this entry valid, check the pfn should - * be mapped. - */ - BUG_ON(!pfn_valid(pte_pfn(pte))); - - set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte))); - } -} - -static int copy_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, - unsigned long end) -{ - pte_t *src_ptep; - pte_t *dst_ptep; - unsigned long addr = start; - - dst_ptep = (pte_t *)get_safe_page(GFP_ATOMIC); - if (!dst_ptep) - return -ENOMEM; - pmd_populate_kernel(&init_mm, dst_pmdp, dst_ptep); - dst_ptep = pte_offset_kernel(dst_pmdp, start); - - src_ptep = pte_offset_kernel(src_pmdp, start); - do { - _copy_pte(dst_ptep, src_ptep, addr); - } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end); - - return 0; -} - -static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, - unsigned long end) -{ - pmd_t *src_pmdp; - pmd_t *dst_pmdp; - unsigned long next; - unsigned long addr = start; - - if (pud_none(READ_ONCE(*dst_pudp))) { - dst_pmdp = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pmdp) - return -ENOMEM; - pud_populate(&init_mm, dst_pudp, dst_pmdp); - } - dst_pmdp = pmd_offset(dst_pudp, start); - - src_pmdp = pmd_offset(src_pudp, start); - do { - pmd_t pmd = READ_ONCE(*src_pmdp); - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - continue; - if (pmd_table(pmd)) { - if (copy_pte(dst_pmdp, src_pmdp, addr, next)) - return -ENOMEM; - } else { - set_pmd(dst_pmdp, - __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY)); - } - } while (dst_pmdp++, src_pmdp++, addr = next, addr != end); - - return 0; -} - -static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, - unsigned long end) -{ - pud_t *dst_pudp; - pud_t *src_pudp; - unsigned long next; - unsigned long addr = start; - - if (p4d_none(READ_ONCE(*dst_p4dp))) { - dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pudp) - return -ENOMEM; - p4d_populate(&init_mm, dst_p4dp, dst_pudp); - } - dst_pudp = pud_offset(dst_p4dp, start); - - src_pudp = pud_offset(src_p4dp, start); - do { - pud_t pud = READ_ONCE(*src_pudp); - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - continue; - if (pud_table(pud)) { - if (copy_pmd(dst_pudp, src_pudp, addr, next)) - return -ENOMEM; - } else { - set_pud(dst_pudp, - __pud(pud_val(pud) & ~PUD_SECT_RDONLY)); - } - } while (dst_pudp++, src_pudp++, addr = next, addr != end); - - return 0; -} - -static int copy_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, - unsigned long end) -{ - p4d_t *dst_p4dp; - p4d_t *src_p4dp; - unsigned long next; - unsigned long addr = start; - - dst_p4dp = p4d_offset(dst_pgdp, start); - src_p4dp = p4d_offset(src_pgdp, start); - do { - next = p4d_addr_end(addr, end); - if (p4d_none(READ_ONCE(*src_p4dp))) - continue; - if (copy_pud(dst_p4dp, src_p4dp, addr, next)) - return -ENOMEM; - } while (dst_p4dp++, src_p4dp++, addr = next, addr != end); - - return 0; -} - -static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, - unsigned long end) -{ - unsigned long next; - unsigned long addr = start; - pgd_t *src_pgdp = pgd_offset_k(start); - - dst_pgdp = pgd_offset_pgd(dst_pgdp, start); - do { - next = pgd_addr_end(addr, end); - if (pgd_none(READ_ONCE(*src_pgdp))) - continue; - if (copy_p4d(dst_pgdp, src_pgdp, addr, next)) - return -ENOMEM; - } while (dst_pgdp++, src_pgdp++, addr = next, addr != end); - - return 0; -} - -static int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start, - unsigned long end) -{ - int rc; - pgd_t *trans_pgd = (pgd_t *)get_safe_page(GFP_ATOMIC); - - if (!trans_pgd) { - pr_err("Failed to allocate memory for temporary page tables.\n"); - return -ENOMEM; - } - - rc = copy_page_tables(trans_pgd, start, end); - if (!rc) - *dst_pgdp = trans_pgd; - - return rc; -} - /* * Setup then Resume from the hibernate image using swsusp_arch_suspend_exit(). * diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 5ead3c3de3b6..77222d92667a 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -6,6 +6,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o +obj-$(CONFIG_TRANS_TABLE) += trans_pgd.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_ARM64_MTE) += mteswap.o diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c new file mode 100644 index 000000000000..e048d1f5c912 --- /dev/null +++ b/arch/arm64/mm/trans_pgd.c @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Transitional page tables for kexec and hibernate + * + * This file derived from: arch/arm64/kernel/hibernate.c + * + * Copyright (c) 2020, Microsoft Corporation. + * Pavel Tatashin + * + */ + +/* + * Transitional tables are used during system transferring from one world to + * another: such as during hibernate restore, and kexec reboots. During these + * phases one cannot rely on page table not being overwritten. This is because + * hibernate and kexec can overwrite the current page tables during transition. + */ + +#include +#include +#include +#include +#include +#include +#include + +static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) +{ + pte_t pte = READ_ONCE(*src_ptep); + + if (pte_valid(pte)) { + /* + * Resume will overwrite areas that may be marked + * read only (code, rodata). Clear the RDONLY bit from + * the temporary mappings we use during restore. + */ + set_pte(dst_ptep, pte_mkwrite(pte)); + } else if (debug_pagealloc_enabled() && !pte_none(pte)) { + /* + * debug_pagealloc will removed the PTE_VALID bit if + * the page isn't in use by the resume kernel. It may have + * been in use by the original kernel, in which case we need + * to put it back in our copy to do the restore. + * + * Before marking this entry valid, check the pfn should + * be mapped. + */ + BUG_ON(!pfn_valid(pte_pfn(pte))); + + set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte))); + } +} + +static int copy_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, + unsigned long end) +{ + pte_t *src_ptep; + pte_t *dst_ptep; + unsigned long addr = start; + + dst_ptep = (pte_t *)get_safe_page(GFP_ATOMIC); + if (!dst_ptep) + return -ENOMEM; + pmd_populate_kernel(&init_mm, dst_pmdp, dst_ptep); + dst_ptep = pte_offset_kernel(dst_pmdp, start); + + src_ptep = pte_offset_kernel(src_pmdp, start); + do { + _copy_pte(dst_ptep, src_ptep, addr); + } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end); + + return 0; +} + +static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, + unsigned long end) +{ + pmd_t *src_pmdp; + pmd_t *dst_pmdp; + unsigned long next; + unsigned long addr = start; + + if (pud_none(READ_ONCE(*dst_pudp))) { + dst_pmdp = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!dst_pmdp) + return -ENOMEM; + pud_populate(&init_mm, dst_pudp, dst_pmdp); + } + dst_pmdp = pmd_offset(dst_pudp, start); + + src_pmdp = pmd_offset(src_pudp, start); + do { + pmd_t pmd = READ_ONCE(*src_pmdp); + + next = pmd_addr_end(addr, end); + if (pmd_none(pmd)) + continue; + if (pmd_table(pmd)) { + if (copy_pte(dst_pmdp, src_pmdp, addr, next)) + return -ENOMEM; + } else { + set_pmd(dst_pmdp, + __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY)); + } + } while (dst_pmdp++, src_pmdp++, addr = next, addr != end); + + return 0; +} + +static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, + unsigned long end) +{ + pud_t *dst_pudp; + pud_t *src_pudp; + unsigned long next; + unsigned long addr = start; + + if (p4d_none(READ_ONCE(*dst_p4dp))) { + dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC); + if (!dst_pudp) + return -ENOMEM; + p4d_populate(&init_mm, dst_p4dp, dst_pudp); + } + dst_pudp = pud_offset(dst_p4dp, start); + + src_pudp = pud_offset(src_p4dp, start); + do { + pud_t pud = READ_ONCE(*src_pudp); + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + continue; + if (pud_table(pud)) { + if (copy_pmd(dst_pudp, src_pudp, addr, next)) + return -ENOMEM; + } else { + set_pud(dst_pudp, + __pud(pud_val(pud) & ~PUD_SECT_RDONLY)); + } + } while (dst_pudp++, src_pudp++, addr = next, addr != end); + + return 0; +} + +static int copy_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, + unsigned long end) +{ + p4d_t *dst_p4dp; + p4d_t *src_p4dp; + unsigned long next; + unsigned long addr = start; + + dst_p4dp = p4d_offset(dst_pgdp, start); + src_p4dp = p4d_offset(src_pgdp, start); + do { + next = p4d_addr_end(addr, end); + if (p4d_none(READ_ONCE(*src_p4dp))) + continue; + if (copy_pud(dst_p4dp, src_p4dp, addr, next)) + return -ENOMEM; + } while (dst_p4dp++, src_p4dp++, addr = next, addr != end); + + return 0; +} + +static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, + unsigned long end) +{ + unsigned long next; + unsigned long addr = start; + pgd_t *src_pgdp = pgd_offset_k(start); + + dst_pgdp = pgd_offset_pgd(dst_pgdp, start); + do { + next = pgd_addr_end(addr, end); + if (pgd_none(READ_ONCE(*src_pgdp))) + continue; + if (copy_p4d(dst_pgdp, src_pgdp, addr, next)) + return -ENOMEM; + } while (dst_pgdp++, src_pgdp++, addr = next, addr != end); + + return 0; +} + +int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start, + unsigned long end) +{ + int rc; + pgd_t *trans_pgd = (pgd_t *)get_safe_page(GFP_ATOMIC); + + if (!trans_pgd) { + pr_err("Failed to allocate memory for temporary page tables.\n"); + return -ENOMEM; + } + + rc = copy_page_tables(trans_pgd, start, end); + if (!rc) + *dst_pgdp = trans_pgd; + + return rc; +} + +int trans_pgd_map_page(pgd_t *trans_pgd, void *page, + unsigned long dst_addr, + pgprot_t pgprot) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + pgdp = pgd_offset_pgd(trans_pgd, dst_addr); + if (pgd_none(READ_ONCE(*pgdp))) { + p4dp = (void *)get_safe_page(GFP_ATOMIC); + if (!pgdp) + return -ENOMEM; + pgd_populate(&init_mm, pgdp, p4dp); + } + + p4dp = p4d_offset(pgdp, dst_addr); + if (p4d_none(READ_ONCE(*p4dp))) { + pudp = (void *)get_safe_page(GFP_ATOMIC); + if (!pudp) + return -ENOMEM; + p4d_populate(&init_mm, p4dp, pudp); + } + + pudp = pud_offset(p4dp, dst_addr); + if (pud_none(READ_ONCE(*pudp))) { + pmdp = (void *)get_safe_page(GFP_ATOMIC); + if (!pmdp) + return -ENOMEM; + pud_populate(&init_mm, pudp, pmdp); + } + + pmdp = pmd_offset(pudp, dst_addr); + if (pmd_none(READ_ONCE(*pmdp))) { + ptep = (void *)get_safe_page(GFP_ATOMIC); + if (!ptep) + return -ENOMEM; + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + + ptep = pte_offset_kernel(pmdp, dst_addr); + set_pte(ptep, pfn_pte(virt_to_pfn(page), PAGE_KERNEL_EXEC)); + + return 0; +} From 50f53fb721817a6efa541cca24f1b7caa84801c1 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:09 -0500 Subject: [PATCH 037/108] arm64: trans_pgd: make trans_pgd_map_page generic kexec is going to use a different allocator, so make trans_pgd_map_page to accept allocator as an argument, and also kexec is going to use a different map protection, so also pass it via argument. Signed-off-by: Pavel Tatashin Reviewed-by: Matthias Brugger Link: https://lore.kernel.org/r/20210125191923.1060122-5-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/trans_pgd.h | 19 +++++++++++++++++-- arch/arm64/kernel/hibernate.c | 12 +++++++++++- arch/arm64/mm/trans_pgd.c | 30 ++++++++++++++++++++++-------- 3 files changed, 50 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/trans_pgd.h b/arch/arm64/include/asm/trans_pgd.h index 23153c13d1ce..b46409b25234 100644 --- a/arch/arm64/include/asm/trans_pgd.h +++ b/arch/arm64/include/asm/trans_pgd.h @@ -12,10 +12,25 @@ #include #include +/* + * trans_alloc_page + * - Allocator that should return exactly one zeroed page, if this + * allocator fails, trans_pgd_create_copy() and trans_pgd_map_page() + * return -ENOMEM error. + * + * trans_alloc_arg + * - Passed to trans_alloc_page as an argument + */ + +struct trans_pgd_info { + void * (*trans_alloc_page)(void *arg); + void *trans_alloc_arg; +}; + int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start, unsigned long end); -int trans_pgd_map_page(pgd_t *trans_pgd, void *page, unsigned long dst_addr, - pgprot_t pgprot); +int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, + void *page, unsigned long dst_addr, pgprot_t pgprot); #endif /* _ASM_TRANS_TABLE_H */ diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 4a38662f0d90..c173f280bfea 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -176,6 +176,11 @@ int arch_hibernation_header_restore(void *addr) } EXPORT_SYMBOL(arch_hibernation_header_restore); +static void *hibernate_page_alloc(void *arg) +{ + return (void *)get_safe_page((gfp_t)(unsigned long)arg); +} + /* * Copies length bytes, starting at src_start into an new page, * perform cache maintenance, then maps it at the specified address low @@ -192,6 +197,11 @@ static int create_safe_exec_page(void *src_start, size_t length, unsigned long dst_addr, phys_addr_t *phys_dst_addr) { + struct trans_pgd_info trans_info = { + .trans_alloc_page = hibernate_page_alloc, + .trans_alloc_arg = (void *)GFP_ATOMIC, + }; + void *page = (void *)get_safe_page(GFP_ATOMIC); pgd_t *trans_pgd; int rc; @@ -206,7 +216,7 @@ static int create_safe_exec_page(void *src_start, size_t length, if (!trans_pgd) return -ENOMEM; - rc = trans_pgd_map_page(trans_pgd, page, dst_addr, + rc = trans_pgd_map_page(&trans_info, trans_pgd, page, dst_addr, PAGE_KERNEL_EXEC); if (rc) return rc; diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index e048d1f5c912..f28eceba2242 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -25,6 +25,11 @@ #include #include +static void *trans_alloc(struct trans_pgd_info *info) +{ + return info->trans_alloc_page(info->trans_alloc_arg); +} + static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) { pte_t pte = READ_ONCE(*src_ptep); @@ -201,9 +206,18 @@ int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start, return rc; } -int trans_pgd_map_page(pgd_t *trans_pgd, void *page, - unsigned long dst_addr, - pgprot_t pgprot) +/* + * Add map entry to trans_pgd for a base-size page at PTE level. + * info: contains allocator and its argument + * trans_pgd: page table in which new map is added. + * page: page to be mapped. + * dst_addr: new VA address for the page + * pgprot: protection for the page. + * + * Returns 0 on success, and -ENOMEM on failure. + */ +int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, + void *page, unsigned long dst_addr, pgprot_t pgprot) { pgd_t *pgdp; p4d_t *p4dp; @@ -213,7 +227,7 @@ int trans_pgd_map_page(pgd_t *trans_pgd, void *page, pgdp = pgd_offset_pgd(trans_pgd, dst_addr); if (pgd_none(READ_ONCE(*pgdp))) { - p4dp = (void *)get_safe_page(GFP_ATOMIC); + p4dp = trans_alloc(info); if (!pgdp) return -ENOMEM; pgd_populate(&init_mm, pgdp, p4dp); @@ -221,7 +235,7 @@ int trans_pgd_map_page(pgd_t *trans_pgd, void *page, p4dp = p4d_offset(pgdp, dst_addr); if (p4d_none(READ_ONCE(*p4dp))) { - pudp = (void *)get_safe_page(GFP_ATOMIC); + pudp = trans_alloc(info); if (!pudp) return -ENOMEM; p4d_populate(&init_mm, p4dp, pudp); @@ -229,7 +243,7 @@ int trans_pgd_map_page(pgd_t *trans_pgd, void *page, pudp = pud_offset(p4dp, dst_addr); if (pud_none(READ_ONCE(*pudp))) { - pmdp = (void *)get_safe_page(GFP_ATOMIC); + pmdp = trans_alloc(info); if (!pmdp) return -ENOMEM; pud_populate(&init_mm, pudp, pmdp); @@ -237,14 +251,14 @@ int trans_pgd_map_page(pgd_t *trans_pgd, void *page, pmdp = pmd_offset(pudp, dst_addr); if (pmd_none(READ_ONCE(*pmdp))) { - ptep = (void *)get_safe_page(GFP_ATOMIC); + ptep = trans_alloc(info); if (!ptep) return -ENOMEM; pmd_populate_kernel(&init_mm, pmdp, ptep); } ptep = pte_offset_kernel(pmdp, dst_addr); - set_pte(ptep, pfn_pte(virt_to_pfn(page), PAGE_KERNEL_EXEC)); + set_pte(ptep, pfn_pte(virt_to_pfn(page), pgprot)); return 0; } From 89d1410f4af55c621395548355f3520415d2bcff Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:10 -0500 Subject: [PATCH 038/108] arm64: trans_pgd: pass allocator trans_pgd_create_copy Make trans_pgd_create_copy and its subroutines to use allocator that is passed as an argument Signed-off-by: Pavel Tatashin Reviewed-by: James Morse Link: https://lore.kernel.org/r/20210125191923.1060122-6-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/trans_pgd.h | 4 +-- arch/arm64/kernel/hibernate.c | 7 ++++- arch/arm64/mm/trans_pgd.c | 49 ++++++++++++++++++------------ 3 files changed, 38 insertions(+), 22 deletions(-) diff --git a/arch/arm64/include/asm/trans_pgd.h b/arch/arm64/include/asm/trans_pgd.h index b46409b25234..7fbf6a3ccff7 100644 --- a/arch/arm64/include/asm/trans_pgd.h +++ b/arch/arm64/include/asm/trans_pgd.h @@ -27,8 +27,8 @@ struct trans_pgd_info { void *trans_alloc_arg; }; -int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start, - unsigned long end); +int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **trans_pgd, + unsigned long start, unsigned long end); int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, void *page, unsigned long dst_addr, pgprot_t pgprot); diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index c173f280bfea..94fc275cdd21 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -437,13 +437,18 @@ int swsusp_arch_resume(void) phys_addr_t phys_hibernate_exit; void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *, void *, phys_addr_t, phys_addr_t); + struct trans_pgd_info trans_info = { + .trans_alloc_page = hibernate_page_alloc, + .trans_alloc_arg = (void *)GFP_ATOMIC, + }; /* * Restoring the memory image will overwrite the ttbr1 page tables. * Create a second copy of just the linear map, and use this when * restoring. */ - rc = trans_pgd_create_copy(&tmp_pg_dir, PAGE_OFFSET, PAGE_END); + rc = trans_pgd_create_copy(&trans_info, &tmp_pg_dir, PAGE_OFFSET, + PAGE_END); if (rc) return rc; diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index f28eceba2242..47b6b7029907 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -57,14 +57,14 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) } } -static int copy_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, - unsigned long end) +static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp, + pmd_t *src_pmdp, unsigned long start, unsigned long end) { pte_t *src_ptep; pte_t *dst_ptep; unsigned long addr = start; - dst_ptep = (pte_t *)get_safe_page(GFP_ATOMIC); + dst_ptep = trans_alloc(info); if (!dst_ptep) return -ENOMEM; pmd_populate_kernel(&init_mm, dst_pmdp, dst_ptep); @@ -78,8 +78,8 @@ static int copy_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, return 0; } -static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, - unsigned long end) +static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp, + pud_t *src_pudp, unsigned long start, unsigned long end) { pmd_t *src_pmdp; pmd_t *dst_pmdp; @@ -87,7 +87,7 @@ static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, unsigned long addr = start; if (pud_none(READ_ONCE(*dst_pudp))) { - dst_pmdp = (pmd_t *)get_safe_page(GFP_ATOMIC); + dst_pmdp = trans_alloc(info); if (!dst_pmdp) return -ENOMEM; pud_populate(&init_mm, dst_pudp, dst_pmdp); @@ -102,7 +102,7 @@ static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, if (pmd_none(pmd)) continue; if (pmd_table(pmd)) { - if (copy_pte(dst_pmdp, src_pmdp, addr, next)) + if (copy_pte(info, dst_pmdp, src_pmdp, addr, next)) return -ENOMEM; } else { set_pmd(dst_pmdp, @@ -113,7 +113,8 @@ static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, return 0; } -static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, +static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp, + p4d_t *src_p4dp, unsigned long start, unsigned long end) { pud_t *dst_pudp; @@ -122,7 +123,7 @@ static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, unsigned long addr = start; if (p4d_none(READ_ONCE(*dst_p4dp))) { - dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC); + dst_pudp = trans_alloc(info); if (!dst_pudp) return -ENOMEM; p4d_populate(&init_mm, dst_p4dp, dst_pudp); @@ -137,7 +138,7 @@ static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, if (pud_none(pud)) continue; if (pud_table(pud)) { - if (copy_pmd(dst_pudp, src_pudp, addr, next)) + if (copy_pmd(info, dst_pudp, src_pudp, addr, next)) return -ENOMEM; } else { set_pud(dst_pudp, @@ -148,7 +149,8 @@ static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, return 0; } -static int copy_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, +static int copy_p4d(struct trans_pgd_info *info, pgd_t *dst_pgdp, + pgd_t *src_pgdp, unsigned long start, unsigned long end) { p4d_t *dst_p4dp; @@ -162,15 +164,15 @@ static int copy_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, next = p4d_addr_end(addr, end); if (p4d_none(READ_ONCE(*src_p4dp))) continue; - if (copy_pud(dst_p4dp, src_p4dp, addr, next)) + if (copy_pud(info, dst_p4dp, src_p4dp, addr, next)) return -ENOMEM; } while (dst_p4dp++, src_p4dp++, addr = next, addr != end); return 0; } -static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, - unsigned long end) +static int copy_page_tables(struct trans_pgd_info *info, pgd_t *dst_pgdp, + unsigned long start, unsigned long end) { unsigned long next; unsigned long addr = start; @@ -181,25 +183,34 @@ static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, next = pgd_addr_end(addr, end); if (pgd_none(READ_ONCE(*src_pgdp))) continue; - if (copy_p4d(dst_pgdp, src_pgdp, addr, next)) + if (copy_p4d(info, dst_pgdp, src_pgdp, addr, next)) return -ENOMEM; } while (dst_pgdp++, src_pgdp++, addr = next, addr != end); return 0; } -int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start, - unsigned long end) +/* + * Create trans_pgd and copy linear map. + * info: contains allocator and its argument + * dst_pgdp: new page table that is created, and to which map is copied. + * start: Start of the interval (inclusive). + * end: End of the interval (exclusive). + * + * Returns 0 on success, and -ENOMEM on failure. + */ +int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **dst_pgdp, + unsigned long start, unsigned long end) { int rc; - pgd_t *trans_pgd = (pgd_t *)get_safe_page(GFP_ATOMIC); + pgd_t *trans_pgd = trans_alloc(info); if (!trans_pgd) { pr_err("Failed to allocate memory for temporary page tables.\n"); return -ENOMEM; } - rc = copy_page_tables(trans_pgd, start, end); + rc = copy_page_tables(info, trans_pgd, start, end); if (!rc) *dst_pgdp = trans_pgd; From 5de59884ac0ec56007e4aa8226ee259aa669be80 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:11 -0500 Subject: [PATCH 039/108] arm64: trans_pgd: pass NULL instead of init_mm to *_populate functions trans_pgd_* should be independent from mm context because the tables that are created by this code are used when there are no mm context around, as it is between kernels. Simply replace mm_init's with NULL. Signed-off-by: Pavel Tatashin Acked-by: James Morse Link: https://lore.kernel.org/r/20210125191923.1060122-7-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/mm/trans_pgd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 47b6b7029907..ded8e2ba0308 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -67,7 +67,7 @@ static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp, dst_ptep = trans_alloc(info); if (!dst_ptep) return -ENOMEM; - pmd_populate_kernel(&init_mm, dst_pmdp, dst_ptep); + pmd_populate_kernel(NULL, dst_pmdp, dst_ptep); dst_ptep = pte_offset_kernel(dst_pmdp, start); src_ptep = pte_offset_kernel(src_pmdp, start); @@ -90,7 +90,7 @@ static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp, dst_pmdp = trans_alloc(info); if (!dst_pmdp) return -ENOMEM; - pud_populate(&init_mm, dst_pudp, dst_pmdp); + pud_populate(NULL, dst_pudp, dst_pmdp); } dst_pmdp = pmd_offset(dst_pudp, start); @@ -126,7 +126,7 @@ static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp, dst_pudp = trans_alloc(info); if (!dst_pudp) return -ENOMEM; - p4d_populate(&init_mm, dst_p4dp, dst_pudp); + p4d_populate(NULL, dst_p4dp, dst_pudp); } dst_pudp = pud_offset(dst_p4dp, start); @@ -241,7 +241,7 @@ int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, p4dp = trans_alloc(info); if (!pgdp) return -ENOMEM; - pgd_populate(&init_mm, pgdp, p4dp); + pgd_populate(NULL, pgdp, p4dp); } p4dp = p4d_offset(pgdp, dst_addr); @@ -249,7 +249,7 @@ int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, pudp = trans_alloc(info); if (!pudp) return -ENOMEM; - p4d_populate(&init_mm, p4dp, pudp); + p4d_populate(NULL, p4dp, pudp); } pudp = pud_offset(p4dp, dst_addr); @@ -257,7 +257,7 @@ int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, pmdp = trans_alloc(info); if (!pmdp) return -ENOMEM; - pud_populate(&init_mm, pudp, pmdp); + pud_populate(NULL, pudp, pmdp); } pmdp = pmd_offset(pudp, dst_addr); @@ -265,7 +265,7 @@ int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, ptep = trans_alloc(info); if (!ptep) return -ENOMEM; - pmd_populate_kernel(&init_mm, pmdp, ptep); + pmd_populate_kernel(NULL, pmdp, ptep); } ptep = pte_offset_kernel(pmdp, dst_addr); From 1401bef703a48cf79c674215f702a1435362beae Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 25 Jan 2021 14:19:12 -0500 Subject: [PATCH 040/108] arm64: mm: Always update TCR_EL1 from __cpu_set_tcr_t0sz() Because only the idmap sets a non-standard T0SZ, __cpu_set_tcr_t0sz() can check for platforms that need to do this using __cpu_uses_extended_idmap() before doing its work. The idmap is only built with enough levels, (and T0SZ bits) to map its single page. To allow hibernate, and then kexec to idmap their single page copy routines, __cpu_set_tcr_t0sz() needs to consider additional users, who may need a different number of levels/T0SZ-bits to the idmap. (i.e. VA_BITS may be enough for the idmap, but not hibernate/kexec) Always read TCR_EL1, and check whether any work needs doing for this request. __cpu_uses_extended_idmap() remains as it is used by KVM, whose idmap is also part of the kernel image. This mostly affects the cpuidle path, where we now get an extra system register read . CC: Lorenzo Pieralisi CC: Sudeep Holla Signed-off-by: James Morse Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/r/20210125191923.1060122-8-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/mmu_context.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 0b3079fd28eb..70ce8c1d2b07 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -81,16 +81,15 @@ static inline bool __cpu_uses_extended_idmap_level(void) } /* - * Set TCR.T0SZ to its default value (based on VA_BITS) + * Ensure TCR.T0SZ is set to the provided value. */ static inline void __cpu_set_tcr_t0sz(unsigned long t0sz) { - unsigned long tcr; + unsigned long tcr = read_sysreg(tcr_el1); - if (!__cpu_uses_extended_idmap()) + if ((tcr & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET == t0sz) return; - tcr = read_sysreg(tcr_el1); tcr &= ~TCR_T0SZ_MASK; tcr |= t0sz << TCR_T0SZ_OFFSET; write_sysreg(tcr, tcr_el1); From 7018d467ff2d6a6d3c820a7ad4e897fe2430b040 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 25 Jan 2021 14:19:13 -0500 Subject: [PATCH 041/108] arm64: trans_pgd: hibernate: idmap the single page that holds the copy page routines To resume from hibernate, the contents of memory are restored from the swap image. This may overwrite any page, including the running kernel and its page tables. Hibernate copies the code it uses to do the restore into a single page that it knows won't be overwritten, and maps it with page tables built from pages that won't be overwritten. Today the address it uses for this mapping is arbitrary, but to allow kexec to reuse this code, it needs to be idmapped. To idmap the page we must avoid the kernel helpers that have VA_BITS baked in. Convert create_single_mapping() to take a single PA, and idmap it. The page tables are built in the reverse order to normal using pfn_pte() to stir in any bits between 52:48. T0SZ is always increased to cover 48bits, or 52 if the copy code has bits 52:48 in its PA. Signed-off-by: James Morse [Adopted the original patch from James to trans_pgd interface, so it can be commonly used by both Kexec and Hibernate. Some minor clean-ups.] Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/linux-arm-kernel/20200115143322.214247-4-james.morse@arm.com/ Link: https://lore.kernel.org/r/20210125191923.1060122-9-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/trans_pgd.h | 3 ++ arch/arm64/kernel/hibernate.c | 32 +++++++------------ arch/arm64/mm/trans_pgd.c | 49 ++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 21 deletions(-) diff --git a/arch/arm64/include/asm/trans_pgd.h b/arch/arm64/include/asm/trans_pgd.h index 7fbf6a3ccff7..5d08e5adf3d5 100644 --- a/arch/arm64/include/asm/trans_pgd.h +++ b/arch/arm64/include/asm/trans_pgd.h @@ -33,4 +33,7 @@ int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **trans_pgd, int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, void *page, unsigned long dst_addr, pgprot_t pgprot); +int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0, + unsigned long *t0sz, void *page); + #endif /* _ASM_TRANS_TABLE_H */ diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 94fc275cdd21..9df32ba0d574 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -194,7 +194,6 @@ static void *hibernate_page_alloc(void *arg) * page system. */ static int create_safe_exec_page(void *src_start, size_t length, - unsigned long dst_addr, phys_addr_t *phys_dst_addr) { struct trans_pgd_info trans_info = { @@ -203,7 +202,8 @@ static int create_safe_exec_page(void *src_start, size_t length, }; void *page = (void *)get_safe_page(GFP_ATOMIC); - pgd_t *trans_pgd; + phys_addr_t trans_ttbr0; + unsigned long t0sz; int rc; if (!page) @@ -211,13 +211,7 @@ static int create_safe_exec_page(void *src_start, size_t length, memcpy(page, src_start, length); __flush_icache_range((unsigned long)page, (unsigned long)page + length); - - trans_pgd = (void *)get_safe_page(GFP_ATOMIC); - if (!trans_pgd) - return -ENOMEM; - - rc = trans_pgd_map_page(&trans_info, trans_pgd, page, dst_addr, - PAGE_KERNEL_EXEC); + rc = trans_pgd_idmap_page(&trans_info, &trans_ttbr0, &t0sz, page); if (rc) return rc; @@ -230,12 +224,15 @@ static int create_safe_exec_page(void *src_start, size_t length, * page, but TLBs may contain stale ASID-tagged entries (e.g. for EFI * runtime services), while for a userspace-driven test_resume cycle it * points to userspace page tables (and we must point it at a zero page - * ourselves). Elsewhere we only (un)install the idmap with preemption - * disabled, so T0SZ should be as required regardless. + * ourselves). + * + * We change T0SZ as part of installing the idmap. This is undone by + * cpu_uninstall_idmap() in __cpu_suspend_exit(). */ cpu_set_reserved_ttbr0(); local_flush_tlb_all(); - write_sysreg(phys_to_ttbr(virt_to_phys(trans_pgd)), ttbr0_el1); + __cpu_set_tcr_t0sz(t0sz); + write_sysreg(trans_ttbr0, ttbr0_el1); isb(); *phys_dst_addr = virt_to_phys(page); @@ -434,7 +431,6 @@ int swsusp_arch_resume(void) void *zero_page; size_t exit_size; pgd_t *tmp_pg_dir; - phys_addr_t phys_hibernate_exit; void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *, void *, phys_addr_t, phys_addr_t); struct trans_pgd_info trans_info = { @@ -462,19 +458,13 @@ int swsusp_arch_resume(void) return -ENOMEM; } - /* - * Locate the exit code in the bottom-but-one page, so that *NULL - * still has disastrous affects. - */ - hibernate_exit = (void *)PAGE_SIZE; exit_size = __hibernate_exit_text_end - __hibernate_exit_text_start; /* * Copy swsusp_arch_suspend_exit() to a safe page. This will generate * a new set of ttbr0 page tables and load them. */ rc = create_safe_exec_page(__hibernate_exit_text_start, exit_size, - (unsigned long)hibernate_exit, - &phys_hibernate_exit); + (phys_addr_t *)&hibernate_exit); if (rc) { pr_err("Failed to create safe executable page for hibernate_exit code.\n"); return rc; @@ -493,7 +483,7 @@ int swsusp_arch_resume(void) * We can skip this step if we booted at EL1, or are running with VHE. */ if (el2_reset_needed()) { - phys_addr_t el2_vectors = phys_hibernate_exit; /* base */ + phys_addr_t el2_vectors = (phys_addr_t)hibernate_exit; el2_vectors += hibernate_el2_vectors - __hibernate_exit_text_start; /* offset */ diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index ded8e2ba0308..527f0a39c3da 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -273,3 +273,52 @@ int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, return 0; } + +/* + * The page we want to idmap may be outside the range covered by VA_BITS that + * can be built using the kernel's p?d_populate() helpers. As a one off, for a + * single page, we build these page tables bottom up and just assume that will + * need the maximum T0SZ. + * + * Returns 0 on success, and -ENOMEM on failure. + * On success trans_ttbr0 contains page table with idmapped page, t0sz is set to + * maximum T0SZ for this page. + */ +int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0, + unsigned long *t0sz, void *page) +{ + phys_addr_t dst_addr = virt_to_phys(page); + unsigned long pfn = __phys_to_pfn(dst_addr); + int max_msb = (dst_addr & GENMASK(52, 48)) ? 51 : 47; + int bits_mapped = PAGE_SHIFT - 4; + unsigned long level_mask, prev_level_entry, *levels[4]; + int this_level, index, level_lsb, level_msb; + + dst_addr &= PAGE_MASK; + prev_level_entry = pte_val(pfn_pte(pfn, PAGE_KERNEL_EXEC)); + + for (this_level = 3; this_level >= 0; this_level--) { + levels[this_level] = trans_alloc(info); + if (!levels[this_level]) + return -ENOMEM; + + level_lsb = ARM64_HW_PGTABLE_LEVEL_SHIFT(this_level); + level_msb = min(level_lsb + bits_mapped, max_msb); + level_mask = GENMASK_ULL(level_msb, level_lsb); + + index = (dst_addr & level_mask) >> level_lsb; + *(levels[this_level] + index) = prev_level_entry; + + pfn = virt_to_pfn(levels[this_level]); + prev_level_entry = pte_val(pfn_pte(pfn, + __pgprot(PMD_TYPE_TABLE))); + + if (level_msb == max_msb) + break; + } + + *trans_ttbr0 = phys_to_ttbr(__pfn_to_phys(pfn)); + *t0sz = TCR_T0SZ(max_msb + 1); + + return 0; +} From 4c3c31230c912d8f2e49c775555aadf79a43d418 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:14 -0500 Subject: [PATCH 042/108] arm64: kexec: move relocation function setup Currently, kernel relocation function is configured in machine_kexec() at the time of kexec reboot by using control_code_page. This operation, however, is more logical to be done during kexec_load, and thus remove from reboot time. Move, setup of this function to newly added machine_kexec_post_load(). Because once MMU is enabled, kexec control page will contain more than relocation kernel, but also vector table, add pointer to the actual function within this page arch.kern_reloc. Currently, it equals to the beginning of page, we will add offsets later, when vector table is added. Signed-off-by: Pavel Tatashin Reviewed-by: James Morse Link: https://lore.kernel.org/r/20210125191923.1060122-10-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/kexec.h | 1 + arch/arm64/kernel/machine_kexec.c | 46 +++++++++++++------------------ 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index 61530ec3a9b1..9befcd87e9a8 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -95,6 +95,7 @@ static inline void crash_post_resume(void) {} struct kimage_arch { void *dtb; phys_addr_t dtb_mem; + phys_addr_t kern_reloc; /* Core ELF header buffer */ void *elf_headers; unsigned long elf_headers_mem; diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 8096a6aa1d49..a8aaa6562429 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -42,6 +42,7 @@ static void _kexec_image_info(const char *func, int line, pr_debug(" start: %lx\n", kimage->start); pr_debug(" head: %lx\n", kimage->head); pr_debug(" nr_segments: %lu\n", kimage->nr_segments); + pr_debug(" kern_reloc: %pa\n", &kimage->arch.kern_reloc); for (i = 0; i < kimage->nr_segments; i++) { pr_debug(" segment[%lu]: %016lx - %016lx, 0x%lx bytes, %lu pages\n", @@ -58,6 +59,22 @@ void machine_kexec_cleanup(struct kimage *kimage) /* Empty routine needed to avoid build errors. */ } +int machine_kexec_post_load(struct kimage *kimage) +{ + void *reloc_code = page_to_virt(kimage->control_code_page); + + memcpy(reloc_code, arm64_relocate_new_kernel, + arm64_relocate_new_kernel_size); + kimage->arch.kern_reloc = __pa(reloc_code); + + /* Flush the reloc_code in preparation for its execution. */ + __flush_dcache_area(reloc_code, arm64_relocate_new_kernel_size); + flush_icache_range((uintptr_t)reloc_code, (uintptr_t)reloc_code + + arm64_relocate_new_kernel_size); + + return 0; +} + /** * machine_kexec_prepare - Prepare for a kexec reboot. * @@ -143,8 +160,6 @@ static void kexec_segment_flush(const struct kimage *kimage) */ void machine_kexec(struct kimage *kimage) { - phys_addr_t reboot_code_buffer_phys; - void *reboot_code_buffer; bool in_kexec_crash = (kimage == kexec_crash_image); bool stuck_cpus = cpus_are_stuck_in_kernel(); @@ -155,31 +170,8 @@ void machine_kexec(struct kimage *kimage) WARN(in_kexec_crash && (stuck_cpus || smp_crash_stop_failed()), "Some CPUs may be stale, kdump will be unreliable.\n"); - reboot_code_buffer_phys = page_to_phys(kimage->control_code_page); - reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys); - kexec_image_info(kimage); - /* - * Copy arm64_relocate_new_kernel to the reboot_code_buffer for use - * after the kernel is shut down. - */ - memcpy(reboot_code_buffer, arm64_relocate_new_kernel, - arm64_relocate_new_kernel_size); - - /* Flush the reboot_code_buffer in preparation for its execution. */ - __flush_dcache_area(reboot_code_buffer, arm64_relocate_new_kernel_size); - - /* - * Although we've killed off the secondary CPUs, we don't update - * the online mask if we're handling a crash kernel and consequently - * need to avoid flush_icache_range(), which will attempt to IPI - * the offline CPUs. Therefore, we must use the __* variant here. - */ - __flush_icache_range((uintptr_t)reboot_code_buffer, - (uintptr_t)reboot_code_buffer + - arm64_relocate_new_kernel_size); - /* Flush the kimage list and its buffers. */ kexec_list_flush(kimage); @@ -193,7 +185,7 @@ void machine_kexec(struct kimage *kimage) /* * cpu_soft_restart will shutdown the MMU, disable data caches, then - * transfer control to the reboot_code_buffer which contains a copy of + * transfer control to the kern_reloc which contains a copy of * the arm64_relocate_new_kernel routine. arm64_relocate_new_kernel * uses physical addressing to relocate the new image to its final * position and transfers control to the image entry point when the @@ -203,7 +195,7 @@ void machine_kexec(struct kimage *kimage) * userspace (kexec-tools). * In kexec_file case, the kernel starts directly without purgatory. */ - cpu_soft_restart(reboot_code_buffer_phys, kimage->head, kimage->start, + cpu_soft_restart(kimage->arch.kern_reloc, kimage->head, kimage->start, kimage->arch.dtb_mem); BUG(); /* Should never get here. */ From 77a43be1164852be4b761e4acaf651f986c4e8d7 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:15 -0500 Subject: [PATCH 043/108] arm64: kexec: call kexec_image_info only once Currently, kexec_image_info() is called during load time, and right before kernel is being kexec'ed. There is no need to do both. So, call it only once when segments are loaded and the physical location of page with copy of arm64_relocate_new_kernel is known. Signed-off-by: Pavel Tatashin Acked-by: James Morse Link: https://lore.kernel.org/r/20210125191923.1060122-11-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/kernel/machine_kexec.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index a8aaa6562429..90a335c74442 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -66,6 +66,7 @@ int machine_kexec_post_load(struct kimage *kimage) memcpy(reloc_code, arm64_relocate_new_kernel, arm64_relocate_new_kernel_size); kimage->arch.kern_reloc = __pa(reloc_code); + kexec_image_info(kimage); /* Flush the reloc_code in preparation for its execution. */ __flush_dcache_area(reloc_code, arm64_relocate_new_kernel_size); @@ -84,8 +85,6 @@ int machine_kexec_post_load(struct kimage *kimage) */ int machine_kexec_prepare(struct kimage *kimage) { - kexec_image_info(kimage); - if (kimage->type != KEXEC_TYPE_CRASH && cpus_are_stuck_in_kernel()) { pr_err("Can't kexec: CPUs are stuck in the kernel.\n"); return -EBUSY; @@ -170,8 +169,6 @@ void machine_kexec(struct kimage *kimage) WARN(in_kexec_crash && (stuck_cpus || smp_crash_stop_failed()), "Some CPUs may be stale, kdump will be unreliable.\n"); - kexec_image_info(kimage); - /* Flush the kimage list and its buffers. */ kexec_list_flush(kimage); From dbd82fee0f258739272349bc87f5841fc1fb982a Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:16 -0500 Subject: [PATCH 044/108] arm64: kexec: arm64_relocate_new_kernel clean-ups and optimizations In preparation to bigger changes to arm64_relocate_new_kernel that would enable this function to do MMU backed memory copy, do few clean-ups and optimizations. These include: 1. Call raw_dcache_line_size() only when relocation is actually going to happen. i.e. kdump type kexec, does not need it. 2. copy_page(dest, src, tmps...) increments dest and src by PAGE_SIZE, so no need to store dest prior to calling copy_page and increment it after. Also, src is not used after a copy, not need to copy either. 3. For consistency use comment on the same line with instruction when it describes the instruction itself. 4. Some comment corrections Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/r/20210125191923.1060122-12-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/kernel/relocate_kernel.S | 36 +++++++---------------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index 84eec95ec06c..462ffbc37071 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -17,28 +17,24 @@ /* * arm64_relocate_new_kernel - Put a 2nd stage image in place and boot it. * - * The memory that the old kernel occupies may be overwritten when coping the + * The memory that the old kernel occupies may be overwritten when copying the * new image to its final location. To assure that the * arm64_relocate_new_kernel routine which does that copy is not overwritten, * all code and data needed by arm64_relocate_new_kernel must be between the * symbols arm64_relocate_new_kernel and arm64_relocate_new_kernel_end. The * machine_kexec() routine will copy arm64_relocate_new_kernel to the kexec - * control_code_page, a special page which has been set up to be preserved - * during the copy operation. + * safe memory that has been set up to be preserved during the copy operation. */ SYM_CODE_START(arm64_relocate_new_kernel) - /* Setup the list loop variables. */ mov x18, x2 /* x18 = dtb address */ mov x17, x1 /* x17 = kimage_start */ mov x16, x0 /* x16 = kimage_head */ - raw_dcache_line_size x15, x0 /* x15 = dcache line size */ mov x14, xzr /* x14 = entry ptr */ mov x13, xzr /* x13 = copy dest */ - /* Check if the new image needs relocation. */ tbnz x16, IND_DONE_BIT, .Ldone - + raw_dcache_line_size x15, x0 /* x15 = dcache line size */ .Lloop: and x12, x16, PAGE_MASK /* x12 = addr */ @@ -57,34 +53,18 @@ SYM_CODE_START(arm64_relocate_new_kernel) b.lo 2b dsb sy - mov x20, x13 - mov x21, x12 - copy_page x20, x21, x0, x1, x2, x3, x4, x5, x6, x7 - - /* dest += PAGE_SIZE */ - add x13, x13, PAGE_SIZE + copy_page x13, x12, x0, x1, x2, x3, x4, x5, x6, x7 b .Lnext - .Ltest_indirection: tbz x16, IND_INDIRECTION_BIT, .Ltest_destination - - /* ptr = addr */ - mov x14, x12 + mov x14, x12 /* ptr = addr */ b .Lnext - .Ltest_destination: tbz x16, IND_DESTINATION_BIT, .Lnext - - /* dest = addr */ - mov x13, x12 - + mov x13, x12 /* dest = addr */ .Lnext: - /* entry = *ptr++ */ - ldr x16, [x14], #8 - - /* while (!(entry & DONE)) */ - tbz x16, IND_DONE_BIT, .Lloop - + ldr x16, [x14], #8 /* entry = *ptr++ */ + tbz x16, IND_DONE_BIT, .Lloop /* while (!(entry & DONE)) */ .Ldone: /* wait for writes from copy_page to finish */ dsb nsh From a360190e8a42d47ea80355f286939ba82b02405a Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:17 -0500 Subject: [PATCH 045/108] arm64: kexec: arm64_relocate_new_kernel don't use x0 as temp x0 will contain the only argument to arm64_relocate_new_kernel; don't use it as a temp. Reassigned registers to free-up x0 so we won't need to copy argument, and can use it at the beginning and at the end of the function. Signed-off-by: Pavel Tatashin Reviewed-by: James Morse Link: https://lore.kernel.org/r/20210125191923.1060122-13-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/kernel/relocate_kernel.S | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index 462ffbc37071..b78ea5de97a4 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -34,7 +34,7 @@ SYM_CODE_START(arm64_relocate_new_kernel) mov x13, xzr /* x13 = copy dest */ /* Check if the new image needs relocation. */ tbnz x16, IND_DONE_BIT, .Ldone - raw_dcache_line_size x15, x0 /* x15 = dcache line size */ + raw_dcache_line_size x15, x1 /* x15 = dcache line size */ .Lloop: and x12, x16, PAGE_MASK /* x12 = addr */ @@ -43,17 +43,17 @@ SYM_CODE_START(arm64_relocate_new_kernel) tbz x16, IND_SOURCE_BIT, .Ltest_indirection /* Invalidate dest page to PoC. */ - mov x0, x13 - add x20, x0, #PAGE_SIZE + mov x2, x13 + add x20, x2, #PAGE_SIZE sub x1, x15, #1 - bic x0, x0, x1 -2: dc ivac, x0 - add x0, x0, x15 - cmp x0, x20 + bic x2, x2, x1 +2: dc ivac, x2 + add x2, x2, x15 + cmp x2, x20 b.lo 2b dsb sy - copy_page x13, x12, x0, x1, x2, x3, x4, x5, x6, x7 + copy_page x13, x12, x1, x2, x3, x4, x5, x6, x7, x8 b .Lnext .Ltest_indirection: tbz x16, IND_INDIRECTION_BIT, .Ltest_destination From abd562df94d19d0a9769971a35801b3f4991715d Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 25 Jan 2021 20:14:58 +0800 Subject: [PATCH 046/108] x86/perf: Use static_call for x86_pmu.guest_get_msrs Clean up that CONFIG_RETPOLINE crud and replace the indirect call x86_pmu.guest_get_msrs with static_call(). Reported-by: kernel test robot Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Like Xu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210125121458.181635-1-like.xu@linux.intel.com --- arch/x86/events/core.c | 20 ++++++++++++++++++++ arch/x86/events/intel/core.c | 20 -------------------- arch/x86/include/asm/perf_event.h | 6 +----- 3 files changed, 21 insertions(+), 25 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index e37de298a495..cf0a52c80408 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -81,6 +81,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); +DEFINE_STATIC_CALL_NULL(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); + u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -665,6 +667,12 @@ void x86_pmu_disable_all(void) } } +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +{ + return static_call(x86_pmu_guest_get_msrs)(nr); +} +EXPORT_SYMBOL_GPL(perf_guest_get_msrs); + /* * There may be PMI landing after enabled=0. The PMI hitting could be before or * after disable_all. @@ -1923,6 +1931,8 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs); static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); + + static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); } static void _x86_pmu_read(struct perf_event *event) @@ -1930,6 +1940,13 @@ static void _x86_pmu_read(struct perf_event *event) x86_perf_event_update(event); } +static inline struct perf_guest_switch_msr * +perf_guest_get_msrs_nop(int *nr) +{ + *nr = 0; + return NULL; +} + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -2001,6 +2018,9 @@ static int __init init_hw_perf_events(void) if (!x86_pmu.read) x86_pmu.read = _x86_pmu_read; + if (!x86_pmu.guest_get_msrs) + x86_pmu.guest_get_msrs = perf_guest_get_msrs_nop; + x86_pmu_static_call_update(); /* diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index d4569bfa83e3..93adf53cce5f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3680,26 +3680,6 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } -#ifdef CONFIG_RETPOLINE -static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr); -static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr); -#endif - -struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) -{ -#ifdef CONFIG_RETPOLINE - if (x86_pmu.guest_get_msrs == intel_guest_get_msrs) - return intel_guest_get_msrs(nr); - else if (x86_pmu.guest_get_msrs == core_guest_get_msrs) - return core_guest_get_msrs(nr); -#endif - if (x86_pmu.guest_get_msrs) - return x86_pmu.guest_get_msrs(nr); - *nr = 0; - return NULL; -} -EXPORT_SYMBOL_GPL(perf_guest_get_msrs); - static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index b9a7fd0a27e2..e2a4c785e4e3 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -483,11 +483,7 @@ static inline void perf_check_microcode(void) { } extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); #else -static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) -{ - *nr = 0; - return NULL; -} +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { return -1; From 3daa96d67274653b7c461b30ef9581d68e905fe1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Nov 2020 16:37:51 +0100 Subject: [PATCH 047/108] perf/intel: Remove Perfmon-v4 counter_freezing support Perfmon-v4 counter freezing is fundamentally broken; remove this default disabled code to make sure nobody uses it. The feature is called Freeze-on-PMI in the SDM, and if it would do that, there wouldn't actually be a problem, *however* it does something subtly different. It globally disables the whole PMU when it raises the PMI, not when the PMI hits. This means there's a window between the PMI getting raised and the PMI actually getting served where we loose events and this violates the perf counter independence. That is, a counting event should not result in a different event count when there is a sampling event co-scheduled. This is known to break existing software (RR). Signed-off-by: Peter Zijlstra (Intel) --- .../admin-guide/kernel-parameters.txt | 6 - arch/x86/events/intel/core.c | 152 ------------------ arch/x86/events/perf_event.h | 3 +- 3 files changed, 1 insertion(+), 160 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index c722ec19cd00..628493901b02 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -944,12 +944,6 @@ causing system reset or hang due to sending INIT from AP to BSP. - perf_v4_pmi= [X86,INTEL] - Format: - Disable Intel PMU counter freezing feature. - The feature only exists starting from - Arch Perfmon v4 (Skylake and newer). - disable_ddw [PPC/PSERIES] Disable Dynamic DMA Window support. Use this to workaround buggy firmware. diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 93adf53cce5f..fe940082d49a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2134,18 +2134,6 @@ static void intel_tfa_pmu_enable_all(int added) intel_pmu_enable_all(added); } -static void enable_counter_freeze(void) -{ - update_debugctlmsr(get_debugctlmsr() | - DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI); -} - -static void disable_counter_freeze(void) -{ - update_debugctlmsr(get_debugctlmsr() & - ~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI); -} - static inline u64 intel_pmu_get_status(void) { u64 status; @@ -2709,95 +2697,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) return handled; } -static bool disable_counter_freezing = true; -static int __init intel_perf_counter_freezing_setup(char *s) -{ - bool res; - - if (kstrtobool(s, &res)) - return -EINVAL; - - disable_counter_freezing = !res; - return 1; -} -__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup); - -/* - * Simplified handler for Arch Perfmon v4: - * - We rely on counter freezing/unfreezing to enable/disable the PMU. - * This is done automatically on PMU ack. - * - Ack the PMU only after the APIC. - */ - -static int intel_pmu_handle_irq_v4(struct pt_regs *regs) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int handled = 0; - bool bts = false; - u64 status; - int pmu_enabled = cpuc->enabled; - int loops = 0; - - /* PMU has been disabled because of counter freezing */ - cpuc->enabled = 0; - if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - bts = true; - intel_bts_disable_local(); - handled = intel_pmu_drain_bts_buffer(); - handled += intel_bts_interrupt(); - } - status = intel_pmu_get_status(); - if (!status) - goto done; -again: - intel_pmu_lbr_read(); - if (++loops > 100) { - static bool warned; - - if (!warned) { - WARN(1, "perfevents: irq loop stuck!\n"); - perf_event_print_debug(); - warned = true; - } - intel_pmu_reset(); - goto done; - } - - - handled += handle_pmi_common(regs, status); -done: - /* Ack the PMI in the APIC */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - - /* - * The counters start counting immediately while ack the status. - * Make it as close as possible to IRET. This avoids bogus - * freezing on Skylake CPUs. - */ - if (status) { - intel_pmu_ack_status(status); - } else { - /* - * CPU may issues two PMIs very close to each other. - * When the PMI handler services the first one, the - * GLOBAL_STATUS is already updated to reflect both. - * When it IRETs, the second PMI is immediately - * handled and it sees clear status. At the meantime, - * there may be a third PMI, because the freezing bit - * isn't set since the ack in first PMI handlers. - * Double check if there is more work to be done. - */ - status = intel_pmu_get_status(); - if (status) - goto again; - } - - if (bts) - intel_bts_enable_local(); - cpuc->enabled = pmu_enabled; - return handled; -} - /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -4074,9 +3973,6 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.version > 1) flip_smm_bit(&x86_pmu.attr_freeze_on_smi); - if (x86_pmu.counter_freezing) - enable_counter_freeze(); - /* Disable perf metrics if any added CPU doesn't support it. */ if (x86_pmu.intel_cap.perf_metrics) { union perf_capabilities perf_cap; @@ -4147,9 +4043,6 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc) static void intel_pmu_cpu_dying(int cpu) { fini_debug_store_on_cpu(cpu); - - if (x86_pmu.counter_freezing) - disable_counter_freeze(); } void intel_cpuc_finish(struct cpu_hw_events *cpuc) @@ -4541,39 +4434,6 @@ static __init void intel_nehalem_quirk(void) } } -static const struct x86_cpu_desc counter_freezing_ucodes[] = { - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 2, 0x0000000e), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 9, 0x0000002e), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 10, 0x00000008), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_D, 1, 0x00000028), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 1, 0x00000028), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 8, 0x00000006), - {} -}; - -static bool intel_counter_freezing_broken(void) -{ - return !x86_cpu_has_min_microcode_rev(counter_freezing_ucodes); -} - -static __init void intel_counter_freezing_quirk(void) -{ - /* Check if it's already disabled */ - if (disable_counter_freezing) - return; - - /* - * If the system starts with the wrong ucode, leave the - * counter-freezing feature permanently disabled. - */ - if (intel_counter_freezing_broken()) { - pr_info("PMU counter freezing disabled due to CPU errata," - "please upgrade microcode\n"); - x86_pmu.counter_freezing = false; - x86_pmu.handle_irq = intel_pmu_handle_irq; - } -} - /* * enable software workaround for errata: * SNB: BJ122 @@ -4959,9 +4819,6 @@ __init int intel_pmu_init(void) max((int)edx.split.num_counters_fixed, assume); } - if (version >= 4) - x86_pmu.counter_freezing = !disable_counter_freezing; - if (boot_cpu_has(X86_FEATURE_PDCM)) { u64 capabilities; @@ -5089,7 +4946,6 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ATOM_GOLDMONT: case INTEL_FAM6_ATOM_GOLDMONT_D: - x86_add_quirk(intel_counter_freezing_quirk); memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs, @@ -5116,7 +4972,6 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ATOM_GOLDMONT_PLUS: - x86_add_quirk(intel_counter_freezing_quirk); memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs, @@ -5581,13 +5436,6 @@ __init int intel_pmu_init(void) pr_cont("full-width counters, "); } - /* - * For arch perfmon 4 use counter freezing to avoid - * several MSR accesses in the PMI. - */ - if (x86_pmu.counter_freezing) - x86_pmu.handle_irq = intel_pmu_handle_irq_v4; - if (x86_pmu.intel_cap.perf_metrics) x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 7895cf4c59a7..978a16e7a8d0 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -682,8 +682,7 @@ struct x86_pmu { /* PMI handler bits */ unsigned int late_ack :1, - enabled_ack :1, - counter_freezing :1; + enabled_ack :1; /* * sysfs attrs */ From 3f98a28cc37253269b4104cf95a51f7716a2eb97 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 28 Jan 2021 11:06:26 +0100 Subject: [PATCH 048/108] mm/nommu: Fix return type of filemap_map_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_MMU is not set (e.g. m68k/m5272c3_defconfig): mm/nommu.c:1671:6: error: conflicting types for ‘filemap_map_pages’ 1671 | void filemap_map_pages(struct vm_fault *vmf, | ^~~~~~~~~~~~~~~~~ In file included from mm/nommu.c:20: ./include/linux/mm.h:2578:19: note: previous declaration of ‘filemap_map_pages’ was here 2578 | extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, | ^~~~~~~~~~~~~~~~~ The signature of filemap_map_pages() was changed, but the nommu implementation wasn't updated. Reported-by: noreply@ellerman.id.au Fixes: f9ce0be71d1f ("mm: Cleanup faultaround and finish_fault() codepaths") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20210128100626.2257638-1-geert@linux-m68k.org Signed-off-by: Will Deacon --- mm/nommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/nommu.c b/mm/nommu.c index 870fea12823e..5c9ab799c0e6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1668,10 +1668,11 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct vm_fault *vmf, +vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { BUG(); + return 0; } EXPORT_SYMBOL(filemap_map_pages); From e30be1455bd3b0626602f42725c49200b2b871b4 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 28 Jan 2021 17:38:50 +0000 Subject: [PATCH 049/108] KVM: arm64: Move __hyp_set_vectors out of .hyp.text The .hyp.text section is supposed to be reserved for the nVHE EL2 code. However, there is currently one occurrence of EL1 executing code located in .hyp.text when calling __hyp_{re}set_vectors(), which happen to sit next to the EL2 stub vectors. While not a problem yet, such patterns will cause issues when removing the host kernel from the TCB, so a cleaner split would be preferable. Fix this by delimiting the end of the .hyp.text section in hyp-stub.S. Acked-by: Marc Zyngier Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20210128173850.2478161-1-qperret@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/hyp-stub.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 160f5881a0b7..8a60f9c586bb 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -85,6 +85,8 @@ SYM_CODE_END(\label) invalid_vector el1_fiq_invalid invalid_vector el1_error_invalid + .popsection + /* * __hyp_set_vectors: Call this after boot to set the initial hypervisor * vectors as part of hypervisor installation. On an SMP system, this should From 79d7c3dca99fa96033695ddf5d495b775a3a137b Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 28 Jan 2021 13:12:43 +0000 Subject: [PATCH 050/108] perf/arm-cmn: Fix PMU instance naming Although it's neat to avoid the suffix for the typical case of a single PMU, it means systems with multiple CMN instances end up with inconsistent naming. I think it also breaks perf tool's "uncore alias" logic if the common instance prefix is also the full name of one. Avoid any surprises by not trying to be clever and simply numbering every instance, even when it might technically prove redundant. Fixes: 0ba64770a2f2 ("perf: Add Arm CMN-600 PMU driver") Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/649a2281233f193d59240b13ed91b57337c77b32.1611839564.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/arm-cmn.rst | 2 +- drivers/perf/arm-cmn.c | 13 ++++--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/perf/arm-cmn.rst b/Documentation/admin-guide/perf/arm-cmn.rst index 0e4809346014..796e25b7027b 100644 --- a/Documentation/admin-guide/perf/arm-cmn.rst +++ b/Documentation/admin-guide/perf/arm-cmn.rst @@ -17,7 +17,7 @@ PMU events ---------- The PMU driver registers a single PMU device for the whole interconnect, -see /sys/bus/event_source/devices/arm_cmn. Multi-chip systems may link +see /sys/bus/event_source/devices/arm_cmn_0. Multi-chip systems may link more than one CMN together via external CCIX links - in this situation, each mesh counts its own events entirely independently, and additional PMU devices will be named arm_cmn_{1..n}. diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index f30fcd330899..6bdf13f65875 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -1502,7 +1502,7 @@ static int arm_cmn_probe(struct platform_device *pdev) struct arm_cmn *cmn; const char *name; static atomic_t id; - int err, rootnode, this_id; + int err, rootnode; cmn = devm_kzalloc(&pdev->dev, sizeof(*cmn), GFP_KERNEL); if (!cmn) @@ -1549,14 +1549,9 @@ static int arm_cmn_probe(struct platform_device *pdev) .cancel_txn = arm_cmn_end_txn, }; - this_id = atomic_fetch_inc(&id); - if (this_id == 0) { - name = "arm_cmn"; - } else { - name = devm_kasprintf(cmn->dev, GFP_KERNEL, "arm_cmn_%d", this_id); - if (!name) - return -ENOMEM; - } + name = devm_kasprintf(cmn->dev, GFP_KERNEL, "arm_cmn_%d", atomic_fetch_inc(&id)); + if (!name) + return -ENOMEM; err = cpuhp_state_add_instance(arm_cmn_hp_state, &cmn->cpuhp_node); if (err) From 1c8147ea89c883d1f4e20f1b1d9c879291430102 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 28 Jan 2021 13:12:44 +0000 Subject: [PATCH 051/108] perf/arm-cmn: Move IRQs when migrating context If we migrate the PMU context to another CPU, we need to remember to retarget the IRQs as well. Fixes: 0ba64770a2f2 ("perf: Add Arm CMN-600 PMU driver") Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/e080640aea4ed8dfa870b8549dfb31221803eb6b.1611839564.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 6bdf13f65875..1328159fe564 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -1150,7 +1150,7 @@ static int arm_cmn_commit_txn(struct pmu *pmu) static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) { struct arm_cmn *cmn; - unsigned int target; + unsigned int i, target; cmn = hlist_entry_safe(node, struct arm_cmn, cpuhp_node); if (cpu != cmn->cpu) @@ -1161,6 +1161,8 @@ static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) return 0; perf_pmu_migrate_context(&cmn->pmu, cpu, target); + for (i = 0; i < cmn->num_dtcs; i++) + irq_set_affinity_hint(cmn->dtc[i].irq, cpumask_of(target)); cmn->cpu = target; return 0; } From 2a6c6b7d7ad346f0679d0963cb19b3f0ea7ef32c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 28 Jan 2021 14:40:07 -0800 Subject: [PATCH 052/108] perf/core: Add PERF_SAMPLE_WEIGHT_STRUCT Current PERF_SAMPLE_WEIGHT sample type is very useful to expresses the cost of an action represented by the sample. This allows the profiler to scale the samples to be more informative to the programmer. It could also help to locate a hotspot, e.g., when profiling by memory latencies, the expensive load appear higher up in the histograms. But current PERF_SAMPLE_WEIGHT sample type is solely determined by one factor. This could be a problem, if users want two or more factors to contribute to the weight. For example, Golden Cove core PMU can provide both the instruction latency and the cache Latency information as factors for the memory profiling. For current X86 platforms, although meminfo::latency is defined as a u64, only the lower 32 bits include the valid data in practice (No memory access could last than 4G cycles). The higher 32 bits can be used to store new factors. Add a new sample type, PERF_SAMPLE_WEIGHT_STRUCT, to indicate the new sample weight structure. It shares the same space as the PERF_SAMPLE_WEIGHT sample type. Users can apply either the PERF_SAMPLE_WEIGHT sample type or the PERF_SAMPLE_WEIGHT_STRUCT sample type to retrieve the sample weight, but they cannot apply both sample types simultaneously. Currently, only X86 and PowerPC use the PERF_SAMPLE_WEIGHT sample type. - For PowerPC, there is nothing changed for the PERF_SAMPLE_WEIGHT sample type. There is no effect for the new PERF_SAMPLE_WEIGHT_STRUCT sample type. PowerPC can re-struct the weight field similarly later. - For X86, the same value will be dumped for the PERF_SAMPLE_WEIGHT sample type or the PERF_SAMPLE_WEIGHT_STRUCT sample type for now. The following patches will apply the new factors for the PERF_SAMPLE_WEIGHT_STRUCT sample type. The field in the union perf_sample_weight should be shared among different architectures. A generic name is required, but it's hard to abstract a name that applies to all architectures. For example, on X86, the fields are to store all kinds of latency. While on PowerPC, it stores MMCRA[TECX/TECM], which should not be latency. So a general name prefix 'var$NUM' is used here. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1611873611-156687-2-git-send-email-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 2 +- arch/x86/events/intel/ds.c | 17 ++++++------- include/linux/perf_event.h | 4 ++-- include/uapi/linux/perf_event.h | 42 +++++++++++++++++++++++++++++++-- kernel/events/core.c | 11 +++++---- 5 files changed, 59 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 28206b1fe172..869d999a836e 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2195,7 +2195,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (event->attr.sample_type & PERF_SAMPLE_WEIGHT && ppmu->get_mem_weight) - ppmu->get_mem_weight(&data.weight); + ppmu->get_mem_weight(&data.weight.full); if (perf_event_overflow(event, &data, regs)) power_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 67dbc91bccfe..2f54b1fbb895 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -960,7 +960,8 @@ static void adaptive_pebs_record_size_update(void) } #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ - PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \ + PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_WEIGHT_TYPE | \ PERF_SAMPLE_TRANSACTION | \ PERF_SAMPLE_DATA_PAGE_SIZE) @@ -987,7 +988,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event) gprs = (sample_type & PERF_SAMPLE_REGS_INTR) && (attr->sample_regs_intr & PEBS_GP_REGS); - tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) && + tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) && ((attr->config & INTEL_ARCH_EVENT_MASK) == x86_pmu.rtm_abort_event); @@ -1369,8 +1370,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * Use latency for weight (only avail with PEBS-LL) */ - if (fll && (sample_type & PERF_SAMPLE_WEIGHT)) - data->weight = pebs->lat; + if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) + data->weight.full = pebs->lat; /* * data.data_src encodes the data source @@ -1462,8 +1463,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ - if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll) - data->weight = intel_get_tsx_weight(pebs->tsx_tuning); + if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) + data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); if (sample_type & PERF_SAMPLE_TRANSACTION) data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, @@ -1577,8 +1578,8 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } if (format_size & PEBS_DATACFG_MEMINFO) { - if (sample_type & PERF_SAMPLE_WEIGHT) - data->weight = meminfo->latency ?: + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) + data->weight.full = meminfo->latency ?: intel_get_tsx_weight(meminfo->tsx_tuning); if (sample_type & PERF_SAMPLE_DATA_SRC) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9a38f579bc76..fab42cfbd350 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -998,7 +998,7 @@ struct perf_sample_data { struct perf_raw_record *raw; struct perf_branch_stack *br_stack; u64 period; - u64 weight; + union perf_sample_weight weight; u64 txn; union perf_mem_data_src data_src; @@ -1047,7 +1047,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->raw = NULL; data->br_stack = NULL; data->period = period; - data->weight = 0; + data->weight.full = 0; data->data_src.val = PERF_MEM_NA; data->txn = 0; } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index b15e3447cd9f..b2cc246ec119 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -145,12 +145,14 @@ enum perf_event_sample_format { PERF_SAMPLE_CGROUP = 1U << 21, PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, + PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, - PERF_SAMPLE_MAX = 1U << 24, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; +#define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) /* * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set * @@ -890,7 +892,24 @@ enum perf_event_type { * char data[size]; * u64 dyn_size; } && PERF_SAMPLE_STACK_USER * - * { u64 weight; } && PERF_SAMPLE_WEIGHT + * { union perf_sample_weight + * { + * u64 full; && PERF_SAMPLE_WEIGHT + * #if defined(__LITTLE_ENDIAN_BITFIELD) + * struct { + * u32 var1_dw; + * u16 var2_w; + * u16 var3_w; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #elif defined(__BIG_ENDIAN_BITFIELD) + * struct { + * u16 var3_w; + * u16 var2_w; + * u32 var1_dw; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #endif + * } + * } * { u64 data_src; } && PERF_SAMPLE_DATA_SRC * { u64 transaction; } && PERF_SAMPLE_TRANSACTION * { u64 abi; # enum perf_sample_regs_abi @@ -1248,4 +1267,23 @@ struct perf_branch_entry { reserved:40; }; +union perf_sample_weight { + __u64 full; +#if defined(__LITTLE_ENDIAN_BITFIELD) + struct { + __u32 var1_dw; + __u16 var2_w; + __u16 var3_w; + }; +#elif defined(__BIG_ENDIAN_BITFIELD) + struct { + __u16 var3_w; + __u16 var2_w; + __u32 var1_dw; + }; +#else +#error "Unknown endianness" +#endif +}; + #endif /* _UAPI_LINUX_PERF_EVENT_H */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 55d18791a72d..5206097d4d3d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1879,8 +1879,8 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); - if (sample_type & PERF_SAMPLE_WEIGHT) - size += sizeof(data->weight); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) + size += sizeof(data->weight.full); if (sample_type & PERF_SAMPLE_READ) size += event->read_size; @@ -6907,8 +6907,8 @@ void perf_output_sample(struct perf_output_handle *handle, data->regs_user.regs); } - if (sample_type & PERF_SAMPLE_WEIGHT) - perf_output_put(handle, data->weight); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) + perf_output_put(handle, data->weight.full); if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); @@ -11564,6 +11564,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->sample_type & PERF_SAMPLE_CGROUP) return -EINVAL; #endif + if ((attr->sample_type & PERF_SAMPLE_WEIGHT) && + (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) + return -EINVAL; out: return ret; From 628d923a3c464db98c1c98bb1e0cd50804caf681 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 28 Jan 2021 14:40:08 -0800 Subject: [PATCH 053/108] perf/x86/intel: Factor out intel_update_topdown_event() Similar to Ice Lake, Intel Sapphire Rapids server also supports the topdown performance metrics feature. The difference is that Intel Sapphire Rapids server extends the PERF_METRICS MSR to feature TMA method level two metrics, which will introduce 8 metrics events. Current icl_update_topdown_event() only check 4 level one metrics events. Factor out intel_update_topdown_event() to facilitate the code sharing between Ice Lake and Sapphire Rapids. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1611873611-156687-3-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fe940082d49a..d07408da32fb 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2325,8 +2325,8 @@ static void __icl_update_topdown_event(struct perf_event *event, } } -static void update_saved_topdown_regs(struct perf_event *event, - u64 slots, u64 metrics) +static void update_saved_topdown_regs(struct perf_event *event, u64 slots, + u64 metrics, int metric_end) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *other; @@ -2335,7 +2335,7 @@ static void update_saved_topdown_regs(struct perf_event *event, event->hw.saved_slots = slots; event->hw.saved_metric = metrics; - for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) { + for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) { if (!is_topdown_idx(idx)) continue; other = cpuc->events[idx]; @@ -2350,7 +2350,8 @@ static void update_saved_topdown_regs(struct perf_event *event, * The PERF_METRICS and Fixed counter 3 are read separately. The values may be * modify by a NMI. PMU has to be disabled before calling this function. */ -static u64 icl_update_topdown_event(struct perf_event *event) + +static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *other; @@ -2366,7 +2367,7 @@ static u64 icl_update_topdown_event(struct perf_event *event) /* read PERF_METRICS */ rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); - for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) { + for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) { if (!is_topdown_idx(idx)) continue; other = cpuc->events[idx]; @@ -2392,7 +2393,7 @@ static u64 icl_update_topdown_event(struct perf_event *event) * Don't need to reset the PERF_METRICS and Fixed counter 3. * Because the values will be restored in next schedule in. */ - update_saved_topdown_regs(event, slots, metrics); + update_saved_topdown_regs(event, slots, metrics, metric_end); reset = false; } @@ -2401,12 +2402,17 @@ static u64 icl_update_topdown_event(struct perf_event *event) wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0); wrmsrl(MSR_PERF_METRICS, 0); if (event) - update_saved_topdown_regs(event, 0, 0); + update_saved_topdown_regs(event, 0, 0, metric_end); } return slots; } +static u64 icl_update_topdown_event(struct perf_event *event) +{ + return intel_update_topdown_event(event, INTEL_PMC_IDX_TD_BE_BOUND); +} + static void intel_pmu_read_topdown_event(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); From 1ab5f235c176e93adc4f75000aae6c50fea9db00 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 28 Jan 2021 14:40:09 -0800 Subject: [PATCH 054/108] perf/x86/intel: Filter unsupported Topdown metrics event Intel Sapphire Rapids server will introduce 8 metrics events. Intel Ice Lake only supports 4 metrics events. A perf tool user may mistakenly use the unsupported events via RAW format on Ice Lake. The user can still get a value from the unsupported Topdown metrics event once the following Sapphire Rapids enabling patch is applied. To enable the 8 metrics events on Intel Sapphire Rapids, the INTEL_TD_METRIC_MAX has to be updated, which impacts the is_metric_event(). The is_metric_event() is a generic function. On Ice Lake, the newly added SPR metrics events will be mistakenly accepted as metric events on creation. At runtime, the unsupported Topdown metrics events will be updated. Add a variable num_topdown_events in x86_pmu to indicate the available number of the Topdown metrics event on the platform. Apply the number into is_metric_event(). Only the supported Topdown metrics events should be created as metrics events. Apply the num_topdown_events in icl_update_topdown_event() as well. The function can be reused by the following patch. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1611873611-156687-4-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 15 +++++++++++++-- arch/x86/events/perf_event.h | 1 + arch/x86/include/asm/perf_event.h | 10 ++++++++-- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index d07408da32fb..37830ac34ae1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2410,7 +2410,8 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) static u64 icl_update_topdown_event(struct perf_event *event) { - return intel_update_topdown_event(event, INTEL_PMC_IDX_TD_BE_BOUND); + return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE + + x86_pmu.num_topdown_events - 1); } static void intel_pmu_read_topdown_event(struct perf_event *event) @@ -3468,6 +3469,15 @@ static int core_pmu_hw_config(struct perf_event *event) return intel_pmu_bts_config(event); } +#define INTEL_TD_METRIC_AVAILABLE_MAX (INTEL_TD_METRIC_RETIRING + \ + ((x86_pmu.num_topdown_events - 1) << 8)) + +static bool is_available_metric_event(struct perf_event *event) +{ + return is_metric_event(event) && + event->attr.config <= INTEL_TD_METRIC_AVAILABLE_MAX; +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -3541,7 +3551,7 @@ static int intel_pmu_hw_config(struct perf_event *event) if (event->attr.config & X86_ALL_EVENT_FLAGS) return -EINVAL; - if (is_metric_event(event)) { + if (is_available_metric_event(event)) { struct perf_event *leader = event->group_leader; /* The metric events don't support sampling. */ @@ -5324,6 +5334,7 @@ __init int intel_pmu_init(void) x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); + x86_pmu.num_topdown_events = 4; x86_pmu.update_topdown_event = icl_update_topdown_event; x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; pr_cont("Icelake events, "); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 978a16e7a8d0..15343cc25d47 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -775,6 +775,7 @@ struct x86_pmu { /* * Intel perf metrics */ + int num_topdown_events; u64 (*update_topdown_event)(struct perf_event *event); int (*set_topdown_event_period)(struct perf_event *event); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index e2a4c785e4e3..7c2c3023532a 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -280,8 +280,14 @@ struct x86_pmu_capability { #define INTEL_TD_METRIC_BAD_SPEC 0x8100 /* Bad speculation metric */ #define INTEL_TD_METRIC_FE_BOUND 0x8200 /* FE bound metric */ #define INTEL_TD_METRIC_BE_BOUND 0x8300 /* BE bound metric */ -#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_BE_BOUND -#define INTEL_TD_METRIC_NUM 4 +/* Level 2 metrics */ +#define INTEL_TD_METRIC_HEAVY_OPS 0x8400 /* Heavy Operations metric */ +#define INTEL_TD_METRIC_BR_MISPREDICT 0x8500 /* Branch Mispredict metric */ +#define INTEL_TD_METRIC_FETCH_LAT 0x8600 /* Fetch Latency metric */ +#define INTEL_TD_METRIC_MEM_BOUND 0x8700 /* Memory bound metric */ + +#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND +#define INTEL_TD_METRIC_NUM 8 static inline bool is_metric_idx(int idx) { From 61b985e3e775a3a75fda04ce7ef1b1aefc4758bc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 28 Jan 2021 14:40:10 -0800 Subject: [PATCH 055/108] perf/x86/intel: Add perf core PMU support for Sapphire Rapids Add perf core PMU support for the Intel Sapphire Rapids server, which is the successor of the Intel Ice Lake server. The enabling code is based on Ice Lake, but there are several new features introduced. The event encoding is changed and simplified, e.g., the event codes which are below 0x90 are restricted to counters 0-3. The event codes which above 0x90 are likely to have no restrictions. The event constraints, extra_regs(), and hardware cache events table are changed accordingly. A new Precise Distribution (PDist) facility is introduced, which further minimizes the skid when a precise event is programmed on the GP counter 0. Enable the Precise Distribution (PDist) facility with :ppp event. For this facility to work, the period must be initialized with a value larger than 127. Add spr_limit_period() to apply the limit for :ppp event. Two new data source fields, data block & address block, are added in the PEBS Memory Info Record for the load latency event. To enable the feature, - An auxiliary event has to be enabled together with the load latency event on Sapphire Rapids. A new flag PMU_FL_MEM_LOADS_AUX is introduced to indicate the case. A new event, mem-loads-aux, is exposed to sysfs for the user tool. Add a check in hw_config(). If the auxiliary event is not detected, return an unique error -ENODATA. - The union perf_mem_data_src is extended to support the new fields. - Ice Lake and earlier models do not support block information, but the fields may be set by HW on some machines. Add pebs_no_block to explicitly indicate the previous platforms which don't support the new block fields. Accessing the new block fields are ignored on those platforms. A new store Latency facility is introduced, which leverages the PEBS facility where it can provide additional information about sampled stores. The additional information includes the data address, memory auxiliary info (e.g. Data Source, STLB miss) and the latency of the store access. To enable the facility, the new event (0x02cd) has to be programed on the GP counter 0. A new flag PERF_X86_EVENT_PEBS_STLAT is introduced to indicate the event. The store_latency_data() is introduced to parse the memory auxiliary info. The layout of access latency field of PEBS Memory Info Record has been changed. Two latency, instruction latency (bit 15:0) and cache access latency (bit 47:32) are recorded. - The cache access latency is similar to previous memory access latency. For loads, the latency starts by the actual cache access until the data is returned by the memory subsystem. For stores, the latency starts when the demand write accesses the L1 data cache and lasts until the cacheline write is completed in the memory subsystem. The cache access latency is stored in low 32bits of the sample type PERF_SAMPLE_WEIGHT_STRUCT. - The instruction latency starts by the dispatch of the load operation for execution and lasts until completion of the instruction it belongs to. Add a new flag PMU_FL_INSTR_LATENCY to indicate the instruction latency support. The instruction latency is stored in the bit 47:32 of the sample type PERF_SAMPLE_WEIGHT_STRUCT. Extends the PERF_METRICS MSR to feature TMA method level 2 metrics. The lower half of the register is the TMA level 1 metrics (legacy). The upper half is also divided into four 8-bit fields for the new level 2 metrics. Expose all eight Topdown metrics events to user space. The full description for the SPR features can be found at Intel Architecture Instruction Set Extensions and Future Features Programming Reference, 319433-041. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1611873611-156687-5-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 307 +++++++++++++++++++++++++++++- arch/x86/events/intel/ds.c | 118 +++++++++++- arch/x86/events/perf_event.h | 12 +- arch/x86/include/asm/perf_event.h | 8 +- include/uapi/linux/perf_event.h | 12 +- 5 files changed, 443 insertions(+), 14 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 37830ac34ae1..58cd64e1fc4d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -275,6 +275,55 @@ static struct extra_reg intel_icl_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct extra_reg intel_spr_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE), + EVENT_EXTRA_END +}; + +static struct event_constraint intel_spr_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7), + + INTEL_EVENT_CONSTRAINT(0x2e, 0xff), + INTEL_EVENT_CONSTRAINT(0x3c, 0xff), + /* + * Generally event codes < 0x90 are restricted to counters 0-3. + * The 0x2E and 0x3C are exception, which has no restriction. + */ + INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf), + + INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x08a3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1), + INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1), + INTEL_UEVENT_CONSTRAINT(0x02cd, 0x1), + INTEL_EVENT_CONSTRAINT(0xce, 0x1), + INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf), + /* + * Generally event codes >= 0x90 are likely to have no restrictions. + * The exception are defined as above. + */ + INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0xff), + + EVENT_CONSTRAINT_END +}; + + EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); @@ -314,11 +363,15 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles, EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale, "4", "2"); -EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4"); -EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80"); -EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81"); -EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82"); -EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83"); +EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4"); +EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80"); +EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81"); +EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82"); +EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83"); +EVENT_ATTR_STR(topdown-heavy-ops, td_heavy_ops, "event=0x00,umask=0x84"); +EVENT_ATTR_STR(topdown-br-mispredict, td_br_mispredict, "event=0x00,umask=0x85"); +EVENT_ATTR_STR(topdown-fetch-lat, td_fetch_lat, "event=0x00,umask=0x86"); +EVENT_ATTR_STR(topdown-mem-bound, td_mem_bound, "event=0x00,umask=0x87"); static struct attribute *snb_events_attrs[] = { EVENT_PTR(td_slots_issued), @@ -384,6 +437,108 @@ static u64 intel_pmu_event_map(int hw_event) return intel_perfmon_event_map[hw_event]; } +static __initconst const u64 spr_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, + [ C(RESULT_MISS) ] = 0xe124, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_MISS) ] = 0xe424, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x12a, + [ C(RESULT_MISS) ] = 0x12a, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x12a, + [ C(RESULT_MISS) ] = 0x12a, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, + [ C(RESULT_MISS) ] = 0xe12, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, + [ C(RESULT_MISS) ] = 0xe13, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = 0xe11, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4c4, + [ C(RESULT_MISS) ] = 0x4c5, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x12a, + [ C(RESULT_MISS) ] = 0x12a, + }, + }, +}; + +static __initconst const u64 spr_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x10001, + [ C(RESULT_MISS) ] = 0x3fbfc00001, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x3f3ffc0002, + [ C(RESULT_MISS) ] = 0x3f3fc00002, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x10c000001, + [ C(RESULT_MISS) ] = 0x3fb3000001, + }, + }, +}; + /* * Notes on the events: * - data reads do not include code reads (comparable to earlier tables) @@ -3478,6 +3633,17 @@ static bool is_available_metric_event(struct perf_event *event) event->attr.config <= INTEL_TD_METRIC_AVAILABLE_MAX; } +static inline bool is_mem_loads_event(struct perf_event *event) +{ + return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xcd, .umask=0x01); +} + +static inline bool is_mem_loads_aux_event(struct perf_event *event) +{ + return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82); +} + + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -3580,6 +3746,33 @@ static int intel_pmu_hw_config(struct perf_event *event) } } + /* + * The load latency event X86_CONFIG(.event=0xcd, .umask=0x01) on SPR + * doesn't function quite right. As a work-around it needs to always be + * co-scheduled with a auxiliary event X86_CONFIG(.event=0x03, .umask=0x82). + * The actual count of this second event is irrelevant it just needs + * to be active to make the first event function correctly. + * + * In a group, the auxiliary event must be in front of the load latency + * event. The rule is to simplify the implementation of the check. + * That's because perf cannot have a complete group at the moment. + */ + if (x86_pmu.flags & PMU_FL_MEM_LOADS_AUX && + (event->attr.sample_type & PERF_SAMPLE_DATA_SRC) && + is_mem_loads_event(event)) { + struct perf_event *leader = event->group_leader; + struct perf_event *sibling = NULL; + + if (!is_mem_loads_aux_event(leader)) { + for_each_sibling_event(sibling, leader) { + if (is_mem_loads_aux_event(sibling)) + break; + } + if (list_entry_is_head(sibling, &leader->sibling_list, sibling_list)) + return -ENODATA; + } + } + if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY)) return 0; @@ -3759,6 +3952,29 @@ icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx, return hsw_get_event_constraints(cpuc, idx, event); } +static struct event_constraint * +spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c; + + c = icl_get_event_constraints(cpuc, idx, event); + + /* + * The :ppp indicates the Precise Distribution (PDist) facility, which + * is only supported on the GP counter 0. If a :ppp event which is not + * available on the GP counter 0, error out. + */ + if (event->attr.precise_ip == 3) { + if (c->idxmsk64 & BIT_ULL(0)) + return &counter0_constraint; + + return &emptyconstraint; + } + + return c; +} + static struct event_constraint * glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) @@ -3848,6 +4064,14 @@ static u64 nhm_limit_period(struct perf_event *event, u64 left) return max(left, 32ULL); } +static u64 spr_limit_period(struct perf_event *event, u64 left) +{ + if (event->attr.precise_ip == 3) + return max(left, 128ULL); + + return left; +} + PMU_FORMAT_ATTR(event, "config:0-7" ); PMU_FORMAT_ATTR(umask, "config:8-15" ); PMU_FORMAT_ATTR(edge, "config:18" ); @@ -4559,6 +4783,42 @@ static struct attribute *icl_tsx_events_attrs[] = { NULL, }; + +EVENT_ATTR_STR(mem-stores, mem_st_spr, "event=0xcd,umask=0x2"); +EVENT_ATTR_STR(mem-loads-aux, mem_ld_aux, "event=0x03,umask=0x82"); + +static struct attribute *spr_events_attrs[] = { + EVENT_PTR(mem_ld_hsw), + EVENT_PTR(mem_st_spr), + EVENT_PTR(mem_ld_aux), + NULL, +}; + +static struct attribute *spr_td_events_attrs[] = { + EVENT_PTR(slots), + EVENT_PTR(td_retiring), + EVENT_PTR(td_bad_spec), + EVENT_PTR(td_fe_bound), + EVENT_PTR(td_be_bound), + EVENT_PTR(td_heavy_ops), + EVENT_PTR(td_br_mispredict), + EVENT_PTR(td_fetch_lat), + EVENT_PTR(td_mem_bound), + NULL, +}; + +static struct attribute *spr_tsx_events_attrs[] = { + EVENT_PTR(tx_start), + EVENT_PTR(tx_abort), + EVENT_PTR(tx_commit), + EVENT_PTR(tx_capacity_read), + EVENT_PTR(tx_capacity_write), + EVENT_PTR(tx_conflict), + EVENT_PTR(cycles_t), + EVENT_PTR(cycles_ct), + NULL, +}; + static ssize_t freeze_on_smi_show(struct device *cdev, struct device_attribute *attr, char *buf) @@ -5341,6 +5601,43 @@ __init int intel_pmu_init(void) name = "icelake"; break; + case INTEL_FAM6_SAPPHIRERAPIDS_X: + pmem = true; + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + + x86_pmu.event_constraints = intel_spr_event_constraints; + x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints; + x86_pmu.extra_regs = intel_spr_extra_regs; + x86_pmu.limit_period = spr_limit_period; + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.pebs_block = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + x86_pmu.flags |= PMU_FL_PEBS_ALL; + x86_pmu.flags |= PMU_FL_INSTR_LATENCY; + x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = spr_get_event_constraints; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; + extra_skl_attr = skl_format_attr; + mem_attr = spr_events_attrs; + td_attr = spr_td_events_attrs; + tsx_attr = spr_tsx_events_attrs; + x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); + x86_pmu.lbr_pt_coexist = true; + intel_pmu_pebs_data_source_skl(pmem); + x86_pmu.num_topdown_events = 8; + x86_pmu.update_topdown_event = icl_update_topdown_event; + x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; + pr_cont("Sapphire Rapids events, "); + name = "sapphire_rapids"; + break; + default: switch (x86_pmu.version) { case 1: diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 2f54b1fbb895..7ebae1826403 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -36,7 +36,9 @@ union intel_x86_pebs_dse { unsigned int ld_dse:4; unsigned int ld_stlb_miss:1; unsigned int ld_locked:1; - unsigned int ld_reserved:26; + unsigned int ld_data_blk:1; + unsigned int ld_addr_blk:1; + unsigned int ld_reserved:24; }; struct { unsigned int st_l1d_hit:1; @@ -45,6 +47,12 @@ union intel_x86_pebs_dse { unsigned int st_locked:1; unsigned int st_reserved2:26; }; + struct { + unsigned int st_lat_dse:4; + unsigned int st_lat_stlb_miss:1; + unsigned int st_lat_locked:1; + unsigned int ld_reserved3:26; + }; }; @@ -198,6 +206,63 @@ static u64 load_latency_data(u64 status) if (dse.ld_locked) val |= P(LOCK, LOCKED); + /* + * Ice Lake and earlier models do not support block infos. + */ + if (!x86_pmu.pebs_block) { + val |= P(BLK, NA); + return val; + } + /* + * bit 6: load was blocked since its data could not be forwarded + * from a preceding store + */ + if (dse.ld_data_blk) + val |= P(BLK, DATA); + + /* + * bit 7: load was blocked due to potential address conflict with + * a preceding store + */ + if (dse.ld_addr_blk) + val |= P(BLK, ADDR); + + if (!dse.ld_data_blk && !dse.ld_addr_blk) + val |= P(BLK, NA); + + return val; +} + +static u64 store_latency_data(u64 status) +{ + union intel_x86_pebs_dse dse; + u64 val; + + dse.val = status; + + /* + * use the mapping table for bit 0-3 + */ + val = pebs_data_source[dse.st_lat_dse]; + + /* + * bit 4: TLB access + * 0 = did not miss 2nd level TLB + * 1 = missed 2nd level TLB + */ + if (dse.st_lat_stlb_miss) + val |= P(TLB, MISS) | P(TLB, L2); + else + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + + /* + * bit 5: locked prefix + */ + if (dse.st_lat_locked) + val |= P(LOCK, LOCKED); + + val |= P(BLK, NA); + return val; } @@ -870,6 +935,28 @@ struct event_constraint intel_icl_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_spr_pebs_event_constraints[] = { + INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), + + INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe), + INTEL_PLD_CONSTRAINT(0x1cd, 0xfe), + INTEL_PSD_CONSTRAINT(0x2cd, 0x1), + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf), + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf), + + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), + + INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf), + + /* + * Everything else is handled by PMU_FL_PEBS_ALL, because we + * need the full constraints from the main table. + */ + + EVENT_CONSTRAINT_END +}; + struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *c; @@ -1332,6 +1419,8 @@ static u64 get_data_src(struct perf_event *event, u64 aux) if (fl & PERF_X86_EVENT_PEBS_LDLAT) val = load_latency_data(aux); + else if (fl & PERF_X86_EVENT_PEBS_STLAT) + val = store_latency_data(aux); else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC)) val = precise_datala_hsw(event, aux); else if (fst) @@ -1508,6 +1597,9 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs, #endif } +#define PEBS_LATENCY_MASK 0xffff +#define PEBS_CACHE_LATENCY_OFFSET 32 + /* * With adaptive PEBS the layout depends on what fields are configured. */ @@ -1578,9 +1670,27 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } if (format_size & PEBS_DATACFG_MEMINFO) { - if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) - data->weight.full = meminfo->latency ?: - intel_get_tsx_weight(meminfo->tsx_tuning); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { + u64 weight = meminfo->latency; + + if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) { + data->weight.var2_w = weight & PEBS_LATENCY_MASK; + weight >>= PEBS_CACHE_LATENCY_OFFSET; + } + + /* + * Although meminfo::latency is defined as a u64, + * only the lower 32 bits include the valid data + * in practice on Ice Lake and earlier platforms. + */ + if (sample_type & PERF_SAMPLE_WEIGHT) { + data->weight.full = weight ?: + intel_get_tsx_weight(meminfo->tsx_tuning); + } else { + data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: + intel_get_tsx_weight(meminfo->tsx_tuning); + } + } if (sample_type & PERF_SAMPLE_DATA_SRC) data->data_src.val = get_data_src(event, meminfo->aux); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 15343cc25d47..b8bf2ce23913 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -80,6 +80,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) #define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ #define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ #define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */ +#define PERF_X86_EVENT_PEBS_STLAT 0x8000 /* st+stlat data address sampling */ static inline bool is_topdown_count(struct perf_event *event) { @@ -443,6 +444,10 @@ struct cpu_hw_events { __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) +#define INTEL_PSD_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_STLAT) + #define INTEL_PST_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST) @@ -723,7 +728,8 @@ struct x86_pmu { pebs_broken :1, pebs_prec_dist :1, pebs_no_tlb :1, - pebs_no_isolation :1; + pebs_no_isolation :1, + pebs_block :1; int pebs_record_size; int pebs_buffer_size; int max_pebs_events; @@ -871,6 +877,8 @@ do { \ #define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */ #define PMU_FL_TFA 0x20 /* deal with TSX force abort */ #define PMU_FL_PAIR 0x40 /* merge counters for large incr. events */ +#define PMU_FL_INSTR_LATENCY 0x80 /* Support Instruction Latency in PEBS Memory Info Record */ +#define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr @@ -1157,6 +1165,8 @@ extern struct event_constraint intel_skl_pebs_event_constraints[]; extern struct event_constraint intel_icl_pebs_event_constraints[]; +extern struct event_constraint intel_spr_pebs_event_constraints[]; + struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_add(struct perf_event *event); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 7c2c3023532a..544f41a179fb 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -261,8 +261,12 @@ struct x86_pmu_capability { #define INTEL_PMC_IDX_TD_BAD_SPEC (INTEL_PMC_IDX_METRIC_BASE + 1) #define INTEL_PMC_IDX_TD_FE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 2) #define INTEL_PMC_IDX_TD_BE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 3) -#define INTEL_PMC_IDX_METRIC_END INTEL_PMC_IDX_TD_BE_BOUND -#define INTEL_PMC_MSK_TOPDOWN ((0xfull << INTEL_PMC_IDX_METRIC_BASE) | \ +#define INTEL_PMC_IDX_TD_HEAVY_OPS (INTEL_PMC_IDX_METRIC_BASE + 4) +#define INTEL_PMC_IDX_TD_BR_MISPREDICT (INTEL_PMC_IDX_METRIC_BASE + 5) +#define INTEL_PMC_IDX_TD_FETCH_LAT (INTEL_PMC_IDX_METRIC_BASE + 6) +#define INTEL_PMC_IDX_TD_MEM_BOUND (INTEL_PMC_IDX_METRIC_BASE + 7) +#define INTEL_PMC_IDX_METRIC_END INTEL_PMC_IDX_TD_MEM_BOUND +#define INTEL_PMC_MSK_TOPDOWN ((0xffull << INTEL_PMC_IDX_METRIC_BASE) | \ INTEL_PMC_MSK_FIXED_SLOTS) /* diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index b2cc246ec119..7d292de51410 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1146,14 +1146,16 @@ union perf_mem_data_src { mem_lvl_num:4, /* memory hierarchy level number */ mem_remote:1, /* remote */ mem_snoopx:2, /* snoop mode, ext */ - mem_rsvd:24; + mem_blk:3, /* access blocked */ + mem_rsvd:21; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd:24, + __u64 mem_rsvd:21, + mem_blk:3, /* access blocked */ mem_snoopx:2, /* snoop mode, ext */ mem_remote:1, /* remote */ mem_lvl_num:4, /* memory hierarchy level number */ @@ -1236,6 +1238,12 @@ union perf_mem_data_src { #define PERF_MEM_TLB_OS 0x40 /* OS fault handler */ #define PERF_MEM_TLB_SHIFT 26 +/* Access blocked */ +#define PERF_MEM_BLK_NA 0x01 /* not available */ +#define PERF_MEM_BLK_DATA 0x02 /* data could not be forwarded */ +#define PERF_MEM_BLK_ADDR 0x04 /* address conflict */ +#define PERF_MEM_BLK_SHIFT 40 + #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) From 32451614da2a9cf4296f90d3606ac77814fb519d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 28 Jan 2021 14:40:11 -0800 Subject: [PATCH 056/108] perf/x86/intel: Support CPUID 10.ECX to disable fixed counters With Architectural Performance Monitoring Version 5, CPUID 10.ECX cpu leaf indicates the fixed counter enumeration. This extends the previous count to a bitmap which allows disabling even lower fixed counters. It could be used by a Hypervisor. The existing intel_ctrl variable is used to remember the bitmask of the counters. All code that reads all counters is fixed to check this extra bitmask. Suggested-by: Peter Zijlstra (Intel) Originally-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1611873611-156687-6-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/core.c | 8 +++++++- arch/x86/events/intel/core.c | 34 ++++++++++++++++++++++++---------- arch/x86/events/perf_event.h | 5 +++++ 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index cf0a52c80408..6ddeed3cd2ac 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -255,6 +255,8 @@ static bool check_hw_exists(void) if (ret) goto msr_fail; for (i = 0; i < x86_pmu.num_counters_fixed; i++) { + if (fixed_counter_disabled(i)) + continue; if (val & (0x03 << i*4)) { bios_fail = 1; val_fail = val; @@ -1531,6 +1533,8 @@ void perf_event_print_debug(void) cpu, idx, prev_left); } for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + if (fixed_counter_disabled(idx)) + continue; rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -2012,7 +2016,9 @@ static int __init init_hw_perf_events(void) pr_info("... generic registers: %d\n", x86_pmu.num_counters); pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); + pr_info("... fixed-purpose events: %lu\n", + hweight64((((1ULL << x86_pmu.num_counters_fixed) - 1) + << INTEL_PMC_IDX_FIXED) & x86_pmu.intel_ctrl)); pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); if (!x86_pmu.read) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 58cd64e1fc4d..67a724657f60 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2723,8 +2723,11 @@ static void intel_pmu_reset(void) wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + if (fixed_counter_disabled(idx)) + continue; wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } if (ds) ds->bts_index = ds->bts_buffer_base; @@ -5042,7 +5045,7 @@ __init int intel_pmu_init(void) union cpuid10_eax eax; union cpuid10_ebx ebx; struct event_constraint *c; - unsigned int unused; + unsigned int fixed_mask; struct extra_reg *er; bool pmem = false; int version, i; @@ -5064,7 +5067,7 @@ __init int intel_pmu_init(void) * Check whether the Architectural PerfMon supports * Branch Misses Retired hw_event or not. */ - cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full); if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) return -ENODEV; @@ -5088,12 +5091,15 @@ __init int intel_pmu_init(void) * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events, when not running in a hypervisor: */ - if (version > 1) { + if (version > 1 && version < 5) { int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR); x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, assume); - } + + fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1; + } else if (version >= 5) + x86_pmu.num_counters_fixed = fls(fixed_mask); if (boot_cpu_has(X86_FEATURE_PDCM)) { u64 capabilities; @@ -5680,8 +5686,7 @@ __init int intel_pmu_init(void) x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; } - x86_pmu.intel_ctrl |= - ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + x86_pmu.intel_ctrl |= (u64)fixed_mask << INTEL_PMC_IDX_FIXED; /* AnyThread may be deprecated on arch perfmon v5 or later */ if (x86_pmu.intel_cap.anythread_deprecated) @@ -5698,13 +5703,22 @@ __init int intel_pmu_init(void) * events to the generic counters. */ if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) { + /* + * Disable topdown slots and metrics events, + * if slots event is not in CPUID. + */ + if (!(INTEL_PMC_MSK_FIXED_SLOTS & x86_pmu.intel_ctrl)) + c->idxmsk64 = 0; c->weight = hweight64(c->idxmsk64); continue; } - if (c->cmask == FIXED_EVENT_FLAGS - && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) { - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + if (c->cmask == FIXED_EVENT_FLAGS) { + /* Disabled fixed counters which are not in CPUID */ + c->idxmsk64 &= x86_pmu.intel_ctrl; + + if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; } c->idxmsk64 &= ~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed)); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index b8bf2ce23913..53b2b5fc23bc 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1068,6 +1068,11 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); +static inline bool fixed_counter_disabled(int i) +{ + return !(x86_pmu.intel_ctrl >> (i + INTEL_PMC_IDX_FIXED)); +} + #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); From d1bbc35fcab28668c8992c4d5777234b794d7306 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 1 Feb 2021 10:03:06 -0500 Subject: [PATCH 057/108] arm64: hibernate: add __force attribute to gfp_t casting Two new warnings are reported by sparse: "sparse warnings: (new ones prefixed by >>)" >> arch/arm64/kernel/hibernate.c:181:39: sparse: sparse: cast to restricted gfp_t >> arch/arm64/kernel/hibernate.c:202:44: sparse: sparse: cast from restricted gfp_t gfp_t has __bitwise type attribute and requires __force added to casting in order to avoid these warnings. Fixes: 50f53fb72181 ("arm64: trans_pgd: make trans_pgd_map_page generic") Reported-by: kernel test robot Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/r/20210201150306.54099-2-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/kernel/hibernate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 9df32ba0d574..b1cef371df2b 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -178,7 +178,7 @@ EXPORT_SYMBOL(arch_hibernation_header_restore); static void *hibernate_page_alloc(void *arg) { - return (void *)get_safe_page((gfp_t)(unsigned long)arg); + return (void *)get_safe_page((__force gfp_t)(unsigned long)arg); } /* @@ -198,7 +198,7 @@ static int create_safe_exec_page(void *src_start, size_t length, { struct trans_pgd_info trans_info = { .trans_alloc_page = hibernate_page_alloc, - .trans_alloc_arg = (void *)GFP_ATOMIC, + .trans_alloc_arg = (__force void *)GFP_ATOMIC, }; void *page = (void *)get_safe_page(GFP_ATOMIC); From 20116dd93f4d0b2e84a25ee83e3238586dbb79ec Mon Sep 17 00:00:00 2001 From: Qi Liu Date: Tue, 2 Feb 2021 15:58:06 +0800 Subject: [PATCH 058/108] drivers/perf: Prevent forced unbinding of ARM_DMC620_PMU drivers Set "suppress_bind_attrs" to true, so that bind/unbind can be disabled via sysfs and prevent unbinding ARM_DMC620_PMU drivers during perf sampling. Signed-off-by: Qi Liu Link: https://lore.kernel.org/r/1612252686-50329-1-git-send-email-liuqi115@huawei.com Signed-off-by: Will Deacon --- drivers/perf/arm_dmc620_pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c index 27f54c0afc3b..66ad5b3ece19 100644 --- a/drivers/perf/arm_dmc620_pmu.c +++ b/drivers/perf/arm_dmc620_pmu.c @@ -717,6 +717,7 @@ static struct platform_driver dmc620_pmu_driver = { .driver = { .name = DMC620_DRVNAME, .acpi_match_table = dmc620_acpi_match, + .suppress_bind_attrs = true, }, .probe = dmc620_pmu_device_probe, .remove = dmc620_pmu_device_remove, From 2ceee7ed4c6c9e3eec1004aee43608bc98b10603 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Sun, 31 Jan 2021 15:36:15 +0100 Subject: [PATCH 059/108] arm64: perf: Constify static attribute_group structs The only usage of these is to put their addresses in an array of pointers to const attribute_group structs. Make them const to allow the compiler to put them in read-only memory. Signed-off-by: Rikard Falkeborn Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 3605f77ad4df..59b1eed52236 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -280,7 +280,7 @@ armv8pmu_event_attr_is_visible(struct kobject *kobj, return 0; } -static struct attribute_group armv8_pmuv3_events_attr_group = { +static const struct attribute_group armv8_pmuv3_events_attr_group = { .name = "events", .attrs = armv8_pmuv3_event_attrs, .is_visible = armv8pmu_event_attr_is_visible, @@ -300,7 +300,7 @@ static struct attribute *armv8_pmuv3_format_attrs[] = { NULL, }; -static struct attribute_group armv8_pmuv3_format_attr_group = { +static const struct attribute_group armv8_pmuv3_format_attr_group = { .name = "format", .attrs = armv8_pmuv3_format_attrs, }; @@ -322,7 +322,7 @@ static struct attribute *armv8_pmuv3_caps_attrs[] = { NULL, }; -static struct attribute_group armv8_pmuv3_caps_attr_group = { +static const struct attribute_group armv8_pmuv3_caps_attr_group = { .name = "caps", .attrs = armv8_pmuv3_caps_attrs, }; From 12fc4288408a8799409f7fa62a526b60e92da334 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Mon, 1 Feb 2021 19:21:09 -0500 Subject: [PATCH 060/108] arm64: ptrace: Fix missing return in hw breakpoint code When delivering a hw-breakpoint SIGTRAP to a compat task via ptrace, the lack of a 'return' statement means we fallthrough to the native case, which differs in its handling of 'si_errno'. Although this looks to be harmless because the subsequent signal is effectively ignored, it's confusing and unintentional, so add the missing 'return'. Signed-off-by: Keno Fischer Link: https://lore.kernel.org/r/20210202002109.GA624440@juliacomputing.com Signed-off-by: Will Deacon --- arch/arm64/kernel/ptrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 8ac487c84e37..3d5c8afca75b 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -194,6 +194,7 @@ static void ptrace_hbptriggered(struct perf_event *bp, } arm64_force_sig_ptrace_errno_trap(si_errno, bkpt->trigger, desc); + return; } #endif arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, bkpt->trigger, desc); From b9ba680969d1016888fed4e03d8ecb4b97726f34 Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Tue, 2 Feb 2021 23:07:49 +0800 Subject: [PATCH 061/108] arm64/ptdump:display the Linear Mapping start marker The current /sys/kernel/debug/kernel_page_tables does not display the *Linear Mapping start* marker on arm64, which I think should be paired with the *Linear Mapping end* marker. Since *Linear Mapping start* is the first marker, use initialise 'level' to -1 in order to display it. Signed-off-by: Hailong Liu Link: https://lore.kernel.org/r/20210202150749.10104-1-liuhailongg6@163.com Signed-off-by: Will Deacon --- arch/arm64/mm/ptdump.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 04137a8f3d2d..0e050d76b83a 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -324,6 +324,7 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info) st = (struct pg_state){ .seq = s, .marker = info->markers, + .level = -1, .ptdump = { .note_page = note_page, .range = (struct ptdump_range[]){ From db2bb91f2e8e73d85876d1665608e834d91d21ee Mon Sep 17 00:00:00 2001 From: Seiya Wang Date: Wed, 3 Feb 2021 13:53:47 +0800 Subject: [PATCH 062/108] arm64: perf: add support for Cortex-A78 Add support for Cortex-A78 using generic PMUv3 for now. Signed-off-by: Seiya Wang Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20210203055348.4935-2-seiya.wang@mediatek.com Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 59b1eed52236..b6d0b2cbdc79 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -1188,6 +1188,12 @@ static int armv8_a77_pmu_init(struct arm_pmu *cpu_pmu) armv8_pmuv3_map_event); } +static int armv8_a78_pmu_init(struct arm_pmu *cpu_pmu) +{ + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a78", + armv8_pmuv3_map_event); +} + static int armv8_e1_pmu_init(struct arm_pmu *cpu_pmu) { return armv8_pmu_init_nogroups(cpu_pmu, "armv8_neoverse_e1", @@ -1225,6 +1231,7 @@ static const struct of_device_id armv8_pmu_of_device_ids[] = { {.compatible = "arm,cortex-a75-pmu", .data = armv8_a75_pmu_init}, {.compatible = "arm,cortex-a76-pmu", .data = armv8_a76_pmu_init}, {.compatible = "arm,cortex-a77-pmu", .data = armv8_a77_pmu_init}, + {.compatible = "arm,cortex-a78-pmu", .data = armv8_a78_pmu_init}, {.compatible = "arm,neoverse-e1-pmu", .data = armv8_e1_pmu_init}, {.compatible = "arm,neoverse-n1-pmu", .data = armv8_n1_pmu_init}, {.compatible = "cavium,thunder-pmu", .data = armv8_thunder_pmu_init}, From 750d43b4a79e5f3767ee1db933b42abdf967ce1e Mon Sep 17 00:00:00 2001 From: Seiya Wang Date: Wed, 3 Feb 2021 13:53:48 +0800 Subject: [PATCH 063/108] dt-bindings: arm: add Cortex-A78 binding Add compatible for Cortex-A78 PMU Signed-off-by: Seiya Wang Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20210203055348.4935-3-seiya.wang@mediatek.com Signed-off-by: Will Deacon --- Documentation/devicetree/bindings/arm/pmu.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/arm/pmu.yaml b/Documentation/devicetree/bindings/arm/pmu.yaml index 693ef3f185a8..e17ac049e890 100644 --- a/Documentation/devicetree/bindings/arm/pmu.yaml +++ b/Documentation/devicetree/bindings/arm/pmu.yaml @@ -43,6 +43,7 @@ properties: - arm,cortex-a75-pmu - arm,cortex-a76-pmu - arm,cortex-a77-pmu + - arm,cortex-a78-pmu - arm,neoverse-e1-pmu - arm,neoverse-n1-pmu - brcm,vulcan-pmu From 00ef543419366e8b742435992d08e0d5a87fd561 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Tue, 2 Feb 2021 12:36:57 +0000 Subject: [PATCH 064/108] arm64: vmlinux.ld.S: add assertion for reserved_pg_dir offset Add RESERVED_SWAPPER_OFFSET and use that instead of hardcoding the offset between swapper_pg_dir and reserved_pg_dir. Then use RESERVED_SWAPPER_OFFSET to assert that the offset is correct at link time. Signed-off-by: Joey Gouly Reviewed-by: Mark Rutland Tested-by: Mark Rutland Link: https://lore.kernel.org/r/20210202123658.22308-2-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/asm-uaccess.h | 4 ++-- arch/arm64/include/asm/memory.h | 6 ++++++ arch/arm64/include/asm/uaccess.h | 2 +- arch/arm64/kernel/vmlinux.lds.S | 3 +++ 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 9990059be106..ccedf548dac9 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -15,10 +15,10 @@ .macro __uaccess_ttbr0_disable, tmp1 mrs \tmp1, ttbr1_el1 // swapper_pg_dir bic \tmp1, \tmp1, #TTBR_ASID_MASK - sub \tmp1, \tmp1, #PAGE_SIZE // reserved_pg_dir just before swapper_pg_dir + sub \tmp1, \tmp1, #RESERVED_SWAPPER_OFFSET // reserved_pg_dir msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 isb - add \tmp1, \tmp1, #PAGE_SIZE + add \tmp1, \tmp1, #RESERVED_SWAPPER_OFFSET msr ttbr1_el1, \tmp1 // set reserved ASID isb .endm diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 18fce223b67b..f6bf8a972a16 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -159,6 +159,12 @@ #define IOREMAP_MAX_ORDER (PMD_SHIFT) #endif +/* + * Open-coded (swapper_pg_dir - reserved_pg_dir) as this cannot be calculated + * until link time. + */ +#define RESERVED_SWAPPER_OFFSET (PAGE_SIZE) + #ifndef __ASSEMBLY__ #include diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index f0fe0cc6abe0..0deb88467111 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -87,7 +87,7 @@ static inline void __uaccess_ttbr0_disable(void) ttbr = read_sysreg(ttbr1_el1); ttbr &= ~TTBR_ASID_MASK; /* reserved_pg_dir placed before swapper_pg_dir */ - write_sysreg(ttbr - PAGE_SIZE, ttbr0_el1); + write_sysreg(ttbr - RESERVED_SWAPPER_OFFSET, ttbr0_el1); isb(); /* Set reserved ASID */ write_sysreg(ttbr, ttbr1_el1); diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 4c0b0c89ad59..a03a5300bce9 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -316,3 +316,6 @@ ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE, * If padding is applied before .head.text, virt<->phys conversions will fail. */ ASSERT(_text == KIMAGE_VADDR, "HEAD is misaligned") + +ASSERT(swapper_pg_dir - reserved_pg_dir == RESERVED_SWAPPER_OFFSET, + "RESERVED_SWAPPER_OFFSET is wrong!") From 0188a894c390e51475274ece76b4d601782d709e Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Tue, 2 Feb 2021 12:36:58 +0000 Subject: [PATCH 065/108] arm64: vmlinux.ld.S: add assertion for tramp_pg_dir offset Add TRAMP_SWAPPER_OFFSET and use that instead of hardcoding the offset between swapper_pg_dir and tramp_pg_dir. Then use TRAMP_SWAPPER_OFFSET to assert that the offset is correct at link time. Signed-off-by: Joey Gouly Reviewed-by: Mark Rutland Tested-by: Mark Rutland Link: https://lore.kernel.org/r/20210202123658.22308-3-joey.gouly@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/memory.h | 6 ++++++ arch/arm64/kernel/entry.S | 4 ++-- arch/arm64/kernel/vmlinux.lds.S | 5 +++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index f6bf8a972a16..6c2674937665 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -165,6 +165,12 @@ */ #define RESERVED_SWAPPER_OFFSET (PAGE_SIZE) +/* + * Open-coded (swapper_pg_dir - tramp_pg_dir) as this cannot be calculated + * until link time. + */ +#define TRAMP_SWAPPER_OFFSET (2 * PAGE_SIZE) + #ifndef __ASSEMBLY__ #include diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index c9bae73f2621..c6aee646eb6b 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -805,7 +805,7 @@ SYM_CODE_END(ret_to_user) // Move from tramp_pg_dir to swapper_pg_dir .macro tramp_map_kernel, tmp mrs \tmp, ttbr1_el1 - add \tmp, \tmp, #(2 * PAGE_SIZE) + add \tmp, \tmp, #TRAMP_SWAPPER_OFFSET bic \tmp, \tmp, #USER_ASID_FLAG msr ttbr1_el1, \tmp #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003 @@ -825,7 +825,7 @@ alternative_else_nop_endif // Move from swapper_pg_dir to tramp_pg_dir .macro tramp_unmap_kernel, tmp mrs \tmp, ttbr1_el1 - sub \tmp, \tmp, #(2 * PAGE_SIZE) + sub \tmp, \tmp, #TRAMP_SWAPPER_OFFSET orr \tmp, \tmp, #USER_ASID_FLAG msr ttbr1_el1, \tmp /* diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index a03a5300bce9..68f76a96c60b 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -319,3 +319,8 @@ ASSERT(_text == KIMAGE_VADDR, "HEAD is misaligned") ASSERT(swapper_pg_dir - reserved_pg_dir == RESERVED_SWAPPER_OFFSET, "RESERVED_SWAPPER_OFFSET is wrong!") + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +ASSERT(swapper_pg_dir - tramp_pg_dir == TRAMP_SWAPPER_OFFSET, + "TRAMP_SWAPPER_OFFSET is wrong!") +#endif From d13c613f136c9090f3863c49b2306d57ab59feba Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 3 Feb 2021 12:36:18 +0100 Subject: [PATCH 066/108] arm64: assembler: add cond_yield macro Add a macro cond_yield that branches to a specified label when called if the TIF_NEED_RESCHED flag is set and decreasing the preempt count would make the task preemptible again, resulting in a schedule to occur. This can be used by kernel mode SIMD code that keeps a lot of state in SIMD registers, which would make chunking the input in order to perform the cond_resched() check from C code disproportionately costly. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20210203113626.220151-2-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/assembler.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index bf125c591116..27b1ea721c2d 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -745,6 +745,22 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU .Lyield_out_\@ : .endm + /* + * Check whether preempt-disabled code should yield as soon as it + * is able. This is the case if re-enabling preemption a single + * time results in a preempt count of zero, and the TIF_NEED_RESCHED + * flag is set. (Note that the latter is stored negated in the + * top word of the thread_info::preempt_count field) + */ + .macro cond_yield, lbl:req, tmp:req +#ifdef CONFIG_PREEMPTION + get_current_task \tmp + ldr \tmp, [\tmp, #TSK_TI_PREEMPT] + sub \tmp, \tmp, #PREEMPT_DISABLE_OFFSET + cbz \tmp, \lbl +#endif + .endm + /* * This macro emits a program property note section identifying * architecture features which require special handling, mainly for From d9f1b52afa4012974b3c726ca89ae311f194e83f Mon Sep 17 00:00:00 2001 From: Zhiyuan Dai Date: Thu, 4 Feb 2021 09:43:49 +0800 Subject: [PATCH 067/108] arm64: improve whitespace In a few places we don't have whitespace between macro parameters, which makes them hard to read. This patch adds whitespace to clearly separate the parameters. In a few places we have unnecessary whitespace around unary operators, which is confusing, This patch removes the unnecessary whitespace. Signed-off-by: Zhiyuan Dai Link: https://lore.kernel.org/r/1612403029-5011-1-git-send-email-daizhiyuan@phytium.com.cn Signed-off-by: Will Deacon --- arch/arm64/kernel/alternative.c | 2 +- arch/arm64/kernel/module-plts.c | 2 +- arch/arm64/kernel/perf_event.c | 2 +- arch/arm64/kernel/process.c | 4 ++-- arch/arm64/kernel/traps.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index a57cffb752e8..1184c44ea2c7 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -17,7 +17,7 @@ #include #include -#define __ALT_PTR(a,f) ((void *)&(a)->f + (a)->f) +#define __ALT_PTR(a, f) ((void *)&(a)->f + (a)->f) #define ALT_ORIG_PTR(a) __ALT_PTR(a, orig_offset) #define ALT_REPL_PTR(a) __ALT_PTR(a, alt_offset) diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c index 2e224435c024..e53493d8b208 100644 --- a/arch/arm64/kernel/module-plts.c +++ b/arch/arm64/kernel/module-plts.c @@ -131,7 +131,7 @@ u64 module_emit_veneer_for_adrp(struct module *mod, Elf64_Shdr *sechdrs, } #endif -#define cmp_3way(a,b) ((a) < (b) ? -1 : (a) > (b)) +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b)) static int cmp_rela(const void *a, const void *b) { diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 3605f77ad4df..d1fec4ab4bcf 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -810,7 +810,7 @@ static int armv8pmu_get_single_idx(struct pmu_hw_events *cpuc, { int idx; - for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; idx ++) { + for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; idx++) { if (!test_and_set_bit(idx, cpuc->used_mask)) return idx; } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 6616486a58fe..4cc1ccc8d6ab 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -304,7 +304,7 @@ void __show_regs(struct pt_regs *regs) } } -void show_regs(struct pt_regs * regs) +void show_regs(struct pt_regs *regs) { __show_regs(regs); dump_backtrace(regs, NULL, KERN_DEFAULT); @@ -587,7 +587,7 @@ unsigned long get_wchan(struct task_struct *p) ret = frame.pc; goto out; } - } while (count ++ < 16); + } while (count++ < 16); out: put_task_stack(p); diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 6895ce777e7f..a05d34f0e82a 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -45,7 +45,7 @@ #include #include -static const char *handler[]= { +static const char *handler[] = { "Synchronous Abort", "IRQ", "FIQ", From abd4737f67d75563d1d0cc57bd5daab026e8c2d1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 5 Feb 2021 04:09:19 -0500 Subject: [PATCH 068/108] mm/arm64: Correct obsolete comment in do_page_fault() commit d8ed45c5dcd4 ("mmap locking API: use coccinelle to convert mmap_sem rwsem call sites") has convertd down_read_trylock() to mmap_read_trylock(). But it forgot to update the relevant comment. Signed-off-by: Miaohe Lin Link: https://lore.kernel.org/r/20210205090919.63382-1-linmiaohe@huawei.com Signed-off-by: Will Deacon --- arch/arm64/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 3c40da479899..86a3877ea86f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -564,7 +564,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned int esr, mmap_read_lock(mm); } else { /* - * The above down_read_trylock() might have succeeded in which + * The above mmap_read_trylock() might have succeeded in which * case, we'll have missed the might_sleep() from down_read(). */ might_sleep(); From c0b15c25d25171db4b70cc0b7dbc1130ee94017d Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 3 Feb 2021 23:00:57 +0000 Subject: [PATCH 069/108] arm64: Extend workaround for erratum 1024718 to all versions of Cortex-A55 The erratum 1024718 affects Cortex-A55 r0p0 to r2p0. However we apply the work around for r0p0 - r1p0. Unfortunately this won't be fixed for the future revisions for the CPU. Thus extend the work around for all versions of A55, to cover for r2p0 and any future revisions. Cc: stable@vger.kernel.org Cc: Catalin Marinas Cc: Will Deacon Cc: James Morse Cc: Kunihiko Hayashi Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20210203230057.3961239-1-suzuki.poulose@arm.com [will: Update Kconfig help text] Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 2 +- arch/arm64/kernel/cpufeature.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f39568b28ec1..3dfb25afa616 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -522,7 +522,7 @@ config ARM64_ERRATUM_1024718 help This option adds a workaround for ARM Cortex-A55 Erratum 1024718. - Affected Cortex-A55 cores (r0p0, r0p1, r1p0) could cause incorrect + Affected Cortex-A55 cores (all revisions) could cause incorrect update of the hardware dirty bit when the DBM/AP bits are updated without a break-before-make. The workaround is to disable the usage of hardware DBM locally on the affected cores. CPUs not affected by diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index e99eddec0a46..db400ca77427 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1455,7 +1455,7 @@ static bool cpu_has_broken_dbm(void) /* List of CPUs which have broken DBM support. */ static const struct midr_range cpus[] = { #ifdef CONFIG_ARM64_ERRATUM_1024718 - MIDR_RANGE(MIDR_CORTEX_A55, 0, 0, 1, 0), // A55 r0p0 -r1p0 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), /* Kryo4xx Silver (rdpe => r1p0) */ MIDR_REV(MIDR_QCOM_KRYO_4XX_SILVER, 0xd, 0xe), #endif From 114945d84a30a5feba8ec24d854257c78c89abd1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:10 +0000 Subject: [PATCH 070/108] arm64: Fix labels in el2_setup macros If someone happens to write the following code: b 1f init_el2_state vhe 1: [...] they will be in for a long debugging session, as the label "1f" will be resolved *inside* the init_el2_state macro instead of after it. Not really what one expects. Instead, rewite the EL2 setup macros to use unambiguous labels, thanks to the usual macro counter trick. Acked-by: Catalin Marinas Signed-off-by: Marc Zyngier Acked-by: David Brazdil Link: https://lore.kernel.org/r/20210208095732.3267263-2-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/el2_setup.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index a7f5a1bbc8ac..540116de80bf 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -45,24 +45,24 @@ mrs x1, id_aa64dfr0_el1 sbfx x0, x1, #ID_AA64DFR0_PMUVER_SHIFT, #4 cmp x0, #1 - b.lt 1f // Skip if no PMU present + b.lt .Lskip_pmu_\@ // Skip if no PMU present mrs x0, pmcr_el0 // Disable debug access traps ubfx x0, x0, #11, #5 // to EL2 and allow access to -1: +.Lskip_pmu_\@: csel x2, xzr, x0, lt // all PMU counters from EL1 /* Statistical profiling */ ubfx x0, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4 - cbz x0, 3f // Skip if SPE not present + cbz x0, .Lskip_spe_\@ // Skip if SPE not present .ifeqs "\mode", "nvhe" mrs_s x0, SYS_PMBIDR_EL1 // If SPE available at EL2, and x0, x0, #(1 << SYS_PMBIDR_EL1_P_SHIFT) - cbnz x0, 2f // then permit sampling of physical + cbnz x0, .Lskip_spe_el2_\@ // then permit sampling of physical mov x0, #(1 << SYS_PMSCR_EL2_PCT_SHIFT | \ 1 << SYS_PMSCR_EL2_PA_SHIFT) msr_s SYS_PMSCR_EL2, x0 // addresses and physical counter -2: +.Lskip_spe_el2_\@: mov x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) orr x2, x2, x0 // If we don't have VHE, then // use EL1&0 translation. @@ -71,7 +71,7 @@ // and disable access from EL1 .endif -3: +.Lskip_spe_\@: msr mdcr_el2, x2 // Configure debug traps .endm @@ -79,9 +79,9 @@ .macro __init_el2_lor mrs x1, id_aa64mmfr1_el1 ubfx x0, x1, #ID_AA64MMFR1_LOR_SHIFT, 4 - cbz x0, 1f + cbz x0, .Lskip_lor_\@ msr_s SYS_LORC_EL1, xzr -1: +.Lskip_lor_\@: .endm /* Stage-2 translation */ @@ -93,7 +93,7 @@ .macro __init_el2_gicv3 mrs x0, id_aa64pfr0_el1 ubfx x0, x0, #ID_AA64PFR0_GIC_SHIFT, #4 - cbz x0, 1f + cbz x0, .Lskip_gicv3_\@ mrs_s x0, SYS_ICC_SRE_EL2 orr x0, x0, #ICC_SRE_EL2_SRE // Set ICC_SRE_EL2.SRE==1 @@ -103,7 +103,7 @@ mrs_s x0, SYS_ICC_SRE_EL2 // Read SRE back, tbz x0, #0, 1f // and check that it sticks msr_s SYS_ICH_HCR_EL2, xzr // Reset ICC_HCR_EL2 to defaults -1: +.Lskip_gicv3_\@: .endm .macro __init_el2_hstr @@ -128,14 +128,14 @@ .macro __init_el2_nvhe_sve mrs x1, id_aa64pfr0_el1 ubfx x1, x1, #ID_AA64PFR0_SVE_SHIFT, #4 - cbz x1, 1f + cbz x1, .Lskip_sve_\@ bic x0, x0, #CPTR_EL2_TZ // Also disable SVE traps msr cptr_el2, x0 // Disable copro. traps to EL2 isb mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector msr_s SYS_ZCR_EL2, x1 // length for EL1. -1: +.Lskip_sve_\@: .endm .macro __init_el2_nvhe_prepare_eret From b161f92482426a7323884d57cbae683812909988 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:11 +0000 Subject: [PATCH 071/108] arm64: Fix outdated TCR setup comment The arm64 kernel has long be able to use more than 39bit VAs. Since day one, actually. Let's rewrite the offending comment. Signed-off-by: Marc Zyngier Acked-by: Catalin Marinas Acked-by: David Brazdil Link: https://lore.kernel.org/r/20210208095732.3267263-3-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/mm/proc.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 1f7ee8c8b7b8..ece785477bdc 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -464,8 +464,8 @@ SYM_FUNC_START(__cpu_setup) #endif msr mair_el1, x5 /* - * Set/prepare TCR and TTBR. We use 512GB (39-bit) address range for - * both user and kernel. + * Set/prepare TCR and TTBR. TCR_EL1.T1SZ gets further + * adjusted if the kernel is compiled with 52bit VA support. */ mov_q x10, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \ TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ From 8cc8a32415364e475c25277b507f06f67c47ca9a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:12 +0000 Subject: [PATCH 072/108] arm64: Turn the MMU-on sequence into a macro Turning the MMU on is a popular sport in the arm64 kernel, and we do it more than once, or even twice. As we are about to add even more, let's turn it into a macro. No expected functional change. Signed-off-by: Marc Zyngier Acked-by: Catalin Marinas Acked-by: David Brazdil Link: https://lore.kernel.org/r/20210208095732.3267263-4-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/assembler.h | 17 +++++++++++++++++ arch/arm64/kernel/head.S | 19 ++++--------------- arch/arm64/mm/proc.S | 12 +----------- 3 files changed, 22 insertions(+), 26 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index bf125c591116..8cded93f99c3 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -675,6 +675,23 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU .endif .endm +/* + * Set SCTLR_EL1 to the passed value, and invalidate the local icache + * in the process. This is called when setting the MMU on. + */ +.macro set_sctlr_el1, reg + msr sctlr_el1, \reg + isb + /* + * Invalidate the local I-cache so that any instructions fetched + * speculatively from the PoC are discarded, since they may have + * been dynamically patched at the PoU. + */ + ic iallu + dsb nsh + isb +.endm + /* * Check whether to yield to another runnable task from kernel mode NEON code * (which runs with preemption disabled). diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index a0dc987724ed..28e9735302df 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -703,16 +703,9 @@ SYM_FUNC_START(__enable_mmu) offset_ttbr1 x1, x3 msr ttbr1_el1, x1 // load TTBR1 isb - msr sctlr_el1, x0 - isb - /* - * Invalidate the local I-cache so that any instructions fetched - * speculatively from the PoC are discarded, since they may have - * been dynamically patched at the PoU. - */ - ic iallu - dsb nsh - isb + + set_sctlr_el1 x0 + ret SYM_FUNC_END(__enable_mmu) @@ -883,11 +876,7 @@ SYM_FUNC_START_LOCAL(__primary_switch) tlbi vmalle1 // Remove any stale TLB entries dsb nsh - msr sctlr_el1, x19 // re-enable the MMU - isb - ic iallu // flush instructions fetched - dsb nsh // via old mapping - isb + set_sctlr_el1 x19 // re-enable the MMU bl __relocate_kernel #endif diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index ece785477bdc..c967bfd30d2b 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -291,17 +291,7 @@ skip_pgd: /* We're done: fire up the MMU again */ mrs x17, sctlr_el1 orr x17, x17, #SCTLR_ELx_M - msr sctlr_el1, x17 - isb - - /* - * Invalidate the local I-cache so that any instructions fetched - * speculatively from the PoC are discarded, since they may have - * been dynamically patched at the PoU. - */ - ic iallu - dsb nsh - isb + set_sctlr_el1 x17 /* Set the flag to zero to indicate that we're all done */ str wzr, [flag_ptr] From f359182291c757cdf77bcd014c025d1ed6b87662 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:13 +0000 Subject: [PATCH 073/108] arm64: Provide an 'upgrade to VHE' stub hypercall As we are about to change the way a VHE system boots, let's provide the core helper, in the form of a stub hypercall that enables VHE and replicates the full EL1 context at EL2, thanks to EL1 and VHE-EL2 being extremely similar. On exception return, the kernel carries on at EL2. Fancy! Nothing calls this new hypercall yet, so no functional change. Signed-off-by: Marc Zyngier Acked-by: David Brazdil Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20210208095732.3267263-5-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/virt.h | 7 +++- arch/arm64/kernel/hyp-stub.S | 76 ++++++++++++++++++++++++++++++++++- 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index ee6a48df89d9..7379f35ae2c6 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -35,8 +35,13 @@ */ #define HVC_RESET_VECTORS 2 +/* + * HVC_VHE_RESTART - Upgrade the CPU from EL1 to EL2, if possible + */ +#define HVC_VHE_RESTART 3 + /* Max number of HYP stub hypercalls */ -#define HVC_STUB_HCALL_NR 3 +#define HVC_STUB_HCALL_NR 4 /* Error returned when an invalid stub number is passed into x0 */ #define HVC_STUB_ERR 0xbadca11 diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 160f5881a0b7..3f3dbbe8914d 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -8,9 +8,9 @@ #include #include -#include #include +#include #include #include #include @@ -47,10 +47,13 @@ SYM_CODE_END(__hyp_stub_vectors) SYM_CODE_START_LOCAL(el1_sync) cmp x0, #HVC_SET_VECTORS - b.ne 2f + b.ne 1f msr vbar_el2, x1 b 9f +1: cmp x0, #HVC_VHE_RESTART + b.eq mutate_to_vhe + 2: cmp x0, #HVC_SOFT_RESTART b.ne 3f mov x0, x2 @@ -70,6 +73,75 @@ SYM_CODE_START_LOCAL(el1_sync) eret SYM_CODE_END(el1_sync) +// nVHE? No way! Give me the real thing! +SYM_CODE_START_LOCAL(mutate_to_vhe) + // Be prepared to fail + mov_q x0, HVC_STUB_ERR + + // Sanity check: MMU *must* be off + mrs x1, sctlr_el2 + tbnz x1, #0, 1f + + // Needs to be VHE capable, obviously + mrs x1, id_aa64mmfr1_el1 + ubfx x1, x1, #ID_AA64MMFR1_VHE_SHIFT, #4 + cbz x1, 1f + + // Engage the VHE magic! + mov_q x0, HCR_HOST_VHE_FLAGS + msr hcr_el2, x0 + isb + + // Doesn't do much on VHE, but still, worth a shot + init_el2_state vhe + + // Use the EL1 allocated stack, per-cpu offset + mrs x0, sp_el1 + mov sp, x0 + mrs x0, tpidr_el1 + msr tpidr_el2, x0 + + // FP configuration, vectors + mrs_s x0, SYS_CPACR_EL12 + msr cpacr_el1, x0 + mrs_s x0, SYS_VBAR_EL12 + msr vbar_el1, x0 + + // Transfer the MM state from EL1 to EL2 + mrs_s x0, SYS_TCR_EL12 + msr tcr_el1, x0 + mrs_s x0, SYS_TTBR0_EL12 + msr ttbr0_el1, x0 + mrs_s x0, SYS_TTBR1_EL12 + msr ttbr1_el1, x0 + mrs_s x0, SYS_MAIR_EL12 + msr mair_el1, x0 + isb + + // Invalidate TLBs before enabling the MMU + tlbi vmalle1 + dsb nsh + + // Enable the EL2 S1 MMU, as set up from EL1 + mrs_s x0, SYS_SCTLR_EL12 + set_sctlr_el1 x0 + + // Disable the EL1 S1 MMU for a good measure + mov_q x0, INIT_SCTLR_EL1_MMU_OFF + msr_s SYS_SCTLR_EL12, x0 + + // Hack the exception return to stay at EL2 + mrs x0, spsr_el1 + and x0, x0, #~PSR_MODE_MASK + mov x1, #PSR_MODE_EL2h + orr x0, x0, x1 + msr spsr_el1, x0 + + mov x0, xzr + +1: eret +SYM_CODE_END(mutate_to_vhe) + .macro invalid_vector label SYM_CODE_START_LOCAL(\label) b \label From 6459b8469753e9feaa8b34691d097cffad905931 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 2 Feb 2021 12:03:41 +0000 Subject: [PATCH 074/108] arm64: entry: consolidate Cortex-A76 erratum 1463225 workaround The workaround for Cortex-A76 erratum 1463225 is split across the syscall and debug handlers in separate files. This structure currently forces us to do some redundant work for debug exceptions from EL0, is a little difficult to follow, and gets in the way of some future rework of the exception entry code as it requires exceptions to be unmasked late in the syscall handling path. To simplify things, and as a preparatory step for future rework of exception entry, this patch moves all the workaround logic into entry-common.c. As the debug handler only needs to run for EL1 debug exceptions, we no longer call it for EL0 debug exceptions, and no longer need to check user_mode(regs) as this is always false. For clarity cortex_a76_erratum_1463225_debug_handler() is changed to return bool. In the SVC path, the workaround is applied earlier, but this should have no functional impact as exceptions are still masked. In the debug path we run the fixup before explicitly disabling preemption, but we will not attempt to preempt before returning from the exception. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20210202120341.28858-1-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/cpu_errata.c | 2 -- arch/arm64/kernel/entry-common.c | 54 +++++++++++++++++++++++++++++++- arch/arm64/kernel/syscall.c | 30 ------------------ arch/arm64/mm/fault.c | 32 ------------------- 4 files changed, 53 insertions(+), 65 deletions(-) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index a63428301f42..506a1cd37973 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -107,8 +107,6 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap) } #ifdef CONFIG_ARM64_ERRATUM_1463225 -DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa); - static bool has_cortex_a76_erratum_1463225(const struct arm64_cpu_capabilities *entry, int scope) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 5346953e4382..9d3588450473 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -109,6 +109,55 @@ asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs) exit_to_kernel_mode(regs); } +#ifdef CONFIG_ARM64_ERRATUM_1463225 +static DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa); + +static void cortex_a76_erratum_1463225_svc_handler(void) +{ + u32 reg, val; + + if (!unlikely(test_thread_flag(TIF_SINGLESTEP))) + return; + + if (!unlikely(this_cpu_has_cap(ARM64_WORKAROUND_1463225))) + return; + + __this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 1); + reg = read_sysreg(mdscr_el1); + val = reg | DBG_MDSCR_SS | DBG_MDSCR_KDE; + write_sysreg(val, mdscr_el1); + asm volatile("msr daifclr, #8"); + isb(); + + /* We will have taken a single-step exception by this point */ + + write_sysreg(reg, mdscr_el1); + __this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 0); +} + +static bool cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) +{ + if (!__this_cpu_read(__in_cortex_a76_erratum_1463225_wa)) + return false; + + /* + * We've taken a dummy step exception from the kernel to ensure + * that interrupts are re-enabled on the syscall path. Return back + * to cortex_a76_erratum_1463225_svc_handler() with debug exceptions + * masked so that we can safely restore the mdscr and get on with + * handling the syscall. + */ + regs->pstate |= PSR_D_BIT; + return true; +} +#else /* CONFIG_ARM64_ERRATUM_1463225 */ +static void cortex_a76_erratum_1463225_svc_handler(void) { } +static bool cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) +{ + return false; +} +#endif /* CONFIG_ARM64_ERRATUM_1463225 */ + static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); @@ -186,7 +235,8 @@ static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); arm64_enter_el1_dbg(regs); - do_debug_exception(far, esr, regs); + if (!cortex_a76_erratum_1463225_debug_handler(regs)) + do_debug_exception(far, esr, regs); arm64_exit_el1_dbg(regs); } @@ -362,6 +412,7 @@ static void noinstr el0_svc(struct pt_regs *regs) gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); enter_from_user_mode(); + cortex_a76_erratum_1463225_svc_handler(); do_el0_svc(regs); } @@ -439,6 +490,7 @@ static void noinstr el0_svc_compat(struct pt_regs *regs) gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); enter_from_user_mode(); + cortex_a76_erratum_1463225_svc_handler(); do_el0_svc_compat(regs); } diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index c2877c332f2d..b9cf12b271d7 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -65,35 +65,6 @@ static inline bool has_syscall_work(unsigned long flags) int syscall_trace_enter(struct pt_regs *regs); void syscall_trace_exit(struct pt_regs *regs); -#ifdef CONFIG_ARM64_ERRATUM_1463225 -DECLARE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa); - -static void cortex_a76_erratum_1463225_svc_handler(void) -{ - u32 reg, val; - - if (!unlikely(test_thread_flag(TIF_SINGLESTEP))) - return; - - if (!unlikely(this_cpu_has_cap(ARM64_WORKAROUND_1463225))) - return; - - __this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 1); - reg = read_sysreg(mdscr_el1); - val = reg | DBG_MDSCR_SS | DBG_MDSCR_KDE; - write_sysreg(val, mdscr_el1); - asm volatile("msr daifclr, #8"); - isb(); - - /* We will have taken a single-step exception by this point */ - - write_sysreg(reg, mdscr_el1); - __this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 0); -} -#else -static void cortex_a76_erratum_1463225_svc_handler(void) { } -#endif /* CONFIG_ARM64_ERRATUM_1463225 */ - static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, const syscall_fn_t syscall_table[]) { @@ -120,7 +91,6 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, * (Similarly for HVC and SMC elsewhere.) */ - cortex_a76_erratum_1463225_svc_handler(); local_daif_restore(DAIF_PROCCTX); if (flags & _TIF_MTE_ASYNC_FAULT) { diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 3c40da479899..d0c23d206aab 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -874,44 +874,12 @@ static void debug_exception_exit(struct pt_regs *regs) } NOKPROBE_SYMBOL(debug_exception_exit); -#ifdef CONFIG_ARM64_ERRATUM_1463225 -DECLARE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa); - -static int cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) -{ - if (user_mode(regs)) - return 0; - - if (!__this_cpu_read(__in_cortex_a76_erratum_1463225_wa)) - return 0; - - /* - * We've taken a dummy step exception from the kernel to ensure - * that interrupts are re-enabled on the syscall path. Return back - * to cortex_a76_erratum_1463225_svc_handler() with debug exceptions - * masked so that we can safely restore the mdscr and get on with - * handling the syscall. - */ - regs->pstate |= PSR_D_BIT; - return 1; -} -#else -static int cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) -{ - return 0; -} -#endif /* CONFIG_ARM64_ERRATUM_1463225 */ -NOKPROBE_SYMBOL(cortex_a76_erratum_1463225_debug_handler); - void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf = esr_to_debug_fault_info(esr); unsigned long pc = instruction_pointer(regs); - if (cortex_a76_erratum_1463225_debug_handler(regs)) - return; - debug_exception_enter(regs); if (user_mode(regs) && !is_ttbr0_addr(pc)) From 0c93df9622d4d921bcd0dc83f71fed9e98f5119f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:14 +0000 Subject: [PATCH 075/108] arm64: Initialise as nVHE before switching to VHE As we are aiming to be able to control whether we enable VHE or not, let's always drop down to EL1 first, and only then upgrade to VHE if at all possible. This means that if the kernel is booted at EL2, we always start with a nVHE init, drop to EL1 to initialise the the kernel, and only then upgrade the kernel EL to EL2 if possible (the process is obviously shortened for secondary CPUs). The resume path is handled similarly to a secondary CPU boot. Signed-off-by: Marc Zyngier Acked-by: David Brazdil Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20210208095732.3267263-6-maz@kernel.org [will: Avoid calling switch_to_vhe twice on kaslr path] Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 38 ++---------------------------------- arch/arm64/kernel/hyp-stub.S | 24 +++++++++++++++++++++++ arch/arm64/kernel/sleep.S | 1 + 3 files changed, 27 insertions(+), 36 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 28e9735302df..ec66dc061b0c 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -447,6 +447,7 @@ SYM_FUNC_START_LOCAL(__primary_switched) ret // to __primary_switch() 0: #endif + bl switch_to_vhe // Prefer VHE if possible add sp, sp, #16 mov x29, #0 mov x30, #0 @@ -493,42 +494,6 @@ SYM_INNER_LABEL(init_el1, SYM_L_LOCAL) eret SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) -#ifdef CONFIG_ARM64_VHE - /* - * Check for VHE being present. x2 being non-zero indicates that we - * do have VHE, and that the kernel is intended to run at EL2. - */ - mrs x2, id_aa64mmfr1_el1 - ubfx x2, x2, #ID_AA64MMFR1_VHE_SHIFT, #4 -#else - mov x2, xzr -#endif - cbz x2, init_el2_nvhe - - /* - * When VHE _is_ in use, EL1 will not be used in the host and - * requires no configuration, and all non-hyp-specific EL2 setup - * will be done via the _EL1 system register aliases in __cpu_setup. - */ - mov_q x0, HCR_HOST_VHE_FLAGS - msr hcr_el2, x0 - isb - - init_el2_state vhe - - isb - - mov_q x0, INIT_PSTATE_EL2 - msr spsr_el2, x0 - msr elr_el2, lr - mov w0, #BOOT_CPU_MODE_EL2 - eret - -SYM_INNER_LABEL(init_el2_nvhe, SYM_L_LOCAL) - /* - * When VHE is not in use, early init of EL2 and EL1 needs to be - * done here. - */ mov_q x0, INIT_SCTLR_EL1_MMU_OFF msr sctlr_el1, x0 @@ -623,6 +588,7 @@ SYM_FUNC_START_LOCAL(secondary_startup) /* * Common entry point for secondary CPUs. */ + bl switch_to_vhe bl __cpu_secondary_check52bitva bl __cpu_setup // initialise processor adrp x1, swapper_pg_dir diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 3f3dbbe8914d..373ed2213e1d 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -190,3 +190,27 @@ SYM_FUNC_START(__hyp_reset_vectors) hvc #0 ret SYM_FUNC_END(__hyp_reset_vectors) + +/* + * Entry point to switch to VHE if deemed capable + */ +SYM_FUNC_START(switch_to_vhe) +#ifdef CONFIG_ARM64_VHE + // Need to have booted at EL2 + adr_l x1, __boot_cpu_mode + ldr w0, [x1] + cmp w0, #BOOT_CPU_MODE_EL2 + b.ne 1f + + // and still be at EL1 + mrs x0, CurrentEL + cmp x0, #CurrentEL_EL1 + b.ne 1f + + // Turn the world upside down + mov x0, #HVC_VHE_RESTART + hvc #0 +1: +#endif + ret +SYM_FUNC_END(switch_to_vhe) diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 6bdef7362c0e..5bfd9b87f85d 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -100,6 +100,7 @@ SYM_FUNC_END(__cpu_suspend_enter) .pushsection ".idmap.text", "awx" SYM_CODE_START(cpu_resume) bl init_kernel_el + bl switch_to_vhe bl __cpu_setup /* enable the MMU early - so we can access sleep_save_stash by va */ adrp x1, swapper_pg_dir From c6f8c92f3f368d345c38aea5cc0e60515bcb159e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:15 +0000 Subject: [PATCH 076/108] arm64: Drop early setting of MDSCR_EL2.TPMS When running VHE, we set MDSCR_EL2.TPMS very early on to force the trapping of EL1 SPE accesses to EL2. However: - we are running with HCR_EL2.{E2H,TGE}={1,1}, meaning that there is no EL1 to trap from - before entering a guest, we call kvm_arm_setup_debug(), which sets MDCR_EL2_TPMS in the per-vcpu shadow mdscr_el2, which gets applied on entry by __activate_traps_common(). The early setting of MDSCR_EL2.TPMS is therefore useless and can be dropped. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210208095732.3267263-7-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/el2_setup.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 540116de80bf..56c9e1cef180 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -66,9 +66,6 @@ mov x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) orr x2, x2, x0 // If we don't have VHE, then // use EL1&0 translation. -.else - orr x2, x2, #MDCR_EL2_TPMS // For VHE, use EL2 translation - // and disable access from EL1 .endif .Lskip_spe_\@: From 19e87e131915a2389a08874092a82fe5aa0f8952 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:16 +0000 Subject: [PATCH 077/108] arm64: Move VHE-specific SPE setup to mutate_to_vhe() There isn't much that a VHE kernel needs on top of whatever has been done for nVHE, so let's move the little we need to the VHE stub (the SPE setup), and drop the init_el2_state macro. No expected functional change. Signed-off-by: Marc Zyngier Acked-by: David Brazdil Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20210208095732.3267263-8-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/hyp-stub.S | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 373ed2213e1d..6229315d533d 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -92,9 +92,6 @@ SYM_CODE_START_LOCAL(mutate_to_vhe) msr hcr_el2, x0 isb - // Doesn't do much on VHE, but still, worth a shot - init_el2_state vhe - // Use the EL1 allocated stack, per-cpu offset mrs x0, sp_el1 mov sp, x0 @@ -107,6 +104,11 @@ SYM_CODE_START_LOCAL(mutate_to_vhe) mrs_s x0, SYS_VBAR_EL12 msr vbar_el1, x0 + // Use EL2 translations for SPE and disable access from EL1 + mrs x0, mdcr_el2 + bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) + msr mdcr_el2, x0 + // Transfer the MM state from EL1 to EL2 mrs_s x0, SYS_TCR_EL12 msr tcr_el1, x0 From e2df464173f0b585adb958a09536eae2cd1dbefd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:17 +0000 Subject: [PATCH 078/108] arm64: Simplify init_el2_state to be non-VHE only As init_el2_state is now nVHE only, let's simplify it and drop the VHE setup. Signed-off-by: Marc Zyngier Acked-by: David Brazdil Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20210208095732.3267263-9-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/el2_setup.h | 33 ++++++++---------------------- arch/arm64/kernel/head.S | 2 +- arch/arm64/kvm/hyp/nvhe/hyp-init.S | 2 +- 3 files changed, 10 insertions(+), 27 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 56c9e1cef180..d77d358f9395 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -32,16 +32,14 @@ * to transparently mess with the EL0 bits via CNTKCTL_EL1 access in * EL2. */ -.macro __init_el2_timers mode -.ifeqs "\mode", "nvhe" +.macro __init_el2_timers mrs x0, cnthctl_el2 orr x0, x0, #3 // Enable EL1 physical timers msr cnthctl_el2, x0 -.endif msr cntvoff_el2, xzr // Clear virtual offset .endm -.macro __init_el2_debug mode +.macro __init_el2_debug mrs x1, id_aa64dfr0_el1 sbfx x0, x1, #ID_AA64DFR0_PMUVER_SHIFT, #4 cmp x0, #1 @@ -55,7 +53,6 @@ ubfx x0, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4 cbz x0, .Lskip_spe_\@ // Skip if SPE not present -.ifeqs "\mode", "nvhe" mrs_s x0, SYS_PMBIDR_EL1 // If SPE available at EL2, and x0, x0, #(1 << SYS_PMBIDR_EL1_P_SHIFT) cbnz x0, .Lskip_spe_el2_\@ // then permit sampling of physical @@ -66,7 +63,6 @@ mov x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) orr x2, x2, x0 // If we don't have VHE, then // use EL1&0 translation. -.endif .Lskip_spe_\@: msr mdcr_el2, x2 // Configure debug traps @@ -142,37 +138,24 @@ /** * Initialize EL2 registers to sane values. This should be called early on all - * cores that were booted in EL2. + * cores that were booted in EL2. Note that everything gets initialised as + * if VHE was not evailable. The kernel context will be upgraded to VHE + * if possible later on in the boot process * * Regs: x0, x1 and x2 are clobbered. */ -.macro init_el2_state mode -.ifnes "\mode", "vhe" -.ifnes "\mode", "nvhe" -.error "Invalid 'mode' argument" -.endif -.endif - +.macro init_el2_state __init_el2_sctlr - __init_el2_timers \mode - __init_el2_debug \mode + __init_el2_timers + __init_el2_debug __init_el2_lor __init_el2_stage2 __init_el2_gicv3 __init_el2_hstr - - /* - * When VHE is not in use, early init of EL2 needs to be done here. - * When VHE _is_ in use, EL1 will not be used in the host and - * requires no configuration, and all non-hyp-specific EL2 setup - * will be done via the _EL1 system register aliases in __cpu_setup. - */ -.ifeqs "\mode", "nvhe" __init_el2_nvhe_idregs __init_el2_nvhe_cptr __init_el2_nvhe_sve __init_el2_nvhe_prepare_eret -.endif .endm #endif /* __ARM_KVM_INIT_H__ */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index ec66dc061b0c..0a46f722f051 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -501,7 +501,7 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) msr hcr_el2, x0 isb - init_el2_state nvhe + init_el2_state /* Hypervisor stub */ adr_l x0, __hyp_stub_vectors diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 31b060a44045..222cfc3e7190 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -189,7 +189,7 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) 2: msr SPsel, #1 // We want to use SP_EL{1,2} /* Initialize EL2 CPU state to sane values. */ - init_el2_state nvhe // Clobbers x0..x2 + init_el2_state // Clobbers x0..x2 /* Enable MMU, set vectors and stack. */ mov x0, x28 From d077cb3cb90470f8bd7dbe357a474e13589390b9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:18 +0000 Subject: [PATCH 079/108] arm64: Move SCTLR_EL1 initialisation to EL-agnostic code We can now move the initial SCTLR_EL1 setup to be used for both EL1 and EL2 setup. Signed-off-by: Marc Zyngier Acked-by: Catalin Marinas Acked-by: David Brazdil Link: https://lore.kernel.org/r/20210208095732.3267263-10-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 0a46f722f051..2c1b8ffa3d3e 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -479,13 +479,14 @@ EXPORT_SYMBOL(kimage_vaddr) * booted in EL1 or EL2 respectively. */ SYM_FUNC_START(init_kernel_el) + mov_q x0, INIT_SCTLR_EL1_MMU_OFF + msr sctlr_el1, x0 + mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 b.eq init_el2 SYM_INNER_LABEL(init_el1, SYM_L_LOCAL) - mov_q x0, INIT_SCTLR_EL1_MMU_OFF - msr sctlr_el1, x0 isb mov_q x0, INIT_PSTATE_EL1 msr spsr_el1, x0 @@ -494,9 +495,6 @@ SYM_INNER_LABEL(init_el1, SYM_L_LOCAL) eret SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) - mov_q x0, INIT_SCTLR_EL1_MMU_OFF - msr sctlr_el1, x0 - mov_q x0, HCR_HOST_NVHE_FLAGS msr hcr_el2, x0 isb From 8f266a5d878ad38fbd43e41e22847650f51d4734 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:19 +0000 Subject: [PATCH 080/108] arm64: cpufeature: Add global feature override facility Add a facility to globally override a feature, no matter what the HW says. Yes, this sounds dangerous, but we do respect the "safe" value for a given feature. This doesn't mean the user doesn't need to know what they are doing. Nothing uses this yet, so we are pretty safe. For now. Signed-off-by: Marc Zyngier Reviewed-by: Suzuki K Poulose Acked-by: David Brazdil Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20210208095732.3267263-11-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpufeature.h | 6 ++++ arch/arm64/kernel/cpufeature.c | 45 +++++++++++++++++++++++++---- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 9a555809b89c..b1f53147e2b2 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -63,6 +63,11 @@ struct arm64_ftr_bits { s64 safe_val; /* safe value for FTR_EXACT features */ }; +struct arm64_ftr_override { + u64 val; + u64 mask; +}; + /* * @arm64_ftr_reg - Feature register * @strict_mask Bits which should match across all CPUs for sanity. @@ -74,6 +79,7 @@ struct arm64_ftr_reg { u64 user_mask; u64 sys_val; u64 user_val; + struct arm64_ftr_override *override; const struct arm64_ftr_bits *ftr_bits; }; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index e99eddec0a46..a4e5c619a516 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -352,9 +352,12 @@ static const struct arm64_ftr_bits ftr_ctr[] = { ARM64_FTR_END, }; +static struct arm64_ftr_override __ro_after_init no_override = { }; + struct arm64_ftr_reg arm64_ftr_reg_ctrel0 = { .name = "SYS_CTR_EL0", - .ftr_bits = ftr_ctr + .ftr_bits = ftr_ctr, + .override = &no_override, }; static const struct arm64_ftr_bits ftr_id_mmfr0[] = { @@ -544,13 +547,16 @@ static const struct arm64_ftr_bits ftr_raz[] = { ARM64_FTR_END, }; -#define ARM64_FTR_REG(id, table) { \ - .sys_id = id, \ - .reg = &(struct arm64_ftr_reg){ \ - .name = #id, \ - .ftr_bits = &((table)[0]), \ +#define ARM64_FTR_REG_OVERRIDE(id, table, ovr) { \ + .sys_id = id, \ + .reg = &(struct arm64_ftr_reg){ \ + .name = #id, \ + .override = (ovr), \ + .ftr_bits = &((table)[0]), \ }} +#define ARM64_FTR_REG(id, table) ARM64_FTR_REG_OVERRIDE(id, table, &no_override) + static const struct __ftr_reg_entry { u32 sys_id; struct arm64_ftr_reg *reg; @@ -770,6 +776,33 @@ static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new) for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) { u64 ftr_mask = arm64_ftr_mask(ftrp); s64 ftr_new = arm64_ftr_value(ftrp, new); + s64 ftr_ovr = arm64_ftr_value(ftrp, reg->override->val); + + if ((ftr_mask & reg->override->mask) == ftr_mask) { + s64 tmp = arm64_ftr_safe_value(ftrp, ftr_ovr, ftr_new); + char *str = NULL; + + if (ftr_ovr != tmp) { + /* Unsafe, remove the override */ + reg->override->mask &= ~ftr_mask; + reg->override->val &= ~ftr_mask; + tmp = ftr_ovr; + str = "ignoring override"; + } else if (ftr_new != tmp) { + /* Override was valid */ + ftr_new = tmp; + str = "forced"; + } else if (ftr_ovr == tmp) { + /* Override was the safe value */ + str = "already set"; + } + + if (str) + pr_warn("%s[%d:%d]: %s to %llx\n", + reg->name, + ftrp->shift + ftrp->width - 1, + ftrp->shift, str, tmp); + } val = arm64_ftr_set_value(ftrp, val, ftr_new); From b3341ae0efa235726ad69e53ce83c6a3c445bda8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:20 +0000 Subject: [PATCH 081/108] arm64: cpufeature: Use IDreg override in __read_sysreg_by_encoding() __read_sysreg_by_encoding() is used by a bunch of cpufeature helpers, which should take the feature override into account. Let's do that. For a good measure (and because we are likely to need to further down the line), make this helper available to the rest of the non-modular kernel. Code that needs to know the *real* features of a CPU can still use read_sysreg_s(), and find the bare, ugly truth. Signed-off-by: Marc Zyngier Reviewed-by: Suzuki K Poulose Acked-by: David Brazdil Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20210208095732.3267263-12-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpufeature.h | 1 + arch/arm64/kernel/cpufeature.c | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index b1f53147e2b2..b5bf7af68691 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -606,6 +606,7 @@ void __init setup_cpu_features(void); void check_local_cpu_capabilities(void); u64 read_sanitised_ftr_reg(u32 id); +u64 __read_sysreg_by_encoding(u32 sys_id); static inline bool cpu_supports_mixed_endian_el0(void) { diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index a4e5c619a516..97da9ed4b79d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1148,14 +1148,17 @@ u64 read_sanitised_ftr_reg(u32 id) EXPORT_SYMBOL_GPL(read_sanitised_ftr_reg); #define read_sysreg_case(r) \ - case r: return read_sysreg_s(r) + case r: val = read_sysreg_s(r); break; /* * __read_sysreg_by_encoding() - Used by a STARTING cpu before cpuinfo is populated. * Read the system register on the current CPU */ -static u64 __read_sysreg_by_encoding(u32 sys_id) +u64 __read_sysreg_by_encoding(u32 sys_id) { + struct arm64_ftr_reg *regp; + u64 val; + switch (sys_id) { read_sysreg_case(SYS_ID_PFR0_EL1); read_sysreg_case(SYS_ID_PFR1_EL1); @@ -1198,6 +1201,14 @@ static u64 __read_sysreg_by_encoding(u32 sys_id) BUG(); return 0; } + + regp = get_arm64_ftr_reg(sys_id); + if (regp) { + val &= ~regp->override->mask; + val |= (regp->override->val & regp->override->mask); + } + + return val; } #include From f6f0c4362f070cab4a0cec432e82428d702ce0a6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:21 +0000 Subject: [PATCH 082/108] arm64: Extract early FDT mapping from kaslr_early_init() As we want to parse more options very early in the kernel lifetime, let's always map the FDT early. This is achieved by moving that code out of kaslr_early_init(). No functional change expected. Signed-off-by: Marc Zyngier Acked-by: Catalin Marinas Acked-by: David Brazdil Link: https://lore.kernel.org/r/20210208095732.3267263-13-maz@kernel.org [will: Ensue KASAN is enabled before running C code] Signed-off-by: Will Deacon --- arch/arm64/include/asm/setup.h | 11 +++++++++++ arch/arm64/kernel/head.S | 3 ++- arch/arm64/kernel/kaslr.c | 7 +++---- arch/arm64/kernel/setup.c | 15 +++++++++++++++ 4 files changed, 31 insertions(+), 5 deletions(-) create mode 100644 arch/arm64/include/asm/setup.h diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h new file mode 100644 index 000000000000..d3320618ed14 --- /dev/null +++ b/arch/arm64/include/asm/setup.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef __ARM64_ASM_SETUP_H +#define __ARM64_ASM_SETUP_H + +#include + +void *get_early_fdt_ptr(void); +void early_fdt_map(u64 dt_phys); + +#endif diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 2c1b8ffa3d3e..070d1c6ec9eb 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -436,10 +436,11 @@ SYM_FUNC_START_LOCAL(__primary_switched) #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) bl kasan_early_init #endif + mov x0, x21 // pass FDT address in x0 + bl early_fdt_map // Try mapping the FDT early #ifdef CONFIG_RANDOMIZE_BASE tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized? b.ne 0f - mov x0, x21 // pass FDT address in x0 bl kaslr_early_init // parse FDT for KASLR options cbz x0, 0f // KASLR disabled? just proceed orr x23, x23, x0 // record KASLR offset diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 1c74c45b9494..5fc86e7d01a1 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -19,6 +19,7 @@ #include #include #include +#include enum kaslr_status { KASLR_ENABLED, @@ -92,12 +93,11 @@ static __init bool is_kaslr_disabled_cmdline(void *fdt) * containing function pointers) to be reinitialized, and zero-initialized * .bss variables will be reset to 0. */ -u64 __init kaslr_early_init(u64 dt_phys) +u64 __init kaslr_early_init(void) { void *fdt; u64 seed, offset, mask, module_range; unsigned long raw; - int size; /* * Set a reasonable default for module_alloc_base in case @@ -111,8 +111,7 @@ u64 __init kaslr_early_init(u64 dt_phys) * and proceed with KASLR disabled. We will make another * attempt at mapping the FDT in setup_machine() */ - early_fixmap_init(); - fdt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); + fdt = get_early_fdt_ptr(); if (!fdt) { kaslr_status = KASLR_DISABLED_FDT_REMAP; return 0; diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index c18aacde8bb0..61845c0821d9 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -168,6 +168,21 @@ static void __init smp_build_mpidr_hash(void) pr_warn("Large number of MPIDR hash buckets detected\n"); } +static void *early_fdt_ptr __initdata; + +void __init *get_early_fdt_ptr(void) +{ + return early_fdt_ptr; +} + +asmlinkage void __init early_fdt_map(u64 dt_phys) +{ + int fdt_size; + + early_fixmap_init(); + early_fdt_ptr = fixmap_remap_fdt(dt_phys, &fdt_size, PAGE_KERNEL); +} + static void __init setup_machine_fdt(phys_addr_t dt_phys) { int size; From 33200303553d3d74e7b980493cf363da545f887d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:22 +0000 Subject: [PATCH 083/108] arm64: cpufeature: Add an early command-line cpufeature override facility In order to be able to override CPU features at boot time, let's add a command line parser that matches options of the form "cpureg.feature=value", and store the corresponding value into the override val/mask pair. No features are currently defined, so no expected change in functionality. Signed-off-by: Marc Zyngier Acked-by: David Brazdil Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20210208095732.3267263-14-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/Makefile | 2 +- arch/arm64/kernel/head.S | 1 + arch/arm64/kernel/idreg-override.c | 150 +++++++++++++++++++++++++++++ 3 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/kernel/idreg-override.c diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 86364ab6f13f..2262f0392857 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -17,7 +17,7 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ - syscall.o proton-pack.o + syscall.o proton-pack.o idreg-override.o targets += efi-entry.o diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 070d1c6ec9eb..e7f12bf49730 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -438,6 +438,7 @@ SYM_FUNC_START_LOCAL(__primary_switched) #endif mov x0, x21 // pass FDT address in x0 bl early_fdt_map // Try mapping the FDT early + bl init_feature_override // Parse cpu feature overrides #ifdef CONFIG_RANDOMIZE_BASE tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized? b.ne 0f diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c new file mode 100644 index 000000000000..3a347b42d07e --- /dev/null +++ b/arch/arm64/kernel/idreg-override.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Early cpufeature override framework + * + * Copyright (C) 2020 Google LLC + * Author: Marc Zyngier + */ + +#include +#include +#include + +#include +#include + +#define FTR_DESC_NAME_LEN 20 +#define FTR_DESC_FIELD_LEN 10 + +struct ftr_set_desc { + char name[FTR_DESC_NAME_LEN]; + struct arm64_ftr_override *override; + struct { + char name[FTR_DESC_FIELD_LEN]; + u8 shift; + } fields[]; +}; + +static const struct ftr_set_desc * const regs[] __initconst = { +}; + +static int __init find_field(const char *cmdline, + const struct ftr_set_desc *reg, int f, u64 *v) +{ + char opt[FTR_DESC_NAME_LEN + FTR_DESC_FIELD_LEN + 2]; + int len; + + len = snprintf(opt, ARRAY_SIZE(opt), "%s.%s=", + reg->name, reg->fields[f].name); + + if (!parameqn(cmdline, opt, len)) + return -1; + + return kstrtou64(cmdline + len, 0, v); +} + +static void __init match_options(const char *cmdline) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + int f; + + if (!regs[i]->override) + continue; + + for (f = 0; strlen(regs[i]->fields[f].name); f++) { + u64 shift = regs[i]->fields[f].shift; + u64 mask = 0xfUL << shift; + u64 v; + + if (find_field(cmdline, regs[i], f, &v)) + continue; + + regs[i]->override->val &= ~mask; + regs[i]->override->val |= (v << shift) & mask; + regs[i]->override->mask |= mask; + + return; + } + } +} + +static __init void __parse_cmdline(const char *cmdline) +{ + do { + char buf[256]; + size_t len; + int i; + + cmdline = skip_spaces(cmdline); + + for (len = 0; cmdline[len] && !isspace(cmdline[len]); len++); + if (!len) + return; + + len = min(len, ARRAY_SIZE(buf) - 1); + strncpy(buf, cmdline, len); + buf[len] = 0; + + if (strcmp(buf, "--") == 0) + return; + + cmdline += len; + + match_options(buf); + + } while (1); +} + +static __init void parse_cmdline(void) +{ + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { + const u8 *prop; + void *fdt; + int node; + + fdt = get_early_fdt_ptr(); + if (!fdt) + goto out; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + goto out; + + prop = fdt_getprop(fdt, node, "bootargs", NULL); + if (!prop) + goto out; + + __parse_cmdline(prop); + + if (!IS_ENABLED(CONFIG_CMDLINE_EXTEND)) + return; + } + +out: + __parse_cmdline(CONFIG_CMDLINE); +} + +/* Keep checkers quiet */ +void init_feature_override(void); + +asmlinkage void __init init_feature_override(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + if (regs[i]->override) { + regs[i]->override->val = 0; + regs[i]->override->mask = 0; + } + } + + parse_cmdline(); + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + if (regs[i]->override) + __flush_dcache_area(regs[i]->override, + sizeof(*regs[i]->override)); + } +} From 361db0fca7affafa920f7d91bf93b9d9da44712f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:23 +0000 Subject: [PATCH 084/108] arm64: Allow ID_AA64MMFR1_EL1.VH to be overridden from the command line As we want to be able to disable VHE at runtime, let's match "id_aa64mmfr1.vh=" from the command line as an override. This doesn't have much effect yet as our boot code doesn't look at the cpufeature, but only at the HW registers. Signed-off-by: Marc Zyngier Acked-by: David Brazdil Acked-by: Suzuki K Poulose Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20210208095732.3267263-15-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpufeature.h | 2 ++ arch/arm64/kernel/cpufeature.c | 5 ++++- arch/arm64/kernel/idreg-override.c | 11 +++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index b5bf7af68691..570f1b4ba3cc 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -818,6 +818,8 @@ static inline unsigned int get_vmid_bits(u64 mmfr1) return 8; } +extern struct arm64_ftr_override id_aa64mmfr1_override; + u32 get_kvm_ipa_limit(void); void dump_cpu_features(void); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 97da9ed4b79d..faada5d8bea6 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -557,6 +557,8 @@ static const struct arm64_ftr_bits ftr_raz[] = { #define ARM64_FTR_REG(id, table) ARM64_FTR_REG_OVERRIDE(id, table, &no_override) +struct arm64_ftr_override __ro_after_init id_aa64mmfr1_override; + static const struct __ftr_reg_entry { u32 sys_id; struct arm64_ftr_reg *reg; @@ -604,7 +606,8 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 0, CRm = 7 */ ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0), - ARM64_FTR_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1, + &id_aa64mmfr1_override), ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2), /* Op1 = 0, CRn = 1, CRm = 2 */ diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 3a347b42d07e..2da11bf60195 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -11,6 +11,7 @@ #include #include +#include #include #define FTR_DESC_NAME_LEN 20 @@ -25,7 +26,17 @@ struct ftr_set_desc { } fields[]; }; +static const struct ftr_set_desc mmfr1 __initconst = { + .name = "id_aa64mmfr1", + .override = &id_aa64mmfr1_override, + .fields = { + { "vh", ID_AA64MMFR1_VHE_SHIFT }, + {} + }, +}; + static const struct ftr_set_desc * const regs[] __initconst = { + &mmfr1, }; static int __init find_field(const char *cmdline, From 41fac42c25338f4ea295b58106c26683d893a1c6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:24 +0000 Subject: [PATCH 085/108] arm64: Honor VHE being disabled from the command-line Finally we can check whether VHE is disabled on the command line, and not enable it if that's the user's wish. Signed-off-by: Marc Zyngier Acked-by: David Brazdil Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20210208095732.3267263-16-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/asm-offsets.c | 3 +++ arch/arm64/kernel/hyp-stub.S | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 301784463587..a36e2fc330d4 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -99,6 +99,9 @@ int main(void) DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); BLANK(); + DEFINE(FTR_OVR_VAL_OFFSET, offsetof(struct arm64_ftr_override, val)); + DEFINE(FTR_OVR_MASK_OFFSET, offsetof(struct arm64_ftr_override, mask)); + BLANK(); #ifdef CONFIG_KVM DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1)); diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 6229315d533d..3e08dcc924b5 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -87,6 +87,17 @@ SYM_CODE_START_LOCAL(mutate_to_vhe) ubfx x1, x1, #ID_AA64MMFR1_VHE_SHIFT, #4 cbz x1, 1f + // Check whether VHE is disabled from the command line + adr_l x1, id_aa64mmfr1_override + ldr x2, [x1, FTR_OVR_VAL_OFFSET] + ldr x1, [x1, FTR_OVR_MASK_OFFSET] + ubfx x2, x2, #ID_AA64MMFR1_VHE_SHIFT, #4 + ubfx x1, x1, #ID_AA64MMFR1_VHE_SHIFT, #4 + cmp x1, xzr + and x2, x2, x1 + csinv x2, x2, xzr, ne + cbz x2, 1f + // Engage the VHE magic! mov_q x0, HCR_HOST_VHE_FLAGS msr hcr_el2, x0 From 863ace77e9ff85c06d57e9491faffae8512070de Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:25 +0000 Subject: [PATCH 086/108] arm64: Add an aliasing facility for the idreg override In order to map the override of idregs to options that a user can easily understand, let's introduce yet another option array, which maps an option to the corresponding idreg options. Signed-off-by: Marc Zyngier Reviewed-by: Catalin Marinas Acked-by: David Brazdil Link: https://lore.kernel.org/r/20210208095732.3267263-17-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/idreg-override.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 2da11bf60195..226bac544e20 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -16,6 +16,8 @@ #define FTR_DESC_NAME_LEN 20 #define FTR_DESC_FIELD_LEN 10 +#define FTR_ALIAS_NAME_LEN 30 +#define FTR_ALIAS_OPTION_LEN 80 struct ftr_set_desc { char name[FTR_DESC_NAME_LEN]; @@ -39,6 +41,12 @@ static const struct ftr_set_desc * const regs[] __initconst = { &mmfr1, }; +static const struct { + char alias[FTR_ALIAS_NAME_LEN]; + char feature[FTR_ALIAS_OPTION_LEN]; +} aliases[] __initconst = { +}; + static int __init find_field(const char *cmdline, const struct ftr_set_desc *reg, int f, u64 *v) { @@ -81,7 +89,7 @@ static void __init match_options(const char *cmdline) } } -static __init void __parse_cmdline(const char *cmdline) +static __init void __parse_cmdline(const char *cmdline, bool parse_aliases) { do { char buf[256]; @@ -105,6 +113,9 @@ static __init void __parse_cmdline(const char *cmdline) match_options(buf); + for (i = 0; parse_aliases && i < ARRAY_SIZE(aliases); i++) + if (parameq(buf, aliases[i].alias)) + __parse_cmdline(aliases[i].feature, false); } while (1); } @@ -127,14 +138,14 @@ static __init void parse_cmdline(void) if (!prop) goto out; - __parse_cmdline(prop); + __parse_cmdline(prop, true); if (!IS_ENABLED(CONFIG_CMDLINE_EXTEND)) return; } out: - __parse_cmdline(CONFIG_CMDLINE); + __parse_cmdline(CONFIG_CMDLINE, true); } /* Keep checkers quiet */ From 1945a067f351debcd2518d9f6039b1835de08dfd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:26 +0000 Subject: [PATCH 087/108] arm64: Make kvm-arm.mode={nvhe, protected} an alias of id_aa64mmfr1.vh=0 Admitedly, passing id_aa64mmfr1.vh=0 on the command-line isn't that easy to understand, and it is likely that users would much prefer write "kvm-arm.mode=nvhe", or "...=protected". So here you go. This has the added advantage that we can now always honor the "kvm-arm.mode=protected" option, even when booting on a VHE system. Signed-off-by: Marc Zyngier Acked-by: Catalin Marinas Acked-by: David Brazdil Link: https://lore.kernel.org/r/20210208095732.3267263-18-maz@kernel.org Signed-off-by: Will Deacon --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ arch/arm64/kernel/idreg-override.c | 2 ++ arch/arm64/kvm/arm.c | 3 +++ 3 files changed, 8 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a10b545c2070..41786d205e0d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2257,6 +2257,9 @@ kvm-arm.mode= [KVM,ARM] Select one of KVM/arm64's modes of operation. + nvhe: Standard nVHE-based mode, without support for + protected guests. + protected: nVHE-based mode with support for guests whose state is kept private from the host. Not valid if the kernel is running in EL2. diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 226bac544e20..b994d689d6fb 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -45,6 +45,8 @@ static const struct { char alias[FTR_ALIAS_NAME_LEN]; char feature[FTR_ALIAS_OPTION_LEN]; } aliases[] __initconst = { + { "kvm-arm.mode=nvhe", "id_aa64mmfr1.vh=0" }, + { "kvm-arm.mode=protected", "id_aa64mmfr1.vh=0" }, }; static int __init find_field(const char *cmdline, diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 04c44853b103..597565a65ca2 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1966,6 +1966,9 @@ static int __init early_kvm_mode_cfg(char *arg) return 0; } + if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) + return 0; + return -EINVAL; } early_param("kvm-arm.mode", early_kvm_mode_cfg); From 166cc2a4be0d80075d379b30d3e84895c878a1a8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:27 +0000 Subject: [PATCH 088/108] KVM: arm64: Document HVC_VHE_RESTART stub hypercall For completeness, let's document the HVC_VHE_RESTART stub. Signed-off-by: Marc Zyngier Acked-by: David Brazdil Link: https://lore.kernel.org/r/20210208095732.3267263-19-maz@kernel.org Signed-off-by: Will Deacon --- Documentation/virt/kvm/arm/hyp-abi.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/virt/kvm/arm/hyp-abi.rst b/Documentation/virt/kvm/arm/hyp-abi.rst index 83cadd8186fa..4d43fbc25195 100644 --- a/Documentation/virt/kvm/arm/hyp-abi.rst +++ b/Documentation/virt/kvm/arm/hyp-abi.rst @@ -58,6 +58,15 @@ these functions (see arch/arm{,64}/include/asm/virt.h): into place (arm64 only), and jump to the restart address while at HYP/EL2. This hypercall is not expected to return to its caller. +* :: + + x0 = HVC_VHE_RESTART (arm64 only) + + Attempt to upgrade the kernel's exception level from EL1 to EL2 by enabling + the VHE mode. This is conditioned by the CPU supporting VHE, the EL2 MMU + being off, and VHE not being disabled by any other means (command line + option, for example). + Any other value of r0/x0 triggers a hypervisor-specific handling, which is not documented here. From a762f4ffc3c8a434da1b712e57a80d8d10404198 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:28 +0000 Subject: [PATCH 089/108] arm64: Move "nokaslr" over to the early cpufeature infrastructure Given that the early cpufeature infrastructure has borrowed quite a lot of code from the kaslr implementation, let's reimplement the matching of the "nokaslr" option with it. Signed-off-by: Marc Zyngier Acked-by: Catalin Marinas Acked-by: David Brazdil Link: https://lore.kernel.org/r/20210208095732.3267263-20-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/idreg-override.c | 15 +++++++++++++ arch/arm64/kernel/kaslr.c | 36 ++---------------------------- 2 files changed, 17 insertions(+), 34 deletions(-) diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index b994d689d6fb..70dd70eee7a2 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -37,8 +37,22 @@ static const struct ftr_set_desc mmfr1 __initconst = { }, }; +extern struct arm64_ftr_override kaslr_feature_override; + +static const struct ftr_set_desc kaslr __initconst = { + .name = "kaslr", +#ifdef CONFIG_RANDOMIZE_BASE + .override = &kaslr_feature_override, +#endif + .fields = { + { "disabled", 0 }, + {} + }, +}; + static const struct ftr_set_desc * const regs[] __initconst = { &mmfr1, + &kaslr, }; static const struct { @@ -47,6 +61,7 @@ static const struct { } aliases[] __initconst = { { "kvm-arm.mode=nvhe", "id_aa64mmfr1.vh=0" }, { "kvm-arm.mode=protected", "id_aa64mmfr1.vh=0" }, + { "nokaslr", "kaslr.disabled=1" }, }; static int __init find_field(const char *cmdline, diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 5fc86e7d01a1..27f8939deb1b 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -51,39 +51,7 @@ static __init u64 get_kaslr_seed(void *fdt) return ret; } -static __init bool cmdline_contains_nokaslr(const u8 *cmdline) -{ - const u8 *str; - - str = strstr(cmdline, "nokaslr"); - return str == cmdline || (str > cmdline && *(str - 1) == ' '); -} - -static __init bool is_kaslr_disabled_cmdline(void *fdt) -{ - if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { - int node; - const u8 *prop; - - node = fdt_path_offset(fdt, "/chosen"); - if (node < 0) - goto out; - - prop = fdt_getprop(fdt, node, "bootargs", NULL); - if (!prop) - goto out; - - if (cmdline_contains_nokaslr(prop)) - return true; - - if (IS_ENABLED(CONFIG_CMDLINE_EXTEND)) - goto out; - - return false; - } -out: - return cmdline_contains_nokaslr(CONFIG_CMDLINE); -} +struct arm64_ftr_override kaslr_feature_override __initdata; /* * This routine will be executed with the kernel mapped at its default virtual @@ -126,7 +94,7 @@ u64 __init kaslr_early_init(void) * Check if 'nokaslr' appears on the command line, and * return 0 if that is the case. */ - if (is_kaslr_disabled_cmdline(fdt)) { + if (kaslr_feature_override.val & kaslr_feature_override.mask & 0xf) { kaslr_status = KASLR_DISABLED_CMDLINE; return 0; } From 93ad55b7852b324a3fd7d46910b88c81deb62357 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:29 +0000 Subject: [PATCH 090/108] arm64: cpufeatures: Allow disabling of BTI from the command-line In order to be able to disable BTI at runtime, whether it is for testing purposes, or to work around HW issues, let's add support for overriding the ID_AA64PFR1_EL1.BTI field. This is further mapped on the arm64.nobti command-line alias. Signed-off-by: Marc Zyngier Reviewed-by: Catalin Marinas Acked-by: David Brazdil Tested-by: Srinivas Ramana Link: https://lore.kernel.org/r/20210208095732.3267263-21-maz@kernel.org Signed-off-by: Will Deacon --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ arch/arm64/include/asm/cpufeature.h | 1 + arch/arm64/kernel/cpufeature.c | 4 +++- arch/arm64/kernel/idreg-override.c | 11 +++++++++++ arch/arm64/mm/mmu.c | 2 +- 5 files changed, 19 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 41786d205e0d..c00236b22c6e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -373,6 +373,9 @@ arcrimi= [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards Format: ,, + arm64.nobti [ARM64] Unconditionally disable Branch Target + Identification support + ataflop= [HW,M68k] atarimouse= [HW,MOUSE] Atari Mouse diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 570f1b4ba3cc..30917b9a760b 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -819,6 +819,7 @@ static inline unsigned int get_vmid_bits(u64 mmfr1) } extern struct arm64_ftr_override id_aa64mmfr1_override; +extern struct arm64_ftr_override id_aa64pfr1_override; u32 get_kvm_ipa_limit(void); void dump_cpu_features(void); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index faada5d8bea6..7fbeab497adb 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -558,6 +558,7 @@ static const struct arm64_ftr_bits ftr_raz[] = { #define ARM64_FTR_REG(id, table) ARM64_FTR_REG_OVERRIDE(id, table, &no_override) struct arm64_ftr_override __ro_after_init id_aa64mmfr1_override; +struct arm64_ftr_override __ro_after_init id_aa64pfr1_override; static const struct __ftr_reg_entry { u32 sys_id; @@ -593,7 +594,8 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 0, CRm = 4 */ ARM64_FTR_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0), - ARM64_FTR_REG(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1, + &id_aa64pfr1_override), ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0), /* Op1 = 0, CRn = 0, CRm = 5 */ diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 70dd70eee7a2..d691e9015c62 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -37,6 +37,15 @@ static const struct ftr_set_desc mmfr1 __initconst = { }, }; +static const struct ftr_set_desc pfr1 __initconst = { + .name = "id_aa64pfr1", + .override = &id_aa64pfr1_override, + .fields = { + { "bt", ID_AA64PFR1_BT_SHIFT }, + {} + }, +}; + extern struct arm64_ftr_override kaslr_feature_override; static const struct ftr_set_desc kaslr __initconst = { @@ -52,6 +61,7 @@ static const struct ftr_set_desc kaslr __initconst = { static const struct ftr_set_desc * const regs[] __initconst = { &mmfr1, + &pfr1, &kaslr, }; @@ -61,6 +71,7 @@ static const struct { } aliases[] __initconst = { { "kvm-arm.mode=nvhe", "id_aa64mmfr1.vh=0" }, { "kvm-arm.mode=protected", "id_aa64mmfr1.vh=0" }, + { "arm64.nobti", "id_aa64pfr1.bt=0" }, { "nokaslr", "kaslr.disabled=1" }, }; diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ae0c3d023824..617e704c980b 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -628,7 +628,7 @@ static bool arm64_early_this_cpu_has_bti(void) if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) return false; - pfr1 = read_sysreg_s(SYS_ID_AA64PFR1_EL1); + pfr1 = __read_sysreg_by_encoding(SYS_ID_AA64PFR1_EL1); return cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_BT_SHIFT); } From 7f6240858cf3abb75237c9ba63ec70d232573ae8 Mon Sep 17 00:00:00 2001 From: Srinivas Ramana Date: Mon, 8 Feb 2021 09:57:30 +0000 Subject: [PATCH 091/108] arm64: Defer enabling pointer authentication on boot core Defer enabling pointer authentication on boot core until after its required to be enabled by cpufeature framework. This will help in controlling the feature dynamically with a boot parameter. Signed-off-by: Ajay Patil Signed-off-by: Prasad Sodagudi Signed-off-by: Srinivas Ramana Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1610152163-16554-2-git-send-email-sramana@codeaurora.org Reviewed-by: Catalin Marinas Acked-by: David Brazdil Link: https://lore.kernel.org/r/20210208095732.3267263-22-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/pointer_auth.h | 10 ++++++++++ arch/arm64/include/asm/stackprotector.h | 1 + arch/arm64/kernel/head.S | 4 ---- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/pointer_auth.h b/arch/arm64/include/asm/pointer_auth.h index c6b4f0603024..b112a11e9302 100644 --- a/arch/arm64/include/asm/pointer_auth.h +++ b/arch/arm64/include/asm/pointer_auth.h @@ -76,6 +76,15 @@ static inline unsigned long ptrauth_strip_insn_pac(unsigned long ptr) return ptrauth_clear_pac(ptr); } +static __always_inline void ptrauth_enable(void) +{ + if (!system_supports_address_auth()) + return; + sysreg_clear_set(sctlr_el1, 0, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | + SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)); + isb(); +} + #define ptrauth_thread_init_user(tsk) \ ptrauth_keys_init_user(&(tsk)->thread.keys_user) #define ptrauth_thread_init_kernel(tsk) \ @@ -84,6 +93,7 @@ static inline unsigned long ptrauth_strip_insn_pac(unsigned long ptr) ptrauth_keys_switch_kernel(&(tsk)->thread.keys_kernel) #else /* CONFIG_ARM64_PTR_AUTH */ +#define ptrauth_enable() #define ptrauth_prctl_reset_keys(tsk, arg) (-EINVAL) #define ptrauth_strip_insn_pac(lr) (lr) #define ptrauth_thread_init_user(tsk) diff --git a/arch/arm64/include/asm/stackprotector.h b/arch/arm64/include/asm/stackprotector.h index 7263e0bac680..33f1bb453150 100644 --- a/arch/arm64/include/asm/stackprotector.h +++ b/arch/arm64/include/asm/stackprotector.h @@ -41,6 +41,7 @@ static __always_inline void boot_init_stack_canary(void) #endif ptrauth_thread_init_kernel(current); ptrauth_thread_switch_kernel(current); + ptrauth_enable(); } #endif /* _ASM_STACKPROTECTOR_H */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index e7f12bf49730..1e30b5550d2a 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -404,10 +404,6 @@ SYM_FUNC_START_LOCAL(__primary_switched) adr_l x5, init_task msr sp_el0, x5 // Save thread_info -#ifdef CONFIG_ARM64_PTR_AUTH - __ptrauth_keys_init_cpu x5, x6, x7, x8 -#endif - adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address isb From f8da5752fd1b25f1ecf78a79013e2dfd2b860589 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Feb 2021 09:57:31 +0000 Subject: [PATCH 092/108] arm64: cpufeatures: Allow disabling of Pointer Auth from the command-line In order to be able to disable Pointer Authentication at runtime, whether it is for testing purposes, or to work around HW issues, let's add support for overriding the ID_AA64ISAR1_EL1.{GPI,GPA,API,APA} fields. This is further mapped on the arm64.nopauth command-line alias. Signed-off-by: Marc Zyngier Reviewed-by: Catalin Marinas Acked-by: David Brazdil Tested-by: Srinivas Ramana Link: https://lore.kernel.org/r/20210208095732.3267263-23-maz@kernel.org Signed-off-by: Will Deacon --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ arch/arm64/include/asm/cpufeature.h | 1 + arch/arm64/kernel/cpufeature.c | 4 +++- arch/arm64/kernel/idreg-override.c | 16 ++++++++++++++++ 4 files changed, 23 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index c00236b22c6e..cbb51ab2be93 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -376,6 +376,9 @@ arm64.nobti [ARM64] Unconditionally disable Branch Target Identification support + arm64.nopauth [ARM64] Unconditionally disable Pointer Authentication + support + ataflop= [HW,M68k] atarimouse= [HW,MOUSE] Atari Mouse diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 30917b9a760b..61177bac49fa 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -820,6 +820,7 @@ static inline unsigned int get_vmid_bits(u64 mmfr1) extern struct arm64_ftr_override id_aa64mmfr1_override; extern struct arm64_ftr_override id_aa64pfr1_override; +extern struct arm64_ftr_override id_aa64isar1_override; u32 get_kvm_ipa_limit(void); void dump_cpu_features(void); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 7fbeab497adb..3bce87a03717 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -559,6 +559,7 @@ static const struct arm64_ftr_bits ftr_raz[] = { struct arm64_ftr_override __ro_after_init id_aa64mmfr1_override; struct arm64_ftr_override __ro_after_init id_aa64pfr1_override; +struct arm64_ftr_override __ro_after_init id_aa64isar1_override; static const struct __ftr_reg_entry { u32 sys_id; @@ -604,7 +605,8 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 0, CRm = 6 */ ARM64_FTR_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0), - ARM64_FTR_REG(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1, + &id_aa64isar1_override), /* Op1 = 0, CRn = 0, CRm = 7 */ ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0), diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index d691e9015c62..dffb16682330 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -46,6 +46,18 @@ static const struct ftr_set_desc pfr1 __initconst = { }, }; +static const struct ftr_set_desc isar1 __initconst = { + .name = "id_aa64isar1", + .override = &id_aa64isar1_override, + .fields = { + { "gpi", ID_AA64ISAR1_GPI_SHIFT }, + { "gpa", ID_AA64ISAR1_GPA_SHIFT }, + { "api", ID_AA64ISAR1_API_SHIFT }, + { "apa", ID_AA64ISAR1_APA_SHIFT }, + {} + }, +}; + extern struct arm64_ftr_override kaslr_feature_override; static const struct ftr_set_desc kaslr __initconst = { @@ -62,6 +74,7 @@ static const struct ftr_set_desc kaslr __initconst = { static const struct ftr_set_desc * const regs[] __initconst = { &mmfr1, &pfr1, + &isar1, &kaslr, }; @@ -72,6 +85,9 @@ static const struct { { "kvm-arm.mode=nvhe", "id_aa64mmfr1.vh=0" }, { "kvm-arm.mode=protected", "id_aa64mmfr1.vh=0" }, { "arm64.nobti", "id_aa64pfr1.bt=0" }, + { "arm64.nopauth", + "id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 " + "id_aa64isar1.api=0 id_aa64isar1.apa=0" }, { "nokaslr", "kaslr.disabled=1" }, }; From e9c6deee00e9197e75cd6aa0d265d3d45bd7cc28 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 8 Feb 2021 17:57:20 -0700 Subject: [PATCH 093/108] arm64: Make CPU_BIG_ENDIAN depend on ld.bfd or ld.lld 13.0.0+ Similar to commit 28187dc8ebd9 ("ARM: 9025/1: Kconfig: CPU_BIG_ENDIAN depends on !LD_IS_LLD"), ld.lld prior to 13.0.0 does not properly support aarch64 big endian, leading to the following build error when CONFIG_CPU_BIG_ENDIAN is selected: ld.lld: error: unknown emulation: aarch64linuxb This has been resolved in LLVM 13. To avoid errors like this, only allow CONFIG_CPU_BIG_ENDIAN to be selected if using ld.bfd or ld.lld 13.0.0 and newer. While we are here, the indentation of this symbol used spaces since its introduction in commit a872013d6d03 ("arm64: kconfig: allow CPU_BIG_ENDIAN to be selected"). Change it to tabs to be consistent with kernel coding style. Link: https://github.com/ClangBuiltLinux/linux/issues/380 Link: https://github.com/ClangBuiltLinux/linux/issues/1288 Link: https://github.com/llvm/llvm-project/commit/7605a9a009b5fa3bdac07e3131c8d82f6d08feb7 Link: https://github.com/llvm/llvm-project/commit/eea34aae2e74e9b6fbdd5b95f479bc7f397bf387 Reported-by: Arnd Bergmann Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/r/20210209005719.803608-1-nathan@kernel.org Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f39568b28ec1..912da4e2ab59 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -952,8 +952,9 @@ choice that is selected here. config CPU_BIG_ENDIAN - bool "Build big-endian kernel" - help + bool "Build big-endian kernel" + depends on !LD_IS_LLD || LLD_VERSION >= 130000 + help Say Y if you plan on running a kernel with a big-endian userspace. config CPU_LITTLE_ENDIAN From b3c3361fe325074d4144c29d46daae4fc5a268d5 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 5 Feb 2021 11:13:24 -0800 Subject: [PATCH 094/108] perf/x86/kvm: Add Cascade Lake Xeon steppings to isolation_ucodes[] Cascade Lake Xeon parts have the same model number as Skylake Xeon parts, so they are tagged with the intel_pebs_isolation quirk. However, as with Skylake Xeon H0 stepping parts, the PEBS isolation issue is fixed in all microcode versions. Add the Cascade Lake Xeon steppings (5, 6, and 7) to the isolation_ucodes[] table so that these parts benefit from Andi's optimization in commit 9b545c04abd4f ("perf/x86/kvm: Avoid unnecessary work in guest filtering"). Signed-off-by: Jim Mattson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/20210205191324.2889006-1-jmattson@google.com --- arch/x86/events/intel/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 67a724657f60..5bac48d5c18e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4513,6 +4513,9 @@ static const struct x86_cpu_desc isolation_ucodes[] = { INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 6, 0x00000000), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 7, 0x00000000), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L, 3, 0x0000007c), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE, 3, 0x0000007c), INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 9, 0x0000004e), From ffb20c2e52e8709b5fc9951e8863e31efb1f2cba Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 5 Feb 2021 00:18:14 +0800 Subject: [PATCH 095/108] perf/x86/rapl: Add msr mask support In some cases, when probing a perf MSR, we're probing certain bits of the MSR instead of the whole register, thus only these bits should be checked. For example, for RAPL ENERGY_STATUS MSR, only the lower 32 bits represents the energy counter, and the higher 32bits are reserved. Introduce a new mask field in struct perf_msr to allow probing certain bits of a MSR. This change is transparent to the current perf_msr_probe() users. Signed-off-by: Zhang Rui Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/20210204161816.12649-1-rui.zhang@intel.com --- arch/x86/events/probe.c | 7 ++++++- arch/x86/events/probe.h | 7 ++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c index 136a1e847254..600bf8d15c0c 100644 --- a/arch/x86/events/probe.c +++ b/arch/x86/events/probe.c @@ -28,6 +28,7 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) for (bit = 0; bit < cnt; bit++) { if (!msr[bit].no_check) { struct attribute_group *grp = msr[bit].grp; + u64 mask; /* skip entry with no group */ if (!grp) @@ -44,8 +45,12 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) /* Virt sucks; you cannot tell if a R/O MSR is present :/ */ if (rdmsrl_safe(msr[bit].msr, &val)) continue; + + mask = msr[bit].mask; + if (!mask) + mask = ~0ULL; /* Disable zero counters if requested. */ - if (!zero && !val) + if (!zero && !(val & mask)) continue; grp->is_visible = NULL; diff --git a/arch/x86/events/probe.h b/arch/x86/events/probe.h index 4c8e0afc5fb5..261b9bda24e3 100644 --- a/arch/x86/events/probe.h +++ b/arch/x86/events/probe.h @@ -4,10 +4,11 @@ #include struct perf_msr { - u64 msr; - struct attribute_group *grp; + u64 msr; + struct attribute_group *grp; bool (*test)(int idx, void *data); - bool no_check; + bool no_check; + u64 mask; }; unsigned long From b6f78d3fba7f605f673185d7292d84af7576fdc1 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 5 Feb 2021 00:18:15 +0800 Subject: [PATCH 096/108] perf/x86/rapl: Only check lower 32bits for RAPL energy counters In the RAPL ENERGY_COUNTER MSR, only the lower 32bits represent the energy counter. On previous platforms, the higher 32bits are reverved and always return Zero. But on Intel SapphireRapids platform, the higher 32bits are reused for other purpose and return non-zero value. Thus check the lower 32bits only for these ENERGY_COUTNER MSRs, to make sure the RAPL PMU events are not added erroneously when higher 32bits contain non-zero value. Signed-off-by: Zhang Rui Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/20210204161816.12649-2-rui.zhang@intel.com --- arch/x86/events/rapl.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 7dbbeaacd995..7ed25b2ba05f 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -523,12 +523,15 @@ static bool test_msr(int idx, void *data) return test_bit(idx, (unsigned long *) data); } +/* Only lower 32bits of the MSR represents the energy counter */ +#define RAPL_MSR_MASK 0xFFFFFFFF + static struct perf_msr intel_rapl_msrs[] = { - [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr }, - [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, - [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr }, - [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr }, - [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr }, + [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, false, RAPL_MSR_MASK }, }; /* From 838342a6d6b7ecc475dc052d4a405c4ffb3ad1b5 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 5 Feb 2021 00:18:16 +0800 Subject: [PATCH 097/108] perf/x86/rapl: Fix psys-energy event on Intel SPR platform There are several things special for the RAPL Psys energy counter, on Intel Sapphire Rapids platform. 1. it contains one Psys master package, and only CPUs on the master package can read valid value of the Psys energy counter, reading the MSR on CPUs in the slave package returns 0. 2. The master package does not have to be Physical package 0. And when all the CPUs on the Psys master package are offlined, we lose the Psys energy counter, at runtime. 3. The Psys energy counter can be disabled by BIOS, while all the other energy counters are not affected. It is not easy to handle all of these in the current RAPL PMU design because a) perf_msr_probe() validates the MSR on some random CPU, which may either be in the Psys master package or in the Psys slave package. b) all the RAPL events share the same PMU, and there is not API to remove the psys-energy event cleanly, without affecting the other events in the same PMU. This patch addresses the problems in a simple way. First, by setting .no_check bit for RAPL Psys MSR, the psys-energy event is always added, so we don't have to check the Psys ENERGY_STATUS MSR on master package. Then, by removing rapl_not_visible(), the psys-energy event is always available in sysfs. This does not affect the previous code because, for the RAPL MSRs with .no_check cleared, the .is_visible() callback is always overriden in the perf_msr_probe() function. Note, although RAPL PMU is die-based, and the Psys energy counter MSR on Intel SPR is package scope, this is not a problem because there is only one die in each package on SPR. Signed-off-by: Zhang Rui Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/20210204161816.12649-3-rui.zhang@intel.com --- arch/x86/events/rapl.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 7ed25b2ba05f..f42a70496a24 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -454,16 +454,9 @@ static struct attribute *rapl_events_cores[] = { NULL, }; -static umode_t -rapl_not_visible(struct kobject *kobj, struct attribute *attr, int i) -{ - return 0; -} - static struct attribute_group rapl_events_cores_group = { .name = "events", .attrs = rapl_events_cores, - .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_pkg[] = { @@ -476,7 +469,6 @@ static struct attribute *rapl_events_pkg[] = { static struct attribute_group rapl_events_pkg_group = { .name = "events", .attrs = rapl_events_pkg, - .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_ram[] = { @@ -489,7 +481,6 @@ static struct attribute *rapl_events_ram[] = { static struct attribute_group rapl_events_ram_group = { .name = "events", .attrs = rapl_events_ram, - .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_gpu[] = { @@ -502,7 +493,6 @@ static struct attribute *rapl_events_gpu[] = { static struct attribute_group rapl_events_gpu_group = { .name = "events", .attrs = rapl_events_gpu, - .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_psys[] = { @@ -515,7 +505,6 @@ static struct attribute *rapl_events_psys[] = { static struct attribute_group rapl_events_psys_group = { .name = "events", .attrs = rapl_events_psys, - .is_visible = rapl_not_visible, }; static bool test_msr(int idx, void *data) @@ -534,6 +523,14 @@ static struct perf_msr intel_rapl_msrs[] = { [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, false, RAPL_MSR_MASK }, }; +static struct perf_msr intel_rapl_spr_msrs[] = { + [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, true, RAPL_MSR_MASK }, +}; + /* * Force to PERF_RAPL_MAX size due to: * - perf_msr_probe(PERF_RAPL_MAX) @@ -764,7 +761,7 @@ static struct rapl_model model_spr = { BIT(PERF_RAPL_PSYS), .unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR, .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_msrs, + .rapl_msrs = intel_rapl_spr_msrs, }; static struct rapl_model model_amd_fam17h = { From de591a82f41b61af4a8fce49d21b43105c5c2186 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 10 Feb 2021 11:15:11 +0000 Subject: [PATCH 098/108] mm: filemap: Fix microblaze build failure with 'mmu_defconfig' Commit f9ce0be71d1f ("mm: Cleanup faultaround and finish_fault() codepaths") added a call to 'update_mmu_cache()' in mm/filemap.c, which breaks the build for microblaze: | mm/filemap.c: In function 'filemap_map_pages': | mm/filemap.c:3153:3: error: implicit declaration of function 'update_mmu_cache'; did you mean 'update_mmu_tlb'? Include asm/tlbflush.h in mm/filemap.c to make sure that the function (or indeed, macro) is available. Reported-by: Guenter Roeck Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20210209202449.GA104837@roeck-us.net Signed-off-by: Will Deacon --- mm/filemap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/filemap.c b/mm/filemap.c index fb7a8d9b5603..2ca13227747b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -43,6 +43,7 @@ #include #include #include +#include #include "internal.h" #define CREATE_TRACE_POINTS From 8ee37e0f97ec66b953d202257293670efaab1daa Mon Sep 17 00:00:00 2001 From: Qi Liu Date: Tue, 9 Feb 2021 17:42:22 +0800 Subject: [PATCH 099/108] drivers/perf: Replace spin_lock_irqsave to spin_lock There is no need to do spin_lock_irqsave in context of hard IRQ, so replace them with spin_lock. Signed-off-by: Qi Liu Link: https://lore.kernel.org/r/1612863742-1551-1-git-send-email-liuqi115@huawei.com Signed-off-by: Will Deacon --- drivers/perf/arm-cci.c | 5 ++--- drivers/perf/xgene_pmu.c | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c index a75cf77c4de4..f81e2ec90005 100644 --- a/drivers/perf/arm-cci.c +++ b/drivers/perf/arm-cci.c @@ -1026,12 +1026,11 @@ static void pmu_event_set_period(struct perf_event *event) static irqreturn_t pmu_handle_irq(int irq_num, void *dev) { - unsigned long flags; struct cci_pmu *cci_pmu = dev; struct cci_pmu_hw_events *events = &cci_pmu->hw_events; int idx, handled = IRQ_NONE; - raw_spin_lock_irqsave(&events->pmu_lock, flags); + raw_spin_lock(&events->pmu_lock); /* Disable the PMU while we walk through the counters */ __cci_pmu_disable(cci_pmu); @@ -1061,7 +1060,7 @@ static irqreturn_t pmu_handle_irq(int irq_num, void *dev) /* Enable the PMU and sync possibly overflowed counters */ __cci_pmu_enable_sync(cci_pmu); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); + raw_spin_unlock(&events->pmu_lock); return IRQ_RETVAL(handled); } diff --git a/drivers/perf/xgene_pmu.c b/drivers/perf/xgene_pmu.c index 633cf07ba672..44faa51ba799 100644 --- a/drivers/perf/xgene_pmu.c +++ b/drivers/perf/xgene_pmu.c @@ -1234,10 +1234,9 @@ static irqreturn_t xgene_pmu_isr(int irq, void *dev_id) u32 intr_mcu, intr_mcb, intr_l3c, intr_iob; struct xgene_pmu_dev_ctx *ctx; struct xgene_pmu *xgene_pmu = dev_id; - unsigned long flags; u32 val; - raw_spin_lock_irqsave(&xgene_pmu->lock, flags); + raw_spin_lock(&xgene_pmu->lock); /* Get Interrupt PMU source */ val = readl(xgene_pmu->pcppmu_csr + PCPPMU_INTSTATUS_REG); @@ -1273,7 +1272,7 @@ static irqreturn_t xgene_pmu_isr(int irq, void *dev_id) } } - raw_spin_unlock_irqrestore(&xgene_pmu->lock, flags); + raw_spin_unlock(&xgene_pmu->lock); return IRQ_HANDLED; } From a35f2ef3b7376bfd0a57f7844bd7454389aae1fc Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 15 Feb 2021 08:49:34 +0100 Subject: [PATCH 100/108] Xen/x86: don't bail early from clear_foreign_p2m_mapping() Its sibling (set_foreign_p2m_mapping()) as well as the sibling of its only caller (gnttab_map_refs()) don't clean up after themselves in case of error. Higher level callers are expected to do so. However, in order for that to really clean up any partially set up state, the operation should not terminate upon encountering an entry in unexpected state. It is particularly relevant to notice here that set_foreign_p2m_mapping() would skip setting up a p2m entry if its grant mapping failed, but it would continue to set up further p2m entries as long as their mappings succeeded. Arguably down the road set_foreign_p2m_mapping() may want its page state related WARN_ON() also converted to an error return. This is part of XSA-361. Signed-off-by: Jan Beulich Cc: stable@vger.kernel.org Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- arch/x86/xen/p2m.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 3301875dd196..ac579c44dd28 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -750,17 +750,15 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i])); unsigned long pfn = page_to_pfn(pages[i]); - if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) { + if (mfn != INVALID_P2M_ENTRY && (mfn & FOREIGN_FRAME_BIT)) + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + else ret = -EINVAL; - goto out; - } - - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } if (kunmap_ops) ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, - kunmap_ops, count); -out: + kunmap_ops, count) ?: ret; + return ret; } EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); From b512e1b077e5ccdbd6e225b15d934ab12453b70a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 15 Feb 2021 08:50:08 +0100 Subject: [PATCH 101/108] Xen/x86: also check kernel mapping in set_foreign_p2m_mapping() We should not set up further state if either mapping failed; paying attention to just the user mapping's status isn't enough. Also use GNTST_okay instead of implying its value (zero). This is part of XSA-361. Signed-off-by: Jan Beulich Cc: stable@vger.kernel.org Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- arch/x86/xen/p2m.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index ac579c44dd28..b5949e5a83ec 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -712,7 +712,8 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, unsigned long mfn, pfn; /* Do not add to override if the map failed. */ - if (map_ops[i].status) + if (map_ops[i].status != GNTST_okay || + (kmap_ops && kmap_ops[i].status != GNTST_okay)) continue; if (map_ops[i].flags & GNTMAP_contains_pte) { From dbe5283605b3bc12ca45def09cc721a0a5c853a2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 15 Feb 2021 08:51:07 +0100 Subject: [PATCH 102/108] Xen/gntdev: correct dev_bus_addr handling in gntdev_map_grant_pages() We may not skip setting the field in the unmap structure when GNTMAP_device_map is in use - such an unmap would fail to release the respective resources (a page ref in the hypervisor). Otoh the field doesn't need setting at all when GNTMAP_device_map is not in use. To record the value for unmapping, we also better don't use our local p2m: In particular after a subsequent change it may not have got updated for all the batch elements. Instead it can simply be taken from the respective map's results. We can additionally avoid playing this game altogether for the kernel part of the mappings in (x86) PV mode. This is part of XSA-361. Signed-off-by: Jan Beulich Cc: stable@vger.kernel.org Reviewed-by: Stefano Stabellini Signed-off-by: Juergen Gross --- drivers/xen/gntdev.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index a36b71286bcf..aece224a4e70 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -309,18 +309,25 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map) * to the kernel linear addresses of the struct pages. * These ptes are completely different from the user ptes dealt * with find_grant_ptes. + * Note that GNTMAP_device_map isn't needed here: The + * dev_bus_addr output field gets consumed only from ->map_ops, + * and by not requesting it when mapping we also avoid needing + * to mirror dev_bus_addr into ->unmap_ops (and holding an extra + * reference to the page in the hypervisor). */ + unsigned int flags = (map->flags & ~GNTMAP_device_map) | + GNTMAP_host_map; + for (i = 0; i < map->count; i++) { unsigned long address = (unsigned long) pfn_to_kaddr(page_to_pfn(map->pages[i])); BUG_ON(PageHighMem(map->pages[i])); - gnttab_set_map_op(&map->kmap_ops[i], address, - map->flags | GNTMAP_host_map, + gnttab_set_map_op(&map->kmap_ops[i], address, flags, map->grants[i].ref, map->grants[i].domid); gnttab_set_unmap_op(&map->kunmap_ops[i], address, - map->flags | GNTMAP_host_map, -1); + flags, -1); } } @@ -336,17 +343,12 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map) continue; } + if (map->flags & GNTMAP_device_map) + map->unmap_ops[i].dev_bus_addr = map->map_ops[i].dev_bus_addr; + map->unmap_ops[i].handle = map->map_ops[i].handle; if (use_ptemod) map->kunmap_ops[i].handle = map->kmap_ops[i].handle; -#ifdef CONFIG_XEN_GRANT_DMA_ALLOC - else if (map->dma_vaddr) { - unsigned long bfn; - - bfn = pfn_to_bfn(page_to_pfn(map->pages[i])); - map->unmap_ops[i].dev_bus_addr = __pfn_to_phys(bfn); - } -#endif } return err; } From ebee0eab08594b2bd5db716288a4f1ae5936e9bc Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 15 Feb 2021 08:52:27 +0100 Subject: [PATCH 103/108] Xen/gntdev: correct error checking in gntdev_map_grant_pages() Failure of the kernel part of the mapping operation should also be indicated as an error to the caller, or else it may assume the respective kernel VA is okay to access. Furthermore gnttab_map_refs() failing still requires recording successfully mapped handles, so they can be unmapped subsequently. This in turn requires there to be a way to tell full hypercall failure from partial success - preset map_op status fields such that they won't "happen" to look as if the operation succeeded. Also again use GNTST_okay instead of implying its value (zero). This is part of XSA-361. Signed-off-by: Jan Beulich Cc: stable@vger.kernel.org Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/xen/gntdev.c | 17 +++++++++-------- include/xen/grant_table.h | 1 + 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index aece224a4e70..5447c5156b2e 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -334,21 +334,22 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map) pr_debug("map %d+%d\n", map->index, map->count); err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL, map->pages, map->count); - if (err) - return err; for (i = 0; i < map->count; i++) { - if (map->map_ops[i].status) { + if (map->map_ops[i].status == GNTST_okay) + map->unmap_ops[i].handle = map->map_ops[i].handle; + else if (!err) err = -EINVAL; - continue; - } if (map->flags & GNTMAP_device_map) map->unmap_ops[i].dev_bus_addr = map->map_ops[i].dev_bus_addr; - map->unmap_ops[i].handle = map->map_ops[i].handle; - if (use_ptemod) - map->kunmap_ops[i].handle = map->kmap_ops[i].handle; + if (use_ptemod) { + if (map->kmap_ops[i].status == GNTST_okay) + map->kunmap_ops[i].handle = map->kmap_ops[i].handle; + else if (!err) + err = -EINVAL; + } } return err; } diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index b9c937b3a149..0b1182a3cf41 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -157,6 +157,7 @@ gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr, map->flags = flags; map->ref = ref; map->dom = domid; + map->status = 1; /* arbitrary positive value */ } static inline void From 36bf1dfb8b266e089afa9b7b984217f17027bf35 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 15 Feb 2021 08:53:44 +0100 Subject: [PATCH 104/108] xen/arm: don't ignore return errors from set_phys_to_machine set_phys_to_machine can fail due to lack of memory, see the kzalloc call in arch/arm/xen/p2m.c:__set_phys_to_machine_multi. Don't ignore the potential return error in set_foreign_p2m_mapping, returning it to the caller instead. This is part of XSA-361. Signed-off-by: Stefano Stabellini Cc: stable@vger.kernel.org Reviewed-by: Julien Grall Signed-off-by: Juergen Gross --- arch/arm/xen/p2m.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm/xen/p2m.c b/arch/arm/xen/p2m.c index e52950a43f2e..fd6e3aafe272 100644 --- a/arch/arm/xen/p2m.c +++ b/arch/arm/xen/p2m.c @@ -95,8 +95,10 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, for (i = 0; i < count; i++) { if (map_ops[i].status) continue; - set_phys_to_machine(map_ops[i].host_addr >> XEN_PAGE_SHIFT, - map_ops[i].dev_bus_addr >> XEN_PAGE_SHIFT); + if (unlikely(!set_phys_to_machine(map_ops[i].host_addr >> XEN_PAGE_SHIFT, + map_ops[i].dev_bus_addr >> XEN_PAGE_SHIFT))) { + return -ENOMEM; + } } return 0; From 5a264285ed1cd32e26d9de4f3c8c6855e467fd63 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 15 Feb 2021 08:54:51 +0100 Subject: [PATCH 105/108] xen-blkback: don't "handle" error by BUG() In particular -ENOMEM may come back here, from set_foreign_p2m_mapping(). Don't make problems worse, the more that handling elsewhere (together with map's status fields now indicating whether a mapping wasn't even attempted, and hence has to be considered failed) doesn't require this odd way of dealing with errors. This is part of XSA-362. Signed-off-by: Jan Beulich Cc: stable@vger.kernel.org Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/block/xen-blkback/blkback.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 9ebf53903d7b..ee62b3c0b55a 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -811,10 +811,8 @@ static int xen_blkbk_map(struct xen_blkif_ring *ring, break; } - if (segs_to_map) { + if (segs_to_map) ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map); - BUG_ON(ret); - } /* * Now swizzle the MFN in our domain with the MFN from the other domain @@ -830,7 +828,7 @@ static int xen_blkbk_map(struct xen_blkif_ring *ring, gnttab_page_cache_put(&ring->free_pages, &pages[seg_idx]->page, 1); pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; - ret |= 1; + ret |= !ret; goto next; } pages[seg_idx]->handle = map[new_map_idx].handle; From 3194a1746e8aabe86075fd3c5e7cf1f4632d7f16 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 15 Feb 2021 08:55:31 +0100 Subject: [PATCH 106/108] xen-netback: don't "handle" error by BUG() In particular -ENOMEM may come back here, from set_foreign_p2m_mapping(). Don't make problems worse, the more that handling elsewhere (together with map's status fields now indicating whether a mapping wasn't even attempted, and hence has to be considered failed) doesn't require this odd way of dealing with errors. This is part of XSA-362. Signed-off-by: Jan Beulich Cc: stable@vger.kernel.org Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/net/xen-netback/netback.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index bc3421d14576..423667b83751 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1342,13 +1342,11 @@ int xenvif_tx_action(struct xenvif_queue *queue, int budget) return 0; gnttab_batch_copy(queue->tx_copy_ops, nr_cops); - if (nr_mops != 0) { + if (nr_mops != 0) ret = gnttab_map_refs(queue->tx_map_ops, NULL, queue->pages_to_map, nr_mops); - BUG_ON(ret); - } work_done = xenvif_tx_submit(queue); From 7c77474b2d22176d2bfb592ec74e0f2cb71352c9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 15 Feb 2021 08:55:57 +0100 Subject: [PATCH 107/108] xen-scsiback: don't "handle" error by BUG() In particular -ENOMEM may come back here, from set_foreign_p2m_mapping(). Don't make problems worse, the more that handling elsewhere (together with map's status fields now indicating whether a mapping wasn't even attempted, and hence has to be considered failed) doesn't require this odd way of dealing with errors. This is part of XSA-362. Signed-off-by: Jan Beulich Cc: stable@vger.kernel.org Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/xen/xen-scsiback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c index 862162dca33c..9cd4fe8ce680 100644 --- a/drivers/xen/xen-scsiback.c +++ b/drivers/xen/xen-scsiback.c @@ -386,12 +386,12 @@ static int scsiback_gnttab_data_map_batch(struct gnttab_map_grant_ref *map, return 0; err = gnttab_map_refs(map, NULL, pg, cnt); - BUG_ON(err); for (i = 0; i < cnt; i++) { if (unlikely(map[i].status != GNTST_okay)) { pr_err("invalid buffer -- could not remap it\n"); map[i].handle = SCSIBACK_INVALID_HANDLE; - err = -ENOMEM; + if (!err) + err = -ENOMEM; } else { get_page(pg[i]); } From 871997bc9e423f05c7da7c9178e62dde5df2a7f8 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 15 Feb 2021 08:56:44 +0100 Subject: [PATCH 108/108] xen-blkback: fix error handling in xen_blkbk_map() The function uses a goto-based loop, which may lead to an earlier error getting discarded by a later iteration. Exit this ad-hoc loop when an error was encountered. The out-of-memory error path additionally fails to fill a structure field looked at by xen_blkbk_unmap_prepare() before inspecting the handle which does get properly set (to BLKBACK_INVALID_HANDLE). Since the earlier exiting from the ad-hoc loop requires the same field filling (invalidation) as that on the out-of-memory path, fold both paths. While doing so, drop the pr_alert(), as extra log messages aren't going to help the situation (the kernel will log oom conditions already anyway). This is XSA-365. Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Reviewed-by: Julien Grall Signed-off-by: Juergen Gross --- drivers/block/xen-blkback/blkback.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index ee62b3c0b55a..da16121140ca 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -794,8 +794,13 @@ static int xen_blkbk_map(struct xen_blkif_ring *ring, pages[i]->persistent_gnt = persistent_gnt; } else { if (gnttab_page_cache_get(&ring->free_pages, - &pages[i]->page)) - goto out_of_memory; + &pages[i]->page)) { + gnttab_page_cache_put(&ring->free_pages, + pages_to_gnt, + segs_to_map); + ret = -ENOMEM; + goto out; + } addr = vaddr(pages[i]->page); pages_to_gnt[segs_to_map] = pages[i]->page; pages[i]->persistent_gnt = NULL; @@ -880,17 +885,18 @@ static int xen_blkbk_map(struct xen_blkif_ring *ring, } segs_to_map = 0; last_map = map_until; - if (map_until != num) + if (!ret && map_until != num) goto again; - return ret; - -out_of_memory: - pr_alert("%s: out of memory\n", __func__); - gnttab_page_cache_put(&ring->free_pages, pages_to_gnt, segs_to_map); - for (i = last_map; i < num; i++) +out: + for (i = last_map; i < num; i++) { + /* Don't zap current batch's valid persistent grants. */ + if(i >= last_map + segs_to_map) + pages[i]->persistent_gnt = NULL; pages[i]->handle = BLKBACK_INVALID_HANDLE; - return -ENOMEM; + } + + return ret; } static int xen_blkbk_map_seg(struct pending_req *pending_req)